forked from Elara6331/itd
		
	
		
			
				
	
	
		
			473 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			473 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
package translit
 | 
						|
 | 
						|
import (
 | 
						|
	"strings"
 | 
						|
	"unicode"
 | 
						|
 | 
						|
	"golang.org/x/text/unicode/norm"
 | 
						|
)
 | 
						|
 | 
						|
// https://en.wikipedia.org/wiki/Hangul_Jamo_%28Unicode_block%29
 | 
						|
var jamoBlock = &unicode.RangeTable{
 | 
						|
	R16: []unicode.Range16{{
 | 
						|
		Lo:     0x1100,
 | 
						|
		Hi:     0x11FF,
 | 
						|
		Stride: 1,
 | 
						|
	}},
 | 
						|
}
 | 
						|
 | 
						|
// https://en.wikipedia.org/wiki/Hangul_Syllables
 | 
						|
var syllablesBlock = &unicode.RangeTable{
 | 
						|
	R16: []unicode.Range16{{
 | 
						|
		Lo:     0xAC00,
 | 
						|
		Hi:     0xD7A3,
 | 
						|
		Stride: 1,
 | 
						|
	}},
 | 
						|
}
 | 
						|
 | 
						|
// https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo
 | 
						|
var compatJamoBlock = &unicode.RangeTable{
 | 
						|
	R16: []unicode.Range16{{
 | 
						|
		Lo:     0x3131,
 | 
						|
		Hi:     0x318E,
 | 
						|
		Stride: 1,
 | 
						|
	}},
 | 
						|
}
 | 
						|
 | 
						|
// KoreanTranslit implements transliteration for Korean.
 | 
						|
//
 | 
						|
// This was translated to Go from the code in https://codeberg.org/Freeyourgadget/Gadgetbridge
 | 
						|
type KoreanTranslit struct{}
 | 
						|
 | 
						|
func (KoreanTranslit) Init() {}
 | 
						|
 | 
						|
// User input consisting of isolated jamo is usually mapped to the KS X 1001 compatibility
 | 
						|
// block, but jamo resulting from decomposed syllables are mapped to the modern one. This
 | 
						|
// function maps compat jamo to modern ones where possible and returns all other characters
 | 
						|
// unmodified.
 | 
						|
//
 | 
						|
// https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo
 | 
						|
// https://en.wikipedia.org/wiki/Hangul_Jamo_%28Unicode_block%29
 | 
						|
func decompatJamo(jamo rune) rune {
 | 
						|
	// KS X 1001 Hangul filler, not used in modern Unicode. A useful landmark in the
 | 
						|
	// compatibility jamo block.
 | 
						|
	// https://en.wikipedia.org/wiki/KS_X_1001#Hangul_Filler
 | 
						|
	var hangulFiller rune = 0x3164
 | 
						|
 | 
						|
	// Ignore characters outside compatibility jamo block
 | 
						|
	if !unicode.In(jamo, compatJamoBlock) {
 | 
						|
		return jamo
 | 
						|
	}
 | 
						|
 | 
						|
	// Vowels are contiguous, in the same order, and unambiguous so it's a simple offset.
 | 
						|
	if jamo >= 0x314F && jamo < hangulFiller {
 | 
						|
		return jamo - 0x1FEE
 | 
						|
	}
 | 
						|
 | 
						|
	// Consonants are organized differently. No clean way to do this.
 | 
						|
	// The compatibility jamo block doesn't distinguish between Choseong (leading) and Jongseong
 | 
						|
	// (final) positions, but the modern block does. We map to Choseong here.
 | 
						|
	switch jamo {
 | 
						|
	case 0x3131:
 | 
						|
		return 0x1100 // ㄱ
 | 
						|
	case 0x3132:
 | 
						|
		return 0x1101 // ㄲ
 | 
						|
	case 0x3134:
 | 
						|
		return 0x1102 // ㄴ
 | 
						|
	case 0x3137:
 | 
						|
		return 0x1103 // ㄷ
 | 
						|
	case 0x3138:
 | 
						|
		return 0x1104 // ㄸ
 | 
						|
	case 0x3139:
 | 
						|
		return 0x1105 // ㄹ
 | 
						|
	case 0x3141:
 | 
						|
		return 0x1106 // ㅁ
 | 
						|
	case 0x3142:
 | 
						|
		return 0x1107 // ㅂ
 | 
						|
	case 0x3143:
 | 
						|
		return 0x1108 // ㅃ
 | 
						|
	case 0x3145:
 | 
						|
		return 0x1109 // ㅅ
 | 
						|
	case 0x3146:
 | 
						|
		return 0x110A // ㅆ
 | 
						|
	case 0x3147:
 | 
						|
		return 0x110B // ㅇ
 | 
						|
	case 0x3148:
 | 
						|
		return 0x110C // ㅈ
 | 
						|
	case 0x3149:
 | 
						|
		return 0x110D // ㅉ
 | 
						|
	case 0x314A:
 | 
						|
		return 0x110E // ㅊ
 | 
						|
	case 0x314B:
 | 
						|
		return 0x110F // ㅋ
 | 
						|
	case 0x314C:
 | 
						|
		return 0x1110 // ㅌ
 | 
						|
	case 0x314D:
 | 
						|
		return 0x1111 // ㅍ
 | 
						|
	case 0x314E:
 | 
						|
		return 0x1112 // ㅎ
 | 
						|
	}
 | 
						|
 | 
						|
	// The rest of the compatibility block consists of archaic compounds that are
 | 
						|
	// unlikely to be encountered in modern systems. Just leave them alone.
 | 
						|
	return jamo
 | 
						|
}
 | 
						|
 | 
						|
// Transliterates one jamo at a time.
 | 
						|
// Does nothing if it isn't in the modern jamo block.
 | 
						|
func translitSingleJamo(jamo rune) string {
 | 
						|
	jamo = decompatJamo(jamo)
 | 
						|
 | 
						|
	switch jamo {
 | 
						|
	// Choseong (leading position consonants)
 | 
						|
	case 0x1100:
 | 
						|
		return "g" // ㄱ
 | 
						|
	case 0x1101:
 | 
						|
		return "kk" // ㄲ
 | 
						|
	case 0x1102:
 | 
						|
		return "n" // ㄴ
 | 
						|
	case 0x1103:
 | 
						|
		return "d" // ㄷ
 | 
						|
	case 0x1104:
 | 
						|
		return "tt" // ㄸ
 | 
						|
	case 0x1105:
 | 
						|
		return "r" // ㄹ
 | 
						|
	case 0x1106:
 | 
						|
		return "m" // ㅁ
 | 
						|
	case 0x1107:
 | 
						|
		return "b" // ㅂ
 | 
						|
	case 0x1108:
 | 
						|
		return "pp" // ㅃ
 | 
						|
	case 0x1109:
 | 
						|
		return "s" // ㅅ
 | 
						|
	case 0x110A:
 | 
						|
		return "ss" // ㅆ
 | 
						|
	case 0x110B:
 | 
						|
		return "" // ㅇ
 | 
						|
	case 0x110C:
 | 
						|
		return "j" // ㅈ
 | 
						|
	case 0x110D:
 | 
						|
		return "jj" // ㅉ
 | 
						|
	case 0x110E:
 | 
						|
		return "ch" // ㅊ
 | 
						|
	case 0x110F:
 | 
						|
		return "k" // ㅋ
 | 
						|
	case 0x1110:
 | 
						|
		return "t" // ㅌ
 | 
						|
	case 0x1111:
 | 
						|
		return "p" // ㅍ
 | 
						|
	case 0x1112:
 | 
						|
		return "h" // ㅎ
 | 
						|
	// Jungseong (vowels)
 | 
						|
	case 0x1161:
 | 
						|
		return "a" // ㅏ
 | 
						|
	case 0x1162:
 | 
						|
		return "ae" // ㅐ
 | 
						|
	case 0x1163:
 | 
						|
		return "ya" // ㅑ
 | 
						|
	case 0x1164:
 | 
						|
		return "yae" // ㅒ
 | 
						|
	case 0x1165:
 | 
						|
		return "eo" // ㅓ
 | 
						|
	case 0x1166:
 | 
						|
		return "e" // ㅔ
 | 
						|
	case 0x1167:
 | 
						|
		return "yeo" // ㅕ
 | 
						|
	case 0x1168:
 | 
						|
		return "ye" // ㅖ
 | 
						|
	case 0x1169:
 | 
						|
		return "o" // ㅗ
 | 
						|
	case 0x116A:
 | 
						|
		return "wa" // ㅘ
 | 
						|
	case 0x116B:
 | 
						|
		return "wae" // ㅙ
 | 
						|
	case 0x116C:
 | 
						|
		return "oe" // ㅚ
 | 
						|
	case 0x116D:
 | 
						|
		return "yo" // ㅛ
 | 
						|
	case 0x116E:
 | 
						|
		return "u" // ㅜ
 | 
						|
	case 0x116F:
 | 
						|
		return "wo" // ㅝ
 | 
						|
	case 0x1170:
 | 
						|
		return "we" // ㅞ
 | 
						|
	case 0x1171:
 | 
						|
		return "wi" // ㅟ
 | 
						|
	case 0x1172:
 | 
						|
		return "yu" // ㅠ
 | 
						|
	case 0x1173:
 | 
						|
		return "eu" // ㅡ
 | 
						|
	case 0x1174:
 | 
						|
		return "ui" // ㅢ
 | 
						|
	case 0x1175:
 | 
						|
		return "i" // ㅣ
 | 
						|
	// Jongseong (final position consonants)
 | 
						|
	case 0x11A8:
 | 
						|
		return "k" // ㄱ
 | 
						|
	case 0x11A9:
 | 
						|
		return "k" // ㄲ
 | 
						|
	case 0x11AB:
 | 
						|
		return "n" // ㄴ
 | 
						|
	case 0x11AE:
 | 
						|
		return "t" // ㄷ
 | 
						|
	case 0x11AF:
 | 
						|
		return "l" // ㄹ
 | 
						|
	case 0x11B7:
 | 
						|
		return "m" // ㅁ
 | 
						|
	case 0x11B8:
 | 
						|
		return "p" // ㅂ
 | 
						|
	case 0x11BA:
 | 
						|
		return "t" // ㅅ
 | 
						|
	case 0x11BB:
 | 
						|
		return "t" // ㅆ
 | 
						|
	case 0x11BC:
 | 
						|
		return "ng" // ㅇ
 | 
						|
	case 0x11BD:
 | 
						|
		return "t" // ㅈ
 | 
						|
	case 0x11BE:
 | 
						|
		return "t" // ㅊ
 | 
						|
	case 0x11BF:
 | 
						|
		return "k" // ㅋ
 | 
						|
	case 0x11C0:
 | 
						|
		return "t" // ㅌ
 | 
						|
	case 0x11C1:
 | 
						|
		return "p" // ㅍ
 | 
						|
	case 0x11C2:
 | 
						|
		return "t" // ㅎ
 | 
						|
	}
 | 
						|
 | 
						|
	return string(jamo)
 | 
						|
}
 | 
						|
 | 
						|
// Some combinations of ending jamo in one syllable and initial jamo in the next are romanized
 | 
						|
// irregularly. These exceptions are called "special provisions". In cases where multiple
 | 
						|
// romanizations are permitted, we use the one that's least commonly used elsewhere.
 | 
						|
//
 | 
						|
// Returns empty strring and false if either character is not in the modern jamo block,
 | 
						|
// or if there is no special provision for that pair of jamo.
 | 
						|
func translitSpecialProvisions(previousEnding rune, nextInitial rune) (string, bool) {
 | 
						|
	// Return false if previousEnding not in modern jamo block
 | 
						|
	if !unicode.In(previousEnding, jamoBlock) {
 | 
						|
		return "", false
 | 
						|
	}
 | 
						|
	// Return false if nextInitial not in modern jamo block
 | 
						|
	if !unicode.In(nextInitial, jamoBlock) {
 | 
						|
		return "", false
 | 
						|
	}
 | 
						|
 | 
						|
	// Jongseong (final position) ㅎ has a number of special provisions.
 | 
						|
	if previousEnding == 0x11C2 {
 | 
						|
		switch nextInitial {
 | 
						|
		case 0x110B:
 | 
						|
			return "h", true // ㅇ
 | 
						|
		case 0x1100:
 | 
						|
			return "k", true // ㄱ
 | 
						|
		case 0x1102:
 | 
						|
			return "nn", true // ㄴ
 | 
						|
		case 0x1103:
 | 
						|
			return "t", true // ㄷ
 | 
						|
		case 0x1105:
 | 
						|
			return "nn", true // ㄹ
 | 
						|
		case 0x1106:
 | 
						|
			return "nm", true // ㅁ
 | 
						|
		case 0x1107:
 | 
						|
			return "p", true // ㅂ
 | 
						|
		case 0x1109:
 | 
						|
			return "hs", true // ㅅ
 | 
						|
		case 0x110C:
 | 
						|
			return "ch", true // ㅈ
 | 
						|
		case 0x1112:
 | 
						|
			return "t", true // ㅎ
 | 
						|
		default:
 | 
						|
			return "", false
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	// Otherwise, special provisions are denser when grouped by the second jamo.
 | 
						|
	switch nextInitial {
 | 
						|
	case 0x1100: // ㄱ
 | 
						|
		switch previousEnding {
 | 
						|
		case 0x11AB:
 | 
						|
			return "n-g", true // ㄴ
 | 
						|
		default:
 | 
						|
			return "", false
 | 
						|
		}
 | 
						|
	case 0x1102: // ㄴ
 | 
						|
		switch previousEnding {
 | 
						|
		case 0x11A8:
 | 
						|
			return "ngn", true // ㄱ
 | 
						|
		case 0x11AE:
 | 
						|
			fallthrough // ㄷ
 | 
						|
		case 0x11BA:
 | 
						|
			fallthrough // ㅅ
 | 
						|
		case 0x11BD:
 | 
						|
			fallthrough // ㅈ
 | 
						|
		case 0x11BE:
 | 
						|
			fallthrough // ㅊ
 | 
						|
		case 0x11C0: // ㅌ
 | 
						|
			return "nn", true
 | 
						|
		case 0x11AF:
 | 
						|
			return "ll", true // ㄹ
 | 
						|
		case 0x11B8:
 | 
						|
			return "mn", true // ㅂ
 | 
						|
		default:
 | 
						|
			return "", false
 | 
						|
		}
 | 
						|
	case 0x1105: // ㄹ
 | 
						|
		switch previousEnding {
 | 
						|
		case 0x11A8:
 | 
						|
			fallthrough // ㄱ
 | 
						|
		case 0x11AB:
 | 
						|
			fallthrough // ㄴ
 | 
						|
		case 0x11AF: // ㄹ
 | 
						|
			return "ll", true
 | 
						|
		case 0x11AE:
 | 
						|
			fallthrough // ㄷ
 | 
						|
		case 0x11BA:
 | 
						|
			fallthrough // ㅅ
 | 
						|
		case 0x11BD:
 | 
						|
			fallthrough // ㅈ
 | 
						|
		case 0x11BE:
 | 
						|
			fallthrough // ㅊ
 | 
						|
		case 0x11C0: // ㅌ
 | 
						|
			return "nn", true
 | 
						|
		case 0x11B7:
 | 
						|
			fallthrough // ㅁ
 | 
						|
		case 0x11B8: // ㅂ
 | 
						|
			return "mn", true
 | 
						|
		case 0x11BC:
 | 
						|
			return "ngn", true // ㅇ
 | 
						|
		default:
 | 
						|
			return "", false
 | 
						|
		}
 | 
						|
	case 0x1106: // ㅁ
 | 
						|
		switch previousEnding {
 | 
						|
		case 0x11A8:
 | 
						|
			return "ngm", true // ㄱ
 | 
						|
		case 0x11AE:
 | 
						|
			fallthrough // ㄷ
 | 
						|
		case 0x11BA:
 | 
						|
			fallthrough // ㅅ
 | 
						|
		case 0x11BD:
 | 
						|
			fallthrough // ㅈ
 | 
						|
		case 0x11BE:
 | 
						|
			fallthrough // ㅊ
 | 
						|
		case 0x11C0: // ㅌ
 | 
						|
			return "nm", true
 | 
						|
		case 0x11B8:
 | 
						|
			return "mm", true // ㅂ
 | 
						|
		default:
 | 
						|
			return "", false
 | 
						|
		}
 | 
						|
	case 0x110B: // ㅇ
 | 
						|
		switch previousEnding {
 | 
						|
		case 0x11A8:
 | 
						|
			return "g", true // ㄱ
 | 
						|
		case 0x11AE:
 | 
						|
			return "d", true // ㄷ
 | 
						|
		case 0x11AF:
 | 
						|
			return "r", true // ㄹ
 | 
						|
		case 0x11B8:
 | 
						|
			return "b", true // ㅂ
 | 
						|
		case 0x11BA:
 | 
						|
			return "s", true // ㅅ
 | 
						|
		case 0x11BC:
 | 
						|
			return "ng-", true // ㅇ
 | 
						|
		case 0x11BD:
 | 
						|
			return "j", true // ㅈ
 | 
						|
		case 0x11BE:
 | 
						|
			return "ch", true // ㅊ
 | 
						|
		default:
 | 
						|
			return "", false
 | 
						|
		}
 | 
						|
	case 0x110F: // ㅋ
 | 
						|
		switch previousEnding {
 | 
						|
		case 0x11A8:
 | 
						|
			return "k-k", true // ㄱ
 | 
						|
		default:
 | 
						|
			return "", false
 | 
						|
		}
 | 
						|
	case 0x1110: // ㅌ
 | 
						|
		switch previousEnding {
 | 
						|
		case 0x11AE:
 | 
						|
			fallthrough // ㄷ
 | 
						|
		case 0x11BA:
 | 
						|
			fallthrough // ㅅ
 | 
						|
		case 0x11BD:
 | 
						|
			fallthrough // ㅈ
 | 
						|
		case 0x11BE:
 | 
						|
			fallthrough // ㅊ
 | 
						|
		case 0x11C0: // ㅌ
 | 
						|
			return "t-t", true
 | 
						|
		default:
 | 
						|
			return "", false
 | 
						|
		}
 | 
						|
	case 0x1111: // ㅍ
 | 
						|
		switch previousEnding {
 | 
						|
		case 0x11B8:
 | 
						|
			return "p-p", true // ㅂ
 | 
						|
		default:
 | 
						|
			return "", false
 | 
						|
		}
 | 
						|
	default:
 | 
						|
		return "", false
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// Decompose a syllable into several jamo. Does nothing if that isn't possible.
 | 
						|
func decompose(syllable rune) string {
 | 
						|
	return norm.NFD.String(string(syllable))
 | 
						|
}
 | 
						|
 | 
						|
// Transliterate any Hangul in the given string.
 | 
						|
// Leaves any non-Hangul characters unmodified.
 | 
						|
func (kt *KoreanTranslit) Transliterate(s string) string {
 | 
						|
	if len(s) == 0 {
 | 
						|
		return s
 | 
						|
	}
 | 
						|
 | 
						|
	builder := &strings.Builder{}
 | 
						|
 | 
						|
	nextInitialJamoConsumed := false
 | 
						|
 | 
						|
	for i, syllable := range s {
 | 
						|
		// If character not in blocks, leave it unmodified
 | 
						|
		if !unicode.In(syllable, jamoBlock, syllablesBlock, compatJamoBlock) {
 | 
						|
			builder.WriteRune(syllable)
 | 
						|
			continue
 | 
						|
		}
 | 
						|
 | 
						|
		jamo := decompose(syllable)
 | 
						|
		for j, char := range jamo {
 | 
						|
			// If we already transliterated the first jamo of this syllable as part of a special
 | 
						|
			// provision, skip it. Otherwise, handle it in the unconditional else branch.
 | 
						|
			if j == 0 && nextInitialJamoConsumed {
 | 
						|
				nextInitialJamoConsumed = false
 | 
						|
				continue
 | 
						|
			}
 | 
						|
 | 
						|
			// If this is the last jamo of this syllable and not the last syllable of the
 | 
						|
			// string, check for special provisions. If the next char is whitespace or not
 | 
						|
			// Hangul, run translitSpecialProvisions() should return no value.
 | 
						|
			if j == len(jamo)-1 && i < len(s)-1 {
 | 
						|
				nextSyllable := s[i+1]
 | 
						|
				nextJamo := decompose(rune(nextSyllable))[0]
 | 
						|
 | 
						|
				// Attempt to handle special provision
 | 
						|
				specialProvision, ok := translitSpecialProvisions(char, rune(nextJamo))
 | 
						|
				if ok {
 | 
						|
					builder.WriteString(specialProvision)
 | 
						|
					nextInitialJamoConsumed = true
 | 
						|
				} else {
 | 
						|
					// Not a special provision, transliterate normally
 | 
						|
					builder.WriteString(translitSingleJamo(char))
 | 
						|
				}
 | 
						|
				continue
 | 
						|
			}
 | 
						|
			// Transliterate normally
 | 
						|
			builder.WriteString(translitSingleJamo(char))
 | 
						|
		}
 | 
						|
	}
 | 
						|
	return builder.String()
 | 
						|
}
 |