forked from Elara6331/itd
		
	Add korean transliteration
This commit is contained in:
		
							
								
								
									
										452
									
								
								translit/korean.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										452
									
								
								translit/korean.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,452 @@ | ||||
| package translit | ||||
|  | ||||
| import ( | ||||
| 	"strings" | ||||
| 	"unicode" | ||||
|  | ||||
| 	"golang.org/x/text/unicode/norm" | ||||
| ) | ||||
|  | ||||
| // https://en.wikipedia.org/wiki/Hangul_Jamo_%28Unicode_block%29 | ||||
| var jamoBlock = &unicode.RangeTable{ | ||||
| 	R16: []unicode.Range16{{ | ||||
| 		Lo:     0x1100, | ||||
| 		Hi:     0x11FF, | ||||
| 		Stride: 1, | ||||
| 	}}, | ||||
| } | ||||
|  | ||||
| // https://en.wikipedia.org/wiki/Hangul_Syllables | ||||
| var syllablesBlock = &unicode.RangeTable{ | ||||
| 	R16: []unicode.Range16{{ | ||||
| 		Lo:     0xAC00, | ||||
| 		Hi:     0xD7A3, | ||||
| 		Stride: 1, | ||||
| 	}}, | ||||
| } | ||||
|  | ||||
| // https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo | ||||
| var compatJamoBlock = &unicode.RangeTable{ | ||||
| 	R16: []unicode.Range16{{ | ||||
| 		Lo:     0x3131, | ||||
| 		Hi:     0x318E, | ||||
| 		Stride: 1, | ||||
| 	}}, | ||||
| } | ||||
|  | ||||
| // KoreanTranslit implements transliteration for Korean. | ||||
| // | ||||
| // This was translated to Go from the code in https://codeberg.org/Freeyourgadget/Gadgetbridge | ||||
| type KoreanTranslit struct{} | ||||
|  | ||||
| // User input consisting of isolated jamo is usually mapped to the KS X 1001 compatibility | ||||
| // block, but jamo resulting from decomposed syllables are mapped to the modern one. This | ||||
| // function maps compat jamo to modern ones where possible and returns all other characters | ||||
| // unmodified. | ||||
| // | ||||
| // https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo | ||||
| // https://en.wikipedia.org/wiki/Hangul_Jamo_%28Unicode_block%29 | ||||
| func decompatJamo(jamo rune) rune { | ||||
| 	// KS X 1001 Hangul filler, not used in modern Unicode. A useful landmark in the | ||||
| 	// compatibility jamo block. | ||||
| 	// https://en.wikipedia.org/wiki/KS_X_1001#Hangul_Filler | ||||
| 	var hangulFiller rune = 0x3164 | ||||
|  | ||||
| 	// Ignore characters outside compatibility jamo block | ||||
| 	if !unicode.In(jamo, compatJamoBlock) { | ||||
| 		return jamo | ||||
| 	} | ||||
|  | ||||
| 	// Vowels are contiguous, in the same order, and unambiguous so it's a simple offset. | ||||
| 	if jamo >= 0x314F && jamo < hangulFiller { | ||||
| 		return jamo - 0x1FEE | ||||
| 	} | ||||
|  | ||||
| 	// Consonants are organized differently. No clean way to do this. | ||||
| 	// The compatibility jamo block doesn't distinguish between Choseong (leading) and Jongseong | ||||
| 	// (final) positions, but the modern block does. We map to Choseong here. | ||||
| 	switch jamo { | ||||
| 	case 0x3131: | ||||
| 		return 0x1100 // ㄱ | ||||
| 	case 0x3132: | ||||
| 		return 0x1101 // ㄲ | ||||
| 	case 0x3134: | ||||
| 		return 0x1102 // ㄴ | ||||
| 	case 0x3137: | ||||
| 		return 0x1103 // ㄷ | ||||
| 	case 0x3138: | ||||
| 		return 0x1104 // ㄸ | ||||
| 	case 0x3139: | ||||
| 		return 0x1105 // ㄹ | ||||
| 	case 0x3141: | ||||
| 		return 0x1106 // ㅁ | ||||
| 	case 0x3142: | ||||
| 		return 0x1107 // ㅂ | ||||
| 	case 0x3143: | ||||
| 		return 0x1108 // ㅃ | ||||
| 	case 0x3145: | ||||
| 		return 0x1109 // ㅅ | ||||
| 	case 0x3146: | ||||
| 		return 0x110A // ㅆ | ||||
| 	case 0x3147: | ||||
| 		return 0x110B // ㅇ | ||||
| 	case 0x3148: | ||||
| 		return 0x110C // ㅈ | ||||
| 	case 0x3149: | ||||
| 		return 0x110D // ㅉ | ||||
| 	case 0x314A: | ||||
| 		return 0x110E // ㅊ | ||||
| 	case 0x314B: | ||||
| 		return 0x110F // ㅋ | ||||
| 	case 0x314C: | ||||
| 		return 0x1110 // ㅌ | ||||
| 	case 0x314D: | ||||
| 		return 0x1111 // ㅍ | ||||
| 	case 0x314E: | ||||
| 		return 0x1112 // ㅎ | ||||
| 	} | ||||
|  | ||||
| 	// The rest of the compatibility block consists of archaic compounds that are | ||||
| 	// unlikely to be encountered in modern systems. Just leave them alone. | ||||
| 	return jamo | ||||
| } | ||||
|  | ||||
| // Transliterates one jamo at a time. | ||||
| // Does nothing if it isn't in the modern jamo block. | ||||
| func translitSingleJamo(jamo rune) string { | ||||
| 	jamo = decompatJamo(jamo) | ||||
|  | ||||
| 	switch jamo { | ||||
| 	// Choseong (leading position consonants) | ||||
| 	case 0x1100: | ||||
| 		return "g" // ㄱ | ||||
| 	case 0x1101: | ||||
| 		return "kk" // ㄲ | ||||
| 	case 0x1102: | ||||
| 		return "n" // ㄴ | ||||
| 	case 0x1103: | ||||
| 		return "d" // ㄷ | ||||
| 	case 0x1104: | ||||
| 		return "tt" // ㄸ | ||||
| 	case 0x1105: | ||||
| 		return "r" // ㄹ | ||||
| 	case 0x1106: | ||||
| 		return "m" // ㅁ | ||||
| 	case 0x1107: | ||||
| 		return "b" // ㅂ | ||||
| 	case 0x1108: | ||||
| 		return "pp" // ㅃ | ||||
| 	case 0x1109: | ||||
| 		return "s" // ㅅ | ||||
| 	case 0x110A: | ||||
| 		return "ss" // ㅆ | ||||
| 	case 0x110B: | ||||
| 		return "" // ㅇ | ||||
| 	case 0x110C: | ||||
| 		return "j" // ㅈ | ||||
| 	case 0x110D: | ||||
| 		return "jj" // ㅉ | ||||
| 	case 0x110E: | ||||
| 		return "ch" // ㅊ | ||||
| 	case 0x110F: | ||||
| 		return "k" // ㅋ | ||||
| 	case 0x1110: | ||||
| 		return "t" // ㅌ | ||||
| 	case 0x1111: | ||||
| 		return "p" // ㅍ | ||||
| 	case 0x1112: | ||||
| 		return "h" // ㅎ | ||||
| 	// Jungseong (vowels) | ||||
| 	case 0x1161: | ||||
| 		return "a" // ㅏ | ||||
| 	case 0x1162: | ||||
| 		return "ae" // ㅐ | ||||
| 	case 0x1163: | ||||
| 		return "ya" // ㅑ | ||||
| 	case 0x1164: | ||||
| 		return "yae" // ㅒ | ||||
| 	case 0x1165: | ||||
| 		return "eo" // ㅓ | ||||
| 	case 0x1166: | ||||
| 		return "e" // ㅔ | ||||
| 	case 0x1167: | ||||
| 		return "yeo" // ㅕ | ||||
| 	case 0x1168: | ||||
| 		return "ye" // ㅖ | ||||
| 	case 0x1169: | ||||
| 		return "o" // ㅗ | ||||
| 	case 0x116A: | ||||
| 		return "wa" // ㅘ | ||||
| 	case 0x116B: | ||||
| 		return "wae" // ㅙ | ||||
| 	case 0x116C: | ||||
| 		return "oe" // ㅚ | ||||
| 	case 0x116D: | ||||
| 		return "yo" // ㅛ | ||||
| 	case 0x116E: | ||||
| 		return "u" // ㅜ | ||||
| 	case 0x116F: | ||||
| 		return "wo" // ㅝ | ||||
| 	case 0x1170: | ||||
| 		return "we" // ㅞ | ||||
| 	case 0x1171: | ||||
| 		return "wi" // ㅟ | ||||
| 	case 0x1172: | ||||
| 		return "yu" // ㅠ | ||||
| 	case 0x1173: | ||||
| 		return "eu" // ㅡ | ||||
| 	case 0x1174: | ||||
| 		return "ui" // ㅢ | ||||
| 	case 0x1175: | ||||
| 		return "i" // ㅣ | ||||
| 	// Jongseong (final position consonants) | ||||
| 	case 0x11A8: | ||||
| 		return "k" // ㄱ | ||||
| 	case 0x11A9: | ||||
| 		return "k" // ㄲ | ||||
| 	case 0x11AB: | ||||
| 		return "n" // ㄴ | ||||
| 	case 0x11AE: | ||||
| 		return "t" // ㄷ | ||||
| 	case 0x11AF: | ||||
| 		return "l" // ㄹ | ||||
| 	case 0x11B7: | ||||
| 		return "m" // ㅁ | ||||
| 	case 0x11B8: | ||||
| 		return "p" // ㅂ | ||||
| 	case 0x11BA: | ||||
| 		return "t" // ㅅ | ||||
| 	case 0x11BB: | ||||
| 		return "t" // ㅆ | ||||
| 	case 0x11BC: | ||||
| 		return "ng" // ㅇ | ||||
| 	case 0x11BD: | ||||
| 		return "t" // ㅈ | ||||
| 	case 0x11BE: | ||||
| 		return "t" // ㅊ | ||||
| 	case 0x11BF: | ||||
| 		return "k" // ㅋ | ||||
| 	case 0x11C0: | ||||
| 		return "t" // ㅌ | ||||
| 	case 0x11C1: | ||||
| 		return "p" // ㅍ | ||||
| 	case 0x11C2: | ||||
| 		return "t" // ㅎ | ||||
| 	} | ||||
|  | ||||
| 	return string(jamo) | ||||
| } | ||||
|  | ||||
| // Some combinations of ending jamo in one syllable and initial jamo in the next are romanized | ||||
| // irregularly. These exceptions are called "special provisions". In cases where multiple | ||||
| // romanizations are permitted, we use the one that's least commonly used elsewhere. | ||||
| // | ||||
| // Returns empty strring and false if either character is not in the modern jamo block, | ||||
| // or if there is no special provision for that pair of jamo. | ||||
| func translitSpecialProvisions(previousEnding rune, nextInitial rune) (string, bool) { | ||||
| 	// Return false if previousEnding not in modern jamo block | ||||
| 	if !unicode.In(previousEnding, jamoBlock) { | ||||
| 		return "", false | ||||
| 	} | ||||
| 	// Return false if nextInitial not in modern jamo block | ||||
| 	if !unicode.In(nextInitial, jamoBlock) { | ||||
| 		return "", false | ||||
| 	} | ||||
|  | ||||
| 	// Jongseong (final position) ㅎ has a number of special provisions. | ||||
| 	if previousEnding == 0x11C2 { | ||||
| 		switch nextInitial { | ||||
| 		case 0x110B: | ||||
| 			return "h", true // ㅇ | ||||
| 		case 0x1100: | ||||
| 			return "k", true // ㄱ | ||||
| 		case 0x1102: | ||||
| 			return "nn", true // ㄴ | ||||
| 		case 0x1103: | ||||
| 			return "t", true // ㄷ | ||||
| 		case 0x1105: | ||||
| 			return "nn", true // ㄹ | ||||
| 		case 0x1106: | ||||
| 			return "nm", true // ㅁ | ||||
| 		case 0x1107: | ||||
| 			return "p", true // ㅂ | ||||
| 		case 0x1109: | ||||
| 			return "hs", true // ㅅ | ||||
| 		case 0x110C: | ||||
| 			return "ch", true // ㅈ | ||||
| 		case 0x1112: | ||||
| 			return "t", true // ㅎ | ||||
| 		default: | ||||
| 			return "", false | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	// Otherwise, special provisions are denser when grouped by the second jamo. | ||||
| 	switch nextInitial { | ||||
| 	case 0x1100: // ㄱ | ||||
| 		switch previousEnding { | ||||
| 		case 0x11AB: | ||||
| 			return "n-g", true // ㄴ | ||||
| 		default: | ||||
| 			return "", false | ||||
| 		} | ||||
| 	case 0x1102: // ㄴ | ||||
| 		switch previousEnding { | ||||
| 		case 0x11A8: | ||||
| 			return "ngn", true // ㄱ | ||||
| 		case 0x11AE: // ㄷ | ||||
| 		case 0x11BA: // ㅅ | ||||
| 		case 0x11BD: // ㅈ | ||||
| 		case 0x11BE: // ㅊ | ||||
| 		case 0x11C0: // ㅌ | ||||
| 			return "nn", true | ||||
| 		case 0x11AF: | ||||
| 			return "ll", true // ㄹ | ||||
| 		case 0x11B8: | ||||
| 			return "mn", true // ㅂ | ||||
| 		default: | ||||
| 			return "", false | ||||
| 		} | ||||
| 	case 0x1105: // ㄹ | ||||
| 		switch previousEnding { | ||||
| 		case 0x11A8: // ㄱ | ||||
| 		case 0x11AB: // ㄴ | ||||
| 		case 0x11AF: // ㄹ | ||||
| 			return "ll", true | ||||
| 		case 0x11AE: // ㄷ | ||||
| 		case 0x11BA: // ㅅ | ||||
| 		case 0x11BD: // ㅈ | ||||
| 		case 0x11BE: // ㅊ | ||||
| 		case 0x11C0: // ㅌ | ||||
| 			return "nn", true | ||||
| 		case 0x11B7: // ㅁ | ||||
| 		case 0x11B8: // ㅂ | ||||
| 			return "mn", true | ||||
| 		case 0x11BC: | ||||
| 			return "ngn", true // ㅇ | ||||
| 		default: | ||||
| 			return "", false | ||||
| 		} | ||||
| 	case 0x1106: // ㅁ | ||||
| 		switch previousEnding { | ||||
| 		case 0x11A8: | ||||
| 			return "ngm", true // ㄱ | ||||
| 		case 0x11AE: // ㄷ | ||||
| 		case 0x11BA: // ㅅ | ||||
| 		case 0x11BD: // ㅈ | ||||
| 		case 0x11BE: // ㅊ | ||||
| 		case 0x11C0: // ㅌ | ||||
| 			return "nm", true | ||||
| 		case 0x11B8: | ||||
| 			return "mm", true // ㅂ | ||||
| 		default: | ||||
| 			return "", false | ||||
| 		} | ||||
| 	case 0x110B: // ㅇ | ||||
| 		switch previousEnding { | ||||
| 		case 0x11A8: | ||||
| 			return "g", true // ㄱ | ||||
| 		case 0x11AE: | ||||
| 			return "d", true // ㄷ | ||||
| 		case 0x11AF: | ||||
| 			return "r", true // ㄹ | ||||
| 		case 0x11B8: | ||||
| 			return "b", true // ㅂ | ||||
| 		case 0x11BA: | ||||
| 			return "s", true // ㅅ | ||||
| 		case 0x11BC: | ||||
| 			return "ng-", true // ㅇ | ||||
| 		case 0x11BD: | ||||
| 			return "j", true // ㅈ | ||||
| 		case 0x11BE: | ||||
| 			return "ch", true // ㅊ | ||||
| 		default: | ||||
| 			return "", false | ||||
| 		} | ||||
| 	case 0x110F: // ㅋ | ||||
| 		switch previousEnding { | ||||
| 		case 0x11A8: | ||||
| 			return "k-k", true // ㄱ | ||||
| 		default: | ||||
| 			return "", false | ||||
| 		} | ||||
| 	case 0x1110: // ㅌ | ||||
| 		switch previousEnding { | ||||
| 		case 0x11AE: // ㄷ | ||||
| 		case 0x11BA: // ㅅ | ||||
| 		case 0x11BD: // ㅈ | ||||
| 		case 0x11BE: // ㅊ | ||||
| 		case 0x11C0: // ㅌ | ||||
| 			return "t-t", true | ||||
| 		default: | ||||
| 			return "", false | ||||
| 		} | ||||
| 	case 0x1111: // ㅍ | ||||
| 		switch previousEnding { | ||||
| 		case 0x11B8: | ||||
| 			return "p-p", true // ㅂ | ||||
| 		default: | ||||
| 			return "", false | ||||
| 		} | ||||
| 	default: | ||||
| 		return "", false | ||||
| 	} | ||||
| 	return "", false | ||||
| } | ||||
|  | ||||
| // Decompose a syllable into several jamo. Does nothing if that isn't possible. | ||||
| func decompose(syllable rune) string { | ||||
| 	return norm.NFD.String(string(syllable)) | ||||
| } | ||||
|  | ||||
| // Transliterate any Hangul in the given string. | ||||
| // Leaves any non-Hangul characters unmodified. | ||||
| func (kt *KoreanTranslit) Transliterate(s string) string { | ||||
| 	if len(s) == 0 { | ||||
| 		return s | ||||
| 	} | ||||
|  | ||||
| 	builder := &strings.Builder{} | ||||
|  | ||||
| 	nextInitialJamoConsumed := false | ||||
|  | ||||
| 	for i, syllable := range s { | ||||
| 		// If character not in blocks, leave it unmodified | ||||
| 		if !unicode.In(syllable, jamoBlock, syllablesBlock, compatJamoBlock) { | ||||
| 			builder.WriteRune(syllable) | ||||
| 			continue | ||||
| 		} | ||||
|  | ||||
| 		jamo := decompose(syllable) | ||||
| 		for j, char := range jamo { | ||||
| 			// If we already transliterated the first jamo of this syllable as part of a special | ||||
| 			// provision, skip it. Otherwise, handle it in the unconditional else branch. | ||||
| 			if j == 0 && nextInitialJamoConsumed { | ||||
| 				nextInitialJamoConsumed = false | ||||
| 				continue | ||||
| 			} | ||||
|  | ||||
| 			// If this is the last jamo of this syllable and not the last syllable of the | ||||
| 			// string, check for special provisions. If the next char is whitespace or not | ||||
| 			// Hangul, run translitSpecialProvisions() should return no value. | ||||
| 			if j == len(jamo)-1 && i < len(s)-1 { | ||||
| 				nextSyllable := s[i+1] | ||||
| 				nextJamo := decompose(rune(nextSyllable))[0] | ||||
|  | ||||
| 				// Attempt to handle special provision | ||||
| 				specialProvision, ok := translitSpecialProvisions(char, rune(nextJamo)) | ||||
| 				if ok { | ||||
| 					builder.WriteString(specialProvision) | ||||
| 					nextInitialJamoConsumed = true | ||||
| 				} else { | ||||
| 					// Not a special provision, transliterate normally | ||||
| 					builder.WriteString(translitSingleJamo(char)) | ||||
| 				} | ||||
| 				continue | ||||
| 			} | ||||
| 			// Transliterate normally | ||||
| 			builder.WriteString(translitSingleJamo(char)) | ||||
| 		} | ||||
| 	} | ||||
| 	return builder.String() | ||||
| } | ||||
| @@ -464,4 +464,5 @@ var Maps = map[string]Transliterator{ | ||||
| 		"😴", ":zzz:", | ||||
| 		"💤", ":zzz:", | ||||
| 	}, | ||||
| 	"Korean": &KoreanTranslit{}, | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user