itd/translit/korean.go

471 lines
10 KiB
Go
Raw Normal View History

2021-10-05 02:07:54 +00:00
package translit
import (
"strings"
"unicode"
"golang.org/x/text/unicode/norm"
)
// https://en.wikipedia.org/wiki/Hangul_Jamo_%28Unicode_block%29
var jamoBlock = &unicode.RangeTable{
R16: []unicode.Range16{{
Lo: 0x1100,
Hi: 0x11FF,
Stride: 1,
}},
}
// https://en.wikipedia.org/wiki/Hangul_Syllables
var syllablesBlock = &unicode.RangeTable{
R16: []unicode.Range16{{
Lo: 0xAC00,
Hi: 0xD7A3,
Stride: 1,
}},
}
// https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo
var compatJamoBlock = &unicode.RangeTable{
R16: []unicode.Range16{{
Lo: 0x3131,
Hi: 0x318E,
Stride: 1,
}},
}
// KoreanTranslit implements transliteration for Korean.
//
// This was translated to Go from the code in https://codeberg.org/Freeyourgadget/Gadgetbridge
type KoreanTranslit struct{}
// User input consisting of isolated jamo is usually mapped to the KS X 1001 compatibility
// block, but jamo resulting from decomposed syllables are mapped to the modern one. This
// function maps compat jamo to modern ones where possible and returns all other characters
// unmodified.
//
// https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo
// https://en.wikipedia.org/wiki/Hangul_Jamo_%28Unicode_block%29
func decompatJamo(jamo rune) rune {
// KS X 1001 Hangul filler, not used in modern Unicode. A useful landmark in the
// compatibility jamo block.
// https://en.wikipedia.org/wiki/KS_X_1001#Hangul_Filler
var hangulFiller rune = 0x3164
// Ignore characters outside compatibility jamo block
if !unicode.In(jamo, compatJamoBlock) {
return jamo
}
// Vowels are contiguous, in the same order, and unambiguous so it's a simple offset.
if jamo >= 0x314F && jamo < hangulFiller {
return jamo - 0x1FEE
}
// Consonants are organized differently. No clean way to do this.
// The compatibility jamo block doesn't distinguish between Choseong (leading) and Jongseong
// (final) positions, but the modern block does. We map to Choseong here.
switch jamo {
case 0x3131:
return 0x1100 // ㄱ
case 0x3132:
return 0x1101 // ㄲ
case 0x3134:
return 0x1102 // ㄴ
case 0x3137:
return 0x1103 // ㄷ
case 0x3138:
return 0x1104 // ㄸ
case 0x3139:
return 0x1105 // ㄹ
case 0x3141:
return 0x1106 // ㅁ
case 0x3142:
return 0x1107 // ㅂ
case 0x3143:
return 0x1108 // ㅃ
case 0x3145:
return 0x1109 // ㅅ
case 0x3146:
return 0x110A // ㅆ
case 0x3147:
return 0x110B // ㅇ
case 0x3148:
return 0x110C // ㅈ
case 0x3149:
return 0x110D // ㅉ
case 0x314A:
return 0x110E // ㅊ
case 0x314B:
return 0x110F // ㅋ
case 0x314C:
return 0x1110 // ㅌ
case 0x314D:
return 0x1111 // ㅍ
case 0x314E:
return 0x1112 // ㅎ
}
// The rest of the compatibility block consists of archaic compounds that are
// unlikely to be encountered in modern systems. Just leave them alone.
return jamo
}
// Transliterates one jamo at a time.
// Does nothing if it isn't in the modern jamo block.
func translitSingleJamo(jamo rune) string {
jamo = decompatJamo(jamo)
switch jamo {
// Choseong (leading position consonants)
case 0x1100:
return "g" // ㄱ
case 0x1101:
return "kk" // ㄲ
case 0x1102:
return "n" // ㄴ
case 0x1103:
return "d" // ㄷ
case 0x1104:
return "tt" // ㄸ
case 0x1105:
return "r" // ㄹ
case 0x1106:
return "m" // ㅁ
case 0x1107:
return "b" // ㅂ
case 0x1108:
return "pp" // ㅃ
case 0x1109:
return "s" // ㅅ
case 0x110A:
return "ss" // ㅆ
case 0x110B:
return "" // ㅇ
case 0x110C:
return "j" // ㅈ
case 0x110D:
return "jj" // ㅉ
case 0x110E:
return "ch" // ㅊ
case 0x110F:
return "k" // ㅋ
case 0x1110:
return "t" // ㅌ
case 0x1111:
return "p" // ㅍ
case 0x1112:
return "h" // ㅎ
// Jungseong (vowels)
case 0x1161:
return "a" // ㅏ
case 0x1162:
return "ae" // ㅐ
case 0x1163:
return "ya" // ㅑ
case 0x1164:
return "yae" // ㅒ
case 0x1165:
return "eo" // ㅓ
case 0x1166:
return "e" // ㅔ
case 0x1167:
return "yeo" // ㅕ
case 0x1168:
return "ye" // ㅖ
case 0x1169:
return "o" // ㅗ
case 0x116A:
return "wa" // ㅘ
case 0x116B:
return "wae" // ㅙ
case 0x116C:
return "oe" // ㅚ
case 0x116D:
return "yo" // ㅛ
case 0x116E:
return "u" // ㅜ
case 0x116F:
return "wo" // ㅝ
case 0x1170:
return "we" // ㅞ
case 0x1171:
return "wi" // ㅟ
case 0x1172:
return "yu" // ㅠ
case 0x1173:
return "eu" // ㅡ
case 0x1174:
return "ui" // ㅢ
case 0x1175:
return "i" // ㅣ
// Jongseong (final position consonants)
case 0x11A8:
return "k" // ㄱ
case 0x11A9:
return "k" // ㄲ
case 0x11AB:
return "n" // ㄴ
case 0x11AE:
return "t" // ㄷ
case 0x11AF:
return "l" // ㄹ
case 0x11B7:
return "m" // ㅁ
case 0x11B8:
return "p" // ㅂ
case 0x11BA:
return "t" // ㅅ
case 0x11BB:
return "t" // ㅆ
case 0x11BC:
return "ng" // ㅇ
case 0x11BD:
return "t" // ㅈ
case 0x11BE:
return "t" // ㅊ
case 0x11BF:
return "k" // ㅋ
case 0x11C0:
return "t" // ㅌ
case 0x11C1:
return "p" // ㅍ
case 0x11C2:
return "t" // ㅎ
}
return string(jamo)
}
// Some combinations of ending jamo in one syllable and initial jamo in the next are romanized
// irregularly. These exceptions are called "special provisions". In cases where multiple
// romanizations are permitted, we use the one that's least commonly used elsewhere.
//
// Returns empty strring and false if either character is not in the modern jamo block,
// or if there is no special provision for that pair of jamo.
func translitSpecialProvisions(previousEnding rune, nextInitial rune) (string, bool) {
// Return false if previousEnding not in modern jamo block
if !unicode.In(previousEnding, jamoBlock) {
return "", false
}
// Return false if nextInitial not in modern jamo block
if !unicode.In(nextInitial, jamoBlock) {
return "", false
}
// Jongseong (final position) ㅎ has a number of special provisions.
if previousEnding == 0x11C2 {
switch nextInitial {
case 0x110B:
return "h", true // ㅇ
case 0x1100:
return "k", true // ㄱ
case 0x1102:
return "nn", true // ㄴ
case 0x1103:
return "t", true // ㄷ
case 0x1105:
return "nn", true // ㄹ
case 0x1106:
return "nm", true // ㅁ
case 0x1107:
return "p", true // ㅂ
case 0x1109:
return "hs", true // ㅅ
case 0x110C:
return "ch", true // ㅈ
case 0x1112:
return "t", true // ㅎ
default:
return "", false
}
}
// Otherwise, special provisions are denser when grouped by the second jamo.
switch nextInitial {
case 0x1100: // ㄱ
switch previousEnding {
case 0x11AB:
return "n-g", true // ㄴ
default:
return "", false
}
case 0x1102: // ㄴ
switch previousEnding {
case 0x11A8:
return "ngn", true // ㄱ
2021-10-05 03:06:08 +00:00
case 0x11AE:
fallthrough // ㄷ
case 0x11BA:
fallthrough // ㅅ
case 0x11BD:
fallthrough // ㅈ
case 0x11BE:
fallthrough // ㅊ
2021-10-05 02:07:54 +00:00
case 0x11C0: // ㅌ
return "nn", true
case 0x11AF:
return "ll", true // ㄹ
case 0x11B8:
return "mn", true // ㅂ
default:
return "", false
}
case 0x1105: // ㄹ
switch previousEnding {
2021-10-05 03:06:08 +00:00
case 0x11A8:
fallthrough // ㄱ
case 0x11AB:
fallthrough // ㄴ
2021-10-05 02:07:54 +00:00
case 0x11AF: // ㄹ
return "ll", true
2021-10-05 03:06:08 +00:00
case 0x11AE:
fallthrough // ㄷ
case 0x11BA:
fallthrough // ㅅ
case 0x11BD:
fallthrough // ㅈ
case 0x11BE:
fallthrough // ㅊ
2021-10-05 02:07:54 +00:00
case 0x11C0: // ㅌ
return "nn", true
2021-10-05 03:06:08 +00:00
case 0x11B7:
fallthrough // ㅁ
2021-10-05 02:07:54 +00:00
case 0x11B8: // ㅂ
return "mn", true
case 0x11BC:
return "ngn", true // ㅇ
default:
return "", false
}
case 0x1106: // ㅁ
switch previousEnding {
case 0x11A8:
return "ngm", true // ㄱ
2021-10-05 03:06:08 +00:00
case 0x11AE:
fallthrough // ㄷ
case 0x11BA:
fallthrough // ㅅ
case 0x11BD:
fallthrough // ㅈ
case 0x11BE:
fallthrough // ㅊ
2021-10-05 02:07:54 +00:00
case 0x11C0: // ㅌ
return "nm", true
case 0x11B8:
return "mm", true // ㅂ
default:
return "", false
}
case 0x110B: // ㅇ
switch previousEnding {
case 0x11A8:
return "g", true // ㄱ
case 0x11AE:
return "d", true // ㄷ
case 0x11AF:
return "r", true // ㄹ
case 0x11B8:
return "b", true // ㅂ
case 0x11BA:
return "s", true // ㅅ
case 0x11BC:
return "ng-", true // ㅇ
case 0x11BD:
return "j", true // ㅈ
case 0x11BE:
return "ch", true // ㅊ
default:
return "", false
}
case 0x110F: // ㅋ
switch previousEnding {
case 0x11A8:
return "k-k", true // ㄱ
default:
return "", false
}
case 0x1110: // ㅌ
switch previousEnding {
2021-10-05 03:06:08 +00:00
case 0x11AE:
fallthrough // ㄷ
case 0x11BA:
fallthrough // ㅅ
case 0x11BD:
fallthrough // ㅈ
case 0x11BE:
fallthrough // ㅊ
2021-10-05 02:07:54 +00:00
case 0x11C0: // ㅌ
return "t-t", true
default:
return "", false
}
case 0x1111: // ㅍ
switch previousEnding {
case 0x11B8:
return "p-p", true // ㅂ
default:
return "", false
}
default:
return "", false
}
}
// Decompose a syllable into several jamo. Does nothing if that isn't possible.
func decompose(syllable rune) string {
return norm.NFD.String(string(syllable))
}
// Transliterate any Hangul in the given string.
// Leaves any non-Hangul characters unmodified.
func (kt *KoreanTranslit) Transliterate(s string) string {
if len(s) == 0 {
return s
}
builder := &strings.Builder{}
nextInitialJamoConsumed := false
for i, syllable := range s {
// If character not in blocks, leave it unmodified
if !unicode.In(syllable, jamoBlock, syllablesBlock, compatJamoBlock) {
builder.WriteRune(syllable)
continue
}
jamo := decompose(syllable)
for j, char := range jamo {
// If we already transliterated the first jamo of this syllable as part of a special
// provision, skip it. Otherwise, handle it in the unconditional else branch.
if j == 0 && nextInitialJamoConsumed {
nextInitialJamoConsumed = false
continue
}
// If this is the last jamo of this syllable and not the last syllable of the
// string, check for special provisions. If the next char is whitespace or not
// Hangul, run translitSpecialProvisions() should return no value.
if j == len(jamo)-1 && i < len(s)-1 {
nextSyllable := s[i+1]
nextJamo := decompose(rune(nextSyllable))[0]
// Attempt to handle special provision
specialProvision, ok := translitSpecialProvisions(char, rune(nextJamo))
if ok {
builder.WriteString(specialProvision)
nextInitialJamoConsumed = true
} else {
// Not a special provision, transliterate normally
builder.WriteString(translitSingleJamo(char))
}
continue
}
// Transliterate normally
builder.WriteString(translitSingleJamo(char))
}
}
return builder.String()
}