/* Copyright (C) 1995 Bjoern Beutel. */ /* Description. =============================================================*/ /* This module handles the splitting of Hangul syllables and multi-letter * Jamos into single Hangul letters (and back). * It also converts Romanised Hangul to single-letter Hangul (and back). * * In this conversion module, we use four representations of * Hangul letters and syllables: * 1. Unicode Hangul syllables, which occupy code points 0xac00-0xd7a3. * The syllables are sorted by their constituting letters. The sort criteria * are (from major to minor): * - the 19 different initial consonant combinations (Choseong) as defined * in the table CHOSEONGS * - the 21 different vowel combinations (Jungseong) as defined in the table * JUNGSEONGS * - the 28 different final consonant combinations (Jonseong) as defined in * the table JONSEONGS (including the empty string) * 2. Unicode alternative Jamo characters, which occupy the code points * 0x3131-0x3163. For internal representation, only the Jamos that represent * single characters are used. * 3. Roman code, which is a latin transcription; it adopts the Yale standard * for Hangul romanization. * Here, every syllable begins with a dot ".". Transcripted Hangul is * enclosed in curly brackets in order to distinguish it from original * latin characters. */ /* Includes. ================================================================*/ #include #include #include #include #include #include "basic.h" #include "pools.h" #include "tries.h" #include "hangul.h" /* Constants. ===============================================================*/ /* Intermediate representation of single Jamo letters as strings */ #define A "\x01" /* a */ #define B "\x02" /* phieuph */ #define C "\x03" /* cieuc */ #define D "\x04" /* thieuth */ #define E "\x05" /* eo */ #define G "\x06" /* kieukh */ #define H "\x07" /* hieuh */ #define I "\x08" /* i */ #define K "\x09" /* kiyeok */ #define L "\x0a" /* rieul */ #define M "\x0b" /* mieum */ #define N "\x0c" /* nieun */ #define O "\x0d" /* o */ #define P "\x0e" /* pieup */ #define S "\x0f" /* sios */ #define T "\x10" /* tikeut */ #define U "\x11" /* eu */ #define W "\x12" /* u */ #define X "\x13" /* ieung */ #define Z "\x14" /* chieuch */ #define DOT "\x15" /* syllable start */ #define VOWELS A E I O U W /* Intermediate representation of single Jamo letters as characters*/ #define A_C '\x01' /* a */ #define B_C '\x02' /* phieuph */ #define C_C '\x03' /* cieuc */ #define D_C '\x04' /* thieuth */ #define E_C '\x05' /* eo */ #define G_C '\x06' /* kieukh */ #define H_C '\x07' /* hieuh */ #define I_C '\x08' /* i */ #define K_C '\x09' /* kiyeok */ #define L_C '\x0a' /* rieul */ #define M_C '\x0b' /* mieum */ #define N_C '\x0c' /* nieun */ #define O_C '\x0d' /* o */ #define P_C '\x0e' /* pieup */ #define S_C '\x0f' /* sios */ #define T_C '\x10' /* tikeut */ #define U_C '\x11' /* eu */ #define W_C '\x12' /* u */ #define X_C '\x13' /* ieung */ #define Z_C '\x14' /* chieuch */ #define DOT_C '\x15' /* syllable start */ /* Unicode representation of single Jamo letters. */ static const u_short_t jamos_unicodes[22] = { 0, /* EOS */ 0x314f, /* a */ 0x314d, /* phieuph */ 0x3148, /* cieuc */ 0x314c, /* thieuth */ 0x3153, /* eo */ 0x314b, /* kieukh */ 0x314e, /* hieuh */ 0x3163, /* i */ 0x3131, /* kiyeok */ 0x3139, /* rieul */ 0x3141, /* mieum */ 0x3134, /* nieun */ 0x3157, /* o */ 0x3142, /* pieup */ 0x3145, /* sios */ 0x3137, /* tikeut */ 0x3161, /* eu */ 0x315c, /* u */ 0x3147, /* ieung */ 0x314a, /* chieuch */ 0x3164, /* syllable start */ }; enum {SYLLABLE_START = 0x3164}; /* Composition of Jamo characters */ enum {FIRST_JAMO = 0x3131, LAST_JAMO = 0x3163, JAMO_COUNT = (LAST_JAMO - FIRST_JAMO + 1) }; static string_t jamos[JAMO_COUNT] = { (K), (K K), (K S), (N), (N C), (N H), (T), (T T), (L), (L K), (L M), (L P), (L S), (L D), (L B), (L H), (M), (P), (P P), (P S), (S), (S S), (X), (C), (C C), (Z), (G), (D), (B), (H), (A), (A I), (I A), (I A I), (E), (E I), (I E), (I E I), (O), (O A), (O I E), (O I), (I O), (W), (W E), (W E I), (W I), (I W), (U), (U I), (I) }; enum {FIRST_SYLLABLE = 0xac00, LAST_SYLLABLE = 0xd7a3, SYLLABLE_COUNT = (LAST_SYLLABLE - FIRST_SYLLABLE + 1)}; /* The initial consonants in a syllable. */ enum {CHOSEONG_COUNT = 19}; static string_t choseongs[CHOSEONG_COUNT] = { (K), (K K), (N), (T), (T T), (L), (M), (P), (P P), (S), (S S), (X), (C), (C C), (Z), (G), (D), (B), (H), }; /* The vowels in a syllable. */ enum {JUNGSEONG_COUNT = 21}; static string_t jungseongs[JUNGSEONG_COUNT] = { (A), (A I), (I A), (I A I), (E), (E I), (I E), (I E I), (O), (O A), (O I E), (O I), (I O), (W), (W E), (W E I), (W I), (I W), (U), (U I), (I) }; /* The final consonants in a syllable. */ enum {JONSEONG_COUNT = 28}; static string_t jonseongs[JONSEONG_COUNT] = { "", (K), (K K), (K S), (N), (N C), (N H), (T), (L), (L K), (L M), (L P), (L S), (L D), (L B), (L H), (M), (P), (P S), (S), (S S), (X), (C), (Z), (G), (D), (B), (H), }; enum {SPLIT_TABLE_SIZE = (SYLLABLE_COUNT + JAMO_COUNT)}; /* Global variables. ========================================================*/ bool_t roman_hangul; /* Indicates whether Hangul output is transcribed in latin script. */ /* Variables. ===============================================================*/ static byte_t jamo_enc[JAMO_COUNT + 1]; /* Intermediate encoding of Jamo single letters and syllable start. */ static string_t split_strings[SPLIT_TABLE_SIZE]; static pool_t string_pool; /* String pool with split Syllables and Jamos. */ static int_t *split_trie; /* Trie used to segmentise Hangul syllables. */ static int_t split_trie_root; /* Root node index of SPLIT_TRIE. */ /* Functions. ===============================================================*/ static int compare_trie_entries( const void *entry1, const void *entry2 ) /* Compare two trie entries. */ { return strcmp_no_case( ((trie_entry_t *) entry1)->key, ((trie_entry_t *) entry2)->key ); } /*---------------------------------------------------------------------------*/ static void add_jamos_to_text( text_t *text, const char *jamos ) { for (; *jamos != EOS; jamos++ ) add_unichar_to_text( text, jamos_unicodes[ (int_t) *jamos ]); } /*---------------------------------------------------------------------------*/ void init_hangul( void ) /* Initialise the hangul module. */ { trie_entry_t trie_entries[ SPLIT_TABLE_SIZE ]; /* Syllable/Jamos. */ int_t c, i, choseong, jungseong, jonseong; pool_t trie_pool; text_t *text; if (! split_hangul_syllables) return; /* Fill table to convert from Jamo letter to intermediate encoding. */ for (i = A_C; i <= DOT_C; i++) jamo_enc[ jamos_unicodes[i] - FIRST_JAMO ] = i; text = new_text(); string_pool = new_pool( sizeof( char_t ) ); for (i = 0; i < SYLLABLE_COUNT; i++) { /* Build Jamos string for syllable i. */ c = i / JONSEONG_COUNT; jonseong = i % JONSEONG_COUNT; jungseong = c % JUNGSEONG_COUNT; choseong = c / JUNGSEONG_COUNT; clear_text( text ); add_unichar_to_text( text, SYLLABLE_START ); add_jamos_to_text( text, choseongs[ choseong ] ); add_jamos_to_text( text, jungseongs[ jungseong ] ); add_jamos_to_text( text, jonseongs[ jonseong ] ); split_strings[i] = copy_string_to_pool( string_pool, text->buffer, NULL ); /* Copy to trie entry table. */ trie_entries[i].key = split_strings[i]; trie_entries[i].content = i + FIRST_SYLLABLE; } for (i = SYLLABLE_COUNT; i < SPLIT_TABLE_SIZE; i++) { clear_text( text ); add_jamos_to_text( text, jamos[ i - SYLLABLE_COUNT ] ); split_strings[i] = copy_string_to_pool( string_pool, text->buffer, NULL ); trie_entries[i].key = split_strings[i]; trie_entries[i].content = i - SYLLABLE_COUNT + FIRST_JAMO; } free_text( &text ); /* Sort the Jamos strings and build the trie. */ qsort( trie_entries, SPLIT_TABLE_SIZE, sizeof( trie_entry_t ), compare_trie_entries); new_trie( SPLIT_TABLE_SIZE, trie_entries, &trie_pool, &split_trie_root ); split_trie = pool_to_vector( trie_pool ); free_pool( &trie_pool ); roman_hangul = FALSE; } /*---------------------------------------------------------------------------*/ void terminate_hangul( void ) /* Terminate the hangul module. */ { if (! split_hangul_syllables) return; free_mem( &split_trie ); free_pool( &string_pool ); } /* Conversion of Jamos to romanised Hangul. =================================*/ static char_t * jamos_to_roman( string_t jamos_string ) /* Convert Jamos string JAMOS_STRING to romanised Hangul. */ { /* Modified Yale roman representation for each of the Jamos letters. */ static string_t romans[24] = { NULL, "a", "ph", "c", "th", "e", "kh", "h", "i", "k", "l", "m", "n", "o", "p", "s", "t", "u", "wu", "ng", "ch", "." }; string_t roman_segment; text_t *roman_text; int_t c; char_t enc, prev, next; roman_text = new_text(); while (*jamos_string != EOS) { c = g_utf8_get_char( jamos_string ); if (c >= FIRST_JAMO && c <= SYLLABLE_START) { /* Convert Jamos. */ add_char_to_text( roman_text, '{' ); enc = jamo_enc[ c - FIRST_JAMO ]; prev = EOS; do { jamos_string = g_utf8_next_char( jamos_string ); c = g_utf8_get_char( jamos_string ); if (c >= FIRST_JAMO && c <= SYLLABLE_START) next = jamo_enc[ c - FIRST_JAMO ]; else next = EOS; /* Convert ENC to roman. */ roman_segment = romans[ (int_t) enc ]; switch (enc) { case X_C: if (prev == DOT_C) roman_segment = ""; break; case I_C: if ((prev != EOS && strchr( VOWELS, prev ) != NULL) || (next != EOS && strchr( VOWELS, next ) != NULL)) { roman_segment = "y"; } break; case O_C: if (next == A_C) roman_segment = "w"; break; case W_C: if (prev == I_C) roman_segment = "u"; else if (next != EOS && strchr( VOWELS, next ) != NULL) { roman_segment = "w"; } break; default: break; } add_to_text( roman_text, roman_segment ); prev = enc; enc = next; } while (enc != 0); add_char_to_text( roman_text, '}' ); } else { add_unichar_to_text( roman_text, c ); jamos_string = g_utf8_next_char( jamos_string ); } } return text_to_string( &roman_text ); } /* Conversion of romanised Hangul to Jamos. =================================*/ static char_t * roman_to_jamos( string_t roman_string ) /* Convert transcribed Hangul string ROMAN_STRING to Jamos. */ { /* All letter sequences that can be converted to hancode. */ static struct {string_t roman; string_t jamos;} romans[] = { /* Two-letter strings must come first. */ {"ch", Z}, {"kh", G}, {"th", D}, {"ph", B}, {"wu", W}, {"ng", X}, {"wa", O A}, {"yu", I W}, {"a", A}, {"c", C}, {"e", E}, {"h", H}, {"i", I}, {"k", K}, {"l", L}, {"m", M}, {"n", N}, {"o", O}, {"p", P}, {"r", L}, {"s", S}, {"t", T}, {"u", U}, {"w", W}, {"x", X}, {"y", I}, {".", DOT}, {NULL, NULL} }; int_t i; text_t *jamos_text; jamos_text = new_text(); while (*roman_string != EOS) { if (*roman_string == '{') { roman_string++; while (*roman_string != '}') { if (*roman_string == EOS) complain( "Missing \"}\" in romanised Hangul." ); /* Insert an "x" at beginning of syllable if vowel is following. */ if (roman_string[-1] == '.' && strrchr( "aeiouwy", roman_string[0] ) != NULL) { add_jamos_to_text( jamos_text, X ); } for (i = 0; romans[i].roman != NULL; i++) { if (strncmp( roman_string, romans[i].roman, strlen( romans[i].roman ) ) == 0) { add_jamos_to_text( jamos_text, romans[i].jamos ); roman_string += strlen( romans[i].roman ); break; } } if (romans[i].roman == NULL) { complain( "\"%c\" is not a romanised Hangul letter.", *roman_string ); } } /* Jump over closing "}" */ roman_string++; } else add_char_to_text( jamos_text, *roman_string++ ); } return text_to_string( &jamos_text ); } /* Conversion of Jamos to Hangul syllables. =================================*/ static char_t * jamos_to_syllables( string_t jamos_string ) /* Convert Jamos JAMOS_STRING to Unicode Hangul syllables. */ { text_t *syl_text; int_t trie_code, code, unicode; int_t trie_node; string_t string_p; syl_text = new_text(); while (*jamos_string != EOS) { /* Try to combine a syllable or a multi-consonant Jamo. */ code = g_utf8_get_char( jamos_string ); if (code >= FIRST_JAMO && code <= SYLLABLE_START) { /* Search the trie until we have found the longest segment. */ trie_node = split_trie_root; string_p = jamos_string; unicode = 0; while (lookup_trie( split_trie, &trie_node, &string_p, &trie_code )) { jamos_string = string_p; unicode = trie_code; } if (unicode != 0) add_unichar_to_text( syl_text, unicode ); else if (code == SYLLABLE_START) { add_to_text( syl_text, "{.}" ); jamos_string = g_utf8_next_char( jamos_string ); } else complain( "Internal error." ); } else { add_unichar_to_text( syl_text, code ); jamos_string = g_utf8_next_char( jamos_string ); } } return text_to_string( &syl_text ); } /* Conversion of Hangul syllables to Jamos. =================================*/ static char_t * syllables_to_jamos( string_t syl_string ) /* Convert Hangul syllables in SYL_STRING to Jamos. * The returned string remains valid until this function is called again. */ { text_t *jamos_text; int_t c; jamos_text = new_text(); while (*syl_string != EOS) { c = g_utf8_get_char( syl_string ); syl_string = g_utf8_next_char( syl_string ); if (c >= FIRST_SYLLABLE && c <= LAST_SYLLABLE) add_to_text( jamos_text, split_strings[ c - FIRST_SYLLABLE ] ); else if (c >= FIRST_JAMO && c <= LAST_JAMO) { add_to_text( jamos_text, split_strings[ c - FIRST_JAMO + SYLLABLE_COUNT] ); } else add_unichar_to_text( jamos_text, c ); } return text_to_string( &jamos_text ); } /* Global conversion routines. ==============================================*/ void decode_hangul( char_t **string_p ) /* Decode *STRING_P to external format. * *STRING_P must be a string on the heap. * It will be replaced by the new string which is also on the heap. */ { char_t *string; if (! split_hangul_syllables) return; if (roman_hangul) string = jamos_to_roman( *string_p ); else string = jamos_to_syllables( *string_p ); free_mem( string_p ); *string_p = string; } /*---------------------------------------------------------------------------*/ void encode_hangul( char_t **string_p ) /* Encode *STRING_P to internal format. * *STRING_P must be a string on the heap. * It will be replaced by the new string which is also on the heap. */ { char_t *string; if (! split_hangul_syllables) return; string = syllables_to_jamos( *string_p ); free_mem( string_p ); *string_p = roman_to_jamos( string ); free_mem( &string ); } /* End of file. =============================================================*/