/* Copyright (C) 1995 Bjoern Beutel. */
/* Description. =============================================================*/
/* This module handles the splitting of Hangul syllables and multi-letter
* Jamos into single Hangul letters (and back).
* It also converts Romanised Hangul to single-letter Hangul (and back).
*
* In this conversion module, we use four representations of
* Hangul letters and syllables:
* 1. Unicode Hangul syllables, which occupy code points 0xac00-0xd7a3.
* The syllables are sorted by their constituting letters. The sort criteria
* are (from major to minor):
* - the 19 different initial consonant combinations (Choseong) as defined
* in the table CHOSEONGS
* - the 21 different vowel combinations (Jungseong) as defined in the table
* JUNGSEONGS
* - the 28 different final consonant combinations (Jonseong) as defined in
* the table JONSEONGS (including the empty string)
* 2. Unicode alternative Jamo characters, which occupy the code points
* 0x3131-0x3163. For internal representation, only the Jamos that represent
* single characters are used.
* 3. Roman code, which is a latin transcription; it adopts the Yale standard
* for Hangul romanization.
* Here, every syllable begins with a dot ".". Transcripted Hangul is
* enclosed in curly brackets in order to distinguish it from original
* latin characters. */
/* Includes. ================================================================*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <setjmp.h>
#include <glib.h>
#include "basic.h"
#include "pools.h"
#include "tries.h"
#include "hangul.h"
/* Constants. ===============================================================*/
/* Intermediate representation of single Jamo letters as strings */
#define A "\x01" /* a */
#define B "\x02" /* phieuph */
#define C "\x03" /* cieuc */
#define D "\x04" /* thieuth */
#define E "\x05" /* eo */
#define G "\x06" /* kieukh */
#define H "\x07" /* hieuh */
#define I "\x08" /* i */
#define K "\x09" /* kiyeok */
#define L "\x0a" /* rieul */
#define M "\x0b" /* mieum */
#define N "\x0c" /* nieun */
#define O "\x0d" /* o */
#define P "\x0e" /* pieup */
#define S "\x0f" /* sios */
#define T "\x10" /* tikeut */
#define U "\x11" /* eu */
#define W "\x12" /* u */
#define X "\x13" /* ieung */
#define Z "\x14" /* chieuch */
#define DOT "\x15" /* syllable start */
#define VOWELS A E I O U W
/* Intermediate representation of single Jamo letters as characters*/
#define A_C '\x01' /* a */
#define B_C '\x02' /* phieuph */
#define C_C '\x03' /* cieuc */
#define D_C '\x04' /* thieuth */
#define E_C '\x05' /* eo */
#define G_C '\x06' /* kieukh */
#define H_C '\x07' /* hieuh */
#define I_C '\x08' /* i */
#define K_C '\x09' /* kiyeok */
#define L_C '\x0a' /* rieul */
#define M_C '\x0b' /* mieum */
#define N_C '\x0c' /* nieun */
#define O_C '\x0d' /* o */
#define P_C '\x0e' /* pieup */
#define S_C '\x0f' /* sios */
#define T_C '\x10' /* tikeut */
#define U_C '\x11' /* eu */
#define W_C '\x12' /* u */
#define X_C '\x13' /* ieung */
#define Z_C '\x14' /* chieuch */
#define DOT_C '\x15' /* syllable start */
/* Unicode representation of single Jamo letters. */
static const u_short_t jamos_unicodes[22] =
{
0, /* EOS */
0x314f, /* a */
0x314d, /* phieuph */
0x3148, /* cieuc */
0x314c, /* thieuth */
0x3153, /* eo */
0x314b, /* kieukh */
0x314e, /* hieuh */
0x3163, /* i */
0x3131, /* kiyeok */
0x3139, /* rieul */
0x3141, /* mieum */
0x3134, /* nieun */
0x3157, /* o */
0x3142, /* pieup */
0x3145, /* sios */
0x3137, /* tikeut */
0x3161, /* eu */
0x315c, /* u */
0x3147, /* ieung */
0x314a, /* chieuch */
0x3164, /* syllable start */
};
enum {SYLLABLE_START = 0x3164};
/* Composition of Jamo characters */
enum {FIRST_JAMO = 0x3131,
LAST_JAMO = 0x3163,
JAMO_COUNT = (LAST_JAMO - FIRST_JAMO + 1) };
static string_t jamos[JAMO_COUNT] =
{
(K), (K K), (K S), (N), (N C), (N H), (T), (T T), (L), (L K), (L M), (L P),
(L S), (L D), (L B), (L H), (M), (P), (P P), (P S), (S), (S S), (X), (C),
(C C), (Z), (G), (D), (B), (H), (A), (A I), (I A), (I A I), (E), (E I),
(I E), (I E I), (O), (O A), (O I E), (O I), (I O), (W), (W E), (W E I),
(W I), (I W), (U), (U I), (I)
};
enum {FIRST_SYLLABLE = 0xac00,
LAST_SYLLABLE = 0xd7a3,
SYLLABLE_COUNT = (LAST_SYLLABLE - FIRST_SYLLABLE + 1)};
/* The initial consonants in a syllable. */
enum {CHOSEONG_COUNT = 19};
static string_t choseongs[CHOSEONG_COUNT] =
{
(K), (K K), (N), (T), (T T), (L), (M), (P), (P P), (S), (S S), (X), (C),
(C C), (Z), (G), (D), (B), (H),
};
/* The vowels in a syllable. */
enum {JUNGSEONG_COUNT = 21};
static string_t jungseongs[JUNGSEONG_COUNT] =
{
(A), (A I), (I A), (I A I), (E), (E I), (I E), (I E I), (O), (O A), (O I E),
(O I), (I O), (W), (W E), (W E I), (W I), (I W), (U), (U I), (I)
};
/* The final consonants in a syllable. */
enum {JONSEONG_COUNT = 28};
static string_t jonseongs[JONSEONG_COUNT] =
{
"", (K), (K K), (K S), (N), (N C), (N H), (T), (L), (L K), (L M), (L P),
(L S), (L D), (L B), (L H), (M), (P), (P S), (S), (S S), (X), (C), (Z), (G),
(D), (B), (H),
};
enum {SPLIT_TABLE_SIZE = (SYLLABLE_COUNT + JAMO_COUNT)};
/* Global variables. ========================================================*/
bool_t roman_hangul;
/* Indicates whether Hangul output is transcribed in latin script. */
/* Variables. ===============================================================*/
static byte_t jamo_enc[JAMO_COUNT + 1];
/* Intermediate encoding of Jamo single letters and syllable start. */
static string_t split_strings[SPLIT_TABLE_SIZE];
static pool_t string_pool; /* String pool with split Syllables and Jamos. */
static int_t *split_trie; /* Trie used to segmentise Hangul syllables. */
static int_t split_trie_root; /* Root node index of SPLIT_TRIE. */
/* Functions. ===============================================================*/
static int
compare_trie_entries( const void *entry1, const void *entry2 )
/* Compare two trie entries. */
{
return strcmp_no_case( ((trie_entry_t *) entry1)->key,
((trie_entry_t *) entry2)->key );
}
/*---------------------------------------------------------------------------*/
static void
add_jamos_to_text( text_t *text, const char *jamos )
{
for (; *jamos != EOS; jamos++ )
add_unichar_to_text( text, jamos_unicodes[ (int_t) *jamos ]);
}
/*---------------------------------------------------------------------------*/
void
init_hangul( void )
/* Initialise the hangul module. */
{
trie_entry_t trie_entries[ SPLIT_TABLE_SIZE ]; /* Syllable/Jamos. */
int_t c, i, choseong, jungseong, jonseong;
pool_t trie_pool;
text_t *text;
if (! split_hangul_syllables)
return;
/* Fill table to convert from Jamo letter to intermediate encoding. */
for (i = A_C; i <= DOT_C; i++)
jamo_enc[ jamos_unicodes[i] - FIRST_JAMO ] = i;
text = new_text();
string_pool = new_pool( sizeof( char_t ) );
for (i = 0; i < SYLLABLE_COUNT; i++)
{
/* Build Jamos string for syllable i. */
c = i / JONSEONG_COUNT;
jonseong = i % JONSEONG_COUNT;
jungseong = c % JUNGSEONG_COUNT;
choseong = c / JUNGSEONG_COUNT;
clear_text( text );
add_unichar_to_text( text, SYLLABLE_START );
add_jamos_to_text( text, choseongs[ choseong ] );
add_jamos_to_text( text, jungseongs[ jungseong ] );
add_jamos_to_text( text, jonseongs[ jonseong ] );
split_strings[i] = copy_string_to_pool( string_pool, text->buffer, NULL );
/* Copy to trie entry table. */
trie_entries[i].key = split_strings[i];
trie_entries[i].content = i + FIRST_SYLLABLE;
}
for (i = SYLLABLE_COUNT; i < SPLIT_TABLE_SIZE; i++)
{
clear_text( text );
add_jamos_to_text( text, jamos[ i - SYLLABLE_COUNT ] );
split_strings[i] = copy_string_to_pool( string_pool, text->buffer, NULL );
trie_entries[i].key = split_strings[i];
trie_entries[i].content = i - SYLLABLE_COUNT + FIRST_JAMO;
}
free_text( &text );
/* Sort the Jamos strings and build the trie. */
qsort( trie_entries, SPLIT_TABLE_SIZE, sizeof( trie_entry_t ),
compare_trie_entries);
new_trie( SPLIT_TABLE_SIZE, trie_entries, &trie_pool, &split_trie_root );
split_trie = pool_to_vector( trie_pool );
free_pool( &trie_pool );
roman_hangul = FALSE;
}
/*---------------------------------------------------------------------------*/
void
terminate_hangul( void )
/* Terminate the hangul module. */
{
if (! split_hangul_syllables)
return;
free_mem( &split_trie );
free_pool( &string_pool );
}
/* Conversion of Jamos to romanised Hangul. =================================*/
static char_t *
jamos_to_roman( string_t jamos_string )
/* Convert Jamos string JAMOS_STRING to romanised Hangul. */
{
/* Modified Yale roman representation for each of the Jamos letters. */
static string_t romans[24] =
{ NULL, "a", "ph", "c", "th", "e", "kh", "h", "i", "k", "l", "m", "n", "o",
"p", "s", "t", "u", "wu", "ng", "ch", "."
};
string_t roman_segment;
text_t *roman_text;
int_t c;
char_t enc, prev, next;
roman_text = new_text();
while (*jamos_string != EOS)
{
c = g_utf8_get_char( jamos_string );
if (c >= FIRST_JAMO && c <= SYLLABLE_START)
{
/* Convert Jamos. */
add_char_to_text( roman_text, '{' );
enc = jamo_enc[ c - FIRST_JAMO ];
prev = EOS;
do
{
jamos_string = g_utf8_next_char( jamos_string );
c = g_utf8_get_char( jamos_string );
if (c >= FIRST_JAMO && c <= SYLLABLE_START)
next = jamo_enc[ c - FIRST_JAMO ];
else
next = EOS;
/* Convert ENC to roman. */
roman_segment = romans[ (int_t) enc ];
switch (enc)
{
case X_C:
if (prev == DOT_C)
roman_segment = "";
break;
case I_C:
if ((prev != EOS && strchr( VOWELS, prev ) != NULL)
|| (next != EOS && strchr( VOWELS, next ) != NULL))
{
roman_segment = "y";
}
break;
case O_C:
if (next == A_C)
roman_segment = "w";
break;
case W_C:
if (prev == I_C)
roman_segment = "u";
else if (next != EOS && strchr( VOWELS, next ) != NULL)
{
roman_segment = "w";
}
break;
default:
break;
}
add_to_text( roman_text, roman_segment );
prev = enc;
enc = next;
} while (enc != 0);
add_char_to_text( roman_text, '}' );
}
else
{
add_unichar_to_text( roman_text, c );
jamos_string = g_utf8_next_char( jamos_string );
}
}
return text_to_string( &roman_text );
}
/* Conversion of romanised Hangul to Jamos. =================================*/
static char_t *
roman_to_jamos( string_t roman_string )
/* Convert transcribed Hangul string ROMAN_STRING to Jamos. */
{
/* All letter sequences that can be converted to hancode. */
static struct {string_t roman; string_t jamos;} romans[] =
{
/* Two-letter strings must come first. */
{"ch", Z}, {"kh", G}, {"th", D}, {"ph", B},
{"wu", W}, {"ng", X}, {"wa", O A}, {"yu", I W},
{"a", A}, {"c", C}, {"e", E}, {"h", H}, {"i", I}, {"k", K}, {"l", L},
{"m", M}, {"n", N}, {"o", O}, {"p", P}, {"r", L}, {"s", S}, {"t", T},
{"u", U}, {"w", W}, {"x", X}, {"y", I}, {".", DOT},
{NULL, NULL}
};
int_t i;
text_t *jamos_text;
jamos_text = new_text();
while (*roman_string != EOS)
{
if (*roman_string == '{')
{
roman_string++;
while (*roman_string != '}')
{
if (*roman_string == EOS)
complain( "Missing \"}\" in romanised Hangul." );
/* Insert an "x" at beginning of syllable if vowel is following. */
if (roman_string[-1] == '.'
&& strrchr( "aeiouwy", roman_string[0] ) != NULL)
{
add_jamos_to_text( jamos_text, X );
}
for (i = 0; romans[i].roman != NULL; i++)
{
if (strncmp( roman_string, romans[i].roman,
strlen( romans[i].roman ) ) == 0)
{
add_jamos_to_text( jamos_text, romans[i].jamos );
roman_string += strlen( romans[i].roman );
break;
}
}
if (romans[i].roman == NULL)
{
complain( "\"%c\" is not a romanised Hangul letter.",
*roman_string );
}
}
/* Jump over closing "}" */
roman_string++;
}
else
add_char_to_text( jamos_text, *roman_string++ );
}
return text_to_string( &jamos_text );
}
/* Conversion of Jamos to Hangul syllables. =================================*/
static char_t *
jamos_to_syllables( string_t jamos_string )
/* Convert Jamos JAMOS_STRING to Unicode Hangul syllables. */
{
text_t *syl_text;
int_t trie_code, code, unicode;
int_t trie_node;
string_t string_p;
syl_text = new_text();
while (*jamos_string != EOS)
{
/* Try to combine a syllable or a multi-consonant Jamo. */
code = g_utf8_get_char( jamos_string );
if (code >= FIRST_JAMO && code <= SYLLABLE_START)
{
/* Search the trie until we have found the longest segment. */
trie_node = split_trie_root;
string_p = jamos_string;
unicode = 0;
while (lookup_trie( split_trie, &trie_node, &string_p, &trie_code ))
{
jamos_string = string_p;
unicode = trie_code;
}
if (unicode != 0)
add_unichar_to_text( syl_text, unicode );
else if (code == SYLLABLE_START)
{
add_to_text( syl_text, "{.}" );
jamos_string = g_utf8_next_char( jamos_string );
}
else
complain( "Internal error." );
}
else
{
add_unichar_to_text( syl_text, code );
jamos_string = g_utf8_next_char( jamos_string );
}
}
return text_to_string( &syl_text );
}
/* Conversion of Hangul syllables to Jamos. =================================*/
static char_t *
syllables_to_jamos( string_t syl_string )
/* Convert Hangul syllables in SYL_STRING to Jamos.
* The returned string remains valid until this function is called again. */
{
text_t *jamos_text;
int_t c;
jamos_text = new_text();
while (*syl_string != EOS)
{
c = g_utf8_get_char( syl_string );
syl_string = g_utf8_next_char( syl_string );
if (c >= FIRST_SYLLABLE && c <= LAST_SYLLABLE)
add_to_text( jamos_text, split_strings[ c - FIRST_SYLLABLE ] );
else if (c >= FIRST_JAMO && c <= LAST_JAMO)
{
add_to_text( jamos_text,
split_strings[ c - FIRST_JAMO + SYLLABLE_COUNT] );
}
else
add_unichar_to_text( jamos_text, c );
}
return text_to_string( &jamos_text );
}
/* Global conversion routines. ==============================================*/
void
decode_hangul( char_t **string_p )
/* Decode *STRING_P to external format.
* *STRING_P must be a string on the heap.
* It will be replaced by the new string which is also on the heap. */
{
char_t *string;
if (! split_hangul_syllables)
return;
if (roman_hangul)
string = jamos_to_roman( *string_p );
else
string = jamos_to_syllables( *string_p );
free_mem( string_p );
*string_p = string;
}
/*---------------------------------------------------------------------------*/
void
encode_hangul( char_t **string_p )
/* Encode *STRING_P to internal format.
* *STRING_P must be a string on the heap.
* It will be replaced by the new string which is also on the heap. */
{
char_t *string;
if (! split_hangul_syllables)
return;
string = syllables_to_jamos( *string_p );
free_mem( string_p );
*string_p = roman_to_jamos( string );
free_mem( &string );
}
/* End of file. =============================================================*/