|
Packit |
d394d9 |
/* Copyright (C) 1995 Bjoern Beutel. */
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Description. =============================================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* This module handles the splitting of Hangul syllables and multi-letter
|
|
Packit |
d394d9 |
* Jamos into single Hangul letters (and back).
|
|
Packit |
d394d9 |
* It also converts Romanised Hangul to single-letter Hangul (and back).
|
|
Packit |
d394d9 |
*
|
|
Packit |
d394d9 |
* In this conversion module, we use four representations of
|
|
Packit |
d394d9 |
* Hangul letters and syllables:
|
|
Packit |
d394d9 |
* 1. Unicode Hangul syllables, which occupy code points 0xac00-0xd7a3.
|
|
Packit |
d394d9 |
* The syllables are sorted by their constituting letters. The sort criteria
|
|
Packit |
d394d9 |
* are (from major to minor):
|
|
Packit |
d394d9 |
* - the 19 different initial consonant combinations (Choseong) as defined
|
|
Packit |
d394d9 |
* in the table CHOSEONGS
|
|
Packit |
d394d9 |
* - the 21 different vowel combinations (Jungseong) as defined in the table
|
|
Packit |
d394d9 |
* JUNGSEONGS
|
|
Packit |
d394d9 |
* - the 28 different final consonant combinations (Jonseong) as defined in
|
|
Packit |
d394d9 |
* the table JONSEONGS (including the empty string)
|
|
Packit |
d394d9 |
* 2. Unicode alternative Jamo characters, which occupy the code points
|
|
Packit |
d394d9 |
* 0x3131-0x3163. For internal representation, only the Jamos that represent
|
|
Packit |
d394d9 |
* single characters are used.
|
|
Packit |
d394d9 |
* 3. Roman code, which is a latin transcription; it adopts the Yale standard
|
|
Packit |
d394d9 |
* for Hangul romanization.
|
|
Packit |
d394d9 |
* Here, every syllable begins with a dot ".". Transcripted Hangul is
|
|
Packit |
d394d9 |
* enclosed in curly brackets in order to distinguish it from original
|
|
Packit |
d394d9 |
* latin characters. */
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Includes. ================================================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
#include <stdio.h>
|
|
Packit |
d394d9 |
#include <string.h>
|
|
Packit |
d394d9 |
#include <stdlib.h>
|
|
Packit |
d394d9 |
#include <setjmp.h>
|
|
Packit |
d394d9 |
#include <glib.h>
|
|
Packit |
d394d9 |
#include "basic.h"
|
|
Packit |
d394d9 |
#include "pools.h"
|
|
Packit |
d394d9 |
#include "tries.h"
|
|
Packit |
d394d9 |
#include "hangul.h"
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Constants. ===============================================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Intermediate representation of single Jamo letters as strings */
|
|
Packit |
d394d9 |
#define A "\x01" /* a */
|
|
Packit |
d394d9 |
#define B "\x02" /* phieuph */
|
|
Packit |
d394d9 |
#define C "\x03" /* cieuc */
|
|
Packit |
d394d9 |
#define D "\x04" /* thieuth */
|
|
Packit |
d394d9 |
#define E "\x05" /* eo */
|
|
Packit |
d394d9 |
#define G "\x06" /* kieukh */
|
|
Packit |
d394d9 |
#define H "\x07" /* hieuh */
|
|
Packit |
d394d9 |
#define I "\x08" /* i */
|
|
Packit |
d394d9 |
#define K "\x09" /* kiyeok */
|
|
Packit |
d394d9 |
#define L "\x0a" /* rieul */
|
|
Packit |
d394d9 |
#define M "\x0b" /* mieum */
|
|
Packit |
d394d9 |
#define N "\x0c" /* nieun */
|
|
Packit |
d394d9 |
#define O "\x0d" /* o */
|
|
Packit |
d394d9 |
#define P "\x0e" /* pieup */
|
|
Packit |
d394d9 |
#define S "\x0f" /* sios */
|
|
Packit |
d394d9 |
#define T "\x10" /* tikeut */
|
|
Packit |
d394d9 |
#define U "\x11" /* eu */
|
|
Packit |
d394d9 |
#define W "\x12" /* u */
|
|
Packit |
d394d9 |
#define X "\x13" /* ieung */
|
|
Packit |
d394d9 |
#define Z "\x14" /* chieuch */
|
|
Packit |
d394d9 |
#define DOT "\x15" /* syllable start */
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
#define VOWELS A E I O U W
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Intermediate representation of single Jamo letters as characters*/
|
|
Packit |
d394d9 |
#define A_C '\x01' /* a */
|
|
Packit |
d394d9 |
#define B_C '\x02' /* phieuph */
|
|
Packit |
d394d9 |
#define C_C '\x03' /* cieuc */
|
|
Packit |
d394d9 |
#define D_C '\x04' /* thieuth */
|
|
Packit |
d394d9 |
#define E_C '\x05' /* eo */
|
|
Packit |
d394d9 |
#define G_C '\x06' /* kieukh */
|
|
Packit |
d394d9 |
#define H_C '\x07' /* hieuh */
|
|
Packit |
d394d9 |
#define I_C '\x08' /* i */
|
|
Packit |
d394d9 |
#define K_C '\x09' /* kiyeok */
|
|
Packit |
d394d9 |
#define L_C '\x0a' /* rieul */
|
|
Packit |
d394d9 |
#define M_C '\x0b' /* mieum */
|
|
Packit |
d394d9 |
#define N_C '\x0c' /* nieun */
|
|
Packit |
d394d9 |
#define O_C '\x0d' /* o */
|
|
Packit |
d394d9 |
#define P_C '\x0e' /* pieup */
|
|
Packit |
d394d9 |
#define S_C '\x0f' /* sios */
|
|
Packit |
d394d9 |
#define T_C '\x10' /* tikeut */
|
|
Packit |
d394d9 |
#define U_C '\x11' /* eu */
|
|
Packit |
d394d9 |
#define W_C '\x12' /* u */
|
|
Packit |
d394d9 |
#define X_C '\x13' /* ieung */
|
|
Packit |
d394d9 |
#define Z_C '\x14' /* chieuch */
|
|
Packit |
d394d9 |
#define DOT_C '\x15' /* syllable start */
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Unicode representation of single Jamo letters. */
|
|
Packit |
d394d9 |
static const u_short_t jamos_unicodes[22] =
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
0, /* EOS */
|
|
Packit |
d394d9 |
0x314f, /* a */
|
|
Packit |
d394d9 |
0x314d, /* phieuph */
|
|
Packit |
d394d9 |
0x3148, /* cieuc */
|
|
Packit |
d394d9 |
0x314c, /* thieuth */
|
|
Packit |
d394d9 |
0x3153, /* eo */
|
|
Packit |
d394d9 |
0x314b, /* kieukh */
|
|
Packit |
d394d9 |
0x314e, /* hieuh */
|
|
Packit |
d394d9 |
0x3163, /* i */
|
|
Packit |
d394d9 |
0x3131, /* kiyeok */
|
|
Packit |
d394d9 |
0x3139, /* rieul */
|
|
Packit |
d394d9 |
0x3141, /* mieum */
|
|
Packit |
d394d9 |
0x3134, /* nieun */
|
|
Packit |
d394d9 |
0x3157, /* o */
|
|
Packit |
d394d9 |
0x3142, /* pieup */
|
|
Packit |
d394d9 |
0x3145, /* sios */
|
|
Packit |
d394d9 |
0x3137, /* tikeut */
|
|
Packit |
d394d9 |
0x3161, /* eu */
|
|
Packit |
d394d9 |
0x315c, /* u */
|
|
Packit |
d394d9 |
0x3147, /* ieung */
|
|
Packit |
d394d9 |
0x314a, /* chieuch */
|
|
Packit |
d394d9 |
0x3164, /* syllable start */
|
|
Packit |
d394d9 |
};
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
enum {SYLLABLE_START = 0x3164};
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Composition of Jamo characters */
|
|
Packit |
d394d9 |
enum {FIRST_JAMO = 0x3131,
|
|
Packit |
d394d9 |
LAST_JAMO = 0x3163,
|
|
Packit |
d394d9 |
JAMO_COUNT = (LAST_JAMO - FIRST_JAMO + 1) };
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
static string_t jamos[JAMO_COUNT] =
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
(K), (K K), (K S), (N), (N C), (N H), (T), (T T), (L), (L K), (L M), (L P),
|
|
Packit |
d394d9 |
(L S), (L D), (L B), (L H), (M), (P), (P P), (P S), (S), (S S), (X), (C),
|
|
Packit |
d394d9 |
(C C), (Z), (G), (D), (B), (H), (A), (A I), (I A), (I A I), (E), (E I),
|
|
Packit |
d394d9 |
(I E), (I E I), (O), (O A), (O I E), (O I), (I O), (W), (W E), (W E I),
|
|
Packit |
d394d9 |
(W I), (I W), (U), (U I), (I)
|
|
Packit |
d394d9 |
};
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
enum {FIRST_SYLLABLE = 0xac00,
|
|
Packit |
d394d9 |
LAST_SYLLABLE = 0xd7a3,
|
|
Packit |
d394d9 |
SYLLABLE_COUNT = (LAST_SYLLABLE - FIRST_SYLLABLE + 1)};
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* The initial consonants in a syllable. */
|
|
Packit |
d394d9 |
enum {CHOSEONG_COUNT = 19};
|
|
Packit |
d394d9 |
static string_t choseongs[CHOSEONG_COUNT] =
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
(K), (K K), (N), (T), (T T), (L), (M), (P), (P P), (S), (S S), (X), (C),
|
|
Packit |
d394d9 |
(C C), (Z), (G), (D), (B), (H),
|
|
Packit |
d394d9 |
};
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* The vowels in a syllable. */
|
|
Packit |
d394d9 |
enum {JUNGSEONG_COUNT = 21};
|
|
Packit |
d394d9 |
static string_t jungseongs[JUNGSEONG_COUNT] =
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
(A), (A I), (I A), (I A I), (E), (E I), (I E), (I E I), (O), (O A), (O I E),
|
|
Packit |
d394d9 |
(O I), (I O), (W), (W E), (W E I), (W I), (I W), (U), (U I), (I)
|
|
Packit |
d394d9 |
};
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* The final consonants in a syllable. */
|
|
Packit |
d394d9 |
enum {JONSEONG_COUNT = 28};
|
|
Packit |
d394d9 |
static string_t jonseongs[JONSEONG_COUNT] =
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
"", (K), (K K), (K S), (N), (N C), (N H), (T), (L), (L K), (L M), (L P),
|
|
Packit |
d394d9 |
(L S), (L D), (L B), (L H), (M), (P), (P S), (S), (S S), (X), (C), (Z), (G),
|
|
Packit |
d394d9 |
(D), (B), (H),
|
|
Packit |
d394d9 |
};
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
enum {SPLIT_TABLE_SIZE = (SYLLABLE_COUNT + JAMO_COUNT)};
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Global variables. ========================================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
bool_t roman_hangul;
|
|
Packit |
d394d9 |
/* Indicates whether Hangul output is transcribed in latin script. */
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Variables. ===============================================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
static byte_t jamo_enc[JAMO_COUNT + 1];
|
|
Packit |
d394d9 |
/* Intermediate encoding of Jamo single letters and syllable start. */
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
static string_t split_strings[SPLIT_TABLE_SIZE];
|
|
Packit |
d394d9 |
static pool_t string_pool; /* String pool with split Syllables and Jamos. */
|
|
Packit |
d394d9 |
static int_t *split_trie; /* Trie used to segmentise Hangul syllables. */
|
|
Packit |
d394d9 |
static int_t split_trie_root; /* Root node index of SPLIT_TRIE. */
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Functions. ===============================================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
static int
|
|
Packit |
d394d9 |
compare_trie_entries( const void *entry1, const void *entry2 )
|
|
Packit |
d394d9 |
/* Compare two trie entries. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
return strcmp_no_case( ((trie_entry_t *) entry1)->key,
|
|
Packit |
d394d9 |
((trie_entry_t *) entry2)->key );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/*---------------------------------------------------------------------------*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
static void
|
|
Packit |
d394d9 |
add_jamos_to_text( text_t *text, const char *jamos )
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
for (; *jamos != EOS; jamos++ )
|
|
Packit |
d394d9 |
add_unichar_to_text( text, jamos_unicodes[ (int_t) *jamos ]);
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/*---------------------------------------------------------------------------*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
void
|
|
Packit |
d394d9 |
init_hangul( void )
|
|
Packit |
d394d9 |
/* Initialise the hangul module. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
trie_entry_t trie_entries[ SPLIT_TABLE_SIZE ]; /* Syllable/Jamos. */
|
|
Packit |
d394d9 |
int_t c, i, choseong, jungseong, jonseong;
|
|
Packit |
d394d9 |
pool_t trie_pool;
|
|
Packit |
d394d9 |
text_t *text;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
if (! split_hangul_syllables)
|
|
Packit |
d394d9 |
return;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Fill table to convert from Jamo letter to intermediate encoding. */
|
|
Packit |
d394d9 |
for (i = A_C; i <= DOT_C; i++)
|
|
Packit |
d394d9 |
jamo_enc[ jamos_unicodes[i] - FIRST_JAMO ] = i;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
text = new_text();
|
|
Packit |
d394d9 |
string_pool = new_pool( sizeof( char_t ) );
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
for (i = 0; i < SYLLABLE_COUNT; i++)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
/* Build Jamos string for syllable i. */
|
|
Packit |
d394d9 |
c = i / JONSEONG_COUNT;
|
|
Packit |
d394d9 |
jonseong = i % JONSEONG_COUNT;
|
|
Packit |
d394d9 |
jungseong = c % JUNGSEONG_COUNT;
|
|
Packit |
d394d9 |
choseong = c / JUNGSEONG_COUNT;
|
|
Packit |
d394d9 |
clear_text( text );
|
|
Packit |
d394d9 |
add_unichar_to_text( text, SYLLABLE_START );
|
|
Packit |
d394d9 |
add_jamos_to_text( text, choseongs[ choseong ] );
|
|
Packit |
d394d9 |
add_jamos_to_text( text, jungseongs[ jungseong ] );
|
|
Packit |
d394d9 |
add_jamos_to_text( text, jonseongs[ jonseong ] );
|
|
Packit |
d394d9 |
split_strings[i] = copy_string_to_pool( string_pool, text->buffer, NULL );
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Copy to trie entry table. */
|
|
Packit |
d394d9 |
trie_entries[i].key = split_strings[i];
|
|
Packit |
d394d9 |
trie_entries[i].content = i + FIRST_SYLLABLE;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
for (i = SYLLABLE_COUNT; i < SPLIT_TABLE_SIZE; i++)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
clear_text( text );
|
|
Packit |
d394d9 |
add_jamos_to_text( text, jamos[ i - SYLLABLE_COUNT ] );
|
|
Packit |
d394d9 |
split_strings[i] = copy_string_to_pool( string_pool, text->buffer, NULL );
|
|
Packit |
d394d9 |
trie_entries[i].key = split_strings[i];
|
|
Packit |
d394d9 |
trie_entries[i].content = i - SYLLABLE_COUNT + FIRST_JAMO;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
free_text( &text );
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Sort the Jamos strings and build the trie. */
|
|
Packit |
d394d9 |
qsort( trie_entries, SPLIT_TABLE_SIZE, sizeof( trie_entry_t ),
|
|
Packit |
d394d9 |
compare_trie_entries);
|
|
Packit |
d394d9 |
new_trie( SPLIT_TABLE_SIZE, trie_entries, &trie_pool, &split_trie_root );
|
|
Packit |
d394d9 |
split_trie = pool_to_vector( trie_pool );
|
|
Packit |
d394d9 |
free_pool( &trie_pool );
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
roman_hangul = FALSE;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/*---------------------------------------------------------------------------*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
void
|
|
Packit |
d394d9 |
terminate_hangul( void )
|
|
Packit |
d394d9 |
/* Terminate the hangul module. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
if (! split_hangul_syllables)
|
|
Packit |
d394d9 |
return;
|
|
Packit |
d394d9 |
free_mem( &split_trie );
|
|
Packit |
d394d9 |
free_pool( &string_pool );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Conversion of Jamos to romanised Hangul. =================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
static char_t *
|
|
Packit |
d394d9 |
jamos_to_roman( string_t jamos_string )
|
|
Packit |
d394d9 |
/* Convert Jamos string JAMOS_STRING to romanised Hangul. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
/* Modified Yale roman representation for each of the Jamos letters. */
|
|
Packit |
d394d9 |
static string_t romans[24] =
|
|
Packit |
d394d9 |
{ NULL, "a", "ph", "c", "th", "e", "kh", "h", "i", "k", "l", "m", "n", "o",
|
|
Packit |
d394d9 |
"p", "s", "t", "u", "wu", "ng", "ch", "."
|
|
Packit |
d394d9 |
};
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
string_t roman_segment;
|
|
Packit |
d394d9 |
text_t *roman_text;
|
|
Packit |
d394d9 |
int_t c;
|
|
Packit |
d394d9 |
char_t enc, prev, next;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
roman_text = new_text();
|
|
Packit |
d394d9 |
while (*jamos_string != EOS)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
c = g_utf8_get_char( jamos_string );
|
|
Packit |
d394d9 |
if (c >= FIRST_JAMO && c <= SYLLABLE_START)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
/* Convert Jamos. */
|
|
Packit |
d394d9 |
add_char_to_text( roman_text, '{' );
|
|
Packit |
d394d9 |
enc = jamo_enc[ c - FIRST_JAMO ];
|
|
Packit |
d394d9 |
prev = EOS;
|
|
Packit |
d394d9 |
do
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
jamos_string = g_utf8_next_char( jamos_string );
|
|
Packit |
d394d9 |
c = g_utf8_get_char( jamos_string );
|
|
Packit |
d394d9 |
if (c >= FIRST_JAMO && c <= SYLLABLE_START)
|
|
Packit |
d394d9 |
next = jamo_enc[ c - FIRST_JAMO ];
|
|
Packit |
d394d9 |
else
|
|
Packit |
d394d9 |
next = EOS;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Convert ENC to roman. */
|
|
Packit |
d394d9 |
roman_segment = romans[ (int_t) enc ];
|
|
Packit |
d394d9 |
switch (enc)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
case X_C:
|
|
Packit |
d394d9 |
if (prev == DOT_C)
|
|
Packit |
d394d9 |
roman_segment = "";
|
|
Packit |
d394d9 |
break;
|
|
Packit |
d394d9 |
case I_C:
|
|
Packit |
d394d9 |
if ((prev != EOS && strchr( VOWELS, prev ) != NULL)
|
|
Packit |
d394d9 |
|| (next != EOS && strchr( VOWELS, next ) != NULL))
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
roman_segment = "y";
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
break;
|
|
Packit |
d394d9 |
case O_C:
|
|
Packit |
d394d9 |
if (next == A_C)
|
|
Packit |
d394d9 |
roman_segment = "w";
|
|
Packit |
d394d9 |
break;
|
|
Packit |
d394d9 |
case W_C:
|
|
Packit |
d394d9 |
if (prev == I_C)
|
|
Packit |
d394d9 |
roman_segment = "u";
|
|
Packit |
d394d9 |
else if (next != EOS && strchr( VOWELS, next ) != NULL)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
roman_segment = "w";
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
break;
|
|
Packit |
d394d9 |
default:
|
|
Packit |
d394d9 |
break;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
add_to_text( roman_text, roman_segment );
|
|
Packit |
d394d9 |
prev = enc;
|
|
Packit |
d394d9 |
enc = next;
|
|
Packit |
d394d9 |
} while (enc != 0);
|
|
Packit |
d394d9 |
add_char_to_text( roman_text, '}' );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
else
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
add_unichar_to_text( roman_text, c );
|
|
Packit |
d394d9 |
jamos_string = g_utf8_next_char( jamos_string );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
return text_to_string( &roman_text );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Conversion of romanised Hangul to Jamos. =================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
static char_t *
|
|
Packit |
d394d9 |
roman_to_jamos( string_t roman_string )
|
|
Packit |
d394d9 |
/* Convert transcribed Hangul string ROMAN_STRING to Jamos. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
/* All letter sequences that can be converted to hancode. */
|
|
Packit |
d394d9 |
static struct {string_t roman; string_t jamos;} romans[] =
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
/* Two-letter strings must come first. */
|
|
Packit |
d394d9 |
{"ch", Z}, {"kh", G}, {"th", D}, {"ph", B},
|
|
Packit |
d394d9 |
{"wu", W}, {"ng", X}, {"wa", O A}, {"yu", I W},
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
{"a", A}, {"c", C}, {"e", E}, {"h", H}, {"i", I}, {"k", K}, {"l", L},
|
|
Packit |
d394d9 |
{"m", M}, {"n", N}, {"o", O}, {"p", P}, {"r", L}, {"s", S}, {"t", T},
|
|
Packit |
d394d9 |
{"u", U}, {"w", W}, {"x", X}, {"y", I}, {".", DOT},
|
|
Packit |
d394d9 |
{NULL, NULL}
|
|
Packit |
d394d9 |
};
|
|
Packit |
d394d9 |
int_t i;
|
|
Packit |
d394d9 |
text_t *jamos_text;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
jamos_text = new_text();
|
|
Packit |
d394d9 |
while (*roman_string != EOS)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
if (*roman_string == '{')
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
roman_string++;
|
|
Packit |
d394d9 |
while (*roman_string != '}')
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
if (*roman_string == EOS)
|
|
Packit |
d394d9 |
complain( "Missing \"}\" in romanised Hangul." );
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Insert an "x" at beginning of syllable if vowel is following. */
|
|
Packit |
d394d9 |
if (roman_string[-1] == '.'
|
|
Packit |
d394d9 |
&& strrchr( "aeiouwy", roman_string[0] ) != NULL)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
add_jamos_to_text( jamos_text, X );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
for (i = 0; romans[i].roman != NULL; i++)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
if (strncmp( roman_string, romans[i].roman,
|
|
Packit |
d394d9 |
strlen( romans[i].roman ) ) == 0)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
add_jamos_to_text( jamos_text, romans[i].jamos );
|
|
Packit |
d394d9 |
roman_string += strlen( romans[i].roman );
|
|
Packit |
d394d9 |
break;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
if (romans[i].roman == NULL)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
complain( "\"%c\" is not a romanised Hangul letter.",
|
|
Packit |
d394d9 |
*roman_string );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
/* Jump over closing "}" */
|
|
Packit |
d394d9 |
roman_string++;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
else
|
|
Packit |
d394d9 |
add_char_to_text( jamos_text, *roman_string++ );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
return text_to_string( &jamos_text );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Conversion of Jamos to Hangul syllables. =================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
static char_t *
|
|
Packit |
d394d9 |
jamos_to_syllables( string_t jamos_string )
|
|
Packit |
d394d9 |
/* Convert Jamos JAMOS_STRING to Unicode Hangul syllables. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
text_t *syl_text;
|
|
Packit |
d394d9 |
int_t trie_code, code, unicode;
|
|
Packit |
d394d9 |
int_t trie_node;
|
|
Packit |
d394d9 |
string_t string_p;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
syl_text = new_text();
|
|
Packit |
d394d9 |
while (*jamos_string != EOS)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
/* Try to combine a syllable or a multi-consonant Jamo. */
|
|
Packit |
d394d9 |
code = g_utf8_get_char( jamos_string );
|
|
Packit |
d394d9 |
if (code >= FIRST_JAMO && code <= SYLLABLE_START)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
/* Search the trie until we have found the longest segment. */
|
|
Packit |
d394d9 |
trie_node = split_trie_root;
|
|
Packit |
d394d9 |
string_p = jamos_string;
|
|
Packit |
d394d9 |
unicode = 0;
|
|
Packit |
d394d9 |
while (lookup_trie( split_trie, &trie_node, &string_p, &trie_code ))
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
jamos_string = string_p;
|
|
Packit |
d394d9 |
unicode = trie_code;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
if (unicode != 0)
|
|
Packit |
d394d9 |
add_unichar_to_text( syl_text, unicode );
|
|
Packit |
d394d9 |
else if (code == SYLLABLE_START)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
add_to_text( syl_text, "{.}" );
|
|
Packit |
d394d9 |
jamos_string = g_utf8_next_char( jamos_string );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
else
|
|
Packit |
d394d9 |
complain( "Internal error." );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
else
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
add_unichar_to_text( syl_text, code );
|
|
Packit |
d394d9 |
jamos_string = g_utf8_next_char( jamos_string );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
return text_to_string( &syl_text );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Conversion of Hangul syllables to Jamos. =================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
static char_t *
|
|
Packit |
d394d9 |
syllables_to_jamos( string_t syl_string )
|
|
Packit |
d394d9 |
/* Convert Hangul syllables in SYL_STRING to Jamos.
|
|
Packit |
d394d9 |
* The returned string remains valid until this function is called again. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
text_t *jamos_text;
|
|
Packit |
d394d9 |
int_t c;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
jamos_text = new_text();
|
|
Packit |
d394d9 |
while (*syl_string != EOS)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
c = g_utf8_get_char( syl_string );
|
|
Packit |
d394d9 |
syl_string = g_utf8_next_char( syl_string );
|
|
Packit |
d394d9 |
if (c >= FIRST_SYLLABLE && c <= LAST_SYLLABLE)
|
|
Packit |
d394d9 |
add_to_text( jamos_text, split_strings[ c - FIRST_SYLLABLE ] );
|
|
Packit |
d394d9 |
else if (c >= FIRST_JAMO && c <= LAST_JAMO)
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
add_to_text( jamos_text,
|
|
Packit |
d394d9 |
split_strings[ c - FIRST_JAMO + SYLLABLE_COUNT] );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
else
|
|
Packit |
d394d9 |
add_unichar_to_text( jamos_text, c );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
return text_to_string( &jamos_text );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* Global conversion routines. ==============================================*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
void
|
|
Packit |
d394d9 |
decode_hangul( char_t **string_p )
|
|
Packit |
d394d9 |
/* Decode *STRING_P to external format.
|
|
Packit |
d394d9 |
* *STRING_P must be a string on the heap.
|
|
Packit |
d394d9 |
* It will be replaced by the new string which is also on the heap. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
char_t *string;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
if (! split_hangul_syllables)
|
|
Packit |
d394d9 |
return;
|
|
Packit |
d394d9 |
if (roman_hangul)
|
|
Packit |
d394d9 |
string = jamos_to_roman( *string_p );
|
|
Packit |
d394d9 |
else
|
|
Packit |
d394d9 |
string = jamos_to_syllables( *string_p );
|
|
Packit |
d394d9 |
free_mem( string_p );
|
|
Packit |
d394d9 |
*string_p = string;
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/*---------------------------------------------------------------------------*/
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
void
|
|
Packit |
d394d9 |
encode_hangul( char_t **string_p )
|
|
Packit |
d394d9 |
/* Encode *STRING_P to internal format.
|
|
Packit |
d394d9 |
* *STRING_P must be a string on the heap.
|
|
Packit |
d394d9 |
* It will be replaced by the new string which is also on the heap. */
|
|
Packit |
d394d9 |
{
|
|
Packit |
d394d9 |
char_t *string;
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
if (! split_hangul_syllables)
|
|
Packit |
d394d9 |
return;
|
|
Packit |
d394d9 |
string = syllables_to_jamos( *string_p );
|
|
Packit |
d394d9 |
free_mem( string_p );
|
|
Packit |
d394d9 |
*string_p = roman_to_jamos( string );
|
|
Packit |
d394d9 |
free_mem( &string );
|
|
Packit |
d394d9 |
}
|
|
Packit |
d394d9 |
|
|
Packit |
d394d9 |
/* End of file. =============================================================*/
|