Blob Blame History Raw
/* Copyright (C) 1995 Bjoern Beutel. */

/* Description. =============================================================*/

/* This module handles the splitting of Hangul syllables and multi-letter
 * Jamos into single Hangul letters (and back).
 * It also converts Romanised Hangul to single-letter Hangul (and back).
 * 
 * In this conversion module, we use four representations of
 * Hangul letters and syllables:
 * 1. Unicode Hangul syllables, which occupy code points 0xac00-0xd7a3.
 *    The syllables are sorted by their constituting letters. The sort criteria
 *    are (from major to minor):
 *    - the 19 different initial consonant combinations (Choseong) as defined 
 *      in the table CHOSEONGS
 *    - the 21 different vowel combinations (Jungseong) as defined in the table
 *      JUNGSEONGS
 *    - the 28 different final consonant combinations (Jonseong) as defined in
 *      the table JONSEONGS (including the empty string)
 * 2. Unicode alternative Jamo characters, which occupy the code points 
 *    0x3131-0x3163. For internal representation, only the Jamos that represent
 *    single characters are used.
 * 3. Roman code, which is a latin transcription; it adopts the Yale standard 
 *    for Hangul romanization.
 *    Here, every syllable begins with a dot ".". Transcripted Hangul is 
 *    enclosed in curly brackets in order to distinguish it from original
 *    latin characters. */

/* Includes. ================================================================*/

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <setjmp.h>
#include <glib.h>
#include "basic.h"
#include "pools.h"
#include "tries.h"
#include "hangul.h"

/* Constants. ===============================================================*/

/* Intermediate representation of single Jamo letters as strings */
#define A "\x01" /* a */
#define B "\x02" /* phieuph */
#define C "\x03" /* cieuc */
#define D "\x04" /* thieuth */
#define E "\x05" /* eo */
#define G "\x06" /* kieukh */
#define H "\x07" /* hieuh */
#define I "\x08" /* i */
#define K "\x09" /* kiyeok */
#define L "\x0a" /* rieul */
#define M "\x0b" /* mieum */
#define N "\x0c" /* nieun */
#define O "\x0d" /* o */
#define P "\x0e" /* pieup */
#define S "\x0f" /* sios */ 
#define T "\x10" /* tikeut */
#define U "\x11" /* eu */ 
#define W "\x12" /* u */
#define X "\x13" /* ieung */
#define Z "\x14" /* chieuch */
#define DOT "\x15" /* syllable start */

#define VOWELS A E I O U W

/* Intermediate representation of single Jamo letters as characters*/
#define A_C '\x01' /* a */
#define B_C '\x02' /* phieuph */
#define C_C '\x03' /* cieuc */
#define D_C '\x04' /* thieuth */
#define E_C '\x05' /* eo */
#define G_C '\x06' /* kieukh */
#define H_C '\x07' /* hieuh */
#define I_C '\x08' /* i */
#define K_C '\x09' /* kiyeok */
#define L_C '\x0a' /* rieul */
#define M_C '\x0b' /* mieum */
#define N_C '\x0c' /* nieun */
#define O_C '\x0d' /* o */
#define P_C '\x0e' /* pieup */
#define S_C '\x0f' /* sios */ 
#define T_C '\x10' /* tikeut */
#define U_C '\x11' /* eu */ 
#define W_C '\x12' /* u */
#define X_C '\x13' /* ieung */
#define Z_C '\x14' /* chieuch */
#define DOT_C '\x15' /* syllable start */

/* Unicode representation of single Jamo letters. */
static const u_short_t jamos_unicodes[22] =
{
  0, /* EOS */
  0x314f, /* a */
  0x314d, /* phieuph */
  0x3148, /* cieuc */
  0x314c, /* thieuth */
  0x3153, /* eo */
  0x314b, /* kieukh */
  0x314e, /* hieuh */
  0x3163, /* i */
  0x3131, /* kiyeok */
  0x3139, /* rieul */
  0x3141, /* mieum */
  0x3134, /* nieun */
  0x3157, /* o */
  0x3142, /* pieup */
  0x3145, /* sios */ 
  0x3137, /* tikeut */
  0x3161, /* eu */ 
  0x315c, /* u */
  0x3147, /* ieung */
  0x314a, /* chieuch */
  0x3164, /* syllable start */
};

enum {SYLLABLE_START = 0x3164};

/* Composition of Jamo characters */
enum {FIRST_JAMO = 0x3131, 
      LAST_JAMO = 0x3163, 
      JAMO_COUNT = (LAST_JAMO - FIRST_JAMO + 1) };

static string_t jamos[JAMO_COUNT] =
{
  (K), (K K), (K S), (N), (N C), (N H), (T), (T T), (L), (L K), (L M), (L P),
  (L S), (L D), (L B), (L H), (M), (P), (P P), (P S), (S), (S S), (X), (C),
  (C C), (Z), (G), (D), (B), (H), (A), (A I), (I A), (I A I), (E), (E I), 
  (I E), (I E I), (O), (O A), (O I E), (O I), (I O), (W), (W E), (W E I), 
  (W I), (I W), (U), (U I), (I) 
};

enum {FIRST_SYLLABLE = 0xac00,
      LAST_SYLLABLE = 0xd7a3,
      SYLLABLE_COUNT = (LAST_SYLLABLE - FIRST_SYLLABLE + 1)};

/* The initial consonants in a syllable. */
enum {CHOSEONG_COUNT = 19};
static string_t choseongs[CHOSEONG_COUNT] = 
{ 
  (K), (K K), (N), (T), (T T), (L), (M), (P), (P P), (S), (S S), (X), (C), 
  (C C), (Z), (G), (D), (B), (H),
};

/* The vowels in a syllable. */
enum {JUNGSEONG_COUNT = 21};
static string_t jungseongs[JUNGSEONG_COUNT] = 
{ 
  (A), (A I), (I A), (I A I), (E), (E I), (I E), (I E I), (O), (O A), (O I E),
  (O I), (I O), (W), (W E), (W E I), (W I), (I W), (U), (U I), (I)
};

/* The final consonants in a syllable. */
enum {JONSEONG_COUNT = 28};
static string_t jonseongs[JONSEONG_COUNT] = 
{
  "", (K), (K K), (K S), (N), (N C), (N H), (T), (L), (L K), (L M), (L P), 
  (L S), (L D), (L B), (L H), (M), (P), (P S), (S), (S S), (X), (C), (Z), (G), 
  (D), (B), (H),
};

enum {SPLIT_TABLE_SIZE = (SYLLABLE_COUNT + JAMO_COUNT)};

/* Global variables. ========================================================*/

bool_t roman_hangul;
/* Indicates whether Hangul output is transcribed in latin script. */

/* Variables. ===============================================================*/

static byte_t jamo_enc[JAMO_COUNT + 1]; 
/* Intermediate encoding of Jamo single letters and syllable start. */

static string_t split_strings[SPLIT_TABLE_SIZE];
static pool_t string_pool; /* String pool with split Syllables and Jamos. */
static int_t *split_trie; /* Trie used to segmentise Hangul syllables. */
static int_t split_trie_root; /* Root node index of SPLIT_TRIE. */

/* Functions. ===============================================================*/

static int
compare_trie_entries( const void *entry1, const void *entry2 )
/* Compare two trie entries. */
{ 
  return strcmp_no_case( ((trie_entry_t *) entry1)->key,
                         ((trie_entry_t *) entry2)->key );
}

/*---------------------------------------------------------------------------*/

static void
add_jamos_to_text( text_t *text, const char *jamos )
{
  for (; *jamos != EOS; jamos++ )
    add_unichar_to_text( text, jamos_unicodes[ (int_t) *jamos ]);
}

/*---------------------------------------------------------------------------*/

void 
init_hangul( void )
/* Initialise the hangul module. */
{ 
  trie_entry_t trie_entries[ SPLIT_TABLE_SIZE ]; /* Syllable/Jamos. */
  int_t c, i, choseong, jungseong, jonseong;
  pool_t trie_pool;
  text_t *text;

  if (! split_hangul_syllables) 
    return;

  /* Fill table to convert from Jamo letter to intermediate encoding. */
  for (i = A_C; i <= DOT_C; i++)
    jamo_enc[ jamos_unicodes[i] - FIRST_JAMO ] = i;

  text = new_text();
  string_pool = new_pool( sizeof( char_t ) );

  for (i = 0; i < SYLLABLE_COUNT; i++) 
  { 
    /* Build Jamos string for syllable i. */
    c = i / JONSEONG_COUNT;
    jonseong = i % JONSEONG_COUNT;
    jungseong = c % JUNGSEONG_COUNT;
    choseong = c / JUNGSEONG_COUNT;
    clear_text( text );
    add_unichar_to_text( text, SYLLABLE_START );
    add_jamos_to_text( text, choseongs[ choseong ] );
    add_jamos_to_text( text, jungseongs[ jungseong ] );
    add_jamos_to_text( text, jonseongs[ jonseong ] );
    split_strings[i] = copy_string_to_pool( string_pool, text->buffer, NULL );
    
    /* Copy to trie entry table. */
    trie_entries[i].key = split_strings[i];
    trie_entries[i].content = i + FIRST_SYLLABLE;
  }

  for (i = SYLLABLE_COUNT; i < SPLIT_TABLE_SIZE; i++)
  {
    clear_text( text );
    add_jamos_to_text( text, jamos[ i  - SYLLABLE_COUNT ] );
    split_strings[i] = copy_string_to_pool( string_pool, text->buffer, NULL );
    trie_entries[i].key = split_strings[i];
    trie_entries[i].content = i - SYLLABLE_COUNT + FIRST_JAMO;
  }

  free_text( &text );

  /* Sort the Jamos strings and build the trie. */ 
  qsort( trie_entries, SPLIT_TABLE_SIZE, sizeof( trie_entry_t ), 
         compare_trie_entries);
  new_trie( SPLIT_TABLE_SIZE, trie_entries, &trie_pool, &split_trie_root );
  split_trie = pool_to_vector( trie_pool );
  free_pool( &trie_pool );

  roman_hangul = FALSE;
}

/*---------------------------------------------------------------------------*/

void 
terminate_hangul( void )
/* Terminate the hangul module. */
{ 
  if (! split_hangul_syllables) 
    return;
  free_mem( &split_trie );
  free_pool( &string_pool );
}  

/* Conversion of Jamos to romanised Hangul. =================================*/

static char_t *
jamos_to_roman( string_t jamos_string )
/* Convert Jamos string JAMOS_STRING to romanised Hangul. */
{ 
  /* Modified Yale roman representation for each of the Jamos letters. */
  static string_t romans[24] = 
  { NULL, "a", "ph", "c", "th", "e", "kh", "h", "i", "k", "l", "m", "n", "o", 
    "p", "s", "t", "u", "wu", "ng", "ch", "."
  };

  string_t roman_segment;
  text_t *roman_text;
  int_t c;
  char_t enc, prev, next;

  roman_text = new_text();
  while (*jamos_string != EOS) 
  { 
    c = g_utf8_get_char( jamos_string );
    if (c >= FIRST_JAMO && c <= SYLLABLE_START)
    { 
      /* Convert Jamos. */
      add_char_to_text( roman_text, '{' );
      enc = jamo_enc[ c - FIRST_JAMO ];
      prev = EOS;
      do
      { 
	jamos_string = g_utf8_next_char( jamos_string );
 	c = g_utf8_get_char( jamos_string );
	if (c >= FIRST_JAMO && c <= SYLLABLE_START)
	  next = jamo_enc[ c - FIRST_JAMO ];
	else
	  next = EOS;
	
        /* Convert ENC to roman. */
	roman_segment = romans[ (int_t) enc ];
        switch (enc) 
	{
	case X_C:
	  if (prev == DOT_C)
	    roman_segment = "";
	  break;
        case I_C:
          if ((prev != EOS && strchr( VOWELS, prev ) != NULL)
	      || (next != EOS && strchr( VOWELS, next ) != NULL)) 
	  { 
	    roman_segment = "y"; 
	  }
          break;
        case O_C:
          if (next == A_C) 
	    roman_segment = "w";
          break;
        case W_C:
          if (prev == I_C)
	    roman_segment = "u"; 
	  else if (next != EOS && strchr( VOWELS, next ) != NULL) 
	  { 
	    roman_segment = "w"; 
	  }
          break;
        default:
          break;
        } 
        add_to_text( roman_text, roman_segment );
        prev = enc;
	enc = next;
      } while (enc != 0);
      add_char_to_text( roman_text, '}' );
    } 
    else 
    {
      add_unichar_to_text( roman_text, c );
      jamos_string = g_utf8_next_char( jamos_string );
    }
  }
  
  return text_to_string( &roman_text );
}

/* Conversion of romanised Hangul to Jamos. =================================*/

static char_t * 
roman_to_jamos( string_t roman_string )
/* Convert transcribed Hangul string ROMAN_STRING to Jamos. */
{ 
  /* All letter sequences that can be converted to hancode. */
  static struct {string_t roman; string_t jamos;} romans[] = 
  { 
    /* Two-letter strings must come first. */
    {"ch", Z}, {"kh", G}, {"th", D}, {"ph", B}, 
    {"wu", W}, {"ng", X}, {"wa", O A}, {"yu", I W},
    
    {"a", A}, {"c", C}, {"e", E}, {"h", H}, {"i", I}, {"k", K}, {"l", L}, 
    {"m", M}, {"n", N}, {"o", O}, {"p", P}, {"r", L}, {"s", S}, {"t", T}, 
    {"u", U}, {"w", W}, {"x", X}, {"y", I}, {".", DOT},
    {NULL, NULL}
  };
  int_t i;
  text_t *jamos_text;

  jamos_text = new_text();
  while (*roman_string != EOS) 
  { 
    if (*roman_string == '{') 
    { 
      roman_string++;
      while (*roman_string != '}') 
      { 
	if (*roman_string == EOS) 
	  complain( "Missing \"}\" in romanised Hangul." );

        /* Insert an "x" at beginning of syllable if vowel is following. */
        if (roman_string[-1] == '.' 
	    && strrchr( "aeiouwy", roman_string[0] ) != NULL) 
	{ 
	  add_jamos_to_text( jamos_text, X ); 
	}

        for (i = 0; romans[i].roman != NULL; i++) 
	{ 
	  if (strncmp( roman_string, romans[i].roman, 
		       strlen( romans[i].roman ) ) == 0) 
	  { 
	    add_jamos_to_text( jamos_text, romans[i].jamos );
            roman_string += strlen( romans[i].roman );
            break;
          }
        }
        
        if (romans[i].roman == NULL)
	{ 
	  complain( "\"%c\" is not a romanised Hangul letter.", 
		    *roman_string );
	}
      }
      /* Jump over closing "}" */
      roman_string++;
    } 
    else 
      add_char_to_text( jamos_text, *roman_string++ );
  }
  
  return text_to_string( &jamos_text );
}

/* Conversion of Jamos to Hangul syllables. =================================*/

static char_t *
jamos_to_syllables( string_t jamos_string )
/* Convert Jamos JAMOS_STRING to Unicode Hangul syllables. */
{ 
  text_t *syl_text;
  int_t trie_code, code, unicode;
  int_t trie_node;
  string_t string_p;

  syl_text = new_text();
  while (*jamos_string != EOS) 
  { 
    /* Try to combine a syllable or a multi-consonant Jamo. */
    code = g_utf8_get_char( jamos_string );
    if (code >= FIRST_JAMO && code <= SYLLABLE_START)
    {
      /* Search the trie until we have found the longest segment. */
      trie_node = split_trie_root;
      string_p = jamos_string;
      unicode = 0;
      while (lookup_trie( split_trie, &trie_node, &string_p, &trie_code )) 
      { 
	jamos_string = string_p; 
        unicode = trie_code;
      }
      if (unicode != 0) 
	add_unichar_to_text( syl_text, unicode );
      else if (code == SYLLABLE_START)
      { 
	add_to_text( syl_text, "{.}" );
        jamos_string = g_utf8_next_char( jamos_string );
      } 
      else 
	complain( "Internal error." );
    } 
    else 
    {
      add_unichar_to_text( syl_text, code );
      jamos_string = g_utf8_next_char( jamos_string );
    }
  }     
  return text_to_string( &syl_text );
}

/* Conversion of Hangul syllables to Jamos. =================================*/

static char_t * 
syllables_to_jamos( string_t syl_string )
/* Convert Hangul syllables in SYL_STRING to Jamos.
 * The returned string remains valid until this function is called again. */
{ 
  text_t *jamos_text;
  int_t c;

  jamos_text = new_text();
  while (*syl_string != EOS) 
  { 
    c = g_utf8_get_char( syl_string );
    syl_string = g_utf8_next_char( syl_string );
    if (c >= FIRST_SYLLABLE && c <= LAST_SYLLABLE)
      add_to_text( jamos_text, split_strings[ c - FIRST_SYLLABLE ] );
    else if (c >= FIRST_JAMO && c <= LAST_JAMO)
    {
      add_to_text( jamos_text,
		   split_strings[ c - FIRST_JAMO  + SYLLABLE_COUNT] );
    }
    else
      add_unichar_to_text( jamos_text, c );
  }
  return text_to_string( &jamos_text );
}

/* Global conversion routines. ==============================================*/

void 
decode_hangul( char_t **string_p )
/* Decode *STRING_P to external format.
 * *STRING_P must be a string on the heap.
 * It will be replaced by the new string which is also on the heap. */
{ 
  char_t *string;

  if (! split_hangul_syllables)
    return;
  if (roman_hangul) 
    string = jamos_to_roman( *string_p );
  else 
    string = jamos_to_syllables( *string_p );
  free_mem( string_p );
  *string_p = string;
}

/*---------------------------------------------------------------------------*/

void 
encode_hangul( char_t **string_p )
/* Encode *STRING_P to internal format.
 * *STRING_P must be a string on the heap.
 * It will be replaced by the new string which is also on the heap. */
{ 
  char_t *string;

  if (! split_hangul_syllables) 
    return;
  string = syllables_to_jamos( *string_p );
  free_mem( string_p );
  *string_p = roman_to_jamos( string );
  free_mem( &string );
}

/* End of file. =============================================================*/