Blame bibutils/gb18030.c

Packit 89ede9
#include <stdio.h>
Packit 89ede9
#include "gb18030.h"
Packit 89ede9
Packit 89ede9
/* GB18030-2000 is an encoding of Unicode character used in China
Packit 89ede9
 *
Packit 89ede9
 * {0x00-0x7f} are one byte characters identical to US-ASCII
Packit 89ede9
 * {0x80} is properly undefined, but many GB18030 encodings make
Packit 89ede9
 *      it the Euro sign (Unicode 0x20AC), so use that
Packit 89ede9
 * {0x81-0xFE}{0x40-0x7E,0x80-0xFE} a full superset of GBK (with fallback 
Packit 89ede9
 *      mappings)
Packit 89ede9
 * {0x81-0xFE}{0x30-0x39}{0x81-0xFE}{0x30-0x39} maps linearly to ISO 10646
Packit 89ede9
 *      GB+81308130 = U+0080 up to U+FFFF
Packit 89ede9
 *      GB+90308130 = U+10000 up to U+10FFFF skipping mappings already
Packit 89ede9
 *                     defined in 1-byte and 2-byte areas.
Packit 89ede9
 *
Packit 89ede9
 * Truth is it's a bit of a mess algorithmically as it doesn't multiply
Packit 89ede9
 * encode characters, so there are holes in the Unicode mapping that
Packit 89ede9
 * should be avoided.
Packit 89ede9
 */
Packit 89ede9
Packit 89ede9
/* This is a "small" region that needs explicit enumeration */
Packit 89ede9
#include "gb18030_enumeration.c"
Packit 89ede9
Packit 89ede9
static int
Packit 89ede9
in_range( unsigned char n, unsigned char low, unsigned char high )
Packit 89ede9
{
Packit 89ede9
	if ( n < low || n > high ) return 0;
Packit 89ede9
	return 1;
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
Packit 89ede9
/* Get GB 18030 from Unicode Value in Table */
Packit 89ede9
static int
Packit 89ede9
gb18030_unicode_table_lookup( unsigned int unicode, unsigned char out[4] )
Packit 89ede9
{
Packit 89ede9
	int i, j;
Packit 89ede9
	if ( unicode >= 0x0080 && unicode <= 0xFFE5 ) {
Packit 89ede9
		/* list is sorted, so should do binary search here */
Packit 89ede9
		for ( i=0; i
Packit 89ede9
			if ( unicode == gb18030_enums[i].unicode ) {
Packit 89ede9
				for ( j=0; j
Packit 89ede9
					out[j] = gb18030_enums[i].bytes[j];
Packit 89ede9
				return gb18030_enums[i].len;
Packit 89ede9
			}
Packit 89ede9
		}
Packit 89ede9
	}
Packit 89ede9
	return 0;
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
static int
Packit 89ede9
gb18030_match( unsigned char *s, const unsigned char *bytes, unsigned char len )
Packit 89ede9
{
Packit 89ede9
	int i;
Packit 89ede9
	for ( i=0; i
Packit 89ede9
		if ( ( s[i])!=bytes[i] ) return 0;
Packit 89ede9
	return 1;
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
static unsigned int
Packit 89ede9
gb18030_table_lookup( unsigned char *uc, unsigned char len, int *found )
Packit 89ede9
{
Packit 89ede9
	unsigned int i;
Packit 89ede9
	*found = 0;
Packit 89ede9
	for ( i=0; i
Packit 89ede9
		if ( gb18030_enums[i].len!=len ) continue;
Packit 89ede9
		if ( gb18030_match( &(uc[0]), gb18030_enums[i].bytes, len ) ) {
Packit 89ede9
			*found = 1;
Packit 89ede9
			return gb18030_enums[i].unicode;
Packit 89ede9
		}
Packit 89ede9
	}
Packit 89ede9
	return '?';
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
Packit 89ede9
static int
Packit 89ede9
gb18030_unicode_range_lookup( unsigned int unicode, unsigned char out[4] ) 
Packit 89ede9
{
Packit 89ede9
	return 0;
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
static int
Packit 89ede9
gb18030_range_lookup( unsigned char *s, /* unsigned char len = 4 only */ int *found )
Packit 89ede9
{
Packit 89ede9
	*found = 0;
Packit 89ede9
	return 0;
Packit 89ede9
#if 0
Packit 89ede9
  
Packit 89ede9
       Note that GB 18030 defines roundtrip mappings for all Unicode code points U+0000..U+10ffff.
Packit 89ede9
       This would require 1.1 million  elements.
Packit 89ede9
       However, most four-byte GB 18030 mappings can be enumerated efficiently within distinct ranges.
Packit 89ede9
       Therefore, we use <range> elements for all but the 31000 or so assignments above.
Packit 89ede9
    -->
Packit 89ede9
  <range uFirst="0452" uLast="200F"  bFirst="81 30 D3 30" bLast="81 36 A5 31"  bMin="81 30 81 30" bMax="FE 39 FE 39"/>
Packit 89ede9
  <range uFirst="2643" uLast="2E80"  bFirst="81 37 A8 39" bLast="81 38 FD 38"  bMin="81 30 81 30" bMax="FE 39 FE 39"/>
Packit 89ede9
  <range uFirst="361B" uLast="3917"  bFirst="82 30 A6 33" bLast="82 30 F2 37"  bMin="81 30 81 30" bMax="FE 39 FE 39"/>
Packit 89ede9
  <range uFirst="3CE1" uLast="4055"  bFirst="82 31 D4 38" bLast="82 32 AF 32"  bMin="81 30 81 30" bMax="FE 39 FE 39"/>
Packit 89ede9
  <range uFirst="4160" uLast="4336"  bFirst="82 32 C9 37" bLast="82 32 F8 37"  bMin="81 30 81 30" bMax="FE 39 FE 39"/>
Packit 89ede9
  <range uFirst="44D7" uLast="464B"  bFirst="82 33 A3 39" bLast="82 33 C9 31"  bMin="81 30 81 30" bMax="FE 39 FE 39"/>
Packit 89ede9
  <range uFirst="478E" uLast="4946"  bFirst="82 33 E8 38" bLast="82 34 96 38"  bMin="81 30 81 30" bMax="FE 39 FE 39"/>
Packit 89ede9
  <range uFirst="49B8" uLast="4C76"  bFirst="82 34 A1 31" bLast="82 34 E7 33"  bMin="81 30 81 30" bMax="FE 39 FE 39"/>
Packit 89ede9
  <range uFirst="9FA6" uLast="D7FF"  bFirst="82 35 8F 33" bLast="83 36 C7 38"  bMin="81 30 81 30" bMax="FE 39 FE 39"/>
Packit 89ede9
  <range uFirst="E865" uLast="F92B"  bFirst="83 36 D0 30" bLast="84 30 85 34"  bMin="81 30 81 30" bMax="FE 39 FE 39"/>
Packit 89ede9
  <range uFirst="FA2A" uLast="FE2F"  bFirst="84 30 9C 38" bLast="84 31 85 37"  bMin="81 30 81 30" bMax="FE 39 FE 39"/>
Packit 89ede9
  <range uFirst="FFE6" uLast="FFFF"  bFirst="84 31 A2 34" bLast="84 31 A4 39"  bMin="81 30 81 30" bMax="FE 39 FE 39"/>
Packit 89ede9
  <range uFirst="10000" uLast="10FFFF"  bFirst="90 30 81 30" bLast="E3 32 9A 35"  bMin="81 30 81 30" bMax="FE 39 FE 39"/>
Packit 89ede9
#endif
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
unsigned int
Packit 89ede9
gb18030_to_unicode( unsigned char *s, unsigned char len )
Packit 89ede9
{
Packit 89ede9
	unsigned int ret;
Packit 89ede9
	int found;
Packit 89ede9
	ret = gb18030_table_lookup( s, len, &found );
Packit 89ede9
	if ( !found && len==4 ) {
Packit 89ede9
		ret = gb18030_range_lookup( s, &found );
Packit 89ede9
		if ( !found ) ret = '?';
Packit 89ede9
	}
Packit 89ede9
	return ret;
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
/*
Packit 89ede9
 * Convert unicode character to gb18030
Packit 89ede9
 *
Packit 89ede9
 * returns number of characters for output
Packit 89ede9
 */
Packit 89ede9
int
Packit 89ede9
gb18030_encode( unsigned int unicode, unsigned char out[4] )
Packit 89ede9
{
Packit 89ede9
	int len;
Packit 89ede9
	if ( unicode < 0x80 ) {
Packit 89ede9
		out[0] = unicode;
Packit 89ede9
		len = 1;
Packit 89ede9
	} else {
Packit 89ede9
		len = gb18030_unicode_table_lookup( unicode, out );
Packit 89ede9
		if ( !len )
Packit 89ede9
			len = gb18030_unicode_range_lookup( unicode, out ); 
Packit 89ede9
	}
Packit 89ede9
	return len;
Packit 89ede9
}
Packit 89ede9
Packit 89ede9
/*
Packit 89ede9
 * Decode a gb18030 character into unicode
Packit 89ede9
 */
Packit 89ede9
unsigned int
Packit 89ede9
gb18030_decode( char *s, unsigned int *pi )
Packit 89ede9
{
Packit 89ede9
	unsigned int c;
Packit 89ede9
	unsigned char uc[4];
Packit 89ede9
	int i = *pi;
Packit 89ede9
	uc[0] = ( unsigned char ) s[i];
Packit 89ede9
	if ( ( uc[0] & 128 ) == 0 ) {
Packit 89ede9
		c = ( unsigned int ) uc[0];
Packit 89ede9
		i += 1;
Packit 89ede9
	} else if ( uc[0] == 0x80 ) {
Packit 89ede9
		c = 0x20AC;
Packit 89ede9
		i += 1;
Packit 89ede9
	} else if ( uc[0] != 0xFF ) { /* multi-byte character */
Packit 89ede9
		uc[1] = ( unsigned char ) s[i+1];
Packit 89ede9
		uc[2] = ( unsigned char ) s[i+2];
Packit 89ede9
		uc[3]= ( unsigned char ) s[i+3];
Packit 89ede9
		if ( in_range( uc[1], 0x40, 0x7e ) || in_range( uc[1], 0x80, 0xfe ) ) {
Packit 89ede9
			/* two-byte character */
Packit 89ede9
			c = gb18030_to_unicode( &(uc[0]), 2 );
Packit 89ede9
			i += 2;
Packit 89ede9
		} else if ( in_range( uc[1], 0x30, 0x39 ) &&
Packit 89ede9
		            in_range( uc[2], 0x81, 0xfe ) &&
Packit 89ede9
		            in_range( uc[3], 0x30, 0x39 ) ) {
Packit 89ede9
			/* four-byte character */
Packit 89ede9
			c = gb18030_to_unicode( &(uc[0]), 4 );
Packit 89ede9
			i += 4;
Packit 89ede9
		} else {
Packit 89ede9
			/* this is an illegal character */
Packit 89ede9
			c = '?';
Packit 89ede9
			i += 1;
Packit 89ede9
		}
Packit 89ede9
	} else { /* s[i]==0xFF */
Packit 89ede9
		/* this is an illegal character */
Packit 89ede9
		c = '?';
Packit 89ede9
		i += 1;
Packit 89ede9
	}
Packit 89ede9
	*pi = i;
Packit 89ede9
	return c;
Packit 89ede9
}