#include <stdio.h>
#include "gb18030.h"
/* GB18030-2000 is an encoding of Unicode character used in China
*
* {0x00-0x7f} are one byte characters identical to US-ASCII
* {0x80} is properly undefined, but many GB18030 encodings make
* it the Euro sign (Unicode 0x20AC), so use that
* {0x81-0xFE}{0x40-0x7E,0x80-0xFE} a full superset of GBK (with fallback
* mappings)
* {0x81-0xFE}{0x30-0x39}{0x81-0xFE}{0x30-0x39} maps linearly to ISO 10646
* GB+81308130 = U+0080 up to U+FFFF
* GB+90308130 = U+10000 up to U+10FFFF skipping mappings already
* defined in 1-byte and 2-byte areas.
*
* Truth is it's a bit of a mess algorithmically as it doesn't multiply
* encode characters, so there are holes in the Unicode mapping that
* should be avoided.
*/
/* This is a "small" region that needs explicit enumeration */
#include "gb18030_enumeration.c"
static int
in_range( unsigned char n, unsigned char low, unsigned char high )
{
if ( n < low || n > high ) return 0;
return 1;
}
/* Get GB 18030 from Unicode Value in Table */
static int
gb18030_unicode_table_lookup( unsigned int unicode, unsigned char out[4] )
{
int i, j;
if ( unicode >= 0x0080 && unicode <= 0xFFE5 ) {
/* list is sorted, so should do binary search here */
for ( i=0; i<ngb18030_enums; ++i ) {
if ( unicode == gb18030_enums[i].unicode ) {
for ( j=0; j<gb18030_enums[i].len; ++j )
out[j] = gb18030_enums[i].bytes[j];
return gb18030_enums[i].len;
}
}
}
return 0;
}
static int
gb18030_match( unsigned char *s, const unsigned char *bytes, unsigned char len )
{
int i;
for ( i=0; i<len; ++i )
if ( ( s[i])!=bytes[i] ) return 0;
return 1;
}
static unsigned int
gb18030_table_lookup( unsigned char *uc, unsigned char len, int *found )
{
unsigned int i;
*found = 0;
for ( i=0; i<ngb18030_enums; ++i ) {
if ( gb18030_enums[i].len!=len ) continue;
if ( gb18030_match( &(uc[0]), gb18030_enums[i].bytes, len ) ) {
*found = 1;
return gb18030_enums[i].unicode;
}
}
return '?';
}
static int
gb18030_unicode_range_lookup( unsigned int unicode, unsigned char out[4] )
{
return 0;
}
static int
gb18030_range_lookup( unsigned char *s, /* unsigned char len = 4 only */ int *found )
{
*found = 0;
return 0;
#if 0
<!-- Roundtrip-mappings that can be enumerated
Note that GB 18030 defines roundtrip mappings for all Unicode code points U+0000..U+10ffff.
This would require 1.1 million <a> elements.
However, most four-byte GB 18030 mappings can be enumerated efficiently within distinct ranges.
Therefore, we use <range> elements for all but the 31000 or so assignments above.
-->
<range uFirst="0452" uLast="200F" bFirst="81 30 D3 30" bLast="81 36 A5 31" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
<range uFirst="2643" uLast="2E80" bFirst="81 37 A8 39" bLast="81 38 FD 38" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
<range uFirst="361B" uLast="3917" bFirst="82 30 A6 33" bLast="82 30 F2 37" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
<range uFirst="3CE1" uLast="4055" bFirst="82 31 D4 38" bLast="82 32 AF 32" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
<range uFirst="4160" uLast="4336" bFirst="82 32 C9 37" bLast="82 32 F8 37" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
<range uFirst="44D7" uLast="464B" bFirst="82 33 A3 39" bLast="82 33 C9 31" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
<range uFirst="478E" uLast="4946" bFirst="82 33 E8 38" bLast="82 34 96 38" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
<range uFirst="49B8" uLast="4C76" bFirst="82 34 A1 31" bLast="82 34 E7 33" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
<range uFirst="9FA6" uLast="D7FF" bFirst="82 35 8F 33" bLast="83 36 C7 38" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
<range uFirst="E865" uLast="F92B" bFirst="83 36 D0 30" bLast="84 30 85 34" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
<range uFirst="FA2A" uLast="FE2F" bFirst="84 30 9C 38" bLast="84 31 85 37" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
<range uFirst="FFE6" uLast="FFFF" bFirst="84 31 A2 34" bLast="84 31 A4 39" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
<range uFirst="10000" uLast="10FFFF" bFirst="90 30 81 30" bLast="E3 32 9A 35" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
#endif
}
unsigned int
gb18030_to_unicode( unsigned char *s, unsigned char len )
{
unsigned int ret;
int found;
ret = gb18030_table_lookup( s, len, &found );
if ( !found && len==4 ) {
ret = gb18030_range_lookup( s, &found );
if ( !found ) ret = '?';
}
return ret;
}
/*
* Convert unicode character to gb18030
*
* returns number of characters for output
*/
int
gb18030_encode( unsigned int unicode, unsigned char out[4] )
{
int len;
if ( unicode < 0x80 ) {
out[0] = unicode;
len = 1;
} else {
len = gb18030_unicode_table_lookup( unicode, out );
if ( !len )
len = gb18030_unicode_range_lookup( unicode, out );
}
return len;
}
/*
* Decode a gb18030 character into unicode
*/
unsigned int
gb18030_decode( char *s, unsigned int *pi )
{
unsigned int c;
unsigned char uc[4];
int i = *pi;
uc[0] = ( unsigned char ) s[i];
if ( ( uc[0] & 128 ) == 0 ) {
c = ( unsigned int ) uc[0];
i += 1;
} else if ( uc[0] == 0x80 ) {
c = 0x20AC;
i += 1;
} else if ( uc[0] != 0xFF ) { /* multi-byte character */
uc[1] = ( unsigned char ) s[i+1];
uc[2] = ( unsigned char ) s[i+2];
uc[3]= ( unsigned char ) s[i+3];
if ( in_range( uc[1], 0x40, 0x7e ) || in_range( uc[1], 0x80, 0xfe ) ) {
/* two-byte character */
c = gb18030_to_unicode( &(uc[0]), 2 );
i += 2;
} else if ( in_range( uc[1], 0x30, 0x39 ) &&
in_range( uc[2], 0x81, 0xfe ) &&
in_range( uc[3], 0x30, 0x39 ) ) {
/* four-byte character */
c = gb18030_to_unicode( &(uc[0]), 4 );
i += 4;
} else {
/* this is an illegal character */
c = '?';
i += 1;
}
} else { /* s[i]==0xFF */
/* this is an illegal character */
c = '?';
i += 1;
}
*pi = i;
return c;
}