|
Packit |
89ede9 |
#include <stdio.h>
|
|
Packit |
89ede9 |
#include "gb18030.h"
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* GB18030-2000 is an encoding of Unicode character used in China
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* {0x00-0x7f} are one byte characters identical to US-ASCII
|
|
Packit |
89ede9 |
* {0x80} is properly undefined, but many GB18030 encodings make
|
|
Packit |
89ede9 |
* it the Euro sign (Unicode 0x20AC), so use that
|
|
Packit |
89ede9 |
* {0x81-0xFE}{0x40-0x7E,0x80-0xFE} a full superset of GBK (with fallback
|
|
Packit |
89ede9 |
* mappings)
|
|
Packit |
89ede9 |
* {0x81-0xFE}{0x30-0x39}{0x81-0xFE}{0x30-0x39} maps linearly to ISO 10646
|
|
Packit |
89ede9 |
* GB+81308130 = U+0080 up to U+FFFF
|
|
Packit |
89ede9 |
* GB+90308130 = U+10000 up to U+10FFFF skipping mappings already
|
|
Packit |
89ede9 |
* defined in 1-byte and 2-byte areas.
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* Truth is it's a bit of a mess algorithmically as it doesn't multiply
|
|
Packit |
89ede9 |
* encode characters, so there are holes in the Unicode mapping that
|
|
Packit |
89ede9 |
* should be avoided.
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* This is a "small" region that needs explicit enumeration */
|
|
Packit |
89ede9 |
#include "gb18030_enumeration.c"
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static int
|
|
Packit |
89ede9 |
in_range( unsigned char n, unsigned char low, unsigned char high )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
if ( n < low || n > high ) return 0;
|
|
Packit |
89ede9 |
return 1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* Get GB 18030 from Unicode Value in Table */
|
|
Packit |
89ede9 |
static int
|
|
Packit |
89ede9 |
gb18030_unicode_table_lookup( unsigned int unicode, unsigned char out[4] )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
int i, j;
|
|
Packit |
89ede9 |
if ( unicode >= 0x0080 && unicode <= 0xFFE5 ) {
|
|
Packit |
89ede9 |
/* list is sorted, so should do binary search here */
|
|
Packit |
89ede9 |
for ( i=0; i
|
|
Packit |
89ede9 |
if ( unicode == gb18030_enums[i].unicode ) {
|
|
Packit |
89ede9 |
for ( j=0; j
|
|
Packit |
89ede9 |
out[j] = gb18030_enums[i].bytes[j];
|
|
Packit |
89ede9 |
return gb18030_enums[i].len;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
return 0;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static int
|
|
Packit |
89ede9 |
gb18030_match( unsigned char *s, const unsigned char *bytes, unsigned char len )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
int i;
|
|
Packit |
89ede9 |
for ( i=0; i
|
|
Packit |
89ede9 |
if ( ( s[i])!=bytes[i] ) return 0;
|
|
Packit |
89ede9 |
return 1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static unsigned int
|
|
Packit |
89ede9 |
gb18030_table_lookup( unsigned char *uc, unsigned char len, int *found )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
unsigned int i;
|
|
Packit |
89ede9 |
*found = 0;
|
|
Packit |
89ede9 |
for ( i=0; i
|
|
Packit |
89ede9 |
if ( gb18030_enums[i].len!=len ) continue;
|
|
Packit |
89ede9 |
if ( gb18030_match( &(uc[0]), gb18030_enums[i].bytes, len ) ) {
|
|
Packit |
89ede9 |
*found = 1;
|
|
Packit |
89ede9 |
return gb18030_enums[i].unicode;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
return '?';
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static int
|
|
Packit |
89ede9 |
gb18030_unicode_range_lookup( unsigned int unicode, unsigned char out[4] )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
return 0;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static int
|
|
Packit |
89ede9 |
gb18030_range_lookup( unsigned char *s, /* unsigned char len = 4 only */ int *found )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
*found = 0;
|
|
Packit |
89ede9 |
return 0;
|
|
Packit |
89ede9 |
#if 0
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
Note that GB 18030 defines roundtrip mappings for all Unicode code points U+0000..U+10ffff.
|
|
Packit |
89ede9 |
This would require 1.1 million elements.
|
|
Packit |
89ede9 |
However, most four-byte GB 18030 mappings can be enumerated efficiently within distinct ranges.
|
|
Packit |
89ede9 |
Therefore, we use <range> elements for all but the 31000 or so assignments above.
|
|
Packit |
89ede9 |
-->
|
|
Packit |
89ede9 |
<range uFirst="0452" uLast="200F" bFirst="81 30 D3 30" bLast="81 36 A5 31" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
|
|
Packit |
89ede9 |
<range uFirst="2643" uLast="2E80" bFirst="81 37 A8 39" bLast="81 38 FD 38" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
|
|
Packit |
89ede9 |
<range uFirst="361B" uLast="3917" bFirst="82 30 A6 33" bLast="82 30 F2 37" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
|
|
Packit |
89ede9 |
<range uFirst="3CE1" uLast="4055" bFirst="82 31 D4 38" bLast="82 32 AF 32" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
|
|
Packit |
89ede9 |
<range uFirst="4160" uLast="4336" bFirst="82 32 C9 37" bLast="82 32 F8 37" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
|
|
Packit |
89ede9 |
<range uFirst="44D7" uLast="464B" bFirst="82 33 A3 39" bLast="82 33 C9 31" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
|
|
Packit |
89ede9 |
<range uFirst="478E" uLast="4946" bFirst="82 33 E8 38" bLast="82 34 96 38" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
|
|
Packit |
89ede9 |
<range uFirst="49B8" uLast="4C76" bFirst="82 34 A1 31" bLast="82 34 E7 33" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
|
|
Packit |
89ede9 |
<range uFirst="9FA6" uLast="D7FF" bFirst="82 35 8F 33" bLast="83 36 C7 38" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
|
|
Packit |
89ede9 |
<range uFirst="E865" uLast="F92B" bFirst="83 36 D0 30" bLast="84 30 85 34" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
|
|
Packit |
89ede9 |
<range uFirst="FA2A" uLast="FE2F" bFirst="84 30 9C 38" bLast="84 31 85 37" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
|
|
Packit |
89ede9 |
<range uFirst="FFE6" uLast="FFFF" bFirst="84 31 A2 34" bLast="84 31 A4 39" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
|
|
Packit |
89ede9 |
<range uFirst="10000" uLast="10FFFF" bFirst="90 30 81 30" bLast="E3 32 9A 35" bMin="81 30 81 30" bMax="FE 39 FE 39"/>
|
|
Packit |
89ede9 |
#endif
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
unsigned int
|
|
Packit |
89ede9 |
gb18030_to_unicode( unsigned char *s, unsigned char len )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
unsigned int ret;
|
|
Packit |
89ede9 |
int found;
|
|
Packit |
89ede9 |
ret = gb18030_table_lookup( s, len, &found );
|
|
Packit |
89ede9 |
if ( !found && len==4 ) {
|
|
Packit |
89ede9 |
ret = gb18030_range_lookup( s, &found );
|
|
Packit |
89ede9 |
if ( !found ) ret = '?';
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
return ret;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/*
|
|
Packit |
89ede9 |
* Convert unicode character to gb18030
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* returns number of characters for output
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
int
|
|
Packit |
89ede9 |
gb18030_encode( unsigned int unicode, unsigned char out[4] )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
int len;
|
|
Packit |
89ede9 |
if ( unicode < 0x80 ) {
|
|
Packit |
89ede9 |
out[0] = unicode;
|
|
Packit |
89ede9 |
len = 1;
|
|
Packit |
89ede9 |
} else {
|
|
Packit |
89ede9 |
len = gb18030_unicode_table_lookup( unicode, out );
|
|
Packit |
89ede9 |
if ( !len )
|
|
Packit |
89ede9 |
len = gb18030_unicode_range_lookup( unicode, out );
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
return len;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/*
|
|
Packit |
89ede9 |
* Decode a gb18030 character into unicode
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
unsigned int
|
|
Packit |
89ede9 |
gb18030_decode( char *s, unsigned int *pi )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
unsigned int c;
|
|
Packit |
89ede9 |
unsigned char uc[4];
|
|
Packit |
89ede9 |
int i = *pi;
|
|
Packit |
89ede9 |
uc[0] = ( unsigned char ) s[i];
|
|
Packit |
89ede9 |
if ( ( uc[0] & 128 ) == 0 ) {
|
|
Packit |
89ede9 |
c = ( unsigned int ) uc[0];
|
|
Packit |
89ede9 |
i += 1;
|
|
Packit |
89ede9 |
} else if ( uc[0] == 0x80 ) {
|
|
Packit |
89ede9 |
c = 0x20AC;
|
|
Packit |
89ede9 |
i += 1;
|
|
Packit |
89ede9 |
} else if ( uc[0] != 0xFF ) { /* multi-byte character */
|
|
Packit |
89ede9 |
uc[1] = ( unsigned char ) s[i+1];
|
|
Packit |
89ede9 |
uc[2] = ( unsigned char ) s[i+2];
|
|
Packit |
89ede9 |
uc[3]= ( unsigned char ) s[i+3];
|
|
Packit |
89ede9 |
if ( in_range( uc[1], 0x40, 0x7e ) || in_range( uc[1], 0x80, 0xfe ) ) {
|
|
Packit |
89ede9 |
/* two-byte character */
|
|
Packit |
89ede9 |
c = gb18030_to_unicode( &(uc[0]), 2 );
|
|
Packit |
89ede9 |
i += 2;
|
|
Packit |
89ede9 |
} else if ( in_range( uc[1], 0x30, 0x39 ) &&
|
|
Packit |
89ede9 |
in_range( uc[2], 0x81, 0xfe ) &&
|
|
Packit |
89ede9 |
in_range( uc[3], 0x30, 0x39 ) ) {
|
|
Packit |
89ede9 |
/* four-byte character */
|
|
Packit |
89ede9 |
c = gb18030_to_unicode( &(uc[0]), 4 );
|
|
Packit |
89ede9 |
i += 4;
|
|
Packit |
89ede9 |
} else {
|
|
Packit |
89ede9 |
/* this is an illegal character */
|
|
Packit |
89ede9 |
c = '?';
|
|
Packit |
89ede9 |
i += 1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
} else { /* s[i]==0xFF */
|
|
Packit |
89ede9 |
/* this is an illegal character */
|
|
Packit |
89ede9 |
c = '?';
|
|
Packit |
89ede9 |
i += 1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
*pi = i;
|
|
Packit |
89ede9 |
return c;
|
|
Packit |
89ede9 |
}
|