|
Packit |
89ede9 |
/*
|
|
Packit |
89ede9 |
* utf8.c
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* Copyright (c) Chris Putnam 2004-2018
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* Source code released under the GPL version 2
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
#include <stdio.h>
|
|
Packit |
89ede9 |
#include <string.h>
|
|
Packit |
89ede9 |
#include "utf8.h"
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* UTF-8 encoding
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
U-00000000 - U-0000007F: 0xxxxxxx
|
|
Packit |
89ede9 |
U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
|
|
Packit |
89ede9 |
U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
|
|
Packit |
89ede9 |
U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
Packit |
89ede9 |
U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
Packit |
89ede9 |
U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
static void
|
|
Packit |
89ede9 |
utf8_build( unsigned int value, unsigned char out[6], int in_pos, int out_pos )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
unsigned int in_mask, out_mask;
|
|
Packit |
89ede9 |
int byte = 0;
|
|
Packit |
89ede9 |
while ( in_pos < 32 ) {
|
|
Packit |
89ede9 |
in_mask = 1 << ( 31 - in_pos );
|
|
Packit |
89ede9 |
out_mask = 1 << ( 7 - out_pos );
|
|
Packit |
89ede9 |
if ( value & in_mask ) out[byte] |= out_mask;
|
|
Packit |
89ede9 |
in_pos++;
|
|
Packit |
89ede9 |
out_pos++;
|
|
Packit |
89ede9 |
if ( out_pos > 7 ) {
|
|
Packit |
89ede9 |
out_pos=2;
|
|
Packit |
89ede9 |
byte++;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* int utf8( in, out[6] );
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* in is character code 0x0 -> 0x7FFFFFFF
|
|
Packit |
89ede9 |
* int is number of characters for output
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
int
|
|
Packit |
89ede9 |
utf8_encode( unsigned int value, unsigned char out[6] )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
int i;
|
|
Packit |
89ede9 |
for ( i=1; i<6; ++i ) out[i] = 0x80; /* 10xxxxxx */
|
|
Packit |
89ede9 |
if ( value < 0x80 ) {
|
|
Packit |
89ede9 |
out[0] = 0x0; /* 0xxxxxxx */
|
|
Packit |
89ede9 |
utf8_build( value, out, 25, 1 );
|
|
Packit |
89ede9 |
return 1;
|
|
Packit |
89ede9 |
} else if ( value < 0x800 ) {
|
|
Packit |
89ede9 |
out[0] = 0xC0; /* 110xxxxx */
|
|
Packit |
89ede9 |
utf8_build( value, out, 21, 3 );
|
|
Packit |
89ede9 |
return 2;
|
|
Packit |
89ede9 |
} else if ( value < 0x10000 ) {
|
|
Packit |
89ede9 |
out[0] = 0xE0; /* 1110xxxx */
|
|
Packit |
89ede9 |
utf8_build( value, out, 16, 4 );
|
|
Packit |
89ede9 |
return 3;
|
|
Packit |
89ede9 |
} else if ( value < 0x200000 ) {
|
|
Packit |
89ede9 |
out[0] = 0xF0; /* 11110xxx */
|
|
Packit |
89ede9 |
utf8_build( value, out, 11, 5 );
|
|
Packit |
89ede9 |
return 4;
|
|
Packit |
89ede9 |
} else if ( value < 0x4000000 ) {
|
|
Packit |
89ede9 |
out[0] = 0xF8; /* 111110xx */
|
|
Packit |
89ede9 |
utf8_build( value, out, 6, 6 );
|
|
Packit |
89ede9 |
return 5;
|
|
Packit |
89ede9 |
} else if ( value < (unsigned int ) 0x80000000 ) {
|
|
Packit |
89ede9 |
out[0] = 0xFC; /* 1111110x */
|
|
Packit |
89ede9 |
utf8_build( value, out, 1, 7 );
|
|
Packit |
89ede9 |
return 6;
|
|
Packit |
89ede9 |
} else {
|
|
Packit |
89ede9 |
/* error, above 2^31 bits encodable by UTF-8 */
|
|
Packit |
89ede9 |
return 0;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* Generate UTF8 character as null-terminated string */
|
|
Packit |
89ede9 |
void
|
|
Packit |
89ede9 |
utf8_encode_str( unsigned int value, char outstr[7] )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
unsigned char encoded[6];
|
|
Packit |
89ede9 |
int i, n;
|
|
Packit |
89ede9 |
n = utf8_encode( value, encoded );
|
|
Packit |
89ede9 |
for ( i=0; i
|
|
Packit |
89ede9 |
outstr[i] = ( char ) encoded[i];
|
|
Packit |
89ede9 |
outstr[n] = '\0';
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
unsigned int
|
|
Packit |
89ede9 |
utf8_decode( char *s, unsigned int *pi )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
unsigned int c;
|
|
Packit |
89ede9 |
int i = *pi;
|
|
Packit |
89ede9 |
/* one digit utf-8 */
|
|
Packit |
89ede9 |
if ((s[i] & 128)== 0 ) {
|
|
Packit |
89ede9 |
c = (unsigned int) s[i];
|
|
Packit |
89ede9 |
i += 1;
|
|
Packit |
89ede9 |
} else if ((s[i] & 224)== 192 ) { /* 110xxxxx & 111xxxxx == 110xxxxx */
|
|
Packit |
89ede9 |
c = (( (unsigned int) s[i] & 31 ) << 6) +
|
|
Packit |
89ede9 |
( (unsigned int) s[i+1] & 63 );
|
|
Packit |
89ede9 |
i += 2;
|
|
Packit |
89ede9 |
} else if ((s[i] & 240)== 224 ) { /* 1110xxxx & 1111xxxx == 1110xxxx */
|
|
Packit |
89ede9 |
c = ( ( (unsigned int) s[i] & 15 ) << 12 ) +
|
|
Packit |
89ede9 |
( ( (unsigned int) s[i+1] & 63 ) << 6 ) +
|
|
Packit |
89ede9 |
( (unsigned int) s[i+2] & 63 );
|
|
Packit |
89ede9 |
i += 3;
|
|
Packit |
89ede9 |
} else if ((s[i] & 248)== 240 ) { /* 11110xxx & 11111xxx == 11110xxx */
|
|
Packit |
89ede9 |
c = ( ( (unsigned int) s[i] & 7 ) << 18 ) +
|
|
Packit |
89ede9 |
( ( (unsigned int) s[i+1] & 63 ) << 12 ) +
|
|
Packit |
89ede9 |
( ( (unsigned int) s[i+2] & 63 ) << 6 ) +
|
|
Packit |
89ede9 |
( (unsigned int) s[i+3] & 63 );
|
|
Packit |
89ede9 |
i+= 4;
|
|
Packit |
89ede9 |
} else if ((s[i] & 252)== 248 ) { /* 111110xx & 111111xx == 111110xx */
|
|
Packit |
89ede9 |
c = ( ( (unsigned int) s[i] & 3 ) << 24 ) +
|
|
Packit |
89ede9 |
( ( (unsigned int) s[i+1] & 63 ) << 18 ) +
|
|
Packit |
89ede9 |
( ( (unsigned int) s[i+2] & 63 ) << 12 ) +
|
|
Packit |
89ede9 |
( ( (unsigned int) s[i+3] & 63 ) << 6 ) +
|
|
Packit |
89ede9 |
( (unsigned int) s[i+4] & 63 );
|
|
Packit |
89ede9 |
i += 5;
|
|
Packit |
89ede9 |
} else if ((s[i] & 254)== 252 ) { /* 1111110x & 1111111x == 1111110x */
|
|
Packit |
89ede9 |
c = ( ( (unsigned int) s[i] & 1 ) << 30 ) +
|
|
Packit |
89ede9 |
( ( (unsigned int) s[i+1] & 63 ) << 24 ) +
|
|
Packit |
89ede9 |
( ( (unsigned int) s[i+2] & 63 ) << 18 ) +
|
|
Packit |
89ede9 |
( ( (unsigned int) s[i+3] & 63 ) << 12 ) +
|
|
Packit |
89ede9 |
( ( (unsigned int) s[i+4] & 63 ) << 6 ) +
|
|
Packit |
89ede9 |
( (unsigned int) s[i+5] & 63 );
|
|
Packit |
89ede9 |
i += 6;
|
|
Packit |
89ede9 |
} else {
|
|
Packit |
89ede9 |
c = '?';
|
|
Packit |
89ede9 |
i++;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
*pi = i;
|
|
Packit |
89ede9 |
return c;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
void
|
|
Packit |
89ede9 |
utf8_writebom( FILE *outptr )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
int i, nc;
|
|
Packit |
89ede9 |
unsigned char code[6];
|
|
Packit |
89ede9 |
nc = utf8_encode( 0xFEFF, code );
|
|
Packit |
89ede9 |
for ( i=0; i
|
|
Packit |
89ede9 |
fprintf(outptr,"%c",code[i]);
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
int
|
|
Packit |
89ede9 |
utf8_is_bom( char *p )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
unsigned char *up = ( unsigned char * ) p;
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* ...if null-terminated string is too short, we're ok */
|
|
Packit |
89ede9 |
if ( up[0]!=0xEF ) return 0;
|
|
Packit |
89ede9 |
if ( up[1]!=0xBB ) return 0;
|
|
Packit |
89ede9 |
if ( up[2]!=0xBF ) return 0;
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
return 1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* utf8_is_emdash()
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
*emdash = 0xE2 (-30) 0x80 (-128) 0x94 (-108)
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
int
|
|
Packit |
89ede9 |
utf8_is_emdash( char *p )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
const char emdash[3] = { -30, -128, -108 };
|
|
Packit |
89ede9 |
if ( strncmp( p, emdash, 3 ) ) return 0;
|
|
Packit |
89ede9 |
return 1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|
|
Packit |
89ede9 |
/* utf8_is_endash()
|
|
Packit |
89ede9 |
*
|
|
Packit |
89ede9 |
* endash = 0xE2 (-30) 0x80 (-128) 0x93 (-109)
|
|
Packit |
89ede9 |
*/
|
|
Packit |
89ede9 |
int
|
|
Packit |
89ede9 |
utf8_is_endash( char *p )
|
|
Packit |
89ede9 |
{
|
|
Packit |
89ede9 |
const char endash[3] = { -30, -128, -109 };
|
|
Packit |
89ede9 |
if ( strncmp( p, endash, 3 ) ) return 0;
|
|
Packit |
89ede9 |
return 1;
|
|
Packit |
89ede9 |
}
|
|
Packit |
89ede9 |
|