Blob Blame History Raw
/*
 * unicode.c
 *
 * Helper unicode functions/values to determine the
 * types of unicode characters.
 */
#include "utf8.h"
#include "unicode.h"

typedef struct {
	unsigned int value;
	unsigned short info;
} unicodeinfo_t;

static unicodeinfo_t unicodeinfo[] = {
	{  48, UNICODE_NUMBER }, /* 0 */
	{  49, UNICODE_NUMBER }, /* 1 */
	{  50, UNICODE_NUMBER }, /* 2 */
	{  51, UNICODE_NUMBER }, /* 3 */
	{  52, UNICODE_NUMBER }, /* 4 */
	{  53, UNICODE_NUMBER }, /* 5 */
	{  54, UNICODE_NUMBER }, /* 6 */
	{  55, UNICODE_NUMBER }, /* 7 */
	{  56, UNICODE_NUMBER }, /* 8 */
	{  57, UNICODE_NUMBER }, /* 9 */
	{  65, UNICODE_UPPER }, /* Latin Capital A */
	{  66, UNICODE_UPPER }, /* Latin Capital B */
	{  67, UNICODE_UPPER }, /* Latin Capital C */
	{  68, UNICODE_UPPER }, /* Latin Capital D */
	{  69, UNICODE_UPPER }, /* Latin Capital E */
	{  70, UNICODE_UPPER }, /* Latin Capital F */
	{  71, UNICODE_UPPER }, /* Latin Capital G */
	{  72, UNICODE_UPPER }, /* Latin Capital H */
	{  73, UNICODE_UPPER }, /* Latin Capital I */
	{  74, UNICODE_UPPER }, /* Latin Capital J */
	{  75, UNICODE_UPPER }, /* Latin Capital K */
	{  76, UNICODE_UPPER }, /* Latin Capital L */
	{  77, UNICODE_UPPER }, /* Latin Capital M */
	{  78, UNICODE_UPPER }, /* Latin Capital N */
	{  79, UNICODE_UPPER }, /* Latin Capital O */
	{  80, UNICODE_UPPER }, /* Latin Capital P */
	{  81, UNICODE_UPPER }, /* Latin Capital Q */
	{  82, UNICODE_UPPER }, /* Latin Capital R */
	{  83, UNICODE_UPPER }, /* Latin Capital S */
	{  84, UNICODE_UPPER }, /* Latin Capital T */
	{  85, UNICODE_UPPER }, /* Latin Capital U */
	{  86, UNICODE_UPPER }, /* Latin Capital V */
	{  87, UNICODE_UPPER }, /* Latin Capital W */
	{  88, UNICODE_UPPER }, /* Latin Capital X */
	{  89, UNICODE_UPPER }, /* Latin Capital Y */
	{  90, UNICODE_UPPER }, /* Latin Capital Z */
	{  97, UNICODE_LOWER }, /* Latin Small   a */
	{  98, UNICODE_LOWER }, /* Latin Small   b */
	{  99, UNICODE_LOWER }, /* Latin Small   c */
	{ 100, UNICODE_LOWER }, /* Latin Small   d */
	{ 101, UNICODE_LOWER }, /* Latin Small   e */
	{ 102, UNICODE_LOWER }, /* Latin Small   f */
	{ 103, UNICODE_LOWER }, /* Latin Small   g */
	{ 104, UNICODE_LOWER }, /* Latin Small   h */
	{ 105, UNICODE_LOWER }, /* Latin Small   i */
	{ 106, UNICODE_LOWER }, /* Latin Small   j */
	{ 107, UNICODE_LOWER }, /* Latin Small   k */
	{ 108, UNICODE_LOWER }, /* Latin Small   l */
	{ 109, UNICODE_LOWER }, /* Latin Small   m */
	{ 110, UNICODE_LOWER }, /* Latin Small   n */
	{ 111, UNICODE_LOWER }, /* Latin Small   o */
	{ 112, UNICODE_LOWER }, /* Latin Small   p */
	{ 113, UNICODE_LOWER }, /* Latin Small   q */
	{ 114, UNICODE_LOWER }, /* Latin Small   r */
	{ 115, UNICODE_LOWER }, /* Latin Small   s */
	{ 116, UNICODE_LOWER }, /* Latin Small   t */
	{ 117, UNICODE_LOWER }, /* Latin Small   u */
	{ 118, UNICODE_LOWER }, /* Latin Small   v */
	{ 119, UNICODE_LOWER }, /* Latin Small   w */
	{ 120, UNICODE_LOWER }, /* Latin Small   x */
	{ 121, UNICODE_LOWER }, /* Latin Small   y */
	{ 122, UNICODE_LOWER }, /* Latin Small   z */
	{ 192, UNICODE_UPPER }, /* Latin Capital A with grave */
	{ 193, UNICODE_UPPER }, /* Latin Capital A with acute */
	{ 194, UNICODE_UPPER }, /* Latin Capital A with circumflex */
	{ 195, UNICODE_UPPER }, /* Latin Capital A with tilde */
	{ 196, UNICODE_UPPER }, /* Latin Capital A with diuresis */
	{ 197, UNICODE_UPPER }, /* Latin Capital A with ring above */
	{ 198, UNICODE_UPPER }, /* Latin Capital AE */
	{ 199, UNICODE_UPPER }, /* Latin Capital C with cedilla */
	{ 200, UNICODE_UPPER }, /* Latin Capital E with grave */
	{ 201, UNICODE_UPPER }, /* Latin Capital E with acute */
	{ 202, UNICODE_UPPER }, /* Latin Capital E with circumflex */
	{ 203, UNICODE_UPPER }, /* Latin Capital E with diuresis */
	{ 204, UNICODE_UPPER }, /* Latin Capital I with grave */
	{ 205, UNICODE_UPPER }, /* Latin Capital I with acute */
	{ 206, UNICODE_UPPER }, /* Latin Capital I with circumflex */
	{ 207, UNICODE_UPPER }, /* Latin Capital I with diuresis */
	{ 208, UNICODE_UPPER }, /* Latin Capital ETH */
	{ 209, UNICODE_UPPER }, /* Latin Capital N with tilde */
	{ 210, UNICODE_UPPER }, /* Latin Capital O with grave */
	{ 211, UNICODE_UPPER }, /* Latin Capital O with acute */
	{ 212, UNICODE_UPPER }, /* Latin Capital O with circumflex */
	{ 213, UNICODE_UPPER }, /* Latin Capital O with tilde */
	{ 214, UNICODE_UPPER }, /* Latin Captial O with diaeresis */
	{ 216, UNICODE_UPPER }, /* Latin Capital O with stroke */
	{ 217, UNICODE_UPPER }, /* Latin Capital U with grave */
	{ 218, UNICODE_UPPER }, /* Latin Capital U with acute */
	{ 219, UNICODE_UPPER }, /* Latin Capital U with circumflex */
	{ 220, UNICODE_UPPER }, /* Latin Capital U with diaeresis */
	{ 221, UNICODE_UPPER }, /* Latin Capital Y with acute */
	{ 222, UNICODE_UPPER }, /* Latin Capital THORN */
	{ 223, UNICODE_LOWER }, /* German sz ligature */
	{ 224, UNICODE_LOWER }, /* Latin Small   a with grave */
	{ 225, UNICODE_LOWER }, /* Latin Small   a with acute */
	{ 226, UNICODE_LOWER }, /* Latin Small   a with circumflex */
	{ 227, UNICODE_LOWER }, /* Latin Small   a with tilde */
	{ 228, UNICODE_LOWER }, /* Latin Small   a with diuresis */
	{ 229, UNICODE_LOWER }, /* Latin Small   a with ring above */
	{ 230, UNICODE_LOWER }, /* Latin Small   ae */
	{ 231, UNICODE_LOWER }, /* Latin Small   c with cedilla */
	{ 232, UNICODE_LOWER }, /* Latin Small   e with grave */
	{ 233, UNICODE_LOWER }, /* Latin Small   e with acute */
	{ 234, UNICODE_LOWER }, /* Latin Small   e with circumflex */
	{ 235, UNICODE_LOWER }, /* Latin Small   e with diuresis */
	{ 236, UNICODE_LOWER }, /* Latin Small   i with grave */
	{ 237, UNICODE_LOWER }, /* Latin Small   i with acute */
	{ 238, UNICODE_LOWER }, /* Latin Small   i with circumflex */
	{ 239, UNICODE_LOWER }, /* Latin Small   i with diuresis */
	{ 240, UNICODE_LOWER }, /* Latin Small   eth */
	{ 241, UNICODE_LOWER }, /* Latin Small   n with tilde */
	{ 242, UNICODE_LOWER }, /* Latin Small   o with grave */
	{ 243, UNICODE_LOWER }, /* Latin Small   o with acute */
	{ 244, UNICODE_LOWER }, /* Latin Small   o with circumflex */
	{ 245, UNICODE_LOWER }, /* Latin Small   o with tilde */
	{ 246, UNICODE_LOWER }, /* Latin Small   o with diaeresis */
	{ 248, UNICODE_LOWER }, /* Latin Small   o with stroke */
	{ 249, UNICODE_LOWER }, /* Latin Small   u with grave */
	{ 250, UNICODE_LOWER }, /* Latin Small   u with acute */
	{ 251, UNICODE_LOWER }, /* Latin Small   u with circumflex */
	{ 252, UNICODE_LOWER }, /* Latin Small   u with diaeresis */
	{ 253, UNICODE_LOWER }, /* Latin Small   y with acute */
	{ 254, UNICODE_LOWER }, /* Latin Small   thorn */
	{ 255, UNICODE_LOWER }, /* Latin Small   y with diaeresis */
	{ 256, UNICODE_UPPER }, /* Latin Capital A with macron */
	{ 257, UNICODE_LOWER }, /* Latin Small   a with macron */
	{ 258, UNICODE_UPPER }, /* Latin Capital A with breve */
	{ 259, UNICODE_LOWER }, /* Latin Small   a with breve */
	{ 260, UNICODE_UPPER }, /* Latin Capital A with ogonek */
	{ 261, UNICODE_LOWER }, /* Latin Small   a with ogonek */
	{ 262, UNICODE_UPPER }, /* Latin Capital C with acute */
	{ 263, UNICODE_LOWER }, /* Latin Small   c with acute */
	{ 264, UNICODE_UPPER }, /* Latin Capital C with circumflex */
	{ 265, UNICODE_LOWER }, /* Latin Small   c with circumflex */
	{ 266, UNICODE_UPPER }, /* Latin Capital C with dot above */
	{ 267, UNICODE_LOWER }, /* Latin Small   c with dot above */
	{ 268, UNICODE_UPPER }, /* Latin Capital C with caron (hacek) */
	{ 269, UNICODE_LOWER }, /* Latin Small   c with caron (hacek) */
	{ 270, UNICODE_UPPER }, /* Latin Capital D with caron (hacek) */
	{ 271, UNICODE_LOWER }, /* Latin Small   d with caron (hacek) */
	{ 272, UNICODE_UPPER }, /* Latin Capital D with stroke */
	{ 273, UNICODE_LOWER }, /* Latin Small   d with stroke */
	{ 274, UNICODE_UPPER }, /* Latin Capital E with macron */
	{ 275, UNICODE_LOWER }, /* Latin Small   e with macron */
	{ 276, UNICODE_UPPER }, /* Latin Capital E with breve */
	{ 277, UNICODE_LOWER }, /* Latin Small   e with breve */
	{ 278, UNICODE_UPPER }, /* Latin Capital E with dot above */
	{ 279, UNICODE_LOWER }, /* Latin Small   e with dot above */
	{ 280, UNICODE_UPPER }, /* Latin Capital E with ogonek */
	{ 281, UNICODE_LOWER }, /* Latin Small   e with ogonek */
	{ 282, UNICODE_UPPER }, /* Latin Capital E with caron (hacek) */
	{ 283, UNICODE_LOWER }, /* Latin Small   e with caron */
	{ 284, UNICODE_UPPER }, /* Latin Capital G with circumflex */
	{ 285, UNICODE_LOWER }, /* Latin Small   g with circumflex */
	{ 286, UNICODE_UPPER }, /* Latin Capital G with breve */
	{ 287, UNICODE_LOWER }, /* Latin Small   g with breve */
	{ 288, UNICODE_UPPER }, /* Latin Capital G with dot above */
	{ 289, UNICODE_LOWER }, /* Latin Small   g with dot above */
	{ 290, UNICODE_UPPER }, /* Latin Capital G with cedilla */
	{ 291, UNICODE_LOWER }, /* Latin Small   g with cedilla */
	{ 292, UNICODE_UPPER }, /* Latin Capital H with circumflex */
	{ 293, UNICODE_LOWER }, /* Latin Small   h with circumflex */
	{ 294, UNICODE_UPPER }, /* Latin Capital H with stroke */
	{ 295, UNICODE_LOWER }, /* Latin Small   h with stroke */
	{ 296, UNICODE_UPPER }, /* Latin Capital I with tilde */
	{ 297, UNICODE_LOWER }, /* Latin Small   i with tilde */
	{ 298, UNICODE_UPPER }, /* Latin Capital I with macron */
	{ 299, UNICODE_LOWER }, /* Latin Small   i with macron */
	{ 300, UNICODE_UPPER }, /* Latin Capital I with breve */
	{ 301, UNICODE_LOWER }, /* Latin Small   i with breve */
	{ 302, UNICODE_UPPER }, /* Latin Capital I with ogonek */
	{ 303, UNICODE_LOWER }, /* Latin Small   i with ogonek */
	{ 304, UNICODE_UPPER }, /* Latin Capital I with dot above */
	{ 305, UNICODE_LOWER }, /* Latin Small   i without dot above */
	{ 306, UNICODE_UPPER }, /* Latin Capital IJ */
	{ 307, UNICODE_LOWER }, /* Latin Small IJ */
	{ 308, UNICODE_UPPER }, /* Latin Capital J with circumflex */
	{ 309, UNICODE_LOWER }, /* Latin Small   j with circumflex */
	{ 310, UNICODE_UPPER }, /* Latin Capital K with cedilla */
	{ 311, UNICODE_LOWER }, /* Latin Small   j with cedilla */
	{ 312, UNICODE_LOWER }, /* Latin Small   kra */
	{ 313, UNICODE_UPPER }, /* Latin Capital L with acute */
	{ 314, UNICODE_LOWER }, /* Latin Small   l with acute */
	{ 315, UNICODE_UPPER }, /* Latin Capital L with cedilla */
	{ 316, UNICODE_LOWER }, /* Latin Small   l with cedilla */
	{ 317, UNICODE_UPPER }, /* Latin Capital L with caron */
	{ 318, UNICODE_LOWER }, /* Latin Small   l with caron */
	{ 319, UNICODE_UPPER }, /* Latin Capital L with middle dot */
	{ 320, UNICODE_LOWER }, /* Latin Small   l with middle dot */
	{ 321, UNICODE_UPPER }, /* Latin Capital L with stroke */
	{ 322, UNICODE_LOWER }, /* Latin Small   l with stroke */
	{ 323, UNICODE_UPPER }, /* Latin Capital N with acute */
	{ 324, UNICODE_LOWER }, /* Latin Small   n with acute */
	{ 325, UNICODE_UPPER }, /* Latin Capital N with cedilla */
	{ 326, UNICODE_LOWER }, /* Latin Small   n with cedilla */
	{ 327, UNICODE_UPPER }, /* Latin Capital N with caron */
	{ 328, UNICODE_LOWER }, /* Latin Small   n with caron */
	{ 329, UNICODE_LOWER }, /* Latin Small   n preceeded by apostrophe */
	{ 330, UNICODE_UPPER }, /* Latin Capital Eng */
	{ 331, UNICODE_LOWER }, /* Latin Small   eng */
	{ 332, UNICODE_UPPER }, /* Latin Capital O with macron */
	{ 333, UNICODE_LOWER }, /* Latin Small   o with macron */
	{ 334, UNICODE_UPPER }, /* Latin Capital O with breve */
	{ 335, UNICODE_LOWER }, /* Latin Small   o with breve */
	{ 336, UNICODE_UPPER }, /* Latin Capital O with double acute */
	{ 337, UNICODE_LOWER }, /* Latin Small   o with double acute */
	{ 338, UNICODE_UPPER }, /* Latin Capital OE */
	{ 339, UNICODE_LOWER }, /* Latin Small   oe */
	{ 340, UNICODE_UPPER }, /* Latin Capital R with acute */
	{ 341, UNICODE_LOWER }, /* Latin Small   r with acute */
	{ 342, UNICODE_UPPER }, /* Latin Capital R with cedilla */
	{ 343, UNICODE_LOWER }, /* Latin Small   r with cedilla */
	{ 344, UNICODE_UPPER }, /* Latin Capital R with caron */
	{ 345, UNICODE_LOWER }, /* Latin Small   r with caron */
	{ 346, UNICODE_UPPER }, /* Latin Capital S with acute */
	{ 347, UNICODE_LOWER }, /* Latin Small   s with acute */
	{ 348, UNICODE_UPPER }, /* Latin Capital S with circumflex */
	{ 349, UNICODE_LOWER }, /* Latin Small   s with circumflex */
	{ 350, UNICODE_UPPER }, /* Latin Capital S with cedilla */
	{ 351, UNICODE_LOWER }, /* Latin Small   s with cedilla */
	{ 352, UNICODE_UPPER }, /* Latin Capital S with caron */
	{ 353, UNICODE_LOWER }, /* Latin Small   s with caron */
	{ 354, UNICODE_UPPER }, /* Latin Capital T with cedilla */
	{ 355, UNICODE_LOWER }, /* Latin Small   t with cedilla */
	{ 356, UNICODE_UPPER }, /* Latin Capital T with caron */
	{ 357, UNICODE_LOWER }, /* Latin Small   t with caron */
	{ 358, UNICODE_UPPER }, /* Latin Capital T with stroke */
	{ 359, UNICODE_LOWER }, /* Latin Small   t with stroke */
	{ 360, UNICODE_UPPER }, /* Latin Capital U with tilde */
	{ 361, UNICODE_LOWER }, /* Latin Small   u with tilde */
	{ 362, UNICODE_UPPER }, /* Latin Capital U with macron */
	{ 363, UNICODE_LOWER }, /* Latin Small   u with macron */
	{ 364, UNICODE_UPPER }, /* Latin Capital U with breve */
	{ 365, UNICODE_LOWER }, /* Latin Small   u with breve */
	{ 366, UNICODE_UPPER }, /* Latin Capital U with ring above */
	{ 367, UNICODE_LOWER }, /* Latin Small   u with ring above */
	{ 368, UNICODE_UPPER }, /* Latin Capital U with double acute */
	{ 369, UNICODE_LOWER }, /* Latin Small   u with double acute */
	{ 370, UNICODE_UPPER }, /* Latin Capital U with ogonek */
	{ 371, UNICODE_LOWER }, /* Latin Small   u with ogonek */
	{ 372, UNICODE_UPPER }, /* Latin Capital W with circumflex */
	{ 373, UNICODE_LOWER }, /* Latin Small   w with circumflex */
	{ 374, UNICODE_UPPER }, /* Latin Capital Y with circumflex */
	{ 375, UNICODE_LOWER }, /* Latin Small   y with circumflex */
	{ 376, UNICODE_UPPER }, /* Latin Capital Y with diaeresis */
	{ 377, UNICODE_UPPER }, /* Latin Capital Z with acute */
	{ 378, UNICODE_LOWER }, /* Latin Small   z with acute */
	{ 379, UNICODE_UPPER }, /* Latin Capital Z with dot above */
	{ 380, UNICODE_LOWER }, /* Latin Small   z with dot above */
	{ 381, UNICODE_UPPER }, /* Latin Capital Z with caron */
	{ 382, UNICODE_LOWER }, /* Latin Small   z with caron */
	{ 383, UNICODE_LOWER }, /* Latin Small   long S */

	{ 461, UNICODE_UPPER }, /* Latin Capital A with caron (hacek) */
	{ 462, UNICODE_LOWER }, /* Latin Small   a with caron (hacek) */
	{ 463, UNICODE_UPPER }, /* Latin Capital I with caron (hacek) */
	{ 464, UNICODE_LOWER }, /* Latin Small   i with caron (hacek) */
	{ 465, UNICODE_UPPER }, /* Latin Capital O with caron (hacek) */
	{ 466, UNICODE_LOWER }, /* Latin Small   o with caron (hacek) */
	{ 467, UNICODE_UPPER }, /* Latin Capital U with caron (hacek) */
	{ 468, UNICODE_LOWER }, /* Latin Small   u with caron (hacek) */

	{ 486, UNICODE_UPPER }, /* Latin Capital G with caron */
	{ 487, UNICODE_LOWER }, /* Latin Small   g with caron */
	{ 488, UNICODE_UPPER }, /* Latin Capital J with caron */
	{ 489, UNICODE_LOWER }, /* Latin Small   j with caron */
	{ 490, UNICODE_UPPER }, /* Latin Capital O with caron */
	{ 491, UNICODE_LOWER }, /* Latin Small   o with caron */

	{ 500, UNICODE_UPPER }, /* Latin Capital G with acute */
	{ 501, UNICODE_LOWER }, /* Latin Small   g with caron */
};

static int nunicodeinfo = sizeof( unicodeinfo ) / sizeof( unicodeinfo[0] );

static int
unicode_find( unsigned int unicode_character )
{
	int min = 0, max = nunicodeinfo, mid;
	while ( min < max ) {
		mid = ( min + max ) / 2;
		if ( unicodeinfo[mid].value < unicode_character )
			min = mid + 1;
		else
			max = mid;
	}
	if ( ( max==min ) && ( unicodeinfo[min].value == unicode_character ) )
		return min;
	else
		return -1;
}

unsigned short
unicode_utf8_classify( char *p )
{
	unsigned int unicode_character, pos = 0;
	int n;
	unicode_character = utf8_decode( p, &pos );
	n = unicode_find( unicode_character );
	if ( n==-1 ) return UNICODE_SYMBOL;
	else return unicodeinfo[n].info;
}

unsigned short
unicode_utf8_classify_str( str *s )
{
	unsigned int unicode_character, pos = 0;
	unsigned short value = 0;
	int n;
	while ( pos < s->len ) {
		unicode_character = utf8_decode( str_cstr( s ), &pos );
		n = unicode_find( unicode_character );
		if ( n==-1 ) value |= UNICODE_SYMBOL;
		else value |= unicodeinfo[n].info;
	}
	return value;
}