Blame genUnicode.py

Packit 423ecb
#!/usr/bin/python -u
Packit 423ecb
#
Packit 423ecb
# Original script modified in November 2003 to take advantage of
Packit 423ecb
# the character-validation range routines, and updated to the
Packit 423ecb
# current Unicode information (Version 4.0.1)
Packit 423ecb
#
Packit 423ecb
# NOTE: there is an 'alias' facility for blocks which are not present in
Packit 423ecb
#	the current release, but are needed for ABI compatibility.  This
Packit 423ecb
#	must be accomplished MANUALLY!  Please see the comments below under
Packit 423ecb
#     'blockAliases'
Packit 423ecb
#
Packit 423ecb
import sys
Packit 423ecb
import string
Packit 423ecb
import time
Packit 423ecb
Packit 423ecb
webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
Packit 423ecb
sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
Packit 423ecb
Packit 423ecb
#
Packit 423ecb
# blockAliases is a small hack - it is used for mapping block names which
Packit 423ecb
# were were used in the 3.1 release, but are missing or changed in the current
Packit 423ecb
# release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
Packit 423ecb
blockAliases = []
Packit 423ecb
blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
Packit 423ecb
blockAliases.append("Greek:GreekandCoptic")
Packit 423ecb
blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + 
Packit 423ecb
	"SupplementaryPrivateUseArea-B")
Packit 423ecb
Packit 423ecb
# minTableSize gives the minimum number of ranges which must be present
Packit 423ecb
# before a range table is produced.  If there are less than this
Packit 423ecb
# number, inline comparisons are generated
Packit 423ecb
minTableSize = 8
Packit 423ecb
Packit 423ecb
(blockfile, catfile) = string.split(sources)
Packit 423ecb
Packit 423ecb
Packit 423ecb
#
Packit 423ecb
# Now process the "blocks" file, reducing it to a dictionary
Packit 423ecb
# indexed by blockname, containing a tuple with the applicable
Packit 423ecb
# block range
Packit 423ecb
#
Packit 423ecb
BlockNames = {}
Packit 423ecb
try:
Packit 423ecb
    blocks = open(blockfile, "r")
Packit 423ecb
except:
Packit 423ecb
    print "Missing %s, aborting ..." % blockfile
Packit 423ecb
    sys.exit(1)
Packit 423ecb
Packit 423ecb
for line in blocks.readlines():
Packit 423ecb
    if line[0] == '#':
Packit 423ecb
        continue
Packit 423ecb
    line = string.strip(line)
Packit 423ecb
    if line == '':
Packit 423ecb
        continue
Packit 423ecb
    try:
Packit 423ecb
        fields = string.split(line, ';')
Packit 423ecb
        range = string.strip(fields[0])
Packit 423ecb
        (start, end) = string.split(range, "..")
Packit 423ecb
        name = string.strip(fields[1])
Packit 423ecb
        name = string.replace(name, ' ', '')
Packit 423ecb
    except:
Packit 423ecb
        print "Failed to process line: %s" % (line)
Packit 423ecb
        continue
Packit 423ecb
    start = "0x" + start
Packit 423ecb
    end = "0x" + end
Packit 423ecb
    try:
Packit 423ecb
        BlockNames[name].append((start, end))
Packit 423ecb
    except:
Packit 423ecb
        BlockNames[name] = [(start, end)]
Packit 423ecb
blocks.close()
Packit 423ecb
print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
Packit 423ecb
Packit 423ecb
for block in blockAliases:
Packit 423ecb
    alias = string.split(block,':')
Packit 423ecb
    alist = string.split(alias[1],',')
Packit 423ecb
    for comp in alist:
Packit 423ecb
        if BlockNames.has_key(comp):
Packit 423ecb
            if alias[0] not in BlockNames:
Packit 423ecb
                BlockNames[alias[0]] = []
Packit 423ecb
            for r in BlockNames[comp]:
Packit 423ecb
                BlockNames[alias[0]].append(r)
Packit 423ecb
        else:
Packit 423ecb
            print "Alias %s: %s not in Blocks" % (alias[0], comp)
Packit 423ecb
            continue
Packit 423ecb
Packit 423ecb
#
Packit 423ecb
# Next process the Categories file. This is more complex, since
Packit 423ecb
# the file is in code sequence, and we need to invert it.  We use
Packit 423ecb
# a dictionary with index category-name, with each entry containing
Packit 423ecb
# all the ranges (codepoints) of that category.  Note that category
Packit 423ecb
# names comprise two parts - the general category, and the "subclass"
Packit 423ecb
# within that category.  Therefore, both "general category" (which is
Packit 423ecb
# the first character of the 2-character category-name) and the full
Packit 423ecb
# (2-character) name are entered into this dictionary.
Packit 423ecb
#
Packit 423ecb
try:
Packit 423ecb
    data = open(catfile, "r")
Packit 423ecb
except:
Packit 423ecb
    print "Missing %s, aborting ..." % catfile
Packit 423ecb
    sys.exit(1)
Packit 423ecb
Packit 423ecb
nbchar = 0;
Packit 423ecb
Categories = {}
Packit 423ecb
for line in data.readlines():
Packit 423ecb
    if line[0] == '#':
Packit 423ecb
        continue
Packit 423ecb
    line = string.strip(line)
Packit 423ecb
    if line == '':
Packit 423ecb
        continue
Packit 423ecb
    try:
Packit 423ecb
        fields = string.split(line, ';')
Packit 423ecb
        point = string.strip(fields[0])
Packit 423ecb
        value = 0
Packit 423ecb
        while point != '':
Packit 423ecb
            value = value * 16
Packit 423ecb
            if point[0] >= '0' and point[0] <= '9':
Packit 423ecb
                value = value + ord(point[0]) - ord('0')
Packit 423ecb
            elif point[0] >= 'A' and point[0] <= 'F':
Packit 423ecb
                value = value + 10 + ord(point[0]) - ord('A')
Packit 423ecb
            elif point[0] >= 'a' and point[0] <= 'f':
Packit 423ecb
                value = value + 10 + ord(point[0]) - ord('a')
Packit 423ecb
            point = point[1:]
Packit 423ecb
        name = fields[2]
Packit 423ecb
    except:
Packit 423ecb
        print "Failed to process line: %s" % (line)
Packit 423ecb
        continue
Packit 423ecb
    
Packit 423ecb
    nbchar = nbchar + 1
Packit 423ecb
    # update entry for "full name"
Packit 423ecb
    try:
Packit 423ecb
        Categories[name].append(value)
Packit 423ecb
    except:
Packit 423ecb
        try:
Packit 423ecb
            Categories[name] = [value]
Packit 423ecb
        except:
Packit 423ecb
            print "Failed to process line: %s" % (line)
Packit 423ecb
    # update "general category" name
Packit 423ecb
    try:
Packit 423ecb
        Categories[name[0]].append(value)
Packit 423ecb
    except:
Packit 423ecb
        try:
Packit 423ecb
            Categories[name[0]] = [value]
Packit 423ecb
        except:
Packit 423ecb
            print "Failed to process line: %s" % (line)
Packit 423ecb
Packit 423ecb
blocks.close()
Packit 423ecb
print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
Packit 423ecb
Packit 423ecb
#
Packit 423ecb
# The data is now all read.  Time to process it into a more useful form.
Packit 423ecb
#
Packit 423ecb
# reduce the number list into ranges
Packit 423ecb
for cat in Categories.keys():
Packit 423ecb
    list = Categories[cat]
Packit 423ecb
    start = -1
Packit 423ecb
    prev = -1
Packit 423ecb
    end = -1
Packit 423ecb
    ranges = []
Packit 423ecb
    for val in list:
Packit 423ecb
        if start == -1:
Packit 423ecb
            start = val
Packit 423ecb
            prev = val
Packit 423ecb
            continue
Packit 423ecb
        elif val == prev + 1:
Packit 423ecb
            prev = val
Packit 423ecb
            continue
Packit 423ecb
        elif prev == start:
Packit 423ecb
            ranges.append((prev, prev))
Packit 423ecb
            start = val
Packit 423ecb
            prev = val
Packit 423ecb
            continue
Packit 423ecb
        else:
Packit 423ecb
            ranges.append((start, prev))
Packit 423ecb
            start = val
Packit 423ecb
            prev = val
Packit 423ecb
            continue
Packit 423ecb
    if prev == start:
Packit 423ecb
        ranges.append((prev, prev))
Packit 423ecb
    else:
Packit 423ecb
        ranges.append((start, prev))
Packit 423ecb
    Categories[cat] = ranges
Packit 423ecb
Packit 423ecb
#
Packit 423ecb
# Assure all data is in alphabetic order, since we will be doing binary
Packit 423ecb
# searches on the tables.
Packit 423ecb
#
Packit 423ecb
bkeys = BlockNames.keys()
Packit 423ecb
bkeys.sort()
Packit 423ecb
Packit 423ecb
ckeys = Categories.keys()
Packit 423ecb
ckeys.sort()
Packit 423ecb
Packit 423ecb
#
Packit 423ecb
# Generate the resulting files
Packit 423ecb
#
Packit 423ecb
try:
Packit 423ecb
    header = open("include/libxml/xmlunicode.h", "w")
Packit 423ecb
except:
Packit 423ecb
    print "Failed to open include/libxml/xmlunicode.h"
Packit 423ecb
    sys.exit(1)
Packit 423ecb
Packit 423ecb
try:
Packit 423ecb
    output = open("xmlunicode.c", "w")
Packit 423ecb
except:
Packit 423ecb
    print "Failed to open xmlunicode.c"
Packit 423ecb
    sys.exit(1)
Packit 423ecb
Packit 423ecb
date = time.asctime(time.localtime(time.time()))
Packit 423ecb
Packit 423ecb
header.write(
Packit 423ecb
"""/*
Packit 423ecb
 * Summary: Unicode character APIs
Packit 423ecb
 * Description: API for the Unicode character APIs
Packit 423ecb
 *
Packit 423ecb
 * This file is automatically generated from the
Packit 423ecb
 * UCS description files of the Unicode Character Database
Packit 423ecb
 * %s
Packit 423ecb
 * using the genUnicode.py Python script.
Packit 423ecb
 *
Packit 423ecb
 * Generation date: %s
Packit 423ecb
 * Sources: %s
Packit 423ecb
 * Author: Daniel Veillard
Packit 423ecb
 */
Packit 423ecb
Packit 423ecb
#ifndef __XML_UNICODE_H__
Packit 423ecb
#define __XML_UNICODE_H__
Packit 423ecb
Packit 423ecb
#include <libxml/xmlversion.h>
Packit 423ecb
Packit 423ecb
#ifdef LIBXML_UNICODE_ENABLED
Packit 423ecb
Packit 423ecb
#ifdef __cplusplus
Packit 423ecb
extern "C" {
Packit 423ecb
#endif
Packit 423ecb
Packit 423ecb
""" % (webpage, date, sources));
Packit 423ecb
Packit 423ecb
output.write(
Packit 423ecb
"""/*
Packit 423ecb
 * xmlunicode.c: this module implements the Unicode character APIs
Packit 423ecb
 *
Packit 423ecb
 * This file is automatically generated from the
Packit 423ecb
 * UCS description files of the Unicode Character Database
Packit 423ecb
 * %s
Packit 423ecb
 * using the genUnicode.py Python script.
Packit 423ecb
 *
Packit 423ecb
 * Generation date: %s
Packit 423ecb
 * Sources: %s
Packit 423ecb
 * Daniel Veillard <veillard@redhat.com>
Packit 423ecb
 */
Packit 423ecb
Packit 423ecb
#define IN_LIBXML
Packit 423ecb
#include "libxml.h"
Packit 423ecb
Packit 423ecb
#ifdef LIBXML_UNICODE_ENABLED
Packit 423ecb
Packit 423ecb
#include <string.h>
Packit 423ecb
#include <libxml/xmlversion.h>
Packit 423ecb
#include <libxml/xmlunicode.h>
Packit 423ecb
#include <libxml/chvalid.h>
Packit 423ecb
Packit 423ecb
typedef int (xmlIntFunc)(int);	/* just to keep one's mind untwisted */
Packit 423ecb
Packit 423ecb
typedef struct {
Packit 423ecb
    const char *rangename;
Packit 423ecb
    xmlIntFunc *func;
Packit 423ecb
} xmlUnicodeRange;
Packit 423ecb
Packit 423ecb
typedef struct {
Packit 423ecb
    const xmlUnicodeRange *table;
Packit 423ecb
    int		    numentries;
Packit 423ecb
} xmlUnicodeNameTable;
Packit 423ecb
Packit 423ecb
Packit 423ecb
static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
Packit 423ecb
Packit 423ecb
static const xmlUnicodeRange xmlUnicodeBlocks[] = {
Packit 423ecb
""" % (webpage, date, sources));
Packit 423ecb
Packit 423ecb
flag = 0
Packit 423ecb
for block in bkeys:
Packit 423ecb
    name = string.replace(block, '-', '')
Packit 423ecb
    if flag:
Packit 423ecb
        output.write(',\n')
Packit 423ecb
    else:
Packit 423ecb
        flag = 1
Packit 423ecb
    output.write('  {"%s", xmlUCSIs%s}' % (block, name))
Packit 423ecb
output.write('};\n\n')
Packit 423ecb
Packit 423ecb
output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
Packit 423ecb
flag = 0;
Packit 423ecb
for name in ckeys:
Packit 423ecb
    if flag:
Packit 423ecb
        output.write(',\n')
Packit 423ecb
    else:
Packit 423ecb
        flag = 1
Packit 423ecb
    output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))
Packit 423ecb
output.write('};\n\n')
Packit 423ecb
Packit 423ecb
#
Packit 423ecb
# For any categories with more than minTableSize ranges we generate
Packit 423ecb
# a range table suitable for xmlCharInRange
Packit 423ecb
#
Packit 423ecb
for name in ckeys:
Packit 423ecb
  if len(Categories[name]) > minTableSize:
Packit 423ecb
    numshort = 0
Packit 423ecb
    numlong = 0
Packit 423ecb
    ranges = Categories[name]
Packit 423ecb
    sptr = "NULL"
Packit 423ecb
    lptr = "NULL"
Packit 423ecb
    for range in ranges:
Packit 423ecb
      (low, high) = range
Packit 423ecb
      if high < 0x10000:
Packit 423ecb
        if numshort == 0:
Packit 423ecb
          pline = "static const xmlChSRange xml%sS[] = {" % name
Packit 423ecb
          sptr = "xml%sS" % name
Packit 423ecb
        else:
Packit 423ecb
          pline += ", "
Packit 423ecb
        numshort += 1
Packit 423ecb
      else:
Packit 423ecb
        if numlong == 0:
Packit 423ecb
          if numshort > 0:
Packit 423ecb
            output.write(pline + " };\n")
Packit 423ecb
          pline = "static const xmlChLRange xml%sL[] = {" % name
Packit 423ecb
          lptr = "xml%sL" % name
Packit 423ecb
        else:
Packit 423ecb
          pline += ", "
Packit 423ecb
        numlong += 1
Packit 423ecb
      if len(pline) > 60:
Packit 423ecb
        output.write(pline + "\n")
Packit 423ecb
        pline = "    "
Packit 423ecb
      pline += "{%s, %s}" % (hex(low), hex(high))
Packit 423ecb
    output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
Packit 423ecb
         % (name, numshort, numlong, sptr, lptr))
Packit 423ecb
Packit 423ecb
Packit 423ecb
output.write(
Packit 423ecb
"""static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
Packit 423ecb
static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
Packit 423ecb
Packit 423ecb
/**
Packit 423ecb
 * xmlUnicodeLookup:
Packit 423ecb
 * @tptr: pointer to the name table
Packit 423ecb
 * @name: name to be found
Packit 423ecb
 *
Packit 423ecb
 * binary table lookup for user-supplied name
Packit 423ecb
 *
Packit 423ecb
 * Returns pointer to range function if found, otherwise NULL
Packit 423ecb
 */
Packit 423ecb
static xmlIntFunc
Packit 423ecb
*xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
Packit 423ecb
    int low, high, mid, cmp;
Packit 423ecb
    xmlUnicodeRange *sptr;
Packit 423ecb
Packit 423ecb
    if ((tptr == NULL) || (tname == NULL)) return(NULL);
Packit 423ecb
Packit 423ecb
    low = 0;
Packit 423ecb
    high = tptr->numentries - 1;
Packit 423ecb
    sptr = tptr->table;
Packit 423ecb
    while (low <= high) {
Packit 423ecb
	mid = (low + high) / 2;
Packit 423ecb
	if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
Packit 423ecb
	    return (sptr[mid].func);
Packit 423ecb
	if (cmp < 0)
Packit 423ecb
	    high = mid - 1;
Packit 423ecb
	else
Packit 423ecb
	    low = mid + 1;
Packit 423ecb
    }
Packit 423ecb
    return (NULL);    
Packit 423ecb
}
Packit 423ecb
Packit 423ecb
""" % (len(BlockNames), len(Categories)) )
Packit 423ecb
Packit 423ecb
for block in bkeys:
Packit 423ecb
    name = string.replace(block, '-', '')
Packit 423ecb
    header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
Packit 423ecb
    output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
Packit 423ecb
    output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
Packit 423ecb
                 (block))
Packit 423ecb
    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
Packit 423ecb
    output.write("int\nxmlUCSIs%s(int code) {\n    return(" % name)
Packit 423ecb
    flag = 0
Packit 423ecb
    for (start, end) in BlockNames[block]:
Packit 423ecb
        if flag:
Packit 423ecb
            output.write(" ||\n           ")
Packit 423ecb
        else:
Packit 423ecb
            flag = 1
Packit 423ecb
        output.write("((code >= %s) && (code <= %s))" % (start, end))
Packit 423ecb
    output.write(");\n}\n\n")
Packit 423ecb
Packit 423ecb
header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
Packit 423ecb
output.write(
Packit 423ecb
"""/**
Packit 423ecb
 * xmlUCSIsBlock:
Packit 423ecb
 * @code: UCS code point
Packit 423ecb
 * @block: UCS block name
Packit 423ecb
 *
Packit 423ecb
 * Check whether the character is part of the UCS Block
Packit 423ecb
 *
Packit 423ecb
 * Returns 1 if true, 0 if false and -1 on unknown block
Packit 423ecb
 */
Packit 423ecb
int
Packit 423ecb
xmlUCSIsBlock(int code, const char *block) {
Packit 423ecb
    xmlIntFunc *func;
Packit 423ecb
Packit 423ecb
    func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
Packit 423ecb
    if (func == NULL)
Packit 423ecb
	return (-1);
Packit 423ecb
    return (func(code));
Packit 423ecb
}
Packit 423ecb
Packit 423ecb
""")
Packit 423ecb
Packit 423ecb
for name in ckeys:
Packit 423ecb
    ranges = Categories[name]
Packit 423ecb
    header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
Packit 423ecb
    output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
Packit 423ecb
    output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
Packit 423ecb
                 (name))
Packit 423ecb
    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
Packit 423ecb
    output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
Packit 423ecb
    if len(Categories[name]) > minTableSize:
Packit 423ecb
        output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
Packit 423ecb
            % name)
Packit 423ecb
    else:
Packit 423ecb
        start = 1
Packit 423ecb
        for range in ranges:
Packit 423ecb
            (begin, end) = range;
Packit 423ecb
            if start:
Packit 423ecb
                output.write("    return(");
Packit 423ecb
                start = 0
Packit 423ecb
            else:
Packit 423ecb
                output.write(" ||\n           ");
Packit 423ecb
            if (begin == end):
Packit 423ecb
                output.write("(code == %s)" % (hex(begin)))
Packit 423ecb
            else:
Packit 423ecb
                output.write("((code >= %s) && (code <= %s))" % (
Packit 423ecb
                         hex(begin), hex(end)))
Packit 423ecb
    output.write(");\n}\n\n")
Packit 423ecb
Packit 423ecb
header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
Packit 423ecb
output.write(
Packit 423ecb
"""/**
Packit 423ecb
 * xmlUCSIsCat:
Packit 423ecb
 * @code: UCS code point
Packit 423ecb
 * @cat: UCS Category name
Packit 423ecb
 *
Packit 423ecb
 * Check whether the character is part of the UCS Category
Packit 423ecb
 *
Packit 423ecb
 * Returns 1 if true, 0 if false and -1 on unknown category
Packit 423ecb
 */
Packit 423ecb
int
Packit 423ecb
xmlUCSIsCat(int code, const char *cat) {
Packit 423ecb
    xmlIntFunc *func;
Packit 423ecb
Packit 423ecb
    func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
Packit 423ecb
    if (func == NULL)
Packit 423ecb
	return (-1);
Packit 423ecb
    return (func(code));
Packit 423ecb
}
Packit 423ecb
Packit 423ecb
#define bottom_xmlunicode
Packit 423ecb
#include "elfgcchack.h"
Packit 423ecb
#endif /* LIBXML_UNICODE_ENABLED */
Packit 423ecb
""")
Packit 423ecb
Packit 423ecb
header.write("""
Packit 423ecb
#ifdef __cplusplus
Packit 423ecb
}
Packit 423ecb
#endif
Packit 423ecb
Packit 423ecb
#endif /* LIBXML_UNICODE_ENABLED */
Packit 423ecb
Packit 423ecb
#endif /* __XML_UNICODE_H__ */
Packit 423ecb
""");
Packit 423ecb
Packit 423ecb
header.close()
Packit 423ecb
output.close()