Blame src/make_unicode_fold_data.py

Packit Service bd74e6
#!/usr/bin/python
Packit Service bd74e6
# -*- coding: utf-8 -*-
Packit Service bd74e6
# make_unicode_fold_data.py
Packit Service bd74e6
# Copyright (c) 2016-2017  K.Kosako
Packit Service bd74e6
Packit Service bd74e6
import sys
Packit Service bd74e6
import re
Packit Service bd74e6
import codecs
Packit Service bd74e6
Packit Service bd74e6
SOURCE_FILE = 'CaseFolding.txt'
Packit Service bd74e6
GPERF_UNFOLD_KEY_FILE = 'unicode_unfold_key.gperf'
Packit Service bd74e6
GPERF_FOLD_KEY_FILES  = ['unicode_fold1_key.gperf', 'unicode_fold2_key.gperf', 'unicode_fold3_key.gperf']
Packit Service bd74e6
Packit Service bd74e6
Packit Service bd74e6
DataName = 'OnigUnicodeFolds'
Packit Service bd74e6
Packit Service bd74e6
ENCODING = 'utf-8'
Packit Service bd74e6
Packit Service bd74e6
LINE_REG = re.compile("([0-9A-F]{1,6}); (.); ([0-9A-F]{1,6})(?: ([0-9A-F]{1,6}))?(?: ([0-9A-F]{1,6}))?;(?:\s*#\s*)(.*)")
Packit Service bd74e6
VERSION_REG  = re.compile("#.*-(\d+\.\d+\.\d+)\.txt")
Packit Service bd74e6
Packit Service bd74e6
VERSION_INFO = None
Packit Service bd74e6
Packit Service bd74e6
FOLDS = {}
Packit Service bd74e6
TURKISH_FOLDS = {}
Packit Service bd74e6
LOCALE_FOLDS  = {}
Packit Service bd74e6
Packit Service bd74e6
UNFOLDS = {}
Packit Service bd74e6
TURKISH_UNFOLDS = {}
Packit Service bd74e6
LOCALE_UNFOLDS  = {}
Packit Service bd74e6
Packit Service bd74e6
class Entry:
Packit Service bd74e6
    def __init__(self, fold):
Packit Service bd74e6
        self.fold = fold
Packit Service bd74e6
        self.unfolds = []
Packit Service bd74e6
        self.fold_len = len(fold)
Packit Service bd74e6
        self.index = -1
Packit Service bd74e6
        self.comment = None
Packit Service bd74e6
Packit Service bd74e6
def fold_key(fold):
Packit Service bd74e6
    sfold = map(lambda i: "%06x" % i, fold)
Packit Service bd74e6
    return ':'.join(sfold)
Packit Service bd74e6
Packit Service bd74e6
def form16(x, size):
Packit Service bd74e6
    form = "0x%06x" if x > 0xffff else "0x%04x"
Packit Service bd74e6
    s = form % x
Packit Service bd74e6
    rem = size - len(s)
Packit Service bd74e6
    if rem > 0:
Packit Service bd74e6
        s = ' ' * rem + s
Packit Service bd74e6
Packit Service bd74e6
    return s
Packit Service bd74e6
Packit Service bd74e6
def form3bytes(x):
Packit Service bd74e6
    x0 = x & 0xff
Packit Service bd74e6
    x1 = (x>>8)  & 0xff
Packit Service bd74e6
    x2 = (x>>16) & 0xff
Packit Service bd74e6
    return "\\x%02x\\x%02x\\x%02x" % (x2, x1, x0)
Packit Service bd74e6
Packit Service bd74e6
def check_version_info(s):
Packit Service bd74e6
    global VERSION_INFO
Packit Service bd74e6
    if VERSION_INFO is None:
Packit Service bd74e6
        m = VERSION_REG.match(s)
Packit Service bd74e6
        if m is not None:
Packit Service bd74e6
            VERSION_INFO = m.group(1)
Packit Service bd74e6
Packit Service bd74e6
def parse_line(s):
Packit Service bd74e6
    if len(s) == 0:
Packit Service bd74e6
        return False
Packit Service bd74e6
    if s[0] == '#':
Packit Service bd74e6
        check_version_info(s)
Packit Service bd74e6
        return False
Packit Service bd74e6
Packit Service bd74e6
    m = LINE_REG.match(s)
Packit Service bd74e6
    if m is None:
Packit Service bd74e6
        print >> sys.stderr, s.encode(ENCODING)
Packit Service bd74e6
        sys.exit(-1)
Packit Service bd74e6
Packit Service bd74e6
    s_unfold = m.group(1)
Packit Service bd74e6
    s_type   = m.group(2)
Packit Service bd74e6
    s_fold   = m.group(3)
Packit Service bd74e6
    comment  = m.group(6)
Packit Service bd74e6
Packit Service bd74e6
    if s_type == 'S':
Packit Service bd74e6
        return False;
Packit Service bd74e6
Packit Service bd74e6
    unfold = int(s_unfold, 16)
Packit Service bd74e6
    f1     = int(s_fold, 16)
Packit Service bd74e6
    fold = [f1]
Packit Service bd74e6
    if m.group(4) is not None:
Packit Service bd74e6
        f2 = int(m.group(4), 16)
Packit Service bd74e6
        fold.append(f2)
Packit Service bd74e6
        if m.group(5) is not None:
Packit Service bd74e6
            f3 = int(m.group(5), 16)
Packit Service bd74e6
            fold.append(f3)
Packit Service bd74e6
Packit Service bd74e6
    if s_type == 'T':
Packit Service bd74e6
        dic   = TURKISH_FOLDS
Packit Service bd74e6
        undic = TURKISH_UNFOLDS
Packit Service bd74e6
    else:
Packit Service bd74e6
        dic   = FOLDS
Packit Service bd74e6
        undic = UNFOLDS
Packit Service bd74e6
Packit Service bd74e6
    key = fold_key(fold)
Packit Service bd74e6
    e = dic.get(key, None)
Packit Service bd74e6
    if e is None:
Packit Service bd74e6
        e = Entry(fold)
Packit Service bd74e6
        e.comment = comment
Packit Service bd74e6
        dic[key] = e
Packit Service bd74e6
Packit Service bd74e6
    e.unfolds.append(unfold)
Packit Service bd74e6
Packit Service bd74e6
    if undic.get(unfold, None) is not None:
Packit Service bd74e6
        print >> sys.stderr, ("unfold dup: 0x%04x %s\n" % (unfold, s_type))
Packit Service bd74e6
    undic[unfold] = e
Packit Service bd74e6
Packit Service bd74e6
    return True
Packit Service bd74e6
Packit Service bd74e6
def parse_file(f):
Packit Service bd74e6
    line = f.readline()
Packit Service bd74e6
    while line:
Packit Service bd74e6
        s = line.strip()
Packit Service bd74e6
        parse_line(s)
Packit Service bd74e6
        line = f.readline()
Packit Service bd74e6
Packit Service bd74e6
def make_locale():
Packit Service bd74e6
    for unfold, te in TURKISH_UNFOLDS.items():
Packit Service bd74e6
        e = UNFOLDS.get(unfold, None)
Packit Service bd74e6
        if e is None:
Packit Service bd74e6
            continue
Packit Service bd74e6
Packit Service bd74e6
        fkey = fold_key(e.fold)
Packit Service bd74e6
        if len(e.unfolds) == 1:
Packit Service bd74e6
            del FOLDS[fkey]
Packit Service bd74e6
        else:
Packit Service bd74e6
            e.unfolds.remove(unfold)
Packit Service bd74e6
            e = Entry(e.fold)
Packit Service bd74e6
            e.unfolds.append(unfold)
Packit Service bd74e6
Packit Service bd74e6
        LOCALE_FOLDS[fkey] = e
Packit Service bd74e6
        LOCALE_UNFOLDS[unfold] = e
Packit Service bd74e6
        del UNFOLDS[unfold]
Packit Service bd74e6
Packit Service bd74e6
def output_typedef(f):
Packit Service bd74e6
    s = """\
Packit Service bd74e6
typedef unsigned long OnigCodePoint;
Packit Service bd74e6
"""
Packit Service bd74e6
    print >> f, s
Packit Service bd74e6
Packit Service bd74e6
def divide_by_fold_len(d):
Packit Service bd74e6
    l = d.items()
Packit Service bd74e6
    l1 = filter(lambda (k,e):e.fold_len == 1, l)
Packit Service bd74e6
    l2 = filter(lambda (k,e):e.fold_len == 2, l)
Packit Service bd74e6
    l3 = filter(lambda (k,e):e.fold_len == 3, l)
Packit Service bd74e6
    sl1 = sorted(l1, key=lambda (k,e):k)
Packit Service bd74e6
    sl2 = sorted(l2, key=lambda (k,e):k)
Packit Service bd74e6
    sl3 = sorted(l3, key=lambda (k,e):k)
Packit Service bd74e6
    return (sl1, sl2, sl3)
Packit Service bd74e6
Packit Service bd74e6
def output_comment(f, s):
Packit Service bd74e6
    f.write(" /* %s */" % s)
Packit Service bd74e6
Packit Service bd74e6
def output_data_n1(f, n, fn, c, out_comment):
Packit Service bd74e6
    for k, e in fn:
Packit Service bd74e6
        e.index = c
Packit Service bd74e6
        if out_comment and n > 1 and e.comment is not None:
Packit Service bd74e6
            output_comment(f, e.comment)
Packit Service bd74e6
            print >> f, ''
Packit Service bd74e6
Packit Service bd74e6
        f.write(' ')
Packit Service bd74e6
        f.write("/*%4d*/ " % c)
Packit Service bd74e6
        for i in range(0, n):
Packit Service bd74e6
            s = form16(e.fold[i], 8)
Packit Service bd74e6
            f.write(" %s," % s)
Packit Service bd74e6
Packit Service bd74e6
        usize = len(e.unfolds)
Packit Service bd74e6
        f.write("  %d," % usize)
Packit Service bd74e6
        for u in e.unfolds:
Packit Service bd74e6
            s = form16(u, 8)
Packit Service bd74e6
            f.write(" %s," % s)
Packit Service bd74e6
Packit Service bd74e6
        if out_comment and n == 1 and e.comment is not None:
Packit Service bd74e6
            if len(e.comment) < 35:
Packit Service bd74e6
                s = e.comment
Packit Service bd74e6
            else:
Packit Service bd74e6
                s = e.comment[0:33] + '..'
Packit Service bd74e6
Packit Service bd74e6
            output_comment(f, s)
Packit Service bd74e6
Packit Service bd74e6
        f.write("\n")
Packit Service bd74e6
        c += n + 1 + usize
Packit Service bd74e6
Packit Service bd74e6
    return c
Packit Service bd74e6
Packit Service bd74e6
def output_data_n(f, name, n, fn, lfn, out_comment):
Packit Service bd74e6
    print >> f, "OnigCodePoint %s%d[] = {" % (name, n)
Packit Service bd74e6
    c = 0
Packit Service bd74e6
    c = output_data_n1(f, n,  fn, c, out_comment)
Packit Service bd74e6
    print >> f, "#define FOLDS%d_NORMAL_END_INDEX   %d" % (n, c)
Packit Service bd74e6
    print >> f, " /* ----- LOCALE ----- */"
Packit Service bd74e6
    c = output_data_n1(f, n, lfn, c, out_comment)
Packit Service bd74e6
    print >> f, "#define FOLDS%d_END_INDEX   %d" % (n, c)
Packit Service bd74e6
    print >> f, "};"
Packit Service bd74e6
Packit Service bd74e6
def output_fold_data(f, name, out_comment):
Packit Service bd74e6
    f1, f2, f3 = divide_by_fold_len(FOLDS)
Packit Service bd74e6
    lf1, lf2, lf3 = divide_by_fold_len(LOCALE_FOLDS)
Packit Service bd74e6
Packit Service bd74e6
    output_data_n(f, name, 1, f1, lf1, out_comment)
Packit Service bd74e6
    print >> f, ''
Packit Service bd74e6
    output_data_n(f, name, 2, f2, lf2, out_comment)
Packit Service bd74e6
    print >> f, ''
Packit Service bd74e6
    output_data_n(f, name, 3, f3, lf3, out_comment)
Packit Service bd74e6
    print >> f, ''
Packit Service bd74e6
Packit Service bd74e6
def output_macros(f, name):
Packit Service bd74e6
    print >> f, "#define FOLDS1_FOLD(i)         (%s1 + (i))" % name
Packit Service bd74e6
    print >> f, "#define FOLDS2_FOLD(i)         (%s2 + (i))" % name
Packit Service bd74e6
    print >> f, "#define FOLDS3_FOLD(i)         (%s3 + (i))" % name
Packit Service bd74e6
Packit Service bd74e6
    print >> f, "#define FOLDS1_UNFOLDS_NUM(i)  %s1[(i)+1]" % name
Packit Service bd74e6
    print >> f, "#define FOLDS2_UNFOLDS_NUM(i)  %s2[(i)+2]" % name
Packit Service bd74e6
    print >> f, "#define FOLDS3_UNFOLDS_NUM(i)  %s3[(i)+3]" % name
Packit Service bd74e6
Packit Service bd74e6
    print >> f, "#define FOLDS1_UNFOLDS(i)      (%s1 + (i) + 2)" % name
Packit Service bd74e6
    print >> f, "#define FOLDS2_UNFOLDS(i)      (%s2 + (i) + 3)" % name
Packit Service bd74e6
    print >> f, "#define FOLDS3_UNFOLDS(i)      (%s3 + (i) + 4)" % name
Packit Service bd74e6
Packit Service bd74e6
    print >> f, "#define FOLDS1_NEXT_INDEX(i)   ((i) + 2 + %s1[(i)+1])" % name
Packit Service bd74e6
    print >> f, "#define FOLDS2_NEXT_INDEX(i)   ((i) + 3 + %s1[(i)+2])" % name
Packit Service bd74e6
    print >> f, "#define FOLDS3_NEXT_INDEX(i)   ((i) + 4 + %s1[(i)+3])" % name
Packit Service bd74e6
Packit Service bd74e6
def output_fold_source(f, out_comment):
Packit Service bd74e6
    print >> f, "/* This file was generated by make_unicode_fold_data.py. */"
Packit Service bd74e6
    print >> f, '#include "regenc.h"'
Packit Service bd74e6
    print >> f, ''
Packit Service bd74e6
    if VERSION_INFO is not None:
Packit Service bd74e6
        print "#define CASEFOLD_VERSION  %s" % re.sub(r'[\.-]', '_', VERSION_INFO)
Packit Service bd74e6
        print ''
Packit Service bd74e6
    #output_macros(f, DataName)
Packit Service bd74e6
    print >> f, ''
Packit Service bd74e6
    #output_typedef(f)
Packit Service bd74e6
    output_fold_data(f, DataName, out_comment)
Packit Service bd74e6
Packit Service bd74e6
def output_gperf_unfold_key(f):
Packit Service bd74e6
    head = """\
Packit Service bd74e6
%{
Packit Service bd74e6
/* This gperf source file was generated by make_unicode_fold_data.py */
Packit Service bd74e6
#include <string.h>
Packit Service bd74e6
#include "regenc.h"
Packit Service bd74e6
%}
Packit Service bd74e6
struct ByUnfoldKey {
Packit Service bd74e6
  OnigCodePoint code;
Packit Service bd74e6
  short int   index;
Packit Service bd74e6
  short int   fold_len;
Packit Service bd74e6
};
Packit Service bd74e6
%%
Packit Service bd74e6
"""
Packit Service bd74e6
    f.write(head)
Packit Service bd74e6
    UNFOLDS.update(LOCALE_UNFOLDS)
Packit Service bd74e6
    l = UNFOLDS.items()
Packit Service bd74e6
    sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index))
Packit Service bd74e6
    for k, e in sl:
Packit Service bd74e6
        f.write('"%s", /*0x%04x*/ %4d, %d\n' %
Packit Service bd74e6
                (form3bytes(k), k, e.index, e.fold_len))
Packit Service bd74e6
Packit Service bd74e6
    print >> f, '%%'
Packit Service bd74e6
Packit Service bd74e6
def output_gperf_fold_key(f, key_len):
Packit Service bd74e6
    head = """\
Packit Service bd74e6
%{
Packit Service bd74e6
/* This gperf source file was generated by make_unicode_fold_data.py */
Packit Service bd74e6
#include <string.h>
Packit Service bd74e6
#include "regenc.h"
Packit Service bd74e6
%}
Packit Service bd74e6
short int
Packit Service bd74e6
%%
Packit Service bd74e6
"""
Packit Service bd74e6
    f.write(head)
Packit Service bd74e6
    l = FOLDS.items()
Packit Service bd74e6
    l = filter(lambda (k,e):e.fold_len == key_len, l)
Packit Service bd74e6
    sl = sorted(l, key=lambda (k,e):e.index)
Packit Service bd74e6
    for k, e in sl:
Packit Service bd74e6
        skey = ''.join(map(lambda i: form3bytes(i), e.fold))
Packit Service bd74e6
        f.write('"%s", %4d\n' % (skey, e.index))
Packit Service bd74e6
Packit Service bd74e6
    print >> f, '%%'
Packit Service bd74e6
Packit Service bd74e6
def output_gperf_source():
Packit Service bd74e6
   with open(GPERF_UNFOLD_KEY_FILE, 'w') as f:
Packit Service bd74e6
       output_gperf_unfold_key(f)
Packit Service bd74e6
Packit Service bd74e6
   FOLDS.update(LOCALE_FOLDS)
Packit Service bd74e6
Packit Service bd74e6
   for i in range(1, 4):
Packit Service bd74e6
       with open(GPERF_FOLD_KEY_FILES[i-1], 'w') as f:
Packit Service bd74e6
           output_gperf_fold_key(f, i)
Packit Service bd74e6
Packit Service bd74e6
Packit Service bd74e6
## main ##
Packit Service bd74e6
with open(SOURCE_FILE, 'r') as f:
Packit Service bd74e6
    parse_file(f)
Packit Service bd74e6
Packit Service bd74e6
make_locale()
Packit Service bd74e6
Packit Service bd74e6
out_comment = True
Packit Service bd74e6
output_fold_source(sys.stdout, out_comment)
Packit Service bd74e6
Packit Service bd74e6
output_gperf_source()