#!/usr/bin/python
# -*- coding: utf-8 -*-
# make_unicode_egcb_data.py
# Copyright (c) 2017 K.Kosako
import sys
import re
MAX_CODE_POINT = 0x10ffff
PR_TOTAL_REG = re.compile("#\s*Total\s+(?:code\s+points|elements):")
PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)")
PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)")
PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)")
VERSION_REG = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt")
VERSION_INFO = None
DIC = { }
PROPS = []
PropIndex = { }
def check_version_info(s):
global VERSION_INFO
m = VERSION_REG.match(s)
if m is not None:
VERSION_INFO = m.group(1)
def print_ranges(ranges):
for (start, end) in ranges:
print "0x%06x, 0x%06x" % (start, end)
def print_prop_and_index(prop, i):
print "%-35s %3d" % (prop + ',', i)
PropIndex[prop] = i
def dic_find_by_value(dic, v):
for key, val in dic.items():
if val == v:
return key
return None
def normalize_ranges(in_ranges, sort=False):
if sort:
ranges = sorted(in_ranges)
else:
ranges = in_ranges
r = []
prev = None
for (start, end) in ranges:
if prev >= start - 1:
(pstart, pend) = r.pop()
end = max(pend, end)
start = pstart
r.append((start, end))
prev = end
return r
def inverse_ranges(in_ranges):
r = []
prev = 0x000000
for (start, end) in in_ranges:
if prev < start:
r.append((prev, start - 1))
prev = end + 1
if prev < MAX_CODE_POINT:
r.append((prev, MAX_CODE_POINT))
return r
def add_ranges(r1, r2):
r = r1 + r2
return normalize_ranges(r, True)
def sub_one_range(one_range, rs):
r = []
(s1, e1) = one_range
n = len(rs)
for i in range(0, n):
(s2, e2) = rs[i]
if s2 >= s1 and s2 <= e1:
if s2 > s1:
r.append((s1, s2 - 1))
if e2 >= e1:
return r
s1 = e2 + 1
elif s2 < s1 and e2 >= s1:
if e2 < e1:
s1 = e2 + 1
else:
return r
r.append((s1, e1))
return r
def sub_ranges(r1, r2):
r = []
for one_range in r1:
rs = sub_one_range(one_range, r2)
r.extend(rs)
return r
def add_ranges_in_dic(dic):
r = []
for k, v in dic.items():
r = r + v
return normalize_ranges(r, True)
def normalize_ranges_in_dic(dic, sort=False):
for k, v in dic.items():
r = normalize_ranges(v, sort)
dic[k] = r
def merge_dic(to_dic, from_dic):
to_keys = to_dic.keys()
from_keys = from_dic.keys()
common = list(set(to_keys) & set(from_keys))
if len(common) != 0:
print >> sys.stderr, "merge_dic: collision: %s" % sorted(common)
to_dic.update(from_dic)
def merge_props(to_props, from_props):
common = list(set(to_props) & set(from_props))
if len(common) != 0:
print >> sys.stderr, "merge_props: collision: %s" % sorted(common)
to_props.extend(from_props)
def add_range_into_dic(dic, name, start, end):
d = dic.get(name, None)
if d is None:
d = [(start, end)]
dic[name] = d
else:
d.append((start, end))
def list_sub(a, b):
x = set(a) - set(b)
return list(x)
def parse_properties(path):
with open(path, 'r') as f:
dic = { }
prop = None
props = []
for line in f:
s = line.strip()
if len(s) == 0:
continue
if s[0] == '#':
if VERSION_INFO is None:
check_version_info(s)
m = PR_LINE_REG.match(s)
if m:
prop = m.group(3)
if m.group(2):
start = int(m.group(1), 16)
end = int(m.group(2), 16)
add_range_into_dic(dic, prop, start, end)
else:
start = int(m.group(1), 16)
add_range_into_dic(dic, prop, start, start)
elif PR_TOTAL_REG.match(s) is not None:
props.append(prop)
normalize_ranges_in_dic(dic)
return (dic, props)
### main ###
argv = sys.argv
argc = len(argv)
dic, props = parse_properties('GraphemeBreakProperty.txt')
merge_dic(DIC, dic)
merge_props(PROPS, props)
PROPS = sorted(PROPS)
print '/* unicode_egcb_data.c: Generated by make_unicode_egcb_data.py. */'
COPYRIGHT = '''
/*-
* Copyright (c) 2017 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
'''.strip()
print COPYRIGHT
print ''
if VERSION_INFO is not None:
print "#define GRAPHEME_BREAK_PROPERTY_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO)
print ''
ranges = []
for prop in PROPS:
rs = DIC[prop]
for (start, end) in rs:
ranges.append((start, end, prop))
ranges = sorted(ranges, key=lambda x: x[0])
prev = -1
for (start, end, prop) in ranges:
if prev >= start:
raise ValueError("{2}:{0} - {1} range overlap prev value {3}".format(start, end, prop, prev))
print '/*'
for prop in PROPS:
print "%s" % prop
print '*/'
print ''
num_ranges = len(ranges)
print "static int EGCB_RANGE_NUM = %d;" % num_ranges
print 'static EGCB_RANGE_TYPE EGCB_RANGES[] = {'
for i, (start, end, prop) in enumerate(ranges):
if i == num_ranges - 1:
comma = ''
else:
comma = ','
type_name = 'EGCB_' + prop
print " {0x%06x, 0x%06x, %s }%s" % (start, end, type_name, comma)
print '};'
sys.exit(0)