|
Packit |
b89d10 |
#!/usr/bin/python
|
|
Packit |
b89d10 |
# -*- coding: utf-8 -*-
|
|
Packit |
b89d10 |
# make_unicode_property_data.py
|
|
Packit |
b89d10 |
# Copyright (c) 2016-2017 K.Kosako
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
import sys
|
|
Packit |
b89d10 |
import re
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
INCLUDE_GRAPHEME_CLUSTER_DATA = False
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
POSIX_LIST = [
|
|
Packit |
b89d10 |
'NEWLINE', 'Alpha', 'Blank', 'Cntrl', 'Digit', 'Graph', 'Lower',
|
|
Packit |
b89d10 |
'Print', 'Punct', 'Space', 'Upper', 'XDigit', 'Word', 'Alnum', 'ASCII'
|
|
Packit |
b89d10 |
]
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
MAX_CODE_POINT = 0x10ffff
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
GRAPHEME_CLUSTER_BREAK_NAME_PREFIX = 'Grapheme_Cluster_Break_'
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
UD_FIRST_REG = re.compile("<.+,\s*First>")
|
|
Packit |
b89d10 |
UD_LAST_REG = re.compile("<.+,\s*Last>")
|
|
Packit |
b89d10 |
PR_TOTAL_REG = re.compile("#\s*Total\s+code\s+points:")
|
|
Packit |
b89d10 |
PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)")
|
|
Packit |
b89d10 |
PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)")
|
|
Packit |
b89d10 |
PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
|
|
Packit |
b89d10 |
BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)")
|
|
Packit |
b89d10 |
VERSION_REG = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt")
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
VERSION_INFO = None
|
|
Packit |
b89d10 |
DIC = { }
|
|
Packit |
b89d10 |
KDIC = { }
|
|
Packit |
b89d10 |
PropIndex = { }
|
|
Packit |
b89d10 |
PROPERTY_NAME_MAX_LEN = 0
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def normalize_prop_name(name):
|
|
Packit |
b89d10 |
name = re.sub(r'[ _]', '', name)
|
|
Packit |
b89d10 |
name = name.lower()
|
|
Packit |
b89d10 |
return name
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def fix_block_name(name):
|
|
Packit |
b89d10 |
s = re.sub(r'[- ]+', '_', name)
|
|
Packit |
b89d10 |
return 'In_' + s
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def check_version_info(s):
|
|
Packit |
b89d10 |
global VERSION_INFO
|
|
Packit |
b89d10 |
m = VERSION_REG.match(s)
|
|
Packit |
b89d10 |
if m is not None:
|
|
Packit |
b89d10 |
VERSION_INFO = m.group(1)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def print_ranges(ranges):
|
|
Packit |
b89d10 |
for (start, end) in ranges:
|
|
Packit |
b89d10 |
print "0x%06x, 0x%06x" % (start, end)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
print len(ranges)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def print_prop_and_index(prop, i):
|
|
Packit |
b89d10 |
print "%-35s %3d" % (prop + ',', i)
|
|
Packit |
b89d10 |
PropIndex[prop] = i
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
PRINT_CACHE = { }
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def print_property(prop, data, desc):
|
|
Packit |
b89d10 |
print ''
|
|
Packit |
b89d10 |
print "/* PROPERTY: '%s': %s */" % (prop, desc)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
prev_prop = dic_find_by_value(PRINT_CACHE, data)
|
|
Packit |
b89d10 |
if prev_prop is not None:
|
|
Packit |
b89d10 |
print "#define CR_%s CR_%s" % (prop, prev_prop)
|
|
Packit |
b89d10 |
else:
|
|
Packit |
b89d10 |
PRINT_CACHE[prop] = data
|
|
Packit |
b89d10 |
print "static const OnigCodePoint"
|
|
Packit |
b89d10 |
print "CR_%s[] = { %d," % (prop, len(data))
|
|
Packit |
b89d10 |
for (start, end) in data:
|
|
Packit |
b89d10 |
print "0x%04x, 0x%04x," % (start, end)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
print "}; /* END of CR_%s */" % prop
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def dic_find_by_value(dic, v):
|
|
Packit |
b89d10 |
for key, val in dic.items():
|
|
Packit |
b89d10 |
if val == v:
|
|
Packit |
b89d10 |
return key
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
return None
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def normalize_ranges(in_ranges, sort=False):
|
|
Packit |
b89d10 |
if sort:
|
|
Packit |
b89d10 |
ranges = sorted(in_ranges)
|
|
Packit |
b89d10 |
else:
|
|
Packit |
b89d10 |
ranges = in_ranges
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
r = []
|
|
Packit |
b89d10 |
prev = None
|
|
Packit |
b89d10 |
for (start, end) in ranges:
|
|
Packit |
b89d10 |
if prev >= start - 1:
|
|
Packit |
b89d10 |
(pstart, pend) = r.pop()
|
|
Packit |
b89d10 |
end = max(pend, end)
|
|
Packit |
b89d10 |
start = pstart
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
r.append((start, end))
|
|
Packit |
b89d10 |
prev = end
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
return r
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def inverse_ranges(in_ranges):
|
|
Packit |
b89d10 |
r = []
|
|
Packit |
b89d10 |
prev = 0x000000
|
|
Packit |
b89d10 |
for (start, end) in in_ranges:
|
|
Packit |
b89d10 |
if prev < start:
|
|
Packit |
b89d10 |
r.append((prev, start - 1))
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
prev = end + 1
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
if prev < MAX_CODE_POINT:
|
|
Packit |
b89d10 |
r.append((prev, MAX_CODE_POINT))
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
return r
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def add_ranges(r1, r2):
|
|
Packit |
b89d10 |
r = r1 + r2
|
|
Packit |
b89d10 |
return normalize_ranges(r, True)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def sub_one_range(one_range, rs):
|
|
Packit |
b89d10 |
r = []
|
|
Packit |
b89d10 |
(s1, e1) = one_range
|
|
Packit |
b89d10 |
n = len(rs)
|
|
Packit |
b89d10 |
for i in range(0, n):
|
|
Packit |
b89d10 |
(s2, e2) = rs[i]
|
|
Packit |
b89d10 |
if s2 >= s1 and s2 <= e1:
|
|
Packit |
b89d10 |
if s2 > s1:
|
|
Packit |
b89d10 |
r.append((s1, s2 - 1))
|
|
Packit |
b89d10 |
if e2 >= e1:
|
|
Packit |
b89d10 |
return r
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
s1 = e2 + 1
|
|
Packit |
b89d10 |
elif s2 < s1 and e2 >= s1:
|
|
Packit |
b89d10 |
if e2 < e1:
|
|
Packit |
b89d10 |
s1 = e2 + 1
|
|
Packit |
b89d10 |
else:
|
|
Packit |
b89d10 |
return r
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
r.append((s1, e1))
|
|
Packit |
b89d10 |
return r
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def sub_ranges(r1, r2):
|
|
Packit |
b89d10 |
r = []
|
|
Packit |
b89d10 |
for one_range in r1:
|
|
Packit |
b89d10 |
rs = sub_one_range(one_range, r2)
|
|
Packit |
b89d10 |
r.extend(rs)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
return r
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def add_ranges_in_dic(dic):
|
|
Packit |
b89d10 |
r = []
|
|
Packit |
b89d10 |
for k, v in dic.items():
|
|
Packit |
b89d10 |
r = r + v
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
return normalize_ranges(r, True)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def normalize_ranges_in_dic(dic, sort=False):
|
|
Packit |
b89d10 |
for k, v in dic.items():
|
|
Packit |
b89d10 |
r = normalize_ranges(v, sort)
|
|
Packit |
b89d10 |
dic[k] = r
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def merge_dic(to_dic, from_dic):
|
|
Packit |
b89d10 |
to_keys = to_dic.keys()
|
|
Packit |
b89d10 |
from_keys = from_dic.keys()
|
|
Packit |
b89d10 |
common = list(set(to_keys) & set(from_keys))
|
|
Packit |
b89d10 |
if len(common) != 0:
|
|
Packit |
b89d10 |
print >> sys.stderr, "merge_dic: collision: %s" % sorted(common)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
to_dic.update(from_dic)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def merge_props(to_props, from_props):
|
|
Packit |
b89d10 |
common = list(set(to_props) & set(from_props))
|
|
Packit |
b89d10 |
if len(common) != 0:
|
|
Packit |
b89d10 |
print >> sys.stderr, "merge_props: collision: %s" % sorted(common)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
to_props.extend(from_props)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def add_range_into_dic(dic, name, start, end):
|
|
Packit |
b89d10 |
d = dic.get(name, None)
|
|
Packit |
b89d10 |
if d is None:
|
|
Packit |
b89d10 |
d = [(start, end)]
|
|
Packit |
b89d10 |
dic[name] = d
|
|
Packit |
b89d10 |
else:
|
|
Packit |
b89d10 |
d.append((start, end))
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def list_sub(a, b):
|
|
Packit |
b89d10 |
x = set(a) - set(b)
|
|
Packit |
b89d10 |
return list(x)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def parse_unicode_data_file(f):
|
|
Packit |
b89d10 |
dic = { }
|
|
Packit |
b89d10 |
assigned = []
|
|
Packit |
b89d10 |
for line in f:
|
|
Packit |
b89d10 |
s = line.strip()
|
|
Packit |
b89d10 |
if len(s) == 0:
|
|
Packit |
b89d10 |
continue
|
|
Packit |
b89d10 |
if s[0] == '#':
|
|
Packit |
b89d10 |
continue
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
a = s.split(';')
|
|
Packit |
b89d10 |
code = int(a[0], 16)
|
|
Packit |
b89d10 |
desc = a[1]
|
|
Packit |
b89d10 |
prop = a[2]
|
|
Packit |
b89d10 |
if UD_FIRST_REG.match(desc) is not None:
|
|
Packit |
b89d10 |
start = code
|
|
Packit |
b89d10 |
end = None
|
|
Packit |
b89d10 |
elif UD_LAST_REG.match(desc) is not None:
|
|
Packit |
b89d10 |
end = code
|
|
Packit |
b89d10 |
else:
|
|
Packit |
b89d10 |
start = end = code
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
if end is not None:
|
|
Packit |
b89d10 |
assigned.append((start, end))
|
|
Packit |
b89d10 |
add_range_into_dic(dic, prop, start, end)
|
|
Packit |
b89d10 |
if len(prop) == 2:
|
|
Packit |
b89d10 |
add_range_into_dic(dic, prop[0:1], start, end)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
normalize_ranges_in_dic(dic)
|
|
Packit |
b89d10 |
return dic, assigned
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def parse_properties(path, klass, prop_prefix = None):
|
|
Packit |
b89d10 |
with open(path, 'r') as f:
|
|
Packit |
b89d10 |
dic = { }
|
|
Packit |
b89d10 |
prop = None
|
|
Packit |
b89d10 |
props = []
|
|
Packit |
b89d10 |
for line in f:
|
|
Packit |
b89d10 |
s = line.strip()
|
|
Packit |
b89d10 |
if len(s) == 0:
|
|
Packit |
b89d10 |
continue
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
if s[0] == '#':
|
|
Packit |
b89d10 |
if VERSION_INFO is None:
|
|
Packit |
b89d10 |
check_version_info(s)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
m = PR_LINE_REG.match(s)
|
|
Packit |
b89d10 |
if m:
|
|
Packit |
b89d10 |
prop = m.group(3)
|
|
Packit |
b89d10 |
if prop_prefix is not None:
|
|
Packit |
b89d10 |
prop = prop_prefix + prop
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
if m.group(2):
|
|
Packit |
b89d10 |
start = int(m.group(1), 16)
|
|
Packit |
b89d10 |
end = int(m.group(2), 16)
|
|
Packit |
b89d10 |
add_range_into_dic(dic, prop, start, end)
|
|
Packit |
b89d10 |
else:
|
|
Packit |
b89d10 |
start = int(m.group(1), 16)
|
|
Packit |
b89d10 |
add_range_into_dic(dic, prop, start, start)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
elif PR_TOTAL_REG.match(s) is not None:
|
|
Packit |
b89d10 |
KDIC[prop] = klass
|
|
Packit |
b89d10 |
props.append(prop)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
normalize_ranges_in_dic(dic)
|
|
Packit |
b89d10 |
return (dic, props)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def parse_property_aliases(path):
|
|
Packit |
b89d10 |
a = { }
|
|
Packit |
b89d10 |
with open(path, 'r') as f:
|
|
Packit |
b89d10 |
for line in f:
|
|
Packit |
b89d10 |
s = line.strip()
|
|
Packit |
b89d10 |
if len(s) == 0:
|
|
Packit |
b89d10 |
continue
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
m = PA_LINE_REG.match(s)
|
|
Packit |
b89d10 |
if not(m):
|
|
Packit |
b89d10 |
continue
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
if m.group(1) == m.group(2):
|
|
Packit |
b89d10 |
continue
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
a[m.group(1)] = m.group(2)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
return a
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def parse_property_value_aliases(path):
|
|
Packit |
b89d10 |
a = { }
|
|
Packit |
b89d10 |
with open(path, 'r') as f:
|
|
Packit |
b89d10 |
for line in f:
|
|
Packit |
b89d10 |
s = line.strip()
|
|
Packit |
b89d10 |
if len(s) == 0:
|
|
Packit |
b89d10 |
continue
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
m = PVA_LINE_REG.match(s)
|
|
Packit |
b89d10 |
if not(m):
|
|
Packit |
b89d10 |
continue
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
cat = m.group(1)
|
|
Packit |
b89d10 |
x2 = m.group(2)
|
|
Packit |
b89d10 |
x3 = m.group(3)
|
|
Packit |
b89d10 |
x4 = m.group(4)
|
|
Packit |
b89d10 |
if cat == 'sc':
|
|
Packit |
b89d10 |
if x2 != x3:
|
|
Packit |
b89d10 |
a[x2] = x3
|
|
Packit |
b89d10 |
if x4 and x4 != x3:
|
|
Packit |
b89d10 |
a[x4] = x3
|
|
Packit |
b89d10 |
else:
|
|
Packit |
b89d10 |
if x2 != x3:
|
|
Packit |
b89d10 |
a[x3] = x2
|
|
Packit |
b89d10 |
if x4 and x4 != x2:
|
|
Packit |
b89d10 |
a[x4] = x2
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
return a
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def parse_blocks(path):
|
|
Packit |
b89d10 |
dic = { }
|
|
Packit |
b89d10 |
blocks = []
|
|
Packit |
b89d10 |
with open(path, 'r') as f:
|
|
Packit |
b89d10 |
for line in f:
|
|
Packit |
b89d10 |
s = line.strip()
|
|
Packit |
b89d10 |
if len(s) == 0:
|
|
Packit |
b89d10 |
continue
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
m = BL_LINE_REG.match(s)
|
|
Packit |
b89d10 |
if not(m):
|
|
Packit |
b89d10 |
continue
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
start = int(m.group(1), 16)
|
|
Packit |
b89d10 |
end = int(m.group(2), 16)
|
|
Packit |
b89d10 |
block = fix_block_name(m.group(3))
|
|
Packit |
b89d10 |
add_range_into_dic(dic, block, start, end)
|
|
Packit |
b89d10 |
blocks.append(block)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
noblock = fix_block_name('No_Block')
|
|
Packit |
b89d10 |
dic[noblock] = inverse_ranges(add_ranges_in_dic(dic))
|
|
Packit |
b89d10 |
blocks.append(noblock)
|
|
Packit |
b89d10 |
return dic, blocks
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def add_primitive_props(assigned):
|
|
Packit |
b89d10 |
DIC['Assigned'] = normalize_ranges(assigned)
|
|
Packit |
b89d10 |
DIC['Any'] = [(0x000000, 0x10ffff)]
|
|
Packit |
b89d10 |
DIC['ASCII'] = [(0x000000, 0x00007f)]
|
|
Packit |
b89d10 |
DIC['NEWLINE'] = [(0x00000a, 0x00000a)]
|
|
Packit |
b89d10 |
DIC['Cn'] = inverse_ranges(DIC['Assigned'])
|
|
Packit |
b89d10 |
DIC['C'].extend(DIC['Cn'])
|
|
Packit |
b89d10 |
DIC['C'] = normalize_ranges(DIC['C'], True)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
d = []
|
|
Packit |
b89d10 |
d.extend(DIC['Ll'])
|
|
Packit |
b89d10 |
d.extend(DIC['Lt'])
|
|
Packit |
b89d10 |
d.extend(DIC['Lu'])
|
|
Packit |
b89d10 |
DIC['LC'] = normalize_ranges(d, True)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def add_posix_props(dic):
|
|
Packit |
b89d10 |
alnum = []
|
|
Packit |
b89d10 |
alnum.extend(dic['Alphabetic'])
|
|
Packit |
b89d10 |
alnum.extend(dic['Nd']) # Nd == Decimal_Number
|
|
Packit |
b89d10 |
alnum = normalize_ranges(alnum, True)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
blank = [(0x0009, 0x0009)]
|
|
Packit |
b89d10 |
blank.extend(dic['Zs']) # Zs == Space_Separator
|
|
Packit |
b89d10 |
blank = normalize_ranges(blank, True)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
word = []
|
|
Packit |
b89d10 |
word.extend(dic['Alphabetic'])
|
|
Packit |
b89d10 |
word.extend(dic['M']) # M == Mark
|
|
Packit |
b89d10 |
word.extend(dic['Nd'])
|
|
Packit |
b89d10 |
word.extend(dic['Pc']) # Pc == Connector_Punctuation
|
|
Packit |
b89d10 |
word = normalize_ranges(word, True)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
graph = sub_ranges(dic['Any'], dic['White_Space'])
|
|
Packit |
b89d10 |
graph = sub_ranges(graph, dic['Cc'])
|
|
Packit |
b89d10 |
graph = sub_ranges(graph, dic['Cs']) # Cs == Surrogate
|
|
Packit |
b89d10 |
graph = sub_ranges(graph, dic['Cn']) # Cn == Unassigned
|
|
Packit |
b89d10 |
graph = normalize_ranges(graph, True)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
p = []
|
|
Packit |
b89d10 |
p.extend(graph)
|
|
Packit |
b89d10 |
p.extend(dic['Zs'])
|
|
Packit |
b89d10 |
p = normalize_ranges(p, True)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
dic['Alpha'] = dic['Alphabetic']
|
|
Packit |
b89d10 |
dic['Upper'] = dic['Uppercase']
|
|
Packit |
b89d10 |
dic['Lower'] = dic['Lowercase']
|
|
Packit |
b89d10 |
dic['Punct'] = dic['P'] # P == Punctuation
|
|
Packit |
b89d10 |
dic['Digit'] = dic['Nd']
|
|
Packit |
b89d10 |
dic['XDigit'] = [(0x0030, 0x0039), (0x0041, 0x0046), (0x0061, 0x0066)]
|
|
Packit |
b89d10 |
dic['Alnum'] = alnum
|
|
Packit |
b89d10 |
dic['Space'] = dic['White_Space']
|
|
Packit |
b89d10 |
dic['Blank'] = blank
|
|
Packit |
b89d10 |
dic['Cntrl'] = dic['Cc']
|
|
Packit |
b89d10 |
dic['Word'] = word
|
|
Packit |
b89d10 |
dic['Graph'] = graph
|
|
Packit |
b89d10 |
dic['Print'] = p
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def set_max_prop_name(name):
|
|
Packit |
b89d10 |
global PROPERTY_NAME_MAX_LEN
|
|
Packit |
b89d10 |
n = len(name)
|
|
Packit |
b89d10 |
if n > PROPERTY_NAME_MAX_LEN:
|
|
Packit |
b89d10 |
PROPERTY_NAME_MAX_LEN = n
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def entry_prop_name(name, index):
|
|
Packit |
b89d10 |
set_max_prop_name(name)
|
|
Packit |
b89d10 |
if OUTPUT_LIST_MODE and index >= len(POSIX_LIST):
|
|
Packit |
b89d10 |
print >> UPF, "%3d: %s" % (index, name)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
def entry_and_print_prop_and_index(name, index):
|
|
Packit |
b89d10 |
entry_prop_name(name, index)
|
|
Packit |
b89d10 |
nname = normalize_prop_name(name)
|
|
Packit |
b89d10 |
print_prop_and_index(nname, index)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
### main ###
|
|
Packit |
b89d10 |
argv = sys.argv
|
|
Packit |
b89d10 |
argc = len(argv)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
POSIX_ONLY = False
|
|
Packit |
b89d10 |
if argc >= 2:
|
|
Packit |
b89d10 |
if argv[1] == '-posix':
|
|
Packit |
b89d10 |
POSIX_ONLY = True
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
OUTPUT_LIST_MODE = not(POSIX_ONLY)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
with open('UnicodeData.txt', 'r') as f:
|
|
Packit |
b89d10 |
dic, assigned = parse_unicode_data_file(f)
|
|
Packit |
b89d10 |
DIC = dic
|
|
Packit |
b89d10 |
add_primitive_props(assigned)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
PROPS = DIC.keys()
|
|
Packit |
b89d10 |
PROPS = list_sub(PROPS, POSIX_LIST)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
dic, props = parse_properties('DerivedCoreProperties.txt', 'Derived Property')
|
|
Packit |
b89d10 |
merge_dic(DIC, dic)
|
|
Packit |
b89d10 |
merge_props(PROPS, props)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
dic, props = parse_properties('Scripts.txt', 'Script')
|
|
Packit |
b89d10 |
merge_dic(DIC, dic)
|
|
Packit |
b89d10 |
merge_props(PROPS, props)
|
|
Packit |
b89d10 |
DIC['Unknown'] = inverse_ranges(add_ranges_in_dic(dic))
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
dic, props = parse_properties('PropList.txt', 'Binary Property')
|
|
Packit |
b89d10 |
merge_dic(DIC, dic)
|
|
Packit |
b89d10 |
merge_props(PROPS, props)
|
|
Packit |
b89d10 |
PROPS.append('Unknown')
|
|
Packit |
b89d10 |
KDIC['Unknown'] = 'Script'
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
ALIASES = parse_property_aliases('PropertyAliases.txt')
|
|
Packit |
b89d10 |
a = parse_property_value_aliases('PropertyValueAliases.txt')
|
|
Packit |
b89d10 |
merge_dic(ALIASES, a)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
dic, BLOCKS = parse_blocks('Blocks.txt')
|
|
Packit |
b89d10 |
merge_dic(DIC, dic)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
if INCLUDE_GRAPHEME_CLUSTER_DATA:
|
|
Packit |
b89d10 |
dic, props = parse_properties('GraphemeBreakProperty.txt',
|
|
Packit |
b89d10 |
'GraphemeBreak Property',
|
|
Packit |
b89d10 |
GRAPHEME_CLUSTER_BREAK_NAME_PREFIX)
|
|
Packit |
b89d10 |
merge_dic(DIC, dic)
|
|
Packit |
b89d10 |
merge_props(PROPS, props)
|
|
Packit |
b89d10 |
#prop = GRAPHEME_CLUSTER_BREAK_NAME_PREFIX + 'Other'
|
|
Packit |
b89d10 |
#DIC[prop] = inverse_ranges(add_ranges_in_dic(dic))
|
|
Packit |
b89d10 |
#PROPS.append(prop)
|
|
Packit |
b89d10 |
#KDIC[prop] = 'GrapemeBreak Property'
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
add_posix_props(DIC)
|
|
Packit |
b89d10 |
PROPS = sorted(PROPS)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
s = '''%{
|
|
Packit |
b89d10 |
/* Generated by make_unicode_property_data.py. */
|
|
Packit |
b89d10 |
'''
|
|
Packit |
b89d10 |
print s
|
|
Packit |
b89d10 |
for prop in POSIX_LIST:
|
|
Packit |
b89d10 |
print_property(prop, DIC[prop], "POSIX [[:%s:]]" % prop)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
print ''
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
if not(POSIX_ONLY):
|
|
Packit |
b89d10 |
for prop in PROPS:
|
|
Packit |
b89d10 |
klass = KDIC.get(prop, None)
|
|
Packit |
b89d10 |
if klass is None:
|
|
Packit |
b89d10 |
n = len(prop)
|
|
Packit |
b89d10 |
if n == 1:
|
|
Packit |
b89d10 |
klass = 'Major Category'
|
|
Packit |
b89d10 |
elif n == 2:
|
|
Packit |
b89d10 |
klass = 'General Category'
|
|
Packit |
b89d10 |
else:
|
|
Packit |
b89d10 |
klass = '-'
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
print_property(prop, DIC[prop], klass)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
for block in BLOCKS:
|
|
Packit |
b89d10 |
print_property(block, DIC[block], 'Block')
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
print ''
|
|
Packit |
b89d10 |
print "static const OnigCodePoint*\nconst CodeRanges[] = {"
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
for prop in POSIX_LIST:
|
|
Packit |
b89d10 |
print " CR_%s," % prop
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
if not(POSIX_ONLY):
|
|
Packit |
b89d10 |
for prop in PROPS:
|
|
Packit |
b89d10 |
print " CR_%s," % prop
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
for prop in BLOCKS:
|
|
Packit |
b89d10 |
print " CR_%s," % prop
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
s = '''};
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
#define pool_offset(s) offsetof(struct unicode_prop_name_pool_t, unicode_prop_name_pool_str##s)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
%}
|
|
Packit |
b89d10 |
struct PoolPropertyNameCtype {
|
|
Packit |
b89d10 |
short int name;
|
|
Packit |
b89d10 |
short int ctype;
|
|
Packit |
b89d10 |
};
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
%%
|
|
Packit |
b89d10 |
'''
|
|
Packit |
b89d10 |
sys.stdout.write(s)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
if OUTPUT_LIST_MODE:
|
|
Packit |
b89d10 |
UPF = open("UNICODE_PROPERTIES", "w")
|
|
Packit |
b89d10 |
if VERSION_INFO is not None:
|
|
Packit |
b89d10 |
print >> UPF, "Unicode Properties (from Unicode Version: %s)" % VERSION_INFO
|
|
Packit |
b89d10 |
print >> UPF, ''
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
index = -1
|
|
Packit |
b89d10 |
for prop in POSIX_LIST:
|
|
Packit |
b89d10 |
index += 1
|
|
Packit |
b89d10 |
entry_and_print_prop_and_index(prop, index)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
if not(POSIX_ONLY):
|
|
Packit |
b89d10 |
for prop in PROPS:
|
|
Packit |
b89d10 |
index += 1
|
|
Packit |
b89d10 |
entry_and_print_prop_and_index(prop, index)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
NALIASES = map(lambda (k,v):(normalize_prop_name(k), k, v), ALIASES.items())
|
|
Packit |
b89d10 |
NALIASES = sorted(NALIASES)
|
|
Packit |
b89d10 |
for (nk, k, v) in NALIASES:
|
|
Packit |
b89d10 |
nv = normalize_prop_name(v)
|
|
Packit |
b89d10 |
if PropIndex.get(nk, None) is not None:
|
|
Packit |
b89d10 |
print >> sys.stderr, "ALIASES: already exists: %s => %s" % (k, v)
|
|
Packit |
b89d10 |
continue
|
|
Packit |
b89d10 |
aindex = PropIndex.get(nv, None)
|
|
Packit |
b89d10 |
if aindex is None:
|
|
Packit |
b89d10 |
#print >> sys.stderr, "ALIASES: value is not exist: %s => %s" % (k, v)
|
|
Packit |
b89d10 |
continue
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
entry_prop_name(k, aindex)
|
|
Packit |
b89d10 |
print_prop_and_index(nk, aindex)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
for name in BLOCKS:
|
|
Packit |
b89d10 |
index += 1
|
|
Packit |
b89d10 |
entry_and_print_prop_and_index(name, index)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
print '%%'
|
|
Packit |
b89d10 |
print ''
|
|
Packit |
b89d10 |
if VERSION_INFO is not None:
|
|
Packit |
b89d10 |
print "#define PROPERTY_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO)
|
|
Packit |
b89d10 |
print ''
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
print "#define PROPERTY_NAME_MAX_SIZE %d" % (PROPERTY_NAME_MAX_LEN + 10)
|
|
Packit |
b89d10 |
print "#define CODE_RANGES_NUM %d" % (index + 1)
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
if OUTPUT_LIST_MODE:
|
|
Packit |
b89d10 |
UPF.close()
|
|
Packit |
b89d10 |
|
|
Packit |
b89d10 |
sys.exit(0)
|