Blame src/make_unicode_egcb_data.py

Packit b89d10
#!/usr/bin/python
Packit b89d10
# -*- coding: utf-8 -*-
Packit b89d10
# make_unicode_egcb_data.py
Packit b89d10
# Copyright (c) 2017  K.Kosako
Packit b89d10
Packit b89d10
import sys
Packit b89d10
import re
Packit b89d10
Packit b89d10
MAX_CODE_POINT = 0x10ffff
Packit b89d10
Packit b89d10
PR_TOTAL_REG = re.compile("#\s*Total\s+(?:code\s+points|elements):")
Packit b89d10
PR_LINE_REG  = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)")
Packit b89d10
PA_LINE_REG  = re.compile("(\w+)\s*;\s*(\w+)")
Packit b89d10
PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
Packit b89d10
BL_LINE_REG  = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)")
Packit b89d10
VERSION_REG  = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt")
Packit b89d10
Packit b89d10
VERSION_INFO = None
Packit b89d10
DIC  = { }
Packit b89d10
PROPS = []
Packit b89d10
PropIndex = { }
Packit b89d10
Packit b89d10
def check_version_info(s):
Packit b89d10
  global VERSION_INFO
Packit b89d10
  m = VERSION_REG.match(s)
Packit b89d10
  if m is not None:
Packit b89d10
    VERSION_INFO = m.group(1)
Packit b89d10
Packit b89d10
def print_ranges(ranges):
Packit b89d10
  for (start, end) in ranges:
Packit b89d10
    print "0x%06x, 0x%06x" % (start, end)
Packit b89d10
Packit b89d10
def print_prop_and_index(prop, i):
Packit b89d10
  print "%-35s %3d" % (prop + ',', i)
Packit b89d10
  PropIndex[prop] = i
Packit b89d10
Packit b89d10
def dic_find_by_value(dic, v):
Packit b89d10
  for key, val in dic.items():
Packit b89d10
    if val == v:
Packit b89d10
      return key
Packit b89d10
Packit b89d10
  return None
Packit b89d10
Packit b89d10
Packit b89d10
def normalize_ranges(in_ranges, sort=False):
Packit b89d10
  if sort:
Packit b89d10
    ranges = sorted(in_ranges)
Packit b89d10
  else:
Packit b89d10
    ranges = in_ranges
Packit b89d10
Packit b89d10
  r = []
Packit b89d10
  prev = None
Packit b89d10
  for (start, end) in ranges:
Packit b89d10
    if prev >= start - 1:
Packit b89d10
      (pstart, pend) = r.pop()
Packit b89d10
      end = max(pend, end)
Packit b89d10
      start = pstart
Packit b89d10
Packit b89d10
    r.append((start, end))
Packit b89d10
    prev = end
Packit b89d10
Packit b89d10
  return r
Packit b89d10
Packit b89d10
def inverse_ranges(in_ranges):
Packit b89d10
  r = []
Packit b89d10
  prev = 0x000000
Packit b89d10
  for (start, end) in in_ranges:
Packit b89d10
    if prev < start:
Packit b89d10
      r.append((prev, start - 1))
Packit b89d10
Packit b89d10
    prev = end + 1
Packit b89d10
Packit b89d10
  if prev < MAX_CODE_POINT:
Packit b89d10
    r.append((prev, MAX_CODE_POINT))
Packit b89d10
Packit b89d10
  return r
Packit b89d10
Packit b89d10
def add_ranges(r1, r2):
Packit b89d10
  r = r1 + r2
Packit b89d10
  return normalize_ranges(r, True)
Packit b89d10
Packit b89d10
def sub_one_range(one_range, rs):
Packit b89d10
  r = []
Packit b89d10
  (s1, e1) = one_range
Packit b89d10
  n = len(rs)
Packit b89d10
  for i in range(0, n):
Packit b89d10
    (s2, e2) = rs[i]
Packit b89d10
    if s2 >= s1 and s2 <= e1:
Packit b89d10
      if s2 > s1:
Packit b89d10
        r.append((s1, s2 - 1))
Packit b89d10
      if e2 >= e1:
Packit b89d10
        return r
Packit b89d10
Packit b89d10
      s1 = e2 + 1
Packit b89d10
    elif s2 < s1 and e2 >= s1:
Packit b89d10
      if e2 < e1:
Packit b89d10
        s1 = e2 + 1
Packit b89d10
      else:
Packit b89d10
        return r
Packit b89d10
Packit b89d10
  r.append((s1, e1))
Packit b89d10
  return r
Packit b89d10
Packit b89d10
def sub_ranges(r1, r2):
Packit b89d10
  r = []
Packit b89d10
  for one_range in r1:
Packit b89d10
    rs = sub_one_range(one_range, r2)
Packit b89d10
    r.extend(rs)
Packit b89d10
Packit b89d10
  return r
Packit b89d10
Packit b89d10
def add_ranges_in_dic(dic):
Packit b89d10
  r = []
Packit b89d10
  for k, v in dic.items():
Packit b89d10
    r = r + v
Packit b89d10
Packit b89d10
  return normalize_ranges(r, True)
Packit b89d10
Packit b89d10
def normalize_ranges_in_dic(dic, sort=False):
Packit b89d10
  for k, v in dic.items():
Packit b89d10
    r = normalize_ranges(v, sort)
Packit b89d10
    dic[k] = r
Packit b89d10
Packit b89d10
def merge_dic(to_dic, from_dic):
Packit b89d10
  to_keys   = to_dic.keys()
Packit b89d10
  from_keys = from_dic.keys()
Packit b89d10
  common = list(set(to_keys) & set(from_keys))
Packit b89d10
  if len(common) != 0:
Packit b89d10
    print >> sys.stderr, "merge_dic: collision: %s" % sorted(common)
Packit b89d10
Packit b89d10
  to_dic.update(from_dic)
Packit b89d10
Packit b89d10
def merge_props(to_props, from_props):
Packit b89d10
  common = list(set(to_props) & set(from_props))
Packit b89d10
  if len(common) != 0:
Packit b89d10
    print >> sys.stderr, "merge_props: collision: %s" % sorted(common)
Packit b89d10
Packit b89d10
  to_props.extend(from_props)
Packit b89d10
Packit b89d10
def add_range_into_dic(dic, name, start, end):
Packit b89d10
  d = dic.get(name, None)
Packit b89d10
  if d is None:
Packit b89d10
    d = [(start, end)]
Packit b89d10
    dic[name] = d
Packit b89d10
  else:
Packit b89d10
    d.append((start, end))
Packit b89d10
Packit b89d10
def list_sub(a, b):
Packit b89d10
  x = set(a) - set(b)
Packit b89d10
  return list(x)
Packit b89d10
Packit b89d10
def parse_properties(path):
Packit b89d10
  with open(path, 'r') as f:
Packit b89d10
    dic = { }
Packit b89d10
    prop = None
Packit b89d10
    props = []
Packit b89d10
    for line in f:
Packit b89d10
      s = line.strip()
Packit b89d10
      if len(s) == 0:
Packit b89d10
        continue
Packit b89d10
Packit b89d10
      if s[0] == '#':
Packit b89d10
        if VERSION_INFO is None:
Packit b89d10
          check_version_info(s)
Packit b89d10
Packit b89d10
      m = PR_LINE_REG.match(s)
Packit b89d10
      if m:
Packit b89d10
        prop = m.group(3)
Packit b89d10
        if m.group(2):
Packit b89d10
          start = int(m.group(1), 16)
Packit b89d10
          end   = int(m.group(2), 16)
Packit b89d10
          add_range_into_dic(dic, prop, start, end)
Packit b89d10
        else:
Packit b89d10
          start = int(m.group(1), 16)
Packit b89d10
          add_range_into_dic(dic, prop, start, start)
Packit b89d10
Packit b89d10
      elif PR_TOTAL_REG.match(s) is not None:
Packit b89d10
        props.append(prop)
Packit b89d10
Packit b89d10
  normalize_ranges_in_dic(dic)
Packit b89d10
  return (dic, props)
Packit b89d10
Packit b89d10
Packit b89d10
### main ###
Packit b89d10
argv = sys.argv
Packit b89d10
argc = len(argv)
Packit b89d10
Packit b89d10
dic, props = parse_properties('GraphemeBreakProperty.txt')
Packit b89d10
merge_dic(DIC, dic)
Packit b89d10
merge_props(PROPS, props)
Packit b89d10
Packit b89d10
PROPS = sorted(PROPS)
Packit b89d10
Packit b89d10
print '/* unicode_egcb_data.c: Generated by make_unicode_egcb_data.py. */'
Packit b89d10
COPYRIGHT = '''
Packit b89d10
/*-
Packit b89d10
 * Copyright (c) 2017  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
Packit b89d10
 * All rights reserved.
Packit b89d10
 *
Packit b89d10
 * Redistribution and use in source and binary forms, with or without
Packit b89d10
 * modification, are permitted provided that the following conditions
Packit b89d10
 * are met:
Packit b89d10
 * 1. Redistributions of source code must retain the above copyright
Packit b89d10
 *    notice, this list of conditions and the following disclaimer.
Packit b89d10
 * 2. Redistributions in binary form must reproduce the above copyright
Packit b89d10
 *    notice, this list of conditions and the following disclaimer in the
Packit b89d10
 *    documentation and/or other materials provided with the distribution.
Packit b89d10
 *
Packit b89d10
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
Packit b89d10
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
Packit b89d10
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
Packit b89d10
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
Packit b89d10
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
Packit b89d10
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
Packit b89d10
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
Packit b89d10
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
Packit b89d10
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
Packit b89d10
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
Packit b89d10
 * SUCH DAMAGE.
Packit b89d10
 */
Packit b89d10
'''.strip()
Packit b89d10
Packit b89d10
print COPYRIGHT
Packit b89d10
print ''
Packit b89d10
if VERSION_INFO is not None:
Packit b89d10
  print "#define GRAPHEME_BREAK_PROPERTY_VERSION  %s" % re.sub(r'[\.-]', '_', VERSION_INFO)
Packit b89d10
  print ''
Packit b89d10
Packit b89d10
ranges = []
Packit b89d10
for prop in PROPS:
Packit b89d10
  rs = DIC[prop]
Packit b89d10
  for (start, end) in rs:
Packit b89d10
    ranges.append((start, end, prop))
Packit b89d10
Packit b89d10
ranges = sorted(ranges, key=lambda x: x[0])
Packit b89d10
Packit b89d10
prev = -1
Packit b89d10
for (start, end, prop) in ranges:
Packit b89d10
  if prev >= start:
Packit b89d10
    raise ValueError("{2}:{0} - {1} range overlap prev value {3}".format(start, end, prop, prev))
Packit b89d10
Packit b89d10
Packit b89d10
print '/*'
Packit b89d10
for prop in PROPS:
Packit b89d10
  print "%s" % prop
Packit b89d10
print '*/'
Packit b89d10
print ''
Packit b89d10
Packit b89d10
num_ranges = len(ranges)
Packit b89d10
print "static int EGCB_RANGE_NUM = %d;" % num_ranges
Packit b89d10
Packit b89d10
print 'static EGCB_RANGE_TYPE EGCB_RANGES[] = {'
Packit b89d10
for i, (start, end, prop) in enumerate(ranges):
Packit b89d10
  if i == num_ranges - 1:
Packit b89d10
    comma = ''
Packit b89d10
  else:
Packit b89d10
    comma = ','
Packit b89d10
Packit b89d10
  type_name = 'EGCB_' + prop
Packit b89d10
  print " {0x%06x, 0x%06x, %s }%s" % (start, end, type_name, comma)
Packit b89d10
Packit b89d10
print '};'
Packit b89d10
Packit b89d10
sys.exit(0)