Blame src/gen-indic-table.py

Packit Service 5bcba8
#!/usr/bin/python
Packit Service 5bcba8
Packit Service 5bcba8
import sys
Packit Service 5bcba8
Packit Service 5bcba8
if len (sys.argv) != 4:
Packit Service 5bcba8
	print >>sys.stderr, "usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt"
Packit Service 5bcba8
	sys.exit (1)
Packit Service 5bcba8
Packit Service 5bcba8
ALLOWED_SINGLES = [0x00A0, 0x25CC]
Packit Service 5bcba8
ALLOWED_BLOCKS = [
Packit Service 5bcba8
	'Basic Latin',
Packit Service 5bcba8
	'Latin-1 Supplement',
Packit Service 5bcba8
	'Devanagari',
Packit Service 5bcba8
	'Bengali',
Packit Service 5bcba8
	'Gurmukhi',
Packit Service 5bcba8
	'Gujarati',
Packit Service 5bcba8
	'Oriya',
Packit Service 5bcba8
	'Tamil',
Packit Service 5bcba8
	'Telugu',
Packit Service 5bcba8
	'Kannada',
Packit Service 5bcba8
	'Malayalam',
Packit Service 5bcba8
	'Sinhala',
Packit Service 5bcba8
	'Myanmar',
Packit Service 5bcba8
	'Khmer',
Packit Service 5bcba8
	'Vedic Extensions',
Packit Service 5bcba8
	'General Punctuation',
Packit Service 5bcba8
	'Superscripts and Subscripts',
Packit Service 5bcba8
	'Devanagari Extended',
Packit Service 5bcba8
	'Myanmar Extended-B',
Packit Service 5bcba8
	'Myanmar Extended-A',
Packit Service 5bcba8
]
Packit Service 5bcba8
Packit Service 5bcba8
files = [file (x) for x in sys.argv[1:]]
Packit Service 5bcba8
Packit Service 5bcba8
headers = [[f.readline () for i in range (2)] for f in files]
Packit Service 5bcba8
Packit Service 5bcba8
data = [{} for f in files]
Packit Service 5bcba8
values = [{} for f in files]
Packit Service 5bcba8
for i, f in enumerate (files):
Packit Service 5bcba8
	for line in f:
Packit Service 5bcba8
Packit Service 5bcba8
		j = line.find ('#')
Packit Service 5bcba8
		if j >= 0:
Packit Service 5bcba8
			line = line[:j]
Packit Service 5bcba8
Packit Service 5bcba8
		fields = [x.strip () for x in line.split (';')]
Packit Service 5bcba8
		if len (fields) == 1:
Packit Service 5bcba8
			continue
Packit Service 5bcba8
Packit Service 5bcba8
		uu = fields[0].split ('..')
Packit Service 5bcba8
		start = int (uu[0], 16)
Packit Service 5bcba8
		if len (uu) == 1:
Packit Service 5bcba8
			end = start
Packit Service 5bcba8
		else:
Packit Service 5bcba8
			end = int (uu[1], 16)
Packit Service 5bcba8
Packit Service 5bcba8
		t = fields[1]
Packit Service 5bcba8
Packit Service 5bcba8
		for u in range (start, end + 1):
Packit Service 5bcba8
			data[i][u] = t
Packit Service 5bcba8
		values[i][t] = values[i].get (t, 0) + end - start + 1
Packit Service 5bcba8
Packit Service 5bcba8
# Merge data into one dict:
Packit Service 5bcba8
defaults = ('Other', 'Not_Applicable', 'No_Block')
Packit Service 5bcba8
for i,v in enumerate (defaults):
Packit Service 5bcba8
	values[i][v] = values[i].get (v, 0) + 1
Packit Service 5bcba8
combined = {}
Packit Service 5bcba8
for i,d in enumerate (data):
Packit Service 5bcba8
	for u,v in d.items ():
Packit Service 5bcba8
		if i == 2 and not u in combined:
Packit Service 5bcba8
			continue
Packit Service 5bcba8
		if not u in combined:
Packit Service 5bcba8
			combined[u] = list (defaults)
Packit Service 5bcba8
		combined[u][i] = v
Packit Service 5bcba8
combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS}
Packit Service 5bcba8
data = combined
Packit Service 5bcba8
del combined
Packit Service 5bcba8
num = len (data)
Packit Service 5bcba8
Packit Service 5bcba8
for u in [0x17CD, 0x17CE, 0x17CF, 0x17D0, 0x17D3]:
Packit Service 5bcba8
	if data[u][0] == 'Other':
Packit Service 5bcba8
		data[u][0] = "Vowel_Dependent"
Packit Service 5bcba8
Packit Service 5bcba8
# Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out
Packit Service 5bcba8
singles = {}
Packit Service 5bcba8
for u in ALLOWED_SINGLES:
Packit Service 5bcba8
	singles[u] = data[u]
Packit Service 5bcba8
	del data[u]
Packit Service 5bcba8
Packit Service 5bcba8
print "/* == Start of generated table == */"
Packit Service 5bcba8
print "/*"
Packit Service 5bcba8
print " * The following table is generated by running:"
Packit Service 5bcba8
print " *"
Packit Service 5bcba8
print " *   ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt"
Packit Service 5bcba8
print " *"
Packit Service 5bcba8
print " * on files with these headers:"
Packit Service 5bcba8
print " *"
Packit Service 5bcba8
for h in headers:
Packit Service 5bcba8
	for l in h:
Packit Service 5bcba8
		print " * %s" % (l.strip())
Packit Service 5bcba8
print " */"
Packit Service 5bcba8
print
Packit Service 5bcba8
print '#include "hb-ot-shape-complex-indic-private.hh"'
Packit Service 5bcba8
print
Packit Service 5bcba8
Packit Service 5bcba8
# Shorten values
Packit Service 5bcba8
short = [{
Packit Service 5bcba8
	"Bindu":		'Bi',
Packit Service 5bcba8
	"Cantillation_Mark":	'Ca',
Packit Service 5bcba8
	"Joiner":		'ZWJ',
Packit Service 5bcba8
	"Non_Joiner":		'ZWNJ',
Packit Service 5bcba8
	"Number":		'Nd',
Packit Service 5bcba8
	"Visarga":		'Vs',
Packit Service 5bcba8
	"Vowel":		'Vo',
Packit Service 5bcba8
	"Vowel_Dependent":	'M',
Packit Service 5bcba8
	"Consonant_Prefixed":	'CPrf',
Packit Service 5bcba8
	"Other":		'x',
Packit Service 5bcba8
},{
Packit Service 5bcba8
	"Not_Applicable":	'x',
Packit Service 5bcba8
}]
Packit Service 5bcba8
all_shorts = [{},{}]
Packit Service 5bcba8
Packit Service 5bcba8
# Add some of the values, to make them more readable, and to avoid duplicates
Packit Service 5bcba8
Packit Service 5bcba8
Packit Service 5bcba8
for i in range (2):
Packit Service 5bcba8
	for v,s in short[i].items ():
Packit Service 5bcba8
		all_shorts[i][s] = v
Packit Service 5bcba8
Packit Service 5bcba8
what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"]
Packit Service 5bcba8
what_short = ["ISC", "IMC"]
Packit Service 5bcba8
for i in range (2):
Packit Service 5bcba8
	print
Packit Service 5bcba8
	vv = values[i].keys ()
Packit Service 5bcba8
	vv.sort ()
Packit Service 5bcba8
	for v in vv:
Packit Service 5bcba8
		v_no_and = v.replace ('_And_', '_')
Packit Service 5bcba8
		if v in short[i]:
Packit Service 5bcba8
			s = short[i][v]
Packit Service 5bcba8
		else:
Packit Service 5bcba8
			s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')])
Packit Service 5bcba8
			if s in all_shorts[i]:
Packit Service 5bcba8
				raise Exception ("Duplicate short value alias", v, all_shorts[i][s])
Packit Service 5bcba8
			all_shorts[i][s] = v
Packit Service 5bcba8
			short[i][v] = s
Packit Service 5bcba8
		print "#define %s_%s	%s_%s	%s/* %3d chars; %s */" % \
Packit Service 5bcba8
			(what_short[i], s, what[i], v.upper (), \
Packit Service 5bcba8
			'	'* ((48-1 - len (what[i]) - 1 - len (v)) / 8), \
Packit Service 5bcba8
			values[i][v], v)
Packit Service 5bcba8
print
Packit Service 5bcba8
print "#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)"
Packit Service 5bcba8
print
Packit Service 5bcba8
print
Packit Service 5bcba8
Packit Service 5bcba8
total = 0
Packit Service 5bcba8
used = 0
Packit Service 5bcba8
last_block = None
Packit Service 5bcba8
def print_block (block, start, end, data):
Packit Service 5bcba8
	global total, used, last_block
Packit Service 5bcba8
	if block and block != last_block:
Packit Service 5bcba8
		print
Packit Service 5bcba8
		print
Packit Service 5bcba8
		print "  /* %s */" % block
Packit Service 5bcba8
	num = 0
Packit Service 5bcba8
	assert start % 8 == 0
Packit Service 5bcba8
	assert (end+1) % 8 == 0
Packit Service 5bcba8
	for u in range (start, end+1):
Packit Service 5bcba8
		if u % 8 == 0:
Packit Service 5bcba8
			print
Packit Service 5bcba8
			print "  /* %04X */" % u,
Packit Service 5bcba8
		if u in data:
Packit Service 5bcba8
			num += 1
Packit Service 5bcba8
		d = data.get (u, defaults)
Packit Service 5bcba8
		sys.stdout.write ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])))
Packit Service 5bcba8
Packit Service 5bcba8
	total += end - start + 1
Packit Service 5bcba8
	used += num
Packit Service 5bcba8
	if block:
Packit Service 5bcba8
		last_block = block
Packit Service 5bcba8
Packit Service 5bcba8
uu = data.keys ()
Packit Service 5bcba8
uu.sort ()
Packit Service 5bcba8
Packit Service 5bcba8
last = -100000
Packit Service 5bcba8
num = 0
Packit Service 5bcba8
offset = 0
Packit Service 5bcba8
starts = []
Packit Service 5bcba8
ends = []
Packit Service 5bcba8
print "static const INDIC_TABLE_ELEMENT_TYPE indic_table[] = {"
Packit Service 5bcba8
for u in uu:
Packit Service 5bcba8
	if u <= last:
Packit Service 5bcba8
		continue
Packit Service 5bcba8
	block = data[u][2]
Packit Service 5bcba8
Packit Service 5bcba8
	start = u//8*8
Packit Service 5bcba8
	end = start+1
Packit Service 5bcba8
	while end in uu and block == data[end][2]:
Packit Service 5bcba8
		end += 1
Packit Service 5bcba8
	end = (end-1)//8*8 + 7
Packit Service 5bcba8
Packit Service 5bcba8
	if start != last + 1:
Packit Service 5bcba8
		if start - last <= 1+16*3:
Packit Service 5bcba8
			print_block (None, last+1, start-1, data)
Packit Service 5bcba8
			last = start-1
Packit Service 5bcba8
		else:
Packit Service 5bcba8
			if last >= 0:
Packit Service 5bcba8
				ends.append (last + 1)
Packit Service 5bcba8
				offset += ends[-1] - starts[-1]
Packit Service 5bcba8
			print
Packit Service 5bcba8
			print
Packit Service 5bcba8
			print "#define indic_offset_0x%04xu %d" % (start, offset)
Packit Service 5bcba8
			starts.append (start)
Packit Service 5bcba8
Packit Service 5bcba8
	print_block (block, start, end, data)
Packit Service 5bcba8
	last = end
Packit Service 5bcba8
ends.append (last + 1)
Packit Service 5bcba8
offset += ends[-1] - starts[-1]
Packit Service 5bcba8
print
Packit Service 5bcba8
print
Packit Service 5bcba8
occupancy = used * 100. / total
Packit Service 5bcba8
page_bits = 12
Packit Service 5bcba8
print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)
Packit Service 5bcba8
print
Packit Service 5bcba8
print "INDIC_TABLE_ELEMENT_TYPE"
Packit Service 5bcba8
print "hb_indic_get_categories (hb_codepoint_t u)"
Packit Service 5bcba8
print "{"
Packit Service 5bcba8
print "  switch (u >> %d)" % page_bits
Packit Service 5bcba8
print "  {"
Packit Service 5bcba8
pages = set([u>>page_bits for u in starts+ends+singles.keys()])
Packit Service 5bcba8
for p in sorted(pages):
Packit Service 5bcba8
	print "    case 0x%0Xu:" % p
Packit Service 5bcba8
	for (start,end) in zip (starts, ends):
Packit Service 5bcba8
		if p not in [start>>page_bits, end>>page_bits]: continue
Packit Service 5bcba8
		offset = "indic_offset_0x%04xu" % start
Packit Service 5bcba8
		print "      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)
Packit Service 5bcba8
	for u,d in singles.items ():
Packit Service 5bcba8
		if p != u>>page_bits: continue
Packit Service 5bcba8
		print "      if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]])
Packit Service 5bcba8
	print "      break;"
Packit Service 5bcba8
	print ""
Packit Service 5bcba8
print "    default:"
Packit Service 5bcba8
print "      break;"
Packit Service 5bcba8
print "  }"
Packit Service 5bcba8
print "  return _(x,x);"
Packit Service 5bcba8
print "}"
Packit Service 5bcba8
print
Packit Service 5bcba8
print "#undef _"
Packit Service 5bcba8
for i in range (2):
Packit Service 5bcba8
	print
Packit Service 5bcba8
	vv = values[i].keys ()
Packit Service 5bcba8
	vv.sort ()
Packit Service 5bcba8
	for v in vv:
Packit Service 5bcba8
		print "#undef %s_%s" % \
Packit Service 5bcba8
			(what_short[i], short[i][v])
Packit Service 5bcba8
print
Packit Service 5bcba8
print "/* == End of generated table == */"
Packit Service 5bcba8
Packit Service 5bcba8
# Maintain at least 30% occupancy in the table */
Packit Service 5bcba8
if occupancy < 30:
Packit Service 5bcba8
	raise Exception ("Table too sparse, please investigate: ", occupancy)