Blame src/gen-indic-table.py

Packit 874993
#!/usr/bin/python
Packit 874993
Packit 874993
import sys
Packit 874993
Packit 874993
if len (sys.argv) != 4:
Packit 874993
	print >>sys.stderr, "usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt"
Packit 874993
	sys.exit (1)
Packit 874993
Packit 874993
ALLOWED_SINGLES = [0x00A0, 0x25CC]
Packit 874993
ALLOWED_BLOCKS = [
Packit 874993
	'Basic Latin',
Packit 874993
	'Latin-1 Supplement',
Packit 874993
	'Devanagari',
Packit 874993
	'Bengali',
Packit 874993
	'Gurmukhi',
Packit 874993
	'Gujarati',
Packit 874993
	'Oriya',
Packit 874993
	'Tamil',
Packit 874993
	'Telugu',
Packit 874993
	'Kannada',
Packit 874993
	'Malayalam',
Packit 874993
	'Sinhala',
Packit 874993
	'Myanmar',
Packit 874993
	'Khmer',
Packit 874993
	'Vedic Extensions',
Packit 874993
	'General Punctuation',
Packit 874993
	'Superscripts and Subscripts',
Packit 874993
	'Devanagari Extended',
Packit 874993
	'Myanmar Extended-B',
Packit 874993
	'Myanmar Extended-A',
Packit 874993
]
Packit 874993
Packit 874993
files = [file (x) for x in sys.argv[1:]]
Packit 874993
Packit 874993
headers = [[f.readline () for i in range (2)] for f in files]
Packit 874993
Packit 874993
data = [{} for f in files]
Packit 874993
values = [{} for f in files]
Packit 874993
for i, f in enumerate (files):
Packit 874993
	for line in f:
Packit 874993
Packit 874993
		j = line.find ('#')
Packit 874993
		if j >= 0:
Packit 874993
			line = line[:j]
Packit 874993
Packit 874993
		fields = [x.strip () for x in line.split (';')]
Packit 874993
		if len (fields) == 1:
Packit 874993
			continue
Packit 874993
Packit 874993
		uu = fields[0].split ('..')
Packit 874993
		start = int (uu[0], 16)
Packit 874993
		if len (uu) == 1:
Packit 874993
			end = start
Packit 874993
		else:
Packit 874993
			end = int (uu[1], 16)
Packit 874993
Packit 874993
		t = fields[1]
Packit 874993
Packit 874993
		for u in range (start, end + 1):
Packit 874993
			data[i][u] = t
Packit 874993
		values[i][t] = values[i].get (t, 0) + end - start + 1
Packit 874993
Packit 874993
# Merge data into one dict:
Packit 874993
defaults = ('Other', 'Not_Applicable', 'No_Block')
Packit 874993
for i,v in enumerate (defaults):
Packit 874993
	values[i][v] = values[i].get (v, 0) + 1
Packit 874993
combined = {}
Packit 874993
for i,d in enumerate (data):
Packit 874993
	for u,v in d.items ():
Packit 874993
		if i == 2 and not u in combined:
Packit 874993
			continue
Packit 874993
		if not u in combined:
Packit 874993
			combined[u] = list (defaults)
Packit 874993
		combined[u][i] = v
Packit 874993
combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS}
Packit 874993
data = combined
Packit 874993
del combined
Packit 874993
num = len (data)
Packit 874993
Packit 874993
for u in [0x17CD, 0x17CE, 0x17CF, 0x17D0, 0x17D3]:
Packit 874993
	if data[u][0] == 'Other':
Packit 874993
		data[u][0] = "Vowel_Dependent"
Packit 874993
Packit 874993
# Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out
Packit 874993
singles = {}
Packit 874993
for u in ALLOWED_SINGLES:
Packit 874993
	singles[u] = data[u]
Packit 874993
	del data[u]
Packit 874993
Packit 874993
print "/* == Start of generated table == */"
Packit 874993
print "/*"
Packit 874993
print " * The following table is generated by running:"
Packit 874993
print " *"
Packit 874993
print " *   ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt"
Packit 874993
print " *"
Packit 874993
print " * on files with these headers:"
Packit 874993
print " *"
Packit 874993
for h in headers:
Packit 874993
	for l in h:
Packit 874993
		print " * %s" % (l.strip())
Packit 874993
print " */"
Packit 874993
print
Packit 874993
print '#include "hb-ot-shape-complex-indic-private.hh"'
Packit 874993
print
Packit 874993
Packit 874993
# Shorten values
Packit 874993
short = [{
Packit 874993
	"Bindu":		'Bi',
Packit 874993
	"Cantillation_Mark":	'Ca',
Packit 874993
	"Joiner":		'ZWJ',
Packit 874993
	"Non_Joiner":		'ZWNJ',
Packit 874993
	"Number":		'Nd',
Packit 874993
	"Visarga":		'Vs',
Packit 874993
	"Vowel":		'Vo',
Packit 874993
	"Vowel_Dependent":	'M',
Packit 874993
	"Consonant_Prefixed":	'CPrf',
Packit 874993
	"Other":		'x',
Packit 874993
},{
Packit 874993
	"Not_Applicable":	'x',
Packit 874993
}]
Packit 874993
all_shorts = [{},{}]
Packit 874993
Packit 874993
# Add some of the values, to make them more readable, and to avoid duplicates
Packit 874993
Packit 874993
Packit 874993
for i in range (2):
Packit 874993
	for v,s in short[i].items ():
Packit 874993
		all_shorts[i][s] = v
Packit 874993
Packit 874993
what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"]
Packit 874993
what_short = ["ISC", "IMC"]
Packit 874993
for i in range (2):
Packit 874993
	print
Packit 874993
	vv = values[i].keys ()
Packit 874993
	vv.sort ()
Packit 874993
	for v in vv:
Packit 874993
		v_no_and = v.replace ('_And_', '_')
Packit 874993
		if v in short[i]:
Packit 874993
			s = short[i][v]
Packit 874993
		else:
Packit 874993
			s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')])
Packit 874993
			if s in all_shorts[i]:
Packit 874993
				raise Exception ("Duplicate short value alias", v, all_shorts[i][s])
Packit 874993
			all_shorts[i][s] = v
Packit 874993
			short[i][v] = s
Packit 874993
		print "#define %s_%s	%s_%s	%s/* %3d chars; %s */" % \
Packit 874993
			(what_short[i], s, what[i], v.upper (), \
Packit 874993
			'	'* ((48-1 - len (what[i]) - 1 - len (v)) / 8), \
Packit 874993
			values[i][v], v)
Packit 874993
print
Packit 874993
print "#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)"
Packit 874993
print
Packit 874993
print
Packit 874993
Packit 874993
total = 0
Packit 874993
used = 0
Packit 874993
last_block = None
Packit 874993
def print_block (block, start, end, data):
Packit 874993
	global total, used, last_block
Packit 874993
	if block and block != last_block:
Packit 874993
		print
Packit 874993
		print
Packit 874993
		print "  /* %s */" % block
Packit 874993
	num = 0
Packit 874993
	assert start % 8 == 0
Packit 874993
	assert (end+1) % 8 == 0
Packit 874993
	for u in range (start, end+1):
Packit 874993
		if u % 8 == 0:
Packit 874993
			print
Packit 874993
			print "  /* %04X */" % u,
Packit 874993
		if u in data:
Packit 874993
			num += 1
Packit 874993
		d = data.get (u, defaults)
Packit 874993
		sys.stdout.write ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])))
Packit 874993
Packit 874993
	total += end - start + 1
Packit 874993
	used += num
Packit 874993
	if block:
Packit 874993
		last_block = block
Packit 874993
Packit 874993
uu = data.keys ()
Packit 874993
uu.sort ()
Packit 874993
Packit 874993
last = -100000
Packit 874993
num = 0
Packit 874993
offset = 0
Packit 874993
starts = []
Packit 874993
ends = []
Packit 874993
print "static const INDIC_TABLE_ELEMENT_TYPE indic_table[] = {"
Packit 874993
for u in uu:
Packit 874993
	if u <= last:
Packit 874993
		continue
Packit 874993
	block = data[u][2]
Packit 874993
Packit 874993
	start = u//8*8
Packit 874993
	end = start+1
Packit 874993
	while end in uu and block == data[end][2]:
Packit 874993
		end += 1
Packit 874993
	end = (end-1)//8*8 + 7
Packit 874993
Packit 874993
	if start != last + 1:
Packit 874993
		if start - last <= 1+16*3:
Packit 874993
			print_block (None, last+1, start-1, data)
Packit 874993
			last = start-1
Packit 874993
		else:
Packit 874993
			if last >= 0:
Packit 874993
				ends.append (last + 1)
Packit 874993
				offset += ends[-1] - starts[-1]
Packit 874993
			print
Packit 874993
			print
Packit 874993
			print "#define indic_offset_0x%04xu %d" % (start, offset)
Packit 874993
			starts.append (start)
Packit 874993
Packit 874993
	print_block (block, start, end, data)
Packit 874993
	last = end
Packit 874993
ends.append (last + 1)
Packit 874993
offset += ends[-1] - starts[-1]
Packit 874993
print
Packit 874993
print
Packit 874993
occupancy = used * 100. / total
Packit 874993
page_bits = 12
Packit 874993
print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)
Packit 874993
print
Packit 874993
print "INDIC_TABLE_ELEMENT_TYPE"
Packit 874993
print "hb_indic_get_categories (hb_codepoint_t u)"
Packit 874993
print "{"
Packit 874993
print "  switch (u >> %d)" % page_bits
Packit 874993
print "  {"
Packit 874993
pages = set([u>>page_bits for u in starts+ends+singles.keys()])
Packit 874993
for p in sorted(pages):
Packit 874993
	print "    case 0x%0Xu:" % p
Packit 874993
	for (start,end) in zip (starts, ends):
Packit 874993
		if p not in [start>>page_bits, end>>page_bits]: continue
Packit 874993
		offset = "indic_offset_0x%04xu" % start
Packit 874993
		print "      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)
Packit 874993
	for u,d in singles.items ():
Packit 874993
		if p != u>>page_bits: continue
Packit 874993
		print "      if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]])
Packit 874993
	print "      break;"
Packit 874993
	print ""
Packit 874993
print "    default:"
Packit 874993
print "      break;"
Packit 874993
print "  }"
Packit 874993
print "  return _(x,x);"
Packit 874993
print "}"
Packit 874993
print
Packit 874993
print "#undef _"
Packit 874993
for i in range (2):
Packit 874993
	print
Packit 874993
	vv = values[i].keys ()
Packit 874993
	vv.sort ()
Packit 874993
	for v in vv:
Packit 874993
		print "#undef %s_%s" % \
Packit 874993
			(what_short[i], short[i][v])
Packit 874993
print
Packit 874993
print "/* == End of generated table == */"
Packit 874993
Packit 874993
# Maintain at least 30% occupancy in the table */
Packit 874993
if occupancy < 30:
Packit 874993
	raise Exception ("Table too sparse, please investigate: ", occupancy)