Tree - source-git/glibc - CentOS Git server

source-git / glibc

Blame localedata/unicode-gen/utf8_gen.py

Blob History Raw

Packit	6c4009	`#!/usr/bin/python3`
Packit	6c4009	`# -- coding: utf-8 --`
Packit	6c4009	`# Copyright (C) 2014-2018 Free Software Foundation, Inc.`
Packit	6c4009	`# This file is part of the GNU C Library.`
Packit	6c4009	`#`
Packit	6c4009	`# The GNU C Library is free software; you can redistribute it and/or`
Packit	6c4009	`# modify it under the terms of the GNU Lesser General Public`
Packit	6c4009	`# License as published by the Free Software Foundation; either`
Packit	6c4009	`# version 2.1 of the License, or (at your option) any later version.`
Packit	6c4009	`#`
Packit	6c4009	`# The GNU C Library is distributed in the hope that it will be useful,`
Packit	6c4009	`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
Packit	6c4009	`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
Packit	6c4009	`# Lesser General Public License for more details.`
Packit	6c4009	`#`
Packit	6c4009	`# You should have received a copy of the GNU Lesser General Public`
Packit	6c4009	`# License along with the GNU C Library; if not, see`
Packit	6c4009	`# <http://www.gnu.org/licenses/>.`
Packit	6c4009
Packit	6c4009	`'''glibc/localedata/charmaps/UTF-8 file generator script`
Packit	6c4009
Packit	6c4009	`This script generates a glibc/localedata/charmaps/UTF-8 file`
Packit	6c4009	`from Unicode data.`
Packit	6c4009
Packit	6c4009	`Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt`
Packit	6c4009
Packit	6c4009	`It will output UTF-8 file`
Packit	6c4009	`'''`
Packit	6c4009
Packit	6c4009	`import argparse`
Packit	6c4009	`import sys`
Packit	6c4009	`import re`
Packit	6c4009	`import unicode_utils`
Packit	6c4009
Packit	6c4009	`# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,`
Packit	6c4009	`# sections 3.11 and 4.4.`
Packit	6c4009
Packit	6c4009	`JAMO_INITIAL_SHORT_NAME = (`
Packit	6c4009	`'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',`
Packit	6c4009	`'C', 'K', 'T', 'P', 'H'`
Packit	6c4009	`)`
Packit	6c4009
Packit	6c4009	`JAMO_MEDIAL_SHORT_NAME = (`
Packit	6c4009	`'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',`
Packit	6c4009	`'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'`
Packit	6c4009	`)`
Packit	6c4009
Packit	6c4009	`JAMO_FINAL_SHORT_NAME = (`
Packit	6c4009	`'', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',`
Packit	6c4009	`'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',`
Packit	6c4009	`'P', 'H'`
Packit	6c4009	`)`
Packit	6c4009
Packit	6c4009	`def process_range(start, end, outfile, name):`
Packit	6c4009	`'''Writes a range of code points into the CHARMAP section of the`
Packit	6c4009	`output file`
Packit	6c4009
Packit	6c4009	`'''`
Packit	6c4009	`if 'Hangul Syllable' in name:`
Packit	6c4009	`# from glibc/localedata/ChangeLog:`
Packit	6c4009	`#`
Packit	6c4009	`# 2000-09-24 Bruno Haible <haible@clisp.cons.org>`
Packit	6c4009	`# * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,`
Packit	6c4009	`# so they become printable and carry a width. Comment out surrogate`
Packit	6c4009	`# ranges. Add a WIDTH table`
Packit	6c4009	`#`
Packit	6c4009	`# So we expand the Hangul Syllables here:`
Packit	6c4009	`for i in range(int(start, 16), int(end, 16)+1 ):`
Packit	6c4009	`index2, index3 = divmod(i - 0xaC00, 28)`
Packit	6c4009	`index1, index2 = divmod(index2, 21)`
Packit	6c4009	`hangul_syllable_name = 'HANGUL SYLLABLE ' \`
Packit	6c4009	`+ JAMO_INITIAL_SHORT_NAME[index1] \`
Packit	6c4009	`+ JAMO_MEDIAL_SHORT_NAME[index2] \`
Packit	6c4009	`+ JAMO_FINAL_SHORT_NAME[index3]`
Packit	6c4009	`outfile.write('{:<11s} {:<12s} {:s}\n'.format(`
Packit	6c4009	`unicode_utils.ucs_symbol(i), convert_to_hex(i),`
Packit	6c4009	`hangul_syllable_name))`
Packit	6c4009	`return`
Packit	6c4009	`# UnicodeData.txt file has contains code point ranges like this:`
Packit	6c4009	`#`
Packit	6c4009	`# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;`
Packit	6c4009	`# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;`
Packit	6c4009	`#`
Packit	6c4009	`# The glibc UTF-8 file splits ranges like these into shorter`
Packit	6c4009	`# ranges of 64 code points each:`
Packit	6c4009	`#`
Packit	6c4009	`# <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>`
Packit	6c4009	`# …`
Packit	6c4009	`# <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>`
Packit	6c4009	`for i in range(int(start, 16), int(end, 16), 64 ):`
Packit	6c4009	`if i > (int(end, 16)-64):`
Packit	6c4009	`outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(`
Packit	6c4009	`unicode_utils.ucs_symbol(i),`
Packit	6c4009	`unicode_utils.ucs_symbol(int(end,16)),`
Packit	6c4009	`convert_to_hex(i),`
Packit	6c4009	`name))`
Packit	6c4009	`break`
Packit	6c4009	`outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(`
Packit	6c4009	`unicode_utils.ucs_symbol(i),`
Packit	6c4009	`unicode_utils.ucs_symbol(i+63),`
Packit	6c4009	`convert_to_hex(i),`
Packit	6c4009	`name))`
Packit	6c4009
Packit	6c4009	`def process_charmap(flines, outfile):`
Packit	6c4009	`'''This function takes an array which contains all lines of`
Packit	6c4009	`of UnicodeData.txt and write lines to outfile as used in the`
Packit	6c4009
Packit	6c4009	`CHARMAP`
Packit	6c4009	`…`
Packit	6c4009	`END CHARMAP`
Packit	6c4009
Packit	6c4009	`section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.`
Packit	6c4009
Packit	6c4009	`Samples for input lines:`
Packit	6c4009
Packit	6c4009	`0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;`
Packit	6c4009	`3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;`
Packit	6c4009	`4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;`
Packit	6c4009	`D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;`
Packit	6c4009	`DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;`
Packit	6c4009	`100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;`
Packit	6c4009	`10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;`
Packit	6c4009
Packit	6c4009	`Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):`
Packit	6c4009
Packit	6c4009	`<U0010> /x10 DATA LINK ESCAPE`
Packit	6c4009	`<U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>`
Packit	6c4009	`%<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>`
Packit	6c4009	`%<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>`
Packit	6c4009	`<U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>`
Packit	6c4009
Packit	6c4009	`'''`
Packit	6c4009	`fields_start = []`
Packit	6c4009	`for line in flines:`
Packit	6c4009	`fields = line.split(";")`
Packit	6c4009	`# Some characters have “<control>” as their name. We try to`
Packit	6c4009	`# use the “Unicode 1.0 Name” (10th field in`
Packit	6c4009	`# UnicodeData.txt) for them.`
Packit	6c4009	`#`
Packit	6c4009	`# The Characters U+0080, U+0081, U+0084 and U+0099 have`
Packit	6c4009	`# “<control>” as their name but do not even have aa`
Packit	6c4009	`# ”Unicode 1.0 Name”. We could write code to take their`
Packit	6c4009	`# alternate names from NameAliases.txt.`
Packit	6c4009	`if fields[1] == "<control>" and fields[10]:`
Packit	6c4009	`fields[1] = fields[10]`
Packit	6c4009	`# Handling code point ranges like:`
Packit	6c4009	`#`
Packit	6c4009	`# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;`
Packit	6c4009	`# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;`
Packit	6c4009	`if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:`
Packit	6c4009	`fields_start = fields`
Packit	6c4009	`continue`
Packit	6c4009	`if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:`
Packit	6c4009	`process_range(fields_start[0], fields[0],`
Packit	6c4009	`outfile, fields[1][:-7]+'>')`
Packit	6c4009	`fields_start = []`
Packit	6c4009	`continue`
Packit	6c4009	`fields_start = []`
Packit	6c4009	`if 'Surrogate,' in fields[1]:`
Packit	6c4009	`# Comment out the surrogates in the UTF-8 file.`
Packit	6c4009	`# One could of course skip them completely but`
Packit	6c4009	`# the original UTF-8 file in glibc had them as`
Packit	6c4009	`# comments, so we keep these comment lines.`
Packit	6c4009	`outfile.write('%')`
Packit	6c4009	`outfile.write('{:<11s} {:<12s} {:s}\n'.format(`
Packit	6c4009	`unicode_utils.ucs_symbol(int(fields[0], 16)),`
Packit	6c4009	`convert_to_hex(int(fields[0], 16)),`
Packit	6c4009	`fields[1]))`
Packit	6c4009
Packit	6c4009	`def convert_to_hex(code_point):`
Packit	6c4009	`'''Converts a code point to a hexadecimal UTF-8 representation`
Packit	6c4009	`like /x/x/x**.'''`
Packit	6c4009	`# Getting UTF8 of Unicode characters.`
Packit	6c4009	`# In Python3, .encode('UTF-8') does not work for`
Packit	6c4009	`# surrogates. Therefore, we use this conversion table`
Packit	6c4009	`surrogates = {`
Packit	6c4009	`0xD800: '/xed/xa0/x80',`
Packit	6c4009	`0xDB7F: '/xed/xad/xbf',`
Packit	6c4009	`0xDB80: '/xed/xae/x80',`
Packit	6c4009	`0xDBFF: '/xed/xaf/xbf',`
Packit	6c4009	`0xDC00: '/xed/xb0/x80',`
Packit	6c4009	`0xDFFF: '/xed/xbf/xbf',`
Packit	6c4009	`}`
Packit	6c4009	`if code_point in surrogates:`
Packit	6c4009	`return surrogates[code_point]`
Packit	6c4009	`return ''.join([`
Packit	6c4009	`'/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')`
Packit	6c4009	`])`
Packit	6c4009
Packit	6c4009	`def write_header_charmap(outfile):`
Packit	6c4009	`'''Write the header on top of the CHARMAP section to the output file'''`
Packit	6c4009	`outfile.write("<code_set_name> UTF-8\n")`
Packit	6c4009	`outfile.write("<comment_char> %\n")`
Packit	6c4009	`outfile.write("<escape_char> /\n")`
Packit	6c4009	`outfile.write("<mb_cur_min> 1\n")`
Packit	6c4009	`outfile.write("<mb_cur_max> 6\n\n")`
Packit	6c4009	`outfile.write("% CHARMAP generated using utf8_gen.py\n")`
Packit	6c4009	`outfile.write("% alias ISO-10646/UTF-8\n")`
Packit	6c4009	`outfile.write("CHARMAP\n")`
Packit	6c4009
Packit	6c4009	`def write_header_width(outfile, unicode_version):`
Packit	6c4009	`'''Writes the header on top of the WIDTH section to the output file'''`
Packit	6c4009	`outfile.write('% Character width according to Unicode '`
Packit	6c4009	`+ '{:s}.\n'.format(unicode_version))`
Packit	6c4009	`outfile.write('% - Default width is 1.\n')`
Packit	6c4009	`outfile.write('% - Double-width characters have width 2; generated from\n')`
Packit	6c4009	`outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')`
Packit	6c4009	`outfile.write('% - Non-spacing characters have width 0; '`
Packit	6c4009	`+ 'generated from PropList.txt or\n')`
Packit	6c4009	`outfile.write('% "grep \'^[^;];[^;];[^;];[^;];NSM;\' '`
Packit	6c4009	`+ 'UnicodeData.txt"\n')`
Packit	6c4009	`outfile.write('% - Format control characters have width 0; '`
Packit	6c4009	`+ 'generated from\n')`
Packit	6c4009	`outfile.write("% \"grep '^[^;];[^;];Cf;' UnicodeData.txt\"\n")`
Packit	6c4009	`# Not needed covered by Cf`
Packit	6c4009	`# outfile.write("% - Zero width characters have width 0; generated from\n")`
Packit	6c4009	`# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")`
Packit	6c4009	`outfile.write("WIDTH\n")`
Packit	6c4009
Packit	6c4009	`def process_width(outfile, ulines, elines, plines):`
Packit	6c4009	`'''ulines are lines from UnicodeData.txt, elines are lines from`
Packit	6c4009	`EastAsianWidth.txt containing characters with width “W” or “F”,`
Packit	6c4009	`plines are lines from PropList.txt which contain characters`
Packit	6c4009	`with the property “Prepended_Concatenation_Mark”.`
Packit	6c4009
Packit	6c4009	`'''`
Packit	6c4009	`width_dict = {}`
Packit	6c4009	`for line in elines:`
Packit	6c4009	`fields = line.split(";")`
Packit	6c4009	`if not '..' in fields[0]:`
Packit	6c4009	`code_points = (fields[0], fields[0])`
Packit	6c4009	`else:`
Packit	6c4009	`code_points = fields[0].split("..")`
Packit	6c4009	`for key in range(int(code_points[0], 16),`
Packit	6c4009	`int(code_points[1], 16)+1):`
Packit	6c4009	`width_dict[key] = 2`
Packit	6c4009
Packit	6c4009	`for line in ulines:`
Packit	6c4009	`fields = line.split(";")`
Packit	6c4009	`if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):`
Packit	6c4009	`width_dict[int(fields[0], 16)] = 0`
Packit	6c4009
Packit	6c4009	`for line in plines:`
Packit	6c4009	`# Characters with the property “Prepended_Concatenation_Mark”`
Packit	6c4009	`# should have the width 1:`
Packit	6c4009	`fields = line.split(";")`
Packit	6c4009	`if not '..' in fields[0]:`
Packit	6c4009	`code_points = (fields[0], fields[0])`
Packit	6c4009	`else:`
Packit	6c4009	`code_points = fields[0].split("..")`
Packit	6c4009	`for key in range(int(code_points[0], 16),`
Packit	6c4009	`int(code_points[1], 16)+1):`
Packit	6c4009	`del width_dict[key] # default width is 1`
Packit	6c4009
Packit	6c4009	`# handle special cases for compatibility`
Packit	6c4009	`for key in list((0x00AD,)):`
Packit	6c4009	`# https://www.cs.tut.fi/~jkorpela/shy.html`
Packit	6c4009	`if key in width_dict:`
Packit	6c4009	`del width_dict[key] # default width is 1`
Packit	6c4009	`for key in list(range(0x1160, 0x1200)):`
Packit	6c4009	`width_dict[key] = 0`
Packit	6c4009	`for key in list(range(0x3248, 0x3250)):`
Packit	6c4009	`# These are “A” which means we can decide whether to treat them`
Packit	6c4009	`# as “W” or “N” based on context:`
Packit	6c4009	`# http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html`
Packit	6c4009	`# For us, “W” seems better.`
Packit	6c4009	`width_dict[key] = 2`
Packit	6c4009	`for key in list(range(0x4DC0, 0x4E00)):`
Packit	6c4009	`width_dict[key] = 2`
Packit	6c4009
Packit	6c4009	`same_width_lists = []`
Packit	6c4009	`current_width_list = []`
Packit	6c4009	`for key in sorted(width_dict):`
Packit	6c4009	`if not current_width_list:`
Packit	6c4009	`current_width_list = [key]`
Packit	6c4009	`elif (key == current_width_list[-1] + 1`
Packit	6c4009	`and width_dict[key] == width_dict[current_width_list[0]]):`
Packit	6c4009	`current_width_list.append(key)`
Packit	6c4009	`else:`
Packit	6c4009	`same_width_lists.append(current_width_list)`
Packit	6c4009	`current_width_list = [key]`
Packit	6c4009	`if current_width_list:`
Packit	6c4009	`same_width_lists.append(current_width_list)`
Packit	6c4009
Packit	6c4009	`for same_width_list in same_width_lists:`
Packit	6c4009	`if len(same_width_list) == 1:`
Packit	6c4009	`outfile.write('{:s}\t{:d}\n'.format(`
Packit	6c4009	`unicode_utils.ucs_symbol(same_width_list[0]),`
Packit	6c4009	`width_dict[same_width_list[0]]))`
Packit	6c4009	`else:`
Packit	6c4009	`outfile.write('{:s}...{:s}\t{:d}\n'.format(`
Packit	6c4009	`unicode_utils.ucs_symbol(same_width_list[0]),`
Packit	6c4009	`unicode_utils.ucs_symbol(same_width_list[-1]),`
Packit	6c4009	`width_dict[same_width_list[0]]))`
Packit	6c4009
Packit	6c4009	`if __name__ == "__main__":`
Packit	6c4009	`PARSER = argparse.ArgumentParser(`
Packit	6c4009	`description='''`
Packit	6c4009	`Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.`
Packit	6c4009	`''')`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'-u', '--unicode_data_file',`
Packit	6c4009	`nargs='?',`
Packit	6c4009	`type=str,`
Packit	6c4009	`default='UnicodeData.txt',`
Packit	6c4009	`help=('The UnicodeData.txt file to read, '`
Packit	6c4009	`+ 'default: %(default)s'))`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'-e', '--east_asian_with_file',`
Packit	6c4009	`nargs='?',`
Packit	6c4009	`type=str,`
Packit	6c4009	`default='EastAsianWidth.txt',`
Packit	6c4009	`help=('The EastAsianWidth.txt file to read, '`
Packit	6c4009	`+ 'default: %(default)s'))`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'-p', '--prop_list_file',`
Packit	6c4009	`nargs='?',`
Packit	6c4009	`type=str,`
Packit	6c4009	`default='PropList.txt',`
Packit	6c4009	`help=('The PropList.txt file to read, '`
Packit	6c4009	`+ 'default: %(default)s'))`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'--unicode_version',`
Packit	6c4009	`nargs='?',`
Packit	6c4009	`required=True,`
Packit	6c4009	`type=str,`
Packit	6c4009	`help='The Unicode version of the input files used.')`
Packit	6c4009	`ARGS = PARSER.parse_args()`
Packit	6c4009
Packit	6c4009	`with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:`
Packit	6c4009	`UNICODE_DATA_LINES = UNIDATA_FILE.readlines()`
Packit	6c4009	`with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:`
Packit	6c4009	`EAST_ASIAN_WIDTH_LINES = []`
Packit	6c4009	`for LINE in EAST_ASIAN_WIDTH_FILE:`
Packit	6c4009	`# If characters from EastAasianWidth.txt which are from`
Packit	6c4009	`# from reserved ranges (i.e. not yet assigned code points)`
Packit	6c4009	`# are added to the WIDTH section of the UTF-8 file, then`
Packit	6c4009	`# “make check” produces “Unknown Character” errors for`
Packit	6c4009	`# these code points because such unassigned code points`
Packit	6c4009	`# are not in the CHARMAP section of the UTF-8 file.`
Packit	6c4009	`#`
Packit	6c4009	`# Therefore, we skip all reserved code points when reading`
Packit	6c4009	`# the EastAsianWidth.txt file.`
Packit	6c4009	`if re.match(r'.<reserved-.+>\.\.<reserved-.+>.', LINE):`
Packit	6c4009	`continue`
Packit	6c4009	`if re.match(r'^[^;]*;[WF]', LINE):`
Packit	6c4009	`EAST_ASIAN_WIDTH_LINES.append(LINE.strip())`
Packit	6c4009	`with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:`
Packit	6c4009	`PROP_LIST_LINES = []`
Packit	6c4009	`for LINE in PROP_LIST_FILE:`
Packit	6c4009	`if re.match(r'^[^;];[\s]Prepended_Concatenation_Mark', LINE):`
Packit	6c4009	`PROP_LIST_LINES.append(LINE.strip())`
Packit	6c4009	`with open('UTF-8', mode='w') as OUTFILE:`
Packit	6c4009	`# Processing UnicodeData.txt and write CHARMAP to UTF-8 file`
Packit	6c4009	`write_header_charmap(OUTFILE)`
Packit	6c4009	`process_charmap(UNICODE_DATA_LINES, OUTFILE)`
Packit	6c4009	`OUTFILE.write("END CHARMAP\n\n")`
Packit	6c4009	`# Processing EastAsianWidth.txt and write WIDTH to UTF-8 file`
Packit	6c4009	`write_header_width(OUTFILE, ARGS.unicode_version)`
Packit	6c4009	`process_width(OUTFILE,`
Packit	6c4009	`UNICODE_DATA_LINES,`
Packit	6c4009	`EAST_ASIAN_WIDTH_LINES,`
Packit	6c4009	`PROP_LIST_LINES)`
Packit	6c4009	`OUTFILE.write("END WIDTH\n")`

source-git / glibc

Source Code

Blame localedata/unicode-gen/utf8_gen.py