Blame localedata/unicode-gen/utf8_gen.py

Packit 6c4009
#!/usr/bin/python3
Packit 6c4009
# -*- coding: utf-8 -*-
Packit 6c4009
# Copyright (C) 2014-2018 Free Software Foundation, Inc.
Packit 6c4009
# This file is part of the GNU C Library.
Packit 6c4009
#
Packit 6c4009
# The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
# modify it under the terms of the GNU Lesser General Public
Packit 6c4009
# License as published by the Free Software Foundation; either
Packit 6c4009
# version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
#
Packit 6c4009
# The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
# but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Packit 6c4009
# Lesser General Public License for more details.
Packit 6c4009
#
Packit 6c4009
# You should have received a copy of the GNU Lesser General Public
Packit 6c4009
# License along with the GNU C Library; if not, see
Packit 6c4009
# <http://www.gnu.org/licenses/>.
Packit 6c4009
Packit 6c4009
'''glibc/localedata/charmaps/UTF-8 file generator script
Packit 6c4009
Packit 6c4009
This script generates a glibc/localedata/charmaps/UTF-8 file
Packit 6c4009
from Unicode data.
Packit 6c4009
Packit 6c4009
Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
Packit 6c4009
Packit 6c4009
It will output UTF-8 file
Packit 6c4009
'''
Packit 6c4009
Packit 6c4009
import argparse
Packit 6c4009
import sys
Packit 6c4009
import re
Packit 6c4009
import unicode_utils
Packit 6c4009
Packit 6c4009
# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
Packit 6c4009
# sections 3.11 and 4.4.
Packit 6c4009
Packit 6c4009
JAMO_INITIAL_SHORT_NAME = (
Packit 6c4009
    'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
Packit 6c4009
    'C', 'K', 'T', 'P', 'H'
Packit 6c4009
)
Packit 6c4009
Packit 6c4009
JAMO_MEDIAL_SHORT_NAME = (
Packit 6c4009
    'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
Packit 6c4009
    'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
Packit 6c4009
)
Packit 6c4009
Packit 6c4009
JAMO_FINAL_SHORT_NAME = (
Packit 6c4009
    '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
Packit 6c4009
    'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
Packit 6c4009
    'P', 'H'
Packit 6c4009
)
Packit 6c4009
Packit 6c4009
def process_range(start, end, outfile, name):
Packit 6c4009
    '''Writes a range of code points into the CHARMAP section of the
Packit 6c4009
    output file
Packit 6c4009
Packit 6c4009
    '''
Packit 6c4009
    if 'Hangul Syllable' in name:
Packit 6c4009
        # from glibc/localedata/ChangeLog:
Packit 6c4009
        #
Packit 6c4009
        #  2000-09-24  Bruno Haible  <haible@clisp.cons.org>
Packit 6c4009
        #  * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
Packit 6c4009
        #  so they become printable and carry a width. Comment out surrogate
Packit 6c4009
        #  ranges. Add a WIDTH table
Packit 6c4009
        #
Packit 6c4009
        # So we expand the Hangul Syllables here:
Packit 6c4009
        for i in range(int(start, 16), int(end, 16)+1 ):
Packit 6c4009
            index2, index3 = divmod(i - 0xaC00, 28)
Packit 6c4009
            index1, index2 = divmod(index2, 21)
Packit 6c4009
            hangul_syllable_name = 'HANGUL SYLLABLE ' \
Packit 6c4009
                                   + JAMO_INITIAL_SHORT_NAME[index1] \
Packit 6c4009
                                   + JAMO_MEDIAL_SHORT_NAME[index2] \
Packit 6c4009
                                   + JAMO_FINAL_SHORT_NAME[index3]
Packit 6c4009
            outfile.write('{:<11s} {:<12s} {:s}\n'.format(
Packit 6c4009
                unicode_utils.ucs_symbol(i), convert_to_hex(i),
Packit 6c4009
                hangul_syllable_name))
Packit 6c4009
        return
Packit 6c4009
    # UnicodeData.txt file has contains code point ranges like this:
Packit 6c4009
    #
Packit 6c4009
    # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
Packit 6c4009
    # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
Packit 6c4009
    #
Packit 6c4009
    # The glibc UTF-8 file splits ranges like these into shorter
Packit 6c4009
    # ranges of 64 code points each:
Packit 6c4009
    #
Packit 6c4009
    # <U3400>..<U343F>     /xe3/x90/x80         <CJK Ideograph Extension A>
Packit 6c4009
    # …
Packit 6c4009
    # <U4D80>..<U4DB5>     /xe4/xb6/x80         <CJK Ideograph Extension A>
Packit 6c4009
    for i in range(int(start, 16), int(end, 16), 64 ):
Packit 6c4009
        if i > (int(end, 16)-64):
Packit 6c4009
            outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
Packit 6c4009
                    unicode_utils.ucs_symbol(i),
Packit 6c4009
                    unicode_utils.ucs_symbol(int(end,16)),
Packit 6c4009
                    convert_to_hex(i),
Packit 6c4009
                    name))
Packit 6c4009
            break
Packit 6c4009
        outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
Packit 6c4009
                unicode_utils.ucs_symbol(i),
Packit 6c4009
                unicode_utils.ucs_symbol(i+63),
Packit 6c4009
                convert_to_hex(i),
Packit 6c4009
                name))
Packit 6c4009
Packit 6c4009
def process_charmap(flines, outfile):
Packit 6c4009
    '''This function takes an array which contains *all* lines of
Packit 6c4009
    of UnicodeData.txt and write lines to outfile as used in the
Packit 6c4009
Packit 6c4009
    CHARMAP
Packit 6c4009
Packit 6c4009
    END CHARMAP
Packit 6c4009
Packit 6c4009
    section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
Packit 6c4009
Packit 6c4009
    Samples for input lines:
Packit 6c4009
Packit 6c4009
    0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
Packit 6c4009
    3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
Packit 6c4009
    4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
Packit 6c4009
    D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
Packit 6c4009
    DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
Packit 6c4009
    100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
Packit 6c4009
    10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
Packit 6c4009
Packit 6c4009
    Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
Packit 6c4009
Packit 6c4009
    <U0010>     /x10 DATA LINK ESCAPE
Packit 6c4009
    <U3400>..<U343F>     /xe3/x90/x80 <CJK Ideograph Extension A>
Packit 6c4009
    %<UD800>     /xed/xa0/x80 <Non Private Use High Surrogate, First>
Packit 6c4009
    %<UDB7F>     /xed/xad/xbf <Non Private Use High Surrogate, Last>
Packit 6c4009
    <U0010FFC0>..<U0010FFFD>     /xf4/x8f/xbf/x80 <Plane 16 Private Use>
Packit 6c4009
Packit 6c4009
    '''
Packit 6c4009
    fields_start = []
Packit 6c4009
    for line in flines:
Packit 6c4009
        fields = line.split(";")
Packit 6c4009
         # Some characters have “<control>” as their name. We try to
Packit 6c4009
         # use the “Unicode 1.0 Name” (10th field in
Packit 6c4009
         # UnicodeData.txt) for them.
Packit 6c4009
         #
Packit 6c4009
         # The Characters U+0080, U+0081, U+0084 and U+0099 have
Packit 6c4009
         # “<control>” as their name but do not even have aa
Packit 6c4009
         # ”Unicode 1.0 Name”. We could write code to take their
Packit 6c4009
         # alternate names from NameAliases.txt.
Packit 6c4009
        if fields[1] == "<control>" and fields[10]:
Packit 6c4009
            fields[1] = fields[10]
Packit 6c4009
        # Handling code point ranges like:
Packit 6c4009
        #
Packit 6c4009
        # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
Packit 6c4009
        # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
Packit 6c4009
        if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
Packit 6c4009
            fields_start = fields
Packit 6c4009
            continue
Packit 6c4009
        if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
Packit 6c4009
            process_range(fields_start[0], fields[0],
Packit 6c4009
                          outfile, fields[1][:-7]+'>')
Packit 6c4009
            fields_start = []
Packit 6c4009
            continue
Packit 6c4009
        fields_start = []
Packit 6c4009
        if 'Surrogate,' in fields[1]:
Packit 6c4009
            # Comment out the surrogates in the UTF-8 file.
Packit 6c4009
            # One could of course skip them completely but
Packit 6c4009
            # the original UTF-8 file in glibc had them as
Packit 6c4009
            # comments, so we keep these comment lines.
Packit 6c4009
            outfile.write('%')
Packit 6c4009
        outfile.write('{:<11s} {:<12s} {:s}\n'.format(
Packit 6c4009
                unicode_utils.ucs_symbol(int(fields[0], 16)),
Packit 6c4009
                convert_to_hex(int(fields[0], 16)),
Packit 6c4009
                fields[1]))
Packit 6c4009
Packit 6c4009
def convert_to_hex(code_point):
Packit 6c4009
    '''Converts a code point to a hexadecimal UTF-8 representation
Packit 6c4009
    like /x**/x**/x**.'''
Packit 6c4009
    # Getting UTF8 of Unicode characters.
Packit 6c4009
    # In Python3, .encode('UTF-8') does not work for
Packit 6c4009
    # surrogates. Therefore, we use this conversion table
Packit 6c4009
    surrogates = {
Packit 6c4009
        0xD800: '/xed/xa0/x80',
Packit 6c4009
        0xDB7F: '/xed/xad/xbf',
Packit 6c4009
        0xDB80: '/xed/xae/x80',
Packit 6c4009
        0xDBFF: '/xed/xaf/xbf',
Packit 6c4009
        0xDC00: '/xed/xb0/x80',
Packit 6c4009
        0xDFFF: '/xed/xbf/xbf',
Packit 6c4009
    }
Packit 6c4009
    if code_point in surrogates:
Packit 6c4009
        return surrogates[code_point]
Packit 6c4009
    return ''.join([
Packit 6c4009
        '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
Packit 6c4009
    ])
Packit 6c4009
Packit 6c4009
def write_header_charmap(outfile):
Packit 6c4009
    '''Write the header on top of the CHARMAP section to the output file'''
Packit 6c4009
    outfile.write("<code_set_name> UTF-8\n")
Packit 6c4009
    outfile.write("<comment_char> %\n")
Packit 6c4009
    outfile.write("<escape_char> /\n")
Packit 6c4009
    outfile.write("<mb_cur_min> 1\n")
Packit 6c4009
    outfile.write("<mb_cur_max> 6\n\n")
Packit 6c4009
    outfile.write("% CHARMAP generated using utf8_gen.py\n")
Packit 6c4009
    outfile.write("% alias ISO-10646/UTF-8\n")
Packit 6c4009
    outfile.write("CHARMAP\n")
Packit 6c4009
Packit 6c4009
def write_header_width(outfile, unicode_version):
Packit 6c4009
    '''Writes the header on top of the WIDTH section to the output file'''
Packit 6c4009
    outfile.write('% Character width according to Unicode '
Packit 6c4009
                  + '{:s}.\n'.format(unicode_version))
Packit 6c4009
    outfile.write('% - Default width is 1.\n')
Packit 6c4009
    outfile.write('% - Double-width characters have width 2; generated from\n')
Packit 6c4009
    outfile.write('%        "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
Packit 6c4009
    outfile.write('% - Non-spacing characters have width 0; '
Packit 6c4009
                  + 'generated from PropList.txt or\n')
Packit 6c4009
    outfile.write('%   "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
Packit 6c4009
                  + 'UnicodeData.txt"\n')
Packit 6c4009
    outfile.write('% - Format control characters have width 0; '
Packit 6c4009
                  + 'generated from\n')
Packit 6c4009
    outfile.write("%   \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
Packit 6c4009
#   Not needed covered by Cf
Packit 6c4009
#    outfile.write("% - Zero width characters have width 0; generated from\n")
Packit 6c4009
#    outfile.write("%   \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
Packit 6c4009
    outfile.write("WIDTH\n")
Packit 6c4009
Packit 6c4009
def process_width(outfile, ulines, elines, plines):
Packit 6c4009
    '''ulines are lines from UnicodeData.txt, elines are lines from
Packit 6c4009
    EastAsianWidth.txt containing characters with width “W” or “F”,
Packit 6c4009
    plines are lines from PropList.txt which contain characters
Packit 6c4009
    with the property “Prepended_Concatenation_Mark”.
Packit 6c4009
Packit 6c4009
    '''
Packit 6c4009
    width_dict = {}
Packit 6c4009
    for line in elines:
Packit 6c4009
        fields = line.split(";")
Packit 6c4009
        if not '..' in fields[0]:
Packit 6c4009
            code_points = (fields[0], fields[0])
Packit 6c4009
        else:
Packit 6c4009
            code_points = fields[0].split("..")
Packit 6c4009
        for key in range(int(code_points[0], 16),
Packit 6c4009
                         int(code_points[1], 16)+1):
Packit 6c4009
            width_dict[key] = 2
Packit 6c4009
Packit 6c4009
    for line in ulines:
Packit 6c4009
        fields = line.split(";")
Packit 6c4009
        if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
Packit 6c4009
            width_dict[int(fields[0], 16)] = 0
Packit 6c4009
Packit 6c4009
    for line in plines:
Packit 6c4009
        # Characters with the property “Prepended_Concatenation_Mark”
Packit 6c4009
        # should have the width 1:
Packit 6c4009
        fields = line.split(";")
Packit 6c4009
        if not '..' in fields[0]:
Packit 6c4009
            code_points = (fields[0], fields[0])
Packit 6c4009
        else:
Packit 6c4009
            code_points = fields[0].split("..")
Packit 6c4009
        for key in range(int(code_points[0], 16),
Packit 6c4009
                         int(code_points[1], 16)+1):
Packit 6c4009
            del width_dict[key] # default width is 1
Packit 6c4009
Packit 6c4009
    # handle special cases for compatibility
Packit 6c4009
    for key in list((0x00AD,)):
Packit 6c4009
        # https://www.cs.tut.fi/~jkorpela/shy.html
Packit 6c4009
        if key in width_dict:
Packit 6c4009
            del width_dict[key] # default width is 1
Packit 6c4009
    for key in list(range(0x1160, 0x1200)):
Packit 6c4009
        width_dict[key] = 0
Packit 6c4009
    for key in list(range(0x3248, 0x3250)):
Packit 6c4009
        # These are “A” which means we can decide whether to treat them
Packit 6c4009
        # as “W” or “N” based on context:
Packit 6c4009
        # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
Packit 6c4009
        # For us, “W” seems better.
Packit 6c4009
        width_dict[key] = 2
Packit 6c4009
    for key in list(range(0x4DC0, 0x4E00)):
Packit 6c4009
        width_dict[key] = 2
Packit 6c4009
Packit 6c4009
    same_width_lists = []
Packit 6c4009
    current_width_list = []
Packit 6c4009
    for key in sorted(width_dict):
Packit 6c4009
        if not current_width_list:
Packit 6c4009
            current_width_list = [key]
Packit 6c4009
        elif (key == current_width_list[-1] + 1
Packit 6c4009
              and width_dict[key] == width_dict[current_width_list[0]]):
Packit 6c4009
            current_width_list.append(key)
Packit 6c4009
        else:
Packit 6c4009
            same_width_lists.append(current_width_list)
Packit 6c4009
            current_width_list = [key]
Packit 6c4009
    if current_width_list:
Packit 6c4009
        same_width_lists.append(current_width_list)
Packit 6c4009
Packit 6c4009
    for same_width_list in same_width_lists:
Packit 6c4009
        if len(same_width_list) == 1:
Packit 6c4009
            outfile.write('{:s}\t{:d}\n'.format(
Packit 6c4009
                unicode_utils.ucs_symbol(same_width_list[0]),
Packit 6c4009
                width_dict[same_width_list[0]]))
Packit 6c4009
        else:
Packit 6c4009
            outfile.write('{:s}...{:s}\t{:d}\n'.format(
Packit 6c4009
                unicode_utils.ucs_symbol(same_width_list[0]),
Packit 6c4009
                unicode_utils.ucs_symbol(same_width_list[-1]),
Packit 6c4009
                width_dict[same_width_list[0]]))
Packit 6c4009
Packit 6c4009
if __name__ == "__main__":
Packit 6c4009
    PARSER = argparse.ArgumentParser(
Packit 6c4009
        description='''
Packit 6c4009
        Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
Packit 6c4009
        ''')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-u', '--unicode_data_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        default='UnicodeData.txt',
Packit 6c4009
        help=('The UnicodeData.txt file to read, '
Packit 6c4009
              + 'default: %(default)s'))
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-e', '--east_asian_with_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        default='EastAsianWidth.txt',
Packit 6c4009
        help=('The EastAsianWidth.txt file to read, '
Packit 6c4009
              + 'default: %(default)s'))
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-p', '--prop_list_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        default='PropList.txt',
Packit 6c4009
        help=('The PropList.txt file to read, '
Packit 6c4009
              + 'default: %(default)s'))
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '--unicode_version',
Packit 6c4009
        nargs='?',
Packit 6c4009
        required=True,
Packit 6c4009
        type=str,
Packit 6c4009
        help='The Unicode version of the input files used.')
Packit 6c4009
    ARGS = PARSER.parse_args()
Packit 6c4009
Packit 6c4009
    with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
Packit 6c4009
        UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
Packit 6c4009
    with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
Packit 6c4009
        EAST_ASIAN_WIDTH_LINES = []
Packit 6c4009
        for LINE in EAST_ASIAN_WIDTH_FILE:
Packit 6c4009
            # If characters from EastAasianWidth.txt which are from
Packit 6c4009
            # from reserved ranges (i.e. not yet assigned code points)
Packit 6c4009
            # are added to the WIDTH section of the UTF-8 file, then
Packit 6c4009
            # “make check” produces “Unknown Character” errors for
Packit 6c4009
            # these code points because such unassigned code points
Packit 6c4009
            # are not in the CHARMAP section of the UTF-8 file.
Packit 6c4009
            #
Packit 6c4009
            # Therefore, we skip all reserved code points when reading
Packit 6c4009
            # the EastAsianWidth.txt file.
Packit 6c4009
            if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
Packit 6c4009
                continue
Packit 6c4009
            if re.match(r'^[^;]*;[WF]', LINE):
Packit 6c4009
                EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
Packit 6c4009
    with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
Packit 6c4009
        PROP_LIST_LINES = []
Packit 6c4009
        for LINE in PROP_LIST_FILE:
Packit 6c4009
            if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
Packit 6c4009
                PROP_LIST_LINES.append(LINE.strip())
Packit 6c4009
    with open('UTF-8', mode='w') as OUTFILE:
Packit 6c4009
        # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
Packit 6c4009
        write_header_charmap(OUTFILE)
Packit 6c4009
        process_charmap(UNICODE_DATA_LINES, OUTFILE)
Packit 6c4009
        OUTFILE.write("END CHARMAP\n\n")
Packit 6c4009
        # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
Packit 6c4009
        write_header_width(OUTFILE, ARGS.unicode_version)
Packit 6c4009
        process_width(OUTFILE,
Packit 6c4009
                      UNICODE_DATA_LINES,
Packit 6c4009
                      EAST_ASIAN_WIDTH_LINES,
Packit 6c4009
                      PROP_LIST_LINES)
Packit 6c4009
        OUTFILE.write("END WIDTH\n")