Blame localedata/unicode-gen/gen_unicode_ctype.py

Packit 6c4009
#!/usr/bin/python3
Packit 6c4009
#
Packit 6c4009
# Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
Packit 6c4009
# Copyright (C) 2014-2018 Free Software Foundation, Inc.
Packit 6c4009
# This file is part of the GNU C Library.
Packit 6c4009
# Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000.
Packit 6c4009
#
Packit 6c4009
# The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
# modify it under the terms of the GNU Lesser General Public
Packit 6c4009
# License as published by the Free Software Foundation; either
Packit 6c4009
# version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
#
Packit 6c4009
# The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
# but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
# Lesser General Public License for more details.
Packit 6c4009
#
Packit 6c4009
# You should have received a copy of the GNU Lesser General Public
Packit 6c4009
# License along with the GNU C Library; if not, see
Packit 6c4009
# <http://www.gnu.org/licenses/>.
Packit 6c4009
Packit 6c4009
'''
Packit 6c4009
Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
Packit 6c4009
DerivedCoreProperties.txt files.
Packit 6c4009
Packit 6c4009
To see how this script is used, call it with the “-h” option:
Packit 6c4009
Packit 6c4009
    $ ./gen_unicode_ctype.py -h
Packit 6c4009
    … prints usage message …
Packit 6c4009
'''
Packit 6c4009
Packit 6c4009
import argparse
Packit 6c4009
import time
Packit 6c4009
import re
Packit 6c4009
import unicode_utils
Packit 6c4009
Packit 6c4009
def code_point_ranges(is_class_function):
Packit 6c4009
    '''Returns a list of ranges of code points for which is_class_function
Packit 6c4009
    returns True.
Packit 6c4009
Packit 6c4009
    Example:
Packit 6c4009
Packit 6c4009
    [[65, 90], [192, 214], [216, 222], [256], … ]
Packit 6c4009
    '''
Packit 6c4009
    cp_ranges  = []
Packit 6c4009
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
Packit 6c4009
        if is_class_function(code_point):
Packit 6c4009
            if (cp_ranges
Packit 6c4009
                and cp_ranges[-1][-1] == code_point - 1):
Packit 6c4009
                if len(cp_ranges[-1]) == 1:
Packit 6c4009
                    cp_ranges[-1].append(code_point)
Packit 6c4009
                else:
Packit 6c4009
                    cp_ranges[-1][-1] = code_point
Packit 6c4009
            else:
Packit 6c4009
                cp_ranges.append([code_point])
Packit 6c4009
    return cp_ranges
Packit 6c4009
Packit 6c4009
def output_charclass(i18n_file, class_name, is_class_function):
Packit 6c4009
    '''Output a LC_CTYPE character class section
Packit 6c4009
Packit 6c4009
    Example:
Packit 6c4009
Packit 6c4009
    upper /
Packit 6c4009
       <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
Packit 6c4009
Packit 6c4009
       <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
Packit 6c4009
       <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
Packit 6c4009
    '''
Packit 6c4009
    cp_ranges = code_point_ranges(is_class_function)
Packit 6c4009
    if cp_ranges:
Packit 6c4009
        i18n_file.write('%s /\n' %class_name)
Packit 6c4009
        max_column = 75
Packit 6c4009
        prefix = '   '
Packit 6c4009
        line = prefix
Packit 6c4009
        range_string = ''
Packit 6c4009
        for code_point_range in cp_ranges:
Packit 6c4009
            if line.strip():
Packit 6c4009
                line  += ';'
Packit 6c4009
            if len(code_point_range) == 1:
Packit 6c4009
                range_string = unicode_utils.ucs_symbol(code_point_range[0])
Packit 6c4009
            else:
Packit 6c4009
                range_string = unicode_utils.ucs_symbol_range(
Packit 6c4009
                    code_point_range[0], code_point_range[-1])
Packit 6c4009
            if len(line+range_string) > max_column:
Packit 6c4009
                i18n_file.write(line+'/\n')
Packit 6c4009
                line = prefix
Packit 6c4009
            line += range_string
Packit 6c4009
        if line.strip():
Packit 6c4009
            i18n_file.write(line+'\n')
Packit 6c4009
        i18n_file.write('\n')
Packit 6c4009
Packit 6c4009
def output_charmap(i18n_file, map_name, map_function):
Packit 6c4009
    '''Output a LC_CTYPE character map section
Packit 6c4009
Packit 6c4009
    Example:
Packit 6c4009
Packit 6c4009
    toupper /
Packit 6c4009
      (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
Packit 6c4009
Packit 6c4009
      (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
Packit 6c4009
      (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
Packit 6c4009
    '''
Packit 6c4009
    max_column = 75
Packit 6c4009
    prefix = '   '
Packit 6c4009
    line = prefix
Packit 6c4009
    map_string = ''
Packit 6c4009
    i18n_file.write('%s /\n' %map_name)
Packit 6c4009
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
Packit 6c4009
        mapped = map_function(code_point)
Packit 6c4009
        if code_point != mapped:
Packit 6c4009
            if line.strip():
Packit 6c4009
                line += ';'
Packit 6c4009
            map_string = '(' \
Packit 6c4009
                         + unicode_utils.ucs_symbol(code_point) \
Packit 6c4009
                         + ',' \
Packit 6c4009
                         + unicode_utils.ucs_symbol(mapped) \
Packit 6c4009
                         + ')'
Packit 6c4009
            if len(line+map_string) > max_column:
Packit 6c4009
                i18n_file.write(line+'/\n')
Packit 6c4009
                line = prefix
Packit 6c4009
            line += map_string
Packit 6c4009
    if line.strip():
Packit 6c4009
        i18n_file.write(line+'\n')
Packit 6c4009
    i18n_file.write('\n')
Packit 6c4009
Packit 6c4009
def read_input_file(filename):
Packit 6c4009
    '''Reads the original glibc i18n file to get the original head
Packit 6c4009
    and tail.
Packit 6c4009
Packit 6c4009
    We want to replace only the character classes in LC_CTYPE, and the
Packit 6c4009
    date stamp. All the rest of the i18n file should stay unchanged.
Packit 6c4009
    To avoid having to cut and paste the generated data into the
Packit 6c4009
    original file, it is helpful to read the original file here
Packit 6c4009
    to be able to generate a complete result file.
Packit 6c4009
    '''
Packit 6c4009
    head = tail = ''
Packit 6c4009
    with open(filename, mode='r') as i18n_file:
Packit 6c4009
        for line in i18n_file:
Packit 6c4009
            match = re.match(
Packit 6c4009
                r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
Packit 6c4009
                line)
Packit 6c4009
            if match:
Packit 6c4009
                line = match.group('key') \
Packit 6c4009
                       + '"{:s}"\n'.format(time.strftime('%Y-%m-%d'))
Packit 6c4009
            head = head + line
Packit 6c4009
            if line.startswith('LC_CTYPE'):
Packit 6c4009
                break
Packit 6c4009
        for line in i18n_file:
Packit 6c4009
            if line.startswith('translit_start'):
Packit 6c4009
                tail = line
Packit 6c4009
                break
Packit 6c4009
        for line in i18n_file:
Packit 6c4009
            tail = tail + line
Packit 6c4009
    return (head, tail)
Packit 6c4009
Packit 6c4009
def output_head(i18n_file, unicode_version, head=''):
Packit 6c4009
    '''Write the header of the output file, i.e. the part of the file
Packit 6c4009
    before the “LC_CTYPE” line.
Packit 6c4009
    '''
Packit 6c4009
    if ARGS.input_file and head:
Packit 6c4009
        i18n_file.write(head)
Packit 6c4009
    else:
Packit 6c4009
        i18n_file.write('escape_char /\n')
Packit 6c4009
        i18n_file.write('comment_char %\n')
Packit 6c4009
        i18n_file.write('\n')
Packit 6c4009
        i18n_file.write('% Generated automatically by '
Packit 6c4009
                        + 'gen_unicode_ctype.py '
Packit 6c4009
                        + 'for Unicode {:s}.\n'.format(unicode_version))
Packit 6c4009
        i18n_file.write('\n')
Packit 6c4009
        i18n_file.write('LC_IDENTIFICATION\n')
Packit 6c4009
        i18n_file.write('title     "Unicode {:s} FDCC-set"\n'.format(
Packit 6c4009
            unicode_version))
Packit 6c4009
        i18n_file.write('source    "UnicodeData.txt, '
Packit 6c4009
                        + 'DerivedCoreProperties.txt"\n')
Packit 6c4009
        i18n_file.write('address   ""\n')
Packit 6c4009
        i18n_file.write('contact   ""\n')
Packit 6c4009
        i18n_file.write('email     "bug-glibc-locales@gnu.org"\n')
Packit 6c4009
        i18n_file.write('tel       ""\n')
Packit 6c4009
        i18n_file.write('fax       ""\n')
Packit 6c4009
        i18n_file.write('language  ""\n')
Packit 6c4009
        i18n_file.write('territory "Earth"\n')
Packit 6c4009
        i18n_file.write('revision  "{:s}"\n'.format(unicode_version))
Packit 6c4009
        i18n_file.write('date      "{:s}"\n'.format(
Packit 6c4009
            time.strftime('%Y-%m-%d')))
Packit 6c4009
        i18n_file.write('category  "i18n:2012";LC_CTYPE\n')
Packit 6c4009
        i18n_file.write('END LC_IDENTIFICATION\n')
Packit 6c4009
        i18n_file.write('\n')
Packit 6c4009
        i18n_file.write('LC_CTYPE\n')
Packit 6c4009
Packit 6c4009
def output_tail(i18n_file, tail=''):
Packit 6c4009
    '''Write the tail of the output file, i.e. the part of the file
Packit 6c4009
    after the last “LC_CTYPE” character class.
Packit 6c4009
    '''
Packit 6c4009
    if ARGS.input_file and tail:
Packit 6c4009
        i18n_file.write(tail)
Packit 6c4009
    else:
Packit 6c4009
        i18n_file.write('END LC_CTYPE\n')
Packit 6c4009
Packit 6c4009
def output_tables(i18n_file, unicode_version, turkish):
Packit 6c4009
    '''Write the new LC_CTYPE character classes to the output file'''
Packit 6c4009
    i18n_file.write('% The following is the 14652 i18n fdcc-set '
Packit 6c4009
                    + 'LC_CTYPE category.\n')
Packit 6c4009
    i18n_file.write('% It covers Unicode version {:s}.\n'.format(
Packit 6c4009
        unicode_version))
Packit 6c4009
    i18n_file.write('% The character classes and mapping tables were '
Packit 6c4009
                    + 'automatically\n')
Packit 6c4009
    i18n_file.write('% generated using the gen_unicode_ctype.py '
Packit 6c4009
                    + 'program.\n\n')
Packit 6c4009
    i18n_file.write('% The "upper" class reflects the uppercase '
Packit 6c4009
                    + 'characters of class "alpha"\n')
Packit 6c4009
    output_charclass(i18n_file, 'upper', unicode_utils.is_upper)
Packit 6c4009
    i18n_file.write('% The "lower" class reflects the lowercase '
Packit 6c4009
                    + 'characters of class "alpha"\n')
Packit 6c4009
    output_charclass(i18n_file, 'lower', unicode_utils.is_lower)
Packit 6c4009
    i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
Packit 6c4009
                    + 'reflecting\n')
Packit 6c4009
    i18n_file.write('% the recommendations in TR 10176 annex A\n')
Packit 6c4009
    output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha)
Packit 6c4009
    i18n_file.write('% The "digit" class must only contain the '
Packit 6c4009
                    + 'BASIC LATIN digits, says ISO C 99\n')
Packit 6c4009
    i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
Packit 6c4009
    output_charclass(i18n_file, 'digit', unicode_utils.is_digit)
Packit 6c4009
    i18n_file.write('% The "outdigit" information is by default '
Packit 6c4009
                    + '"0" to "9".  We don\'t have to\n')
Packit 6c4009
    i18n_file.write('% provide it here since localedef will fill '
Packit 6c4009
               + 'in the bits and it would\n')
Packit 6c4009
    i18n_file.write('% prevent locales copying this file define '
Packit 6c4009
                    + 'their own values.\n')
Packit 6c4009
    i18n_file.write('% outdigit /\n')
Packit 6c4009
    i18n_file.write('%    <U0030>..<U0039>\n\n')
Packit 6c4009
    # output_charclass(i18n_file, 'outdigit', is_outdigit)
Packit 6c4009
    output_charclass(i18n_file, 'space', unicode_utils.is_space)
Packit 6c4009
    output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl)
Packit 6c4009
    output_charclass(i18n_file, 'punct', unicode_utils.is_punct)
Packit 6c4009
    output_charclass(i18n_file, 'graph', unicode_utils.is_graph)
Packit 6c4009
    output_charclass(i18n_file, 'print', unicode_utils.is_print)
Packit 6c4009
    i18n_file.write('% The "xdigit" class must only contain the '
Packit 6c4009
                    + 'BASIC LATIN digits and A-F, a-f,\n')
Packit 6c4009
    i18n_file.write('% says ISO C 99 '
Packit 6c4009
                    + '(sections 7.25.2.1.12 and 6.4.4.1).\n')
Packit 6c4009
    output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit)
Packit 6c4009
    output_charclass(i18n_file, 'blank', unicode_utils.is_blank)
Packit 6c4009
    if turkish:
Packit 6c4009
        i18n_file.write('% The case conversions reflect '
Packit 6c4009
                        + 'Turkish conventions.\n')
Packit 6c4009
        output_charmap(i18n_file, 'toupper', unicode_utils.to_upper_turkish)
Packit 6c4009
        output_charmap(i18n_file, 'tolower', unicode_utils.to_lower_turkish)
Packit 6c4009
    else:
Packit 6c4009
        output_charmap(i18n_file, 'toupper', unicode_utils.to_upper)
Packit 6c4009
        output_charmap(i18n_file, 'tolower', unicode_utils.to_lower)
Packit 6c4009
    output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title)
Packit 6c4009
    i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
Packit 6c4009
                    + 'annex B.1\n')
Packit 6c4009
    i18n_file.write('% That is, all combining characters (level 2+3).\n')
Packit 6c4009
    output_charclass(i18n_file, 'class "combining";',
Packit 6c4009
                     unicode_utils.is_combining)
Packit 6c4009
    i18n_file.write('% The "combining_level3" class reflects '
Packit 6c4009
                    + 'ISO/IEC 10646-1 annex B.2\n')
Packit 6c4009
    i18n_file.write('% That is, combining characters of level 3.\n')
Packit 6c4009
    output_charclass(i18n_file, 'class "combining_level3";',
Packit 6c4009
                     unicode_utils.is_combining_level3)
Packit 6c4009
Packit 6c4009
if __name__ == "__main__":
Packit 6c4009
    PARSER = argparse.ArgumentParser(
Packit 6c4009
        description='''
Packit 6c4009
        Generate a Unicode conforming LC_CTYPE category from
Packit 6c4009
        UnicodeData.txt and DerivedCoreProperties.txt files.
Packit 6c4009
        ''')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-u', '--unicode_data_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        default='UnicodeData.txt',
Packit 6c4009
        help=('The UnicodeData.txt file to read, '
Packit 6c4009
              + 'default: %(default)s'))
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-d', '--derived_core_properties_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        default='DerivedCoreProperties.txt',
Packit 6c4009
        help=('The DerivedCoreProperties.txt file to read, '
Packit 6c4009
              + 'default: %(default)s'))
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-i', '--input_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        help='''The original glibc/localedata/locales/i18n file.''')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-o', '--output_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        default='i18n.new',
Packit 6c4009
        help='''The file which shall contain the generated LC_CTYPE category,
Packit 6c4009
        default: %(default)s.  If the original
Packit 6c4009
        glibc/localedata/locales/i18n has been given
Packit 6c4009
        as an option, all data from the original file
Packit 6c4009
        except the newly generated LC_CTYPE character
Packit 6c4009
        classes and the date stamp in
Packit 6c4009
        LC_IDENTIFICATION will be copied unchanged
Packit 6c4009
        into the output file.  ''')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '--unicode_version',
Packit 6c4009
        nargs='?',
Packit 6c4009
        required=True,
Packit 6c4009
        type=str,
Packit 6c4009
        help='The Unicode version of the input files used.')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '--turkish',
Packit 6c4009
        action='store_true',
Packit 6c4009
        help='Use Turkish case conversions.')
Packit 6c4009
    ARGS = PARSER.parse_args()
Packit 6c4009
Packit 6c4009
    unicode_utils.fill_attributes(
Packit 6c4009
        ARGS.unicode_data_file)
Packit 6c4009
    unicode_utils.fill_derived_core_properties(
Packit 6c4009
        ARGS.derived_core_properties_file)
Packit 6c4009
    unicode_utils.verifications()
Packit 6c4009
    HEAD = TAIL = ''
Packit 6c4009
    if ARGS.input_file:
Packit 6c4009
        (HEAD, TAIL) = read_input_file(ARGS.input_file)
Packit 6c4009
    with open(ARGS.output_file, mode='w') as I18N_FILE:
Packit 6c4009
        output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
Packit 6c4009
        output_tables(I18N_FILE, ARGS.unicode_version, ARGS.turkish)
Packit 6c4009
        output_tail(I18N_FILE, tail=TAIL)