Blame localedata/unicode-gen/gen_translit_combining.py

Packit 6c4009
#!/usr/bin/python3
Packit 6c4009
# -*- coding: utf-8 -*-
Packit 6c4009
#
Packit 6c4009
# Generate a translit_combining file from a UnicodeData file.
Packit 6c4009
# Copyright (C) 2015-2018 Free Software Foundation, Inc.
Packit 6c4009
# This file is part of the GNU C Library.
Packit 6c4009
#
Packit 6c4009
# The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
# modify it under the terms of the GNU Lesser General Public
Packit 6c4009
# License as published by the Free Software Foundation; either
Packit 6c4009
# version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
#
Packit 6c4009
# The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
# but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
# Lesser General Public License for more details.
Packit 6c4009
#
Packit 6c4009
# You should have received a copy of the GNU Lesser General Public
Packit 6c4009
# License along with the GNU C Library; if not, see
Packit 6c4009
# <http://www.gnu.org/licenses/>.
Packit 6c4009
Packit 6c4009
'''
Packit 6c4009
Generate a translit_combining file from UnicodeData.txt
Packit 6c4009
Packit 6c4009
To see how this script is used, call it with the “-h” option:
Packit 6c4009
Packit 6c4009
    $ ./gen_translit_combining -h
Packit 6c4009
    … prints usage message …
Packit 6c4009
'''
Packit 6c4009
Packit 6c4009
import argparse
Packit 6c4009
import time
Packit 6c4009
import unicode_utils
Packit 6c4009
Packit 6c4009
def read_input_file(filename):
Packit 6c4009
    '''Reads the original glibc translit_combining file to get the
Packit 6c4009
    original head and tail.
Packit 6c4009
Packit 6c4009
    We want to replace only the part of the file between
Packit 6c4009
    “translit_start” and “translit_end”
Packit 6c4009
    '''
Packit 6c4009
    head = tail = ''
Packit 6c4009
    with open(filename, mode='r') as translit_file:
Packit 6c4009
        for line in translit_file:
Packit 6c4009
            head = head + line
Packit 6c4009
            if line.startswith('translit_start'):
Packit 6c4009
                break
Packit 6c4009
        for line in translit_file:
Packit 6c4009
            if line.startswith('translit_end'):
Packit 6c4009
                tail = line
Packit 6c4009
                break
Packit 6c4009
        for line in translit_file:
Packit 6c4009
            tail = tail + line
Packit 6c4009
    return (head, tail)
Packit 6c4009
Packit 6c4009
def output_head(translit_file, unicode_version, head=''):
Packit 6c4009
    '''Write the header of the output file, i.e. the part of the file
Packit 6c4009
    before the “translit_start” line.
Packit 6c4009
    '''
Packit 6c4009
    if ARGS.input_file and head:
Packit 6c4009
        translit_file.write(head)
Packit 6c4009
    else:
Packit 6c4009
        translit_file.write('escape_char /\n')
Packit 6c4009
        translit_file.write('comment_char %\n')
Packit 6c4009
        translit_file.write(unicode_utils.COMMENT_HEADER)
Packit 6c4009
        translit_file.write('\n')
Packit 6c4009
        translit_file.write('% Transliterations that remove all ')
Packit 6c4009
        translit_file.write('combining characters (accents,\n')
Packit 6c4009
        translit_file.write('% pronounciation marks, etc.).\n')
Packit 6c4009
        translit_file.write('% Generated automatically from UnicodeData.txt '
Packit 6c4009
                            + 'by gen_translit_combining.py '
Packit 6c4009
                            + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
Packit 6c4009
                            + 'for Unicode {:s}.\n'.format(unicode_version))
Packit 6c4009
        translit_file.write('\n')
Packit 6c4009
        translit_file.write('LC_CTYPE\n')
Packit 6c4009
        translit_file.write('\n')
Packit 6c4009
        translit_file.write('translit_start\n')
Packit 6c4009
Packit 6c4009
def output_tail(translit_file, tail=''):
Packit 6c4009
    '''Write the tail of the output file'''
Packit 6c4009
    if ARGS.input_file and tail:
Packit 6c4009
        translit_file.write(tail)
Packit 6c4009
    else:
Packit 6c4009
        translit_file.write('translit_end\n')
Packit 6c4009
        translit_file.write('\n')
Packit 6c4009
        translit_file.write('END LC_CTYPE\n')
Packit 6c4009
Packit 6c4009
def is_combining_remove(code_point):
Packit 6c4009
    '''Check whether this is a combining character which should be listed
Packit 6c4009
    in the section of the translit_combining file where combining
Packit 6c4009
    characters are replaced by empty strings.
Packit 6c4009
Packit 6c4009
    We ignore combining characters from many scripts here because
Packit 6c4009
    the original translit_combining file didn’t do this for the
Packit 6c4009
    combining characters from these scripts either and I am not
Packit 6c4009
    sure yet whether this would be useful to do for all combining
Packit 6c4009
    characters or not. For the moment I think it is better to keep
Packit 6c4009
    close to the spirit of the original file.
Packit 6c4009
    '''
Packit 6c4009
    if not unicode_utils.is_combining(code_point):
Packit 6c4009
        return False
Packit 6c4009
    name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
Packit 6c4009
    for substring in ('DEVANAGARI',
Packit 6c4009
                      'BENGALI',
Packit 6c4009
                      'CYRILLIC',
Packit 6c4009
                      'SYRIAC',
Packit 6c4009
                      'THAANA',
Packit 6c4009
                      'NKO',
Packit 6c4009
                      'GURMUKHI',
Packit 6c4009
                      'TAMIL',
Packit 6c4009
                      'GUJARATI',
Packit 6c4009
                      'ORIYA',
Packit 6c4009
                      'TELUGU',
Packit 6c4009
                      'KANNADA',
Packit 6c4009
                      'MALAYALAM',
Packit 6c4009
                      'SINHALA',
Packit 6c4009
                      'THAI',
Packit 6c4009
                      'LAO',
Packit 6c4009
                      'TIBETAN',
Packit 6c4009
                      'MYANMAR',
Packit 6c4009
                      'ETHIOPIC',
Packit 6c4009
                      'TAGALOG',
Packit 6c4009
                      'HANUNOO',
Packit 6c4009
                      'BUHID',
Packit 6c4009
                      'TAGBANWA',
Packit 6c4009
                      'KHMER',
Packit 6c4009
                      'MONGOLIAN',
Packit 6c4009
                      'LIMBU',
Packit 6c4009
                      'NEW TAI LUE',
Packit 6c4009
                      'BUGINESE',
Packit 6c4009
                      'BALINESE',
Packit 6c4009
                      'SUNDANESE',
Packit 6c4009
                      'LEPCHA',
Packit 6c4009
                      'IDEOGRAPHIC',
Packit 6c4009
                      'HANGUL',
Packit 6c4009
                      'SYLOTI',
Packit 6c4009
                      'SAURASHTRA',
Packit 6c4009
                      'KAYAH',
Packit 6c4009
                      'REJANG',
Packit 6c4009
                      'CHAM',
Packit 6c4009
                      'VARIATION SELECTOR',
Packit 6c4009
                      'KHAROSHTHI',
Packit 6c4009
                      'MUSICAL SYMBOL',
Packit 6c4009
                      'SAMARITAN',
Packit 6c4009
                      'MANDAIC',
Packit 6c4009
                      'TAI THAM',
Packit 6c4009
                      'BATAK',
Packit 6c4009
                      'VEDIC',
Packit 6c4009
                      'COPTIC',
Packit 6c4009
                      'TIFINAGH',
Packit 6c4009
                      'BAMUM',
Packit 6c4009
                      'JAVANESE',
Packit 6c4009
                      'TAI VIET',
Packit 6c4009
                      'MEETEI',
Packit 6c4009
                      'MANICHAEAN',
Packit 6c4009
                      'BRAHMI',
Packit 6c4009
                      'KAITHI',
Packit 6c4009
                      'CHAKMA',
Packit 6c4009
                      'MAHAJANI',
Packit 6c4009
                      'SHARADA',
Packit 6c4009
                      'KHOJKI',
Packit 6c4009
                      'KHUDAWADI',
Packit 6c4009
                      'GRANTHA',
Packit 6c4009
                      'TIRHUTA',
Packit 6c4009
                      'SIDDHAM',
Packit 6c4009
                      'MODI VOWEL',
Packit 6c4009
                      'MODI SIGN',
Packit 6c4009
                      'TAKRI',
Packit 6c4009
                      'BASSA VAH',
Packit 6c4009
                      'PAHAWH HMONG',
Packit 6c4009
                      'MIAO',
Packit 6c4009
                      'DUPLOYAN',
Packit 6c4009
                      'MENDE KIKAKUI',
Packit 6c4009
                      'AHOM',
Packit 6c4009
                      'SIGNWRITING'
Packit 6c4009
    ):
Packit 6c4009
        if substring in name:
Packit 6c4009
            return False
Packit 6c4009
    return True
Packit 6c4009
Packit 6c4009
def canonical_decompose(code_point):
Packit 6c4009
    '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
Packit 6c4009
Packit 6c4009
    In some instances a canonical mapping or a compatibility mapping
Packit 6c4009
    may consist of a single character. For a canonical mapping, this
Packit 6c4009
    indicates that the character is a canonical equivalent of another
Packit 6c4009
    single character. For a compatibility mapping, this indicates that
Packit 6c4009
    the character is a compatibility equivalent of another single
Packit 6c4009
    character.
Packit 6c4009
Packit 6c4009
    A canonical mapping may also consist of a pair of characters, but
Packit 6c4009
    is never longer than two characters. When a canonical mapping
Packit 6c4009
    consists of a pair of characters, the first character may itself
Packit 6c4009
    be a character with a decomposition mapping, but the second
Packit 6c4009
    character never has a decomposition mapping.
Packit 6c4009
Packit 6c4009
    We ignore the canonical decomposition for code points
Packit 6c4009
    matching certain substrings because the original translit_combining
Packit 6c4009
    file didn’t include these types of characters either. I am unsure
Packit 6c4009
    about the usefulness of including them and want to keep close
Packit 6c4009
    to the spirit of the original file for the moment.
Packit 6c4009
    '''
Packit 6c4009
    name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
Packit 6c4009
    for substring in ('MUSICAL SYMBOL',
Packit 6c4009
                      'CJK COMPATIBILITY IDEOGRAPH',
Packit 6c4009
                      'BALINESE',
Packit 6c4009
                      'KAITHI LETTER',
Packit 6c4009
                      'CHAKMA VOWEL',
Packit 6c4009
                      'GRANTHA VOWEL',
Packit 6c4009
                      'TIRHUTA VOWEL',
Packit 6c4009
                      'SIDDHAM VOWEL'):
Packit 6c4009
        if substring in name:
Packit 6c4009
            return []
Packit 6c4009
    decomposition = unicode_utils.UNICODE_ATTRIBUTES[
Packit 6c4009
        code_point]['decomposition']
Packit 6c4009
    if decomposition and not decomposition.startswith('<'):
Packit 6c4009
        decomposed_code_points = [int(x, 16) for x in decomposition.split(' ')]
Packit 6c4009
        if decomposed_code_points:
Packit 6c4009
            cd0 = canonical_decompose(decomposed_code_points[0])
Packit 6c4009
            if cd0:
Packit 6c4009
                decomposed_code_points = cd0 + decomposed_code_points[1:]
Packit 6c4009
        return decomposed_code_points
Packit 6c4009
    else:
Packit 6c4009
        return []
Packit 6c4009
Packit 6c4009
def special_decompose(code_point_list):
Packit 6c4009
    '''
Packit 6c4009
    Decompositions which are not canonical or which are not in
Packit 6c4009
    UnicodeData.txt at all but some of these were used in the original
Packit 6c4009
    translit_combining file in glibc and they seemed to make sense.
Packit 6c4009
    I want to keep the update of translit_combining close to the
Packit 6c4009
    spirit of the original file, therefore I added these special
Packit 6c4009
    decomposition rules here.
Packit 6c4009
    '''
Packit 6c4009
    special_decompose_dict = {
Packit 6c4009
        # Ø U+00D8 is already handled in translit_neutral. But
Packit 6c4009
        # translit_combining is usually included after translit_neutral
Packit 6c4009
        # and Ǿ U+01FE LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
Packit 6c4009
        # has a canonical decomposition to Ø U+00D8 and we want to
Packit 6c4009
        # further decompose this to U+004F.
Packit 6c4009
        (0x00D8,): [0x004F], # Ø → O
Packit 6c4009
        # ø U+00F8 is already handled in translit_neutral. But
Packit 6c4009
        # translit_combining is usually included after translit_neutral
Packit 6c4009
        # and ǿ U+01FF LATIN SMALL LETTER O WITH STROKE AND ACUTE
Packit 6c4009
        # has a canonical decomposition to ø U+00F8 and we want to
Packit 6c4009
        # further decompose this to U+006F.
Packit 6c4009
        (0x00F8,): [0x006F], # ø → o
Packit 6c4009
        # æ U+00E6 is already in translit_compat because ligatures
Packit 6c4009
        # are handled in translit_compat. But ǣ U+01E3 has a
Packit 6c4009
        # canonical decomposition to U+00E6, U+0304 and we want to
Packit 6c4009
        # further decompose this to “ae”.
Packit 6c4009
        (0x00E6,): [0x0061, 0x0065], # æ → ae
Packit 6c4009
        # Æ U+00C6  is already in translit_compat because ligatures
Packit 6c4009
        # are handled in translit_compat. But Ǣ U+01E2 has a
Packit 6c4009
        # canonical decomposition to U+00C6, U+0304 and we want to
Packit 6c4009
        # further decompose this to “AE”
Packit 6c4009
        (0x00C6,): [0x0041, 0x0045], # Æ → AE
Packit 6c4009
        # U+05F2 HEBREW LIGATURE YIDDISH DOUBLE YOD is already in
Packit 6c4009
        # translit_compat because ligatures are handled in translit_compat.
Packit 6c4009
        # But U+FB1F has a canonical decomposition to U+05F2 and
Packit 6c4009
        # we want to further decompose this to U+05D9, U+05D9.
Packit 6c4009
        (0x05F2,): [0x05D9, 0x05D9], # ײ → יי
Packit 6c4009
        # 0x2002 has a <compat> decomposition to 0x0020 in UnicodeData.txt
Packit 6c4009
        # But U+2000 EN QUAD has a canonical decomposition U+2002
Packit 6c4009
        # and we want to further decompose this to U+0020.
Packit 6c4009
        (0x2002,): [0x0020], # EN SPACE → SPACE
Packit 6c4009
        # 0x2003 has a <compat> decomposition to 0x0020 in UnicodeData.txt
Packit 6c4009
        # But U+2001 EM QUAD has a canonical decomposition to U+2003
Packit 6c4009
        # and we want to further decompose this to U+0020.
Packit 6c4009
        (0x2003,): [0x0020], # EM SPACE → SPACE
Packit 6c4009
        # U+2260 ≠ has the canonical decomposition U+003D U+0338
Packit 6c4009
        # (= followed by ̸). After stripping the combining characters,
Packit 6c4009
        # the result is only = which reverses the meaning.
Packit 6c4009
        # Therefore, we add a special rules here for such mathematical
Packit 6c4009
        # negations:
Packit 6c4009
        (0x21AE,): [0x0021, 0x003C, 0x002D, 0x003E], # ↮ → !<->
Packit 6c4009
        (0x21CD,): [0x0021, 0x003C, 0x003D], # ⇍ → !<=
Packit 6c4009
        (0x21CE,): [0x0021, 0x003C, 0x003D, 0x003E], # ⇎ → !<=>
Packit 6c4009
        (0x21CF,): [0x0021, 0x003D, 0x003E], # ⇏ → !=>
Packit 6c4009
        (0x2204,): [0x0021, 0x2203], # ∄ → !∃
Packit 6c4009
        (0x2209,): [0x0021, 0x2208], # ∉ → !∈
Packit 6c4009
        (0x220C,): [0x0021, 0x220B], # ∌ → !∋
Packit 6c4009
        (0x2224,): [0x0021, 0x2223], # ∤ → !∣
Packit 6c4009
        (0x2226,): [0x0021, 0x2225], # ∦ → !∥
Packit 6c4009
        (0x2241,): [0x0021, 0x007E], # ≁ → !~
Packit 6c4009
        (0x2244,): [0x0021, 0x007E, 0x002D], # ≄ → !~-
Packit 6c4009
        (0x2247,): [0x0021, 0x007E, 0x003D], # ≇ → !~=
Packit 6c4009
        (0x2249,): [0x0021, 0x007E, 0x007E], # ≉ → !~~
Packit 6c4009
        (0x2260,): [0x0021, 0x003D], # ≠ → !=
Packit 6c4009
        (0x2262,): [0x0021, 0x003D, 0x003D], # ≢ → !==
Packit 6c4009
        (0x226D,): [0x0021, 0x224D], # ≭ → !≍
Packit 6c4009
        (0x226E,): [0x0021, 0x003C], # ≮ → !<
Packit 6c4009
        (0x226F,): [0x0021, 0x003E], # ≯ → !>
Packit 6c4009
        (0x2270,): [0x0021, 0x003C, 0x003D], # ≰ → !<=
Packit 6c4009
        (0x2271,): [0x0021, 0x003E, 0x003D], # ≱ → !>=
Packit 6c4009
        (0x2274,): [0x0021, 0x003C, 0x007E], # ≴ → !<~
Packit 6c4009
        (0x2275,): [0x0021, 0x003E, 0x007E], # ≵ → !>~
Packit 6c4009
        (0x2278,): [0x0021, 0x003C, 0x003E], # ≸ → !<>
Packit 6c4009
        (0x2279,): [0x0021, 0x003E, 0x003C], # ≹ → !><
Packit 6c4009
        (0x2280,): [0x0021, 0x227A], # ⊀ → !≺
Packit 6c4009
        (0x2281,): [0x0021, 0x227B], # ⊁ → !≻
Packit 6c4009
        (0x2284,): [0x0021, 0x2282], # ⊄ → !⊂
Packit 6c4009
        (0x2285,): [0x0021, 0x2283], # ⊅ → !⊃
Packit 6c4009
        (0x2288,): [0x0021, 0x2282, 0x003D], # ⊈ → !⊂=
Packit 6c4009
        (0x2289,): [0x0021, 0x2283, 0x003D], # ⊉ → !⊃=
Packit 6c4009
        (0x22AC,): [0x0021, 0x22A2], # ⊬ → !⊢
Packit 6c4009
        (0x22AD,): [0x0021, 0x22A8], # ⊭ → !⊨
Packit 6c4009
        (0x22AE,): [0x0021, 0x22A9], # ⊮ → !⊩
Packit 6c4009
        (0x22AF,): [0x0021, 0x22AB], # ⊯ → !⊫
Packit 6c4009
        (0x22E0,): [0x0021, 0x227C], # ⋠ → !≼
Packit 6c4009
        (0x22E1,): [0x0021, 0x227D], # ⋡ → !≽
Packit 6c4009
        (0x22E2,): [0x0021, 0x2291], # ⋢ → !⊑
Packit 6c4009
        (0x22E3,): [0x0021, 0x2292], # ⋣ → !⊒
Packit 6c4009
        (0x22EA,): [0x0021, 0x22B2], # ⋪ → !⊲
Packit 6c4009
        (0x22EB,): [0x0021, 0x22B3], # ⋫ → !⊳
Packit 6c4009
        (0x22EC,): [0x0021, 0x22B4], # ⋬ → !⊴
Packit 6c4009
        (0x22ED,): [0x0021, 0x22B5], # ⋭ → !⊵
Packit 6c4009
        (0x2ADC,): [0x0021, 0x2ADD], # ⫝̸ → !⫝
Packit 6c4009
        # Special rule for 〈 U+3008 is added
Packit 6c4009
        # because 〉 U+2329 has the canonical decomposition U+3008
Packit 6c4009
        # and we want to further decompose this to > U+003C.
Packit 6c4009
        (0x3008,): [0x003C], # 〈 → <
Packit 6c4009
        # Special rule for 〉 U+3009 is added
Packit 6c4009
        # because 〉 U+232A has the canonical decomposition U+3009
Packit 6c4009
        # and we want to further decompose this to < U+003E.
Packit 6c4009
        (0x3009,): [0x003E], # 〉→ >
Packit 6c4009
    }
Packit 6c4009
    if tuple(code_point_list) in special_decompose_dict:
Packit 6c4009
        return special_decompose_dict[tuple(code_point_list)]
Packit 6c4009
    else:
Packit 6c4009
        return code_point_list
Packit 6c4009
Packit 6c4009
def output_combining_remove(translit_file):
Packit 6c4009
    '''Write the section of the translit_combining file where combining
Packit 6c4009
    characters are replaced by empty strings.
Packit 6c4009
    '''
Packit 6c4009
    translit_file.write('\n')
Packit 6c4009
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
Packit 6c4009
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
Packit 6c4009
        if is_combining_remove(code_point):
Packit 6c4009
            translit_file.write('% {:s}\n'.format(name))
Packit 6c4009
            translit_file.write('{:s} ""\n'.format(
Packit 6c4009
                unicode_utils.ucs_symbol(code_point)))
Packit 6c4009
    translit_file.write('\n')
Packit 6c4009
Packit 6c4009
def output_decompositions(translit_file):
Packit 6c4009
    '''Write the section of the translit_combining file where characters
Packit 6c4009
    characters are decomposed and combining characters stripped from
Packit 6c4009
    the decompositions.
Packit 6c4009
    '''
Packit 6c4009
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
Packit 6c4009
        if special_decompose([code_point]) != [code_point]:
Packit 6c4009
            decomposed_code_points = [special_decompose([code_point])]
Packit 6c4009
        else:
Packit 6c4009
            decomposed_code_points = [canonical_decompose(code_point)]
Packit 6c4009
        if decomposed_code_points[0]:
Packit 6c4009
            while True:
Packit 6c4009
                special_decomposed_code_points = special_decompose(
Packit 6c4009
                    decomposed_code_points[-1])
Packit 6c4009
                if (special_decomposed_code_points
Packit 6c4009
                        != decomposed_code_points[-1]):
Packit 6c4009
                    decomposed_code_points.append(
Packit 6c4009
                        special_decomposed_code_points)
Packit 6c4009
                    continue
Packit 6c4009
                special_decomposed_code_points = []
Packit 6c4009
                for decomposed_code_point in decomposed_code_points[-1]:
Packit 6c4009
                    special_decomposed_code_points += special_decompose(
Packit 6c4009
                        [decomposed_code_point])
Packit 6c4009
                if (special_decomposed_code_points
Packit 6c4009
                        == decomposed_code_points[-1]):
Packit 6c4009
                    break
Packit 6c4009
                decomposed_code_points.append(
Packit 6c4009
                    special_decomposed_code_points)
Packit 6c4009
            for index in range(0, len(decomposed_code_points)):
Packit 6c4009
                decomposed_code_points[index] = [
Packit 6c4009
                    x for x in decomposed_code_points[index]
Packit 6c4009
                    if not is_combining_remove(x)]
Packit 6c4009
        if decomposed_code_points[0]:
Packit 6c4009
            translit_file.write('% {:s}\n'.format(
Packit 6c4009
                unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']))
Packit 6c4009
            translit_file.write('{:s} '.format(
Packit 6c4009
                unicode_utils.ucs_symbol(code_point)))
Packit 6c4009
            for index in range(0, len(decomposed_code_points)):
Packit 6c4009
                if index > 0:
Packit 6c4009
                    translit_file.write(';')
Packit 6c4009
                if len(decomposed_code_points[index]) > 1:
Packit 6c4009
                    translit_file.write('"')
Packit 6c4009
                for decomposed_code_point in decomposed_code_points[index]:
Packit 6c4009
                    translit_file.write('{:s}'.format(
Packit 6c4009
                        unicode_utils.ucs_symbol(decomposed_code_point)))
Packit 6c4009
                if len(decomposed_code_points[index]) > 1:
Packit 6c4009
                    translit_file.write('"')
Packit 6c4009
            translit_file.write('\n')
Packit 6c4009
    translit_file.write('\n')
Packit 6c4009
Packit 6c4009
def output_transliteration(translit_file):
Packit 6c4009
    '''Write the new transliteration to the output file'''
Packit 6c4009
    output_combining_remove(translit_file)
Packit 6c4009
    output_decompositions(translit_file)
Packit 6c4009
Packit 6c4009
if __name__ == "__main__":
Packit 6c4009
    PARSER = argparse.ArgumentParser(
Packit 6c4009
        description='''
Packit 6c4009
        Generate a translit_combining file from UnicodeData.txt.
Packit 6c4009
        ''')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-u', '--unicode_data_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        default='UnicodeData.txt',
Packit 6c4009
        help=('The UnicodeData.txt file to read, '
Packit 6c4009
              + 'default: %(default)s'))
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-i', '--input_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        help=''' The original glibc/localedata/locales/translit_combining
Packit 6c4009
        file.''')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-o', '--output_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        default='translit_combining.new',
Packit 6c4009
        help='''The new translit_combining file, default: %(default)s.  If the
Packit 6c4009
        original glibc/localedata/locales/translit_combining file has
Packit 6c4009
        been given as an option, the header up to the
Packit 6c4009
        “translit_start” line and the tail from the “translit_end”
Packit 6c4009
        line to the end of the file will be copied unchanged into the
Packit 6c4009
        output file.  ''')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '--unicode_version',
Packit 6c4009
        nargs='?',
Packit 6c4009
        required=True,
Packit 6c4009
        type=str,
Packit 6c4009
        help='The Unicode version of the input files used.')
Packit 6c4009
    ARGS = PARSER.parse_args()
Packit 6c4009
Packit 6c4009
    unicode_utils.fill_attributes(ARGS.unicode_data_file)
Packit 6c4009
    HEAD = TAIL = ''
Packit 6c4009
    if ARGS.input_file:
Packit 6c4009
        (HEAD, TAIL) = read_input_file(ARGS.input_file)
Packit 6c4009
    with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
Packit 6c4009
        output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
Packit 6c4009
        output_transliteration(TRANSLIT_FILE)
Packit 6c4009
        output_tail(TRANSLIT_FILE, tail=TAIL)