Blame localedata/unicode-gen/gen_translit_compat.py

Packit 6c4009
#!/usr/bin/python3
Packit 6c4009
# -*- coding: utf-8 -*-
Packit 6c4009
#
Packit 6c4009
# Generate a translit_compat file from a UnicodeData file.
Packit 6c4009
# Copyright (C) 2015-2018 Free Software Foundation, Inc.
Packit 6c4009
# This file is part of the GNU C Library.
Packit 6c4009
#
Packit 6c4009
# The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
# modify it under the terms of the GNU Lesser General Public
Packit 6c4009
# License as published by the Free Software Foundation; either
Packit 6c4009
# version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
#
Packit 6c4009
# The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
# but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit 6c4009
# Lesser General Public License for more details.
Packit 6c4009
#
Packit 6c4009
# You should have received a copy of the GNU Lesser General Public
Packit 6c4009
# License along with the GNU C Library; if not, see
Packit 6c4009
# <http://www.gnu.org/licenses/>.
Packit 6c4009
Packit 6c4009
'''
Packit 6c4009
Generate a translit_compat file from UnicodeData.txt
Packit 6c4009
Packit 6c4009
To see how this script is used, call it with the “-h” option:
Packit 6c4009
Packit 6c4009
    $ ./gen_translit_compat -h
Packit 6c4009
    … prints usage message …
Packit 6c4009
'''
Packit 6c4009
Packit 6c4009
import argparse
Packit 6c4009
import time
Packit 6c4009
import unicode_utils
Packit 6c4009
Packit 6c4009
def read_input_file(filename):
Packit 6c4009
    '''Reads the original glibc translit_compat file to get the
Packit 6c4009
    original head and tail.
Packit 6c4009
Packit 6c4009
    We want to replace only the part of the file between
Packit 6c4009
    “translit_start” and “translit_end”
Packit 6c4009
    '''
Packit 6c4009
    head = tail = ''
Packit 6c4009
    with open(filename, mode='r') as translit_file:
Packit 6c4009
        for line in translit_file:
Packit 6c4009
            head = head + line
Packit 6c4009
            if line.startswith('translit_start'):
Packit 6c4009
                break
Packit 6c4009
        for line in translit_file:
Packit 6c4009
            if line.startswith('translit_end'):
Packit 6c4009
                tail = line
Packit 6c4009
                break
Packit 6c4009
        for line in translit_file:
Packit 6c4009
            tail = tail + line
Packit 6c4009
    return (head, tail)
Packit 6c4009
Packit 6c4009
def output_head(translit_file, unicode_version, head=''):
Packit 6c4009
    '''Write the header of the output file, i.e. the part of the file
Packit 6c4009
    before the “translit_start” line.
Packit 6c4009
    '''
Packit 6c4009
    if ARGS.input_file and head:
Packit 6c4009
        translit_file.write(head)
Packit 6c4009
    else:
Packit 6c4009
        translit_file.write('escape_char /\n')
Packit 6c4009
        translit_file.write('comment_char %\n')
Packit 6c4009
        translit_file.write(unicode_utils.COMMENT_HEADER)
Packit 6c4009
        translit_file.write('\n')
Packit 6c4009
        translit_file.write('% Transliterations of compatibility characters ')
Packit 6c4009
        translit_file.write('and ligatures.\n')
Packit 6c4009
        translit_file.write('% Generated automatically from UnicodeData.txt '
Packit 6c4009
                            + 'by gen_translit_compat.py '
Packit 6c4009
                            + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
Packit 6c4009
                            + 'for Unicode {:s}.\n'.format(unicode_version))
Packit 6c4009
        translit_file.write('\n')
Packit 6c4009
        translit_file.write('LC_CTYPE\n')
Packit 6c4009
        translit_file.write('\n')
Packit 6c4009
        translit_file.write('translit_start\n')
Packit 6c4009
Packit 6c4009
def output_tail(translit_file, tail=''):
Packit 6c4009
    '''Write the tail of the output file'''
Packit 6c4009
    if ARGS.input_file and tail:
Packit 6c4009
        translit_file.write(tail)
Packit 6c4009
    else:
Packit 6c4009
        translit_file.write('translit_end\n')
Packit 6c4009
        translit_file.write('\n')
Packit 6c4009
        translit_file.write('END LC_CTYPE\n')
Packit 6c4009
Packit 6c4009
def compatibility_decompose(code_point):
Packit 6c4009
    '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
Packit 6c4009
Packit 6c4009
    “The compatibility decomposition is formed by recursively applying
Packit 6c4009
    the canonical and compatibility mappings, then applying the
Packit 6c4009
    Canonical Ordering Algorithm.”
Packit 6c4009
Packit 6c4009
    We don’t do the canonical decomposition here because this is
Packit 6c4009
    done in gen_translit_combining.py to generate translit_combining.
Packit 6c4009
Packit 6c4009
    And we ignore some of the possible compatibility formatting tags
Packit 6c4009
    here. Some of them are used in other translit_* files, not
Packit 6c4009
    translit_compat:
Packit 6c4009
Packit 6c4009
    <font>:   translit_font
Packit 6c4009
    <circle>: translit_circle
Packit 6c4009
    <wide>:   translit_wide
Packit 6c4009
    <narrow>: translit_narrow
Packit 6c4009
    <square>: translit_cjk_compat
Packit 6c4009
    <fraction>: translit_fraction
Packit 6c4009
Packit 6c4009
    And we ignore
Packit 6c4009
Packit 6c4009
    <noBreak>, <initial>, <medial>, <final>, <isolated>
Packit 6c4009
Packit 6c4009
    because they seem to be not useful for transliteration.
Packit 6c4009
    '''
Packit 6c4009
    decomposition = unicode_utils.UNICODE_ATTRIBUTES[
Packit 6c4009
        code_point]['decomposition']
Packit 6c4009
    compatibility_tags = (
Packit 6c4009
        '<compat>', '<super>', '<sub>', '<vertical>')
Packit 6c4009
    for compatibility_tag in compatibility_tags:
Packit 6c4009
        if decomposition.startswith(compatibility_tag):
Packit 6c4009
            decomposition = decomposition[len(compatibility_tag)+1:]
Packit 6c4009
            decomposed_code_points = [int(x, 16)
Packit 6c4009
                                      for x in decomposition.split(' ')]
Packit 6c4009
            if (len(decomposed_code_points) > 1
Packit 6c4009
                    and decomposed_code_points[0] == 0x0020
Packit 6c4009
                    and decomposed_code_points[1] >= 0x0300
Packit 6c4009
                    and decomposed_code_points[1] <= 0x03FF):
Packit 6c4009
                # Decomposes into a space followed by a combining character.
Packit 6c4009
                # This is not useful fo transliteration.
Packit 6c4009
                return []
Packit 6c4009
            else:
Packit 6c4009
                return_value = []
Packit 6c4009
                for index in range(0, len(decomposed_code_points)):
Packit 6c4009
                    cd_code_points = compatibility_decompose(
Packit 6c4009
                        decomposed_code_points[index])
Packit 6c4009
                    if cd_code_points:
Packit 6c4009
                        return_value += cd_code_points
Packit 6c4009
                    else:
Packit 6c4009
                        return_value += [decomposed_code_points[index]]
Packit 6c4009
                return return_value
Packit 6c4009
    return []
Packit 6c4009
Packit 6c4009
def special_decompose(code_point_list):
Packit 6c4009
    '''
Packit 6c4009
    Decompositions which are not in UnicodeData.txt at all but which
Packit 6c4009
    were used in the original translit_compat file in glibc and
Packit 6c4009
    which seem to make sense.  I want to keep the update of
Packit 6c4009
    translit_compat close to the spirit of the original file,
Packit 6c4009
    therefore I added this special decomposition rules here.
Packit 6c4009
    '''
Packit 6c4009
    special_decompose_dict = {
Packit 6c4009
        (0x03BC,): [0x0075], # μ → u
Packit 6c4009
        (0x02BC,): [0x0027], # ʼ → '
Packit 6c4009
    }
Packit 6c4009
    if tuple(code_point_list) in special_decompose_dict:
Packit 6c4009
        return special_decompose_dict[tuple(code_point_list)]
Packit 6c4009
    else:
Packit 6c4009
        return code_point_list
Packit 6c4009
Packit 6c4009
def special_ligature_decompose(code_point):
Packit 6c4009
    '''
Packit 6c4009
    Decompositions for ligatures which are not in UnicodeData.txt at
Packit 6c4009
    all but which were used in the original translit_compat file in
Packit 6c4009
    glibc and which seem to make sense.  I want to keep the update of
Packit 6c4009
    translit_compat close to the spirit of the original file,
Packit 6c4009
    therefore I added these special ligature decomposition rules here.
Packit 6c4009
Packit 6c4009
    '''
Packit 6c4009
    special_ligature_decompose_dict = {
Packit 6c4009
        0x00E6: [0x0061, 0x0065], # æ → ae
Packit 6c4009
        0x00C6: [0x0041, 0x0045], # Æ → AE
Packit 6c4009
        # These following 5 special ligature decompositions were
Packit 6c4009
        # in the original glibc/localedata/locales/translit_compat file
Packit 6c4009
        0x0152: [0x004F, 0x0045], # Œ → OE
Packit 6c4009
        0x0153: [0x006F, 0x0065], # œ → oe
Packit 6c4009
        0x05F0: [0x05D5, 0x05D5], # װ → וו
Packit 6c4009
        0x05F1: [0x05D5, 0x05D9], # ױ → וי
Packit 6c4009
        0x05F2: [0x05D9, 0x05D9], # ײ → יי
Packit 6c4009
        # The following special ligature decompositions were
Packit 6c4009
        # not in the original glibc/localedata/locales/translit_compat file
Packit 6c4009
        # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE
Packit 6c4009
        # → U+041D CYRILLIC CAPITAL LETTER EN,
Packit 6c4009
        #   U+0413 CYRILLIC CAPITAL LETTER GHE
Packit 6c4009
        0x04A4: [0x041D, 0x0413], # Ҥ → НГ
Packit 6c4009
        # U+04A5 CYRILLIC SMALL LIGATURE EN GHE
Packit 6c4009
        # → U+043D CYRILLIC SMALL LETTER EN,
Packit 6c4009
        #   U+0433 CYRILLIC SMALL LETTER GHE
Packit 6c4009
        0x04A5: [0x043D, 0x0433], # ҥ → нг
Packit 6c4009
        # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE
Packit 6c4009
        # → U+0422 CYRILLIC CAPITAL LETTER TE,
Packit 6c4009
        #   U+0426 CYRILLIC CAPITAL LETTER TSE
Packit 6c4009
        0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ
Packit 6c4009
        # U+04B5 CYRILLIC SMALL LIGATURE TE TSE
Packit 6c4009
        # → U+0442 CYRILLIC SMALL LETTER TE,
Packit 6c4009
        #   U+0446 CYRILLIC SMALL LETTER TSE
Packit 6c4009
        0x04B5: [0x0442, 0x0446], # ҵ → тц
Packit 6c4009
        # U+04d4 CYRILLIC CAPITAL LIGATURE A IE
Packit 6c4009
        # → U+0410 CYRILLIC CAPITAL LETTER A
Packit 6c4009
        #   U+0415;CYRILLIC CAPITAL LETTER IE
Packit 6c4009
        0x04D4: [0x0410, 0x0415], # Ӕ → АЕ
Packit 6c4009
        # U+04D5 CYRILLIC SMALL LIGATURE A IE
Packit 6c4009
        # → U+0430 CYRILLIC SMALL LETTER A,
Packit 6c4009
        #   U+0435 CYRILLIC SMALL LETTER IE
Packit 6c4009
        0x04D5: [0x0430, 0x0435], # ӕ → ае
Packit 6c4009
        # I am not sure what to do with the following ligatures
Packit 6c4009
        # maybe it makes no sense to decompose them:
Packit 6c4009
        # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
Packit 6c4009
        # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
Packit 6c4009
        # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
Packit 6c4009
        # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
Packit 6c4009
        # U+fe20 COMBINING LIGATURE LEFT HALF
Packit 6c4009
        # U+fe21 COMBINING LIGATURE RIGHT HALF
Packit 6c4009
        # U+fe27 COMBINING LIGATURE LEFT HALF BELOW
Packit 6c4009
        # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW
Packit 6c4009
        # U+11176 MAHAJANI LIGATURE SHRI
Packit 6c4009
        # U+1f670 SCRIPT LIGATURE ET ORNAMENT
Packit 6c4009
        # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT
Packit 6c4009
        # U+1f672 LIGATURE OPEN ET ORNAMENT
Packit 6c4009
        # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT
Packit 6c4009
    }
Packit 6c4009
    if code_point in special_ligature_decompose_dict:
Packit 6c4009
        return special_ligature_decompose_dict[code_point]
Packit 6c4009
    else:
Packit 6c4009
        return [code_point]
Packit 6c4009
Packit 6c4009
def output_transliteration(translit_file):
Packit 6c4009
    '''Write the new transliteration to the output file'''
Packit 6c4009
    translit_file.write('\n')
Packit 6c4009
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
Packit 6c4009
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
Packit 6c4009
        decomposed_code_points = [compatibility_decompose(code_point)]
Packit 6c4009
        if not decomposed_code_points[0]:
Packit 6c4009
            if special_decompose([code_point]) != [code_point]:
Packit 6c4009
                decomposed_code_points[0] = special_decompose([code_point])
Packit 6c4009
        else:
Packit 6c4009
            special_decomposed_code_points = []
Packit 6c4009
            while True:
Packit 6c4009
                special_decomposed_code_points = special_decompose(
Packit 6c4009
                    decomposed_code_points[-1])
Packit 6c4009
                if (special_decomposed_code_points
Packit 6c4009
                        != decomposed_code_points[-1]):
Packit 6c4009
                    decomposed_code_points.append(
Packit 6c4009
                        special_decomposed_code_points)
Packit 6c4009
                    continue
Packit 6c4009
                special_decomposed_code_points = []
Packit 6c4009
                for decomposed_code_point in decomposed_code_points[-1]:
Packit 6c4009
                    special_decomposed_code_points += special_decompose(
Packit 6c4009
                        [decomposed_code_point])
Packit 6c4009
                if (special_decomposed_code_points
Packit 6c4009
                        == decomposed_code_points[-1]):
Packit 6c4009
                    break
Packit 6c4009
                decomposed_code_points.append(
Packit 6c4009
                    special_decomposed_code_points)
Packit 6c4009
        if decomposed_code_points[0]:
Packit 6c4009
            translit_file.write('% {:s}\n'.format(name))
Packit 6c4009
            translit_file.write('{:s} '.format(
Packit 6c4009
                unicode_utils.ucs_symbol(code_point)))
Packit 6c4009
            for index in range(0, len(decomposed_code_points)):
Packit 6c4009
                if index > 0:
Packit 6c4009
                    translit_file.write(';')
Packit 6c4009
                translit_file.write('"')
Packit 6c4009
                for decomposed_code_point in decomposed_code_points[index]:
Packit 6c4009
                    translit_file.write('{:s}'.format(
Packit 6c4009
                        unicode_utils.ucs_symbol(decomposed_code_point)))
Packit 6c4009
                translit_file.write('"')
Packit 6c4009
            translit_file.write('\n')
Packit 6c4009
        elif 'LIGATURE' in name and 'ARABIC' not in name:
Packit 6c4009
            decomposed_code_points = special_ligature_decompose(code_point)
Packit 6c4009
            if decomposed_code_points[0] != code_point:
Packit 6c4009
                translit_file.write('% {:s}\n'.format(name))
Packit 6c4009
                translit_file.write('{:s} '.format(
Packit 6c4009
                    unicode_utils.ucs_symbol(code_point)))
Packit 6c4009
                translit_file.write('"')
Packit 6c4009
                for decomposed_code_point in decomposed_code_points:
Packit 6c4009
                    translit_file.write('{:s}'.format(
Packit 6c4009
                        unicode_utils.ucs_symbol(decomposed_code_point)))
Packit 6c4009
                translit_file.write('"')
Packit 6c4009
                translit_file.write('\n')
Packit 6c4009
            else:
Packit 6c4009
                print('Warning: unhandled ligature: {:x} {:s}'.format(
Packit 6c4009
                    code_point, name))
Packit 6c4009
    translit_file.write('\n')
Packit 6c4009
Packit 6c4009
if __name__ == "__main__":
Packit 6c4009
    PARSER = argparse.ArgumentParser(
Packit 6c4009
        description='''
Packit 6c4009
        Generate a translit_compat file from UnicodeData.txt.
Packit 6c4009
        ''')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-u', '--unicode_data_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        default='UnicodeData.txt',
Packit 6c4009
        help=('The UnicodeData.txt file to read, '
Packit 6c4009
              + 'default: %(default)s'))
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-i', '--input_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        help=''' The original glibc/localedata/locales/translit_compat
Packit 6c4009
        file.''')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-o', '--output_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        default='translit_compat.new',
Packit 6c4009
        help='''The new translit_compat file, default: %(default)s.  If the
Packit 6c4009
        original glibc/localedata/locales/translit_compat file has
Packit 6c4009
        been given as an option, the header up to the
Packit 6c4009
        “translit_start” line and the tail from the “translit_end”
Packit 6c4009
        line to the end of the file will be copied unchanged into the
Packit 6c4009
        output file.  ''')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '--unicode_version',
Packit 6c4009
        nargs='?',
Packit 6c4009
        required=True,
Packit 6c4009
        type=str,
Packit 6c4009
        help='The Unicode version of the input files used.')
Packit 6c4009
    ARGS = PARSER.parse_args()
Packit 6c4009
Packit 6c4009
    unicode_utils.fill_attributes(ARGS.unicode_data_file)
Packit 6c4009
    HEAD = TAIL = ''
Packit 6c4009
    if ARGS.input_file:
Packit 6c4009
        (HEAD, TAIL) = read_input_file(ARGS.input_file)
Packit 6c4009
    with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
Packit 6c4009
        output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
Packit 6c4009
        output_transliteration(TRANSLIT_FILE)
Packit 6c4009
        output_tail(TRANSLIT_FILE, tail=TAIL)