Tree - source-git/glibc - CentOS Git server

source-git / glibc

Blame localedata/unicode-gen/gen_translit_compat.py

Blob History Raw

Packit	6c4009	`#!/usr/bin/python3`
Packit	6c4009	`# -- coding: utf-8 --`
Packit	6c4009	`#`
Packit	6c4009	`# Generate a translit_compat file from a UnicodeData file.`
Packit	6c4009	`# Copyright (C) 2015-2018 Free Software Foundation, Inc.`
Packit	6c4009	`# This file is part of the GNU C Library.`
Packit	6c4009	`#`
Packit	6c4009	`# The GNU C Library is free software; you can redistribute it and/or`
Packit	6c4009	`# modify it under the terms of the GNU Lesser General Public`
Packit	6c4009	`# License as published by the Free Software Foundation; either`
Packit	6c4009	`# version 2.1 of the License, or (at your option) any later version.`
Packit	6c4009	`#`
Packit	6c4009	`# The GNU C Library is distributed in the hope that it will be useful,`
Packit	6c4009	`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
Packit	6c4009	`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
Packit	6c4009	`# Lesser General Public License for more details.`
Packit	6c4009	`#`
Packit	6c4009	`# You should have received a copy of the GNU Lesser General Public`
Packit	6c4009	`# License along with the GNU C Library; if not, see`
Packit	6c4009	`# <http://www.gnu.org/licenses/>.`
Packit	6c4009
Packit	6c4009	`'''`
Packit	6c4009	`Generate a translit_compat file from UnicodeData.txt`
Packit	6c4009
Packit	6c4009	`To see how this script is used, call it with the “-h” option:`
Packit	6c4009
Packit	6c4009	`$ ./gen_translit_compat -h`
Packit	6c4009	`… prints usage message …`
Packit	6c4009	`'''`
Packit	6c4009
Packit	6c4009	`import argparse`
Packit	6c4009	`import time`
Packit	6c4009	`import unicode_utils`
Packit	6c4009
Packit	6c4009	`def read_input_file(filename):`
Packit	6c4009	`'''Reads the original glibc translit_compat file to get the`
Packit	6c4009	`original head and tail.`
Packit	6c4009
Packit	6c4009	`We want to replace only the part of the file between`
Packit	6c4009	`“translit_start” and “translit_end”`
Packit	6c4009	`'''`
Packit	6c4009	`head = tail = ''`
Packit	6c4009	`with open(filename, mode='r') as translit_file:`
Packit	6c4009	`for line in translit_file:`
Packit	6c4009	`head = head + line`
Packit	6c4009	`if line.startswith('translit_start'):`
Packit	6c4009	`break`
Packit	6c4009	`for line in translit_file:`
Packit	6c4009	`if line.startswith('translit_end'):`
Packit	6c4009	`tail = line`
Packit	6c4009	`break`
Packit	6c4009	`for line in translit_file:`
Packit	6c4009	`tail = tail + line`
Packit	6c4009	`return (head, tail)`
Packit	6c4009
Packit	6c4009	`def output_head(translit_file, unicode_version, head=''):`
Packit	6c4009	`'''Write the header of the output file, i.e. the part of the file`
Packit	6c4009	`before the “translit_start” line.`
Packit	6c4009	`'''`
Packit	6c4009	`if ARGS.input_file and head:`
Packit	6c4009	`translit_file.write(head)`
Packit	6c4009	`else:`
Packit	6c4009	`translit_file.write('escape_char /\n')`
Packit	6c4009	`translit_file.write('comment_char %\n')`
Packit	6c4009	`translit_file.write(unicode_utils.COMMENT_HEADER)`
Packit	6c4009	`translit_file.write('\n')`
Packit	6c4009	`translit_file.write('% Transliterations of compatibility characters ')`
Packit	6c4009	`translit_file.write('and ligatures.\n')`
Packit	6c4009	`translit_file.write('% Generated automatically from UnicodeData.txt '`
Packit	6c4009	`+ 'by gen_translit_compat.py '`
Packit	6c4009	`+ 'on {:s} '.format(time.strftime('%Y-%m-%d'))`
Packit	6c4009	`+ 'for Unicode {:s}.\n'.format(unicode_version))`
Packit	6c4009	`translit_file.write('\n')`
Packit	6c4009	`translit_file.write('LC_CTYPE\n')`
Packit	6c4009	`translit_file.write('\n')`
Packit	6c4009	`translit_file.write('translit_start\n')`
Packit	6c4009
Packit	6c4009	`def output_tail(translit_file, tail=''):`
Packit	6c4009	`'''Write the tail of the output file'''`
Packit	6c4009	`if ARGS.input_file and tail:`
Packit	6c4009	`translit_file.write(tail)`
Packit	6c4009	`else:`
Packit	6c4009	`translit_file.write('translit_end\n')`
Packit	6c4009	`translit_file.write('\n')`
Packit	6c4009	`translit_file.write('END LC_CTYPE\n')`
Packit	6c4009
Packit	6c4009	`def compatibility_decompose(code_point):`
Packit	6c4009	`'''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings`
Packit	6c4009
Packit	6c4009	`“The compatibility decomposition is formed by recursively applying`
Packit	6c4009	`the canonical and compatibility mappings, then applying the`
Packit	6c4009	`Canonical Ordering Algorithm.”`
Packit	6c4009
Packit	6c4009	`We don’t do the canonical decomposition here because this is`
Packit	6c4009	`done in gen_translit_combining.py to generate translit_combining.`
Packit	6c4009
Packit	6c4009	`And we ignore some of the possible compatibility formatting tags`
Packit	6c4009	`here. Some of them are used in other translit_* files, not`
Packit	6c4009	`translit_compat:`
Packit	6c4009
Packit	6c4009	`<font>: translit_font`
Packit	6c4009	`<circle>: translit_circle`
Packit	6c4009	`<wide>: translit_wide`
Packit	6c4009	`<narrow>: translit_narrow`
Packit	6c4009	`<square>: translit_cjk_compat`
Packit	6c4009	`<fraction>: translit_fraction`
Packit	6c4009
Packit	6c4009	`And we ignore`
Packit	6c4009
Packit	6c4009	`<noBreak>, <initial>, <medial>, <final>, <isolated>`
Packit	6c4009
Packit	6c4009	`because they seem to be not useful for transliteration.`
Packit	6c4009	`'''`
Packit	6c4009	`decomposition = unicode_utils.UNICODE_ATTRIBUTES[`
Packit	6c4009	`code_point]['decomposition']`
Packit	6c4009	`compatibility_tags = (`
Packit	6c4009	`'<compat>', '<super>', '<sub>', '<vertical>')`
Packit	6c4009	`for compatibility_tag in compatibility_tags:`
Packit	6c4009	`if decomposition.startswith(compatibility_tag):`
Packit	6c4009	`decomposition = decomposition[len(compatibility_tag)+1:]`
Packit	6c4009	`decomposed_code_points = [int(x, 16)`
Packit	6c4009	`for x in decomposition.split(' ')]`
Packit	6c4009	`if (len(decomposed_code_points) > 1`
Packit	6c4009	`and decomposed_code_points[0] == 0x0020`
Packit	6c4009	`and decomposed_code_points[1] >= 0x0300`
Packit	6c4009	`and decomposed_code_points[1] <= 0x03FF):`
Packit	6c4009	`# Decomposes into a space followed by a combining character.`
Packit	6c4009	`# This is not useful fo transliteration.`
Packit	6c4009	`return []`
Packit	6c4009	`else:`
Packit	6c4009	`return_value = []`
Packit	6c4009	`for index in range(0, len(decomposed_code_points)):`
Packit	6c4009	`cd_code_points = compatibility_decompose(`
Packit	6c4009	`decomposed_code_points[index])`
Packit	6c4009	`if cd_code_points:`
Packit	6c4009	`return_value += cd_code_points`
Packit	6c4009	`else:`
Packit	6c4009	`return_value += [decomposed_code_points[index]]`
Packit	6c4009	`return return_value`
Packit	6c4009	`return []`
Packit	6c4009
Packit	6c4009	`def special_decompose(code_point_list):`
Packit	6c4009	`'''`
Packit	6c4009	`Decompositions which are not in UnicodeData.txt at all but which`
Packit	6c4009	`were used in the original translit_compat file in glibc and`
Packit	6c4009	`which seem to make sense. I want to keep the update of`
Packit	6c4009	`translit_compat close to the spirit of the original file,`
Packit	6c4009	`therefore I added this special decomposition rules here.`
Packit	6c4009	`'''`
Packit	6c4009	`special_decompose_dict = {`
Packit	6c4009	`(0x03BC,): [0x0075], # μ → u`
Packit	6c4009	`(0x02BC,): [0x0027], # ʼ → '`
Packit	6c4009	`}`
Packit	6c4009	`if tuple(code_point_list) in special_decompose_dict:`
Packit	6c4009	`return special_decompose_dict[tuple(code_point_list)]`
Packit	6c4009	`else:`
Packit	6c4009	`return code_point_list`
Packit	6c4009
Packit	6c4009	`def special_ligature_decompose(code_point):`
Packit	6c4009	`'''`
Packit	6c4009	`Decompositions for ligatures which are not in UnicodeData.txt at`
Packit	6c4009	`all but which were used in the original translit_compat file in`
Packit	6c4009	`glibc and which seem to make sense. I want to keep the update of`
Packit	6c4009	`translit_compat close to the spirit of the original file,`
Packit	6c4009	`therefore I added these special ligature decomposition rules here.`
Packit	6c4009
Packit	6c4009	`'''`
Packit	6c4009	`special_ligature_decompose_dict = {`
Packit	6c4009	`0x00E6: [0x0061, 0x0065], # æ → ae`
Packit	6c4009	`0x00C6: [0x0041, 0x0045], # Æ → AE`
Packit	6c4009	`# These following 5 special ligature decompositions were`
Packit	6c4009	`# in the original glibc/localedata/locales/translit_compat file`
Packit	6c4009	`0x0152: [0x004F, 0x0045], # Œ → OE`
Packit	6c4009	`0x0153: [0x006F, 0x0065], # œ → oe`
Packit	6c4009	`0x05F0: [0x05D5, 0x05D5], # װ → וו`
Packit	6c4009	`0x05F1: [0x05D5, 0x05D9], # ױ → וי`
Packit	6c4009	`0x05F2: [0x05D9, 0x05D9], # ײ → יי`
Packit	6c4009	`# The following special ligature decompositions were`
Packit	6c4009	`# not in the original glibc/localedata/locales/translit_compat file`
Packit	6c4009	`# U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE`
Packit	6c4009	`# → U+041D CYRILLIC CAPITAL LETTER EN,`
Packit	6c4009	`# U+0413 CYRILLIC CAPITAL LETTER GHE`
Packit	6c4009	`0x04A4: [0x041D, 0x0413], # Ҥ → НГ`
Packit	6c4009	`# U+04A5 CYRILLIC SMALL LIGATURE EN GHE`
Packit	6c4009	`# → U+043D CYRILLIC SMALL LETTER EN,`
Packit	6c4009	`# U+0433 CYRILLIC SMALL LETTER GHE`
Packit	6c4009	`0x04A5: [0x043D, 0x0433], # ҥ → нг`
Packit	6c4009	`# U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE`
Packit	6c4009	`# → U+0422 CYRILLIC CAPITAL LETTER TE,`
Packit	6c4009	`# U+0426 CYRILLIC CAPITAL LETTER TSE`
Packit	6c4009	`0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ`
Packit	6c4009	`# U+04B5 CYRILLIC SMALL LIGATURE TE TSE`
Packit	6c4009	`# → U+0442 CYRILLIC SMALL LETTER TE,`
Packit	6c4009	`# U+0446 CYRILLIC SMALL LETTER TSE`
Packit	6c4009	`0x04B5: [0x0442, 0x0446], # ҵ → тц`
Packit	6c4009	`# U+04d4 CYRILLIC CAPITAL LIGATURE A IE`
Packit	6c4009	`# → U+0410 CYRILLIC CAPITAL LETTER A`
Packit	6c4009	`# U+0415;CYRILLIC CAPITAL LETTER IE`
Packit	6c4009	`0x04D4: [0x0410, 0x0415], # Ӕ → АЕ`
Packit	6c4009	`# U+04D5 CYRILLIC SMALL LIGATURE A IE`
Packit	6c4009	`# → U+0430 CYRILLIC SMALL LETTER A,`
Packit	6c4009	`# U+0435 CYRILLIC SMALL LETTER IE`
Packit	6c4009	`0x04D5: [0x0430, 0x0435], # ӕ → ае`
Packit	6c4009	`# I am not sure what to do with the following ligatures`
Packit	6c4009	`# maybe it makes no sense to decompose them:`
Packit	6c4009	`# U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH`
Packit	6c4009	`# U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA`
Packit	6c4009	`# U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA`
Packit	6c4009	`# U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM`
Packit	6c4009	`# U+fe20 COMBINING LIGATURE LEFT HALF`
Packit	6c4009	`# U+fe21 COMBINING LIGATURE RIGHT HALF`
Packit	6c4009	`# U+fe27 COMBINING LIGATURE LEFT HALF BELOW`
Packit	6c4009	`# U+fe28 COMBINING LIGATURE RIGHT HALF BELOW`
Packit	6c4009	`# U+11176 MAHAJANI LIGATURE SHRI`
Packit	6c4009	`# U+1f670 SCRIPT LIGATURE ET ORNAMENT`
Packit	6c4009	`# U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT`
Packit	6c4009	`# U+1f672 LIGATURE OPEN ET ORNAMENT`
Packit	6c4009	`# U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT`
Packit	6c4009	`}`
Packit	6c4009	`if code_point in special_ligature_decompose_dict:`
Packit	6c4009	`return special_ligature_decompose_dict[code_point]`
Packit	6c4009	`else:`
Packit	6c4009	`return [code_point]`
Packit	6c4009
Packit	6c4009	`def output_transliteration(translit_file):`
Packit	6c4009	`'''Write the new transliteration to the output file'''`
Packit	6c4009	`translit_file.write('\n')`
Packit	6c4009	`for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):`
Packit	6c4009	`name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']`
Packit	6c4009	`decomposed_code_points = [compatibility_decompose(code_point)]`
Packit	6c4009	`if not decomposed_code_points[0]:`
Packit	6c4009	`if special_decompose([code_point]) != [code_point]:`
Packit	6c4009	`decomposed_code_points[0] = special_decompose([code_point])`
Packit	6c4009	`else:`
Packit	6c4009	`special_decomposed_code_points = []`
Packit	6c4009	`while True:`
Packit	6c4009	`special_decomposed_code_points = special_decompose(`
Packit	6c4009	`decomposed_code_points[-1])`
Packit	6c4009	`if (special_decomposed_code_points`
Packit	6c4009	`!= decomposed_code_points[-1]):`
Packit	6c4009	`decomposed_code_points.append(`
Packit	6c4009	`special_decomposed_code_points)`
Packit	6c4009	`continue`
Packit	6c4009	`special_decomposed_code_points = []`
Packit	6c4009	`for decomposed_code_point in decomposed_code_points[-1]:`
Packit	6c4009	`special_decomposed_code_points += special_decompose(`
Packit	6c4009	`[decomposed_code_point])`
Packit	6c4009	`if (special_decomposed_code_points`
Packit	6c4009	`== decomposed_code_points[-1]):`
Packit	6c4009	`break`
Packit	6c4009	`decomposed_code_points.append(`
Packit	6c4009	`special_decomposed_code_points)`
Packit	6c4009	`if decomposed_code_points[0]:`
Packit	6c4009	`translit_file.write('% {:s}\n'.format(name))`
Packit	6c4009	`translit_file.write('{:s} '.format(`
Packit	6c4009	`unicode_utils.ucs_symbol(code_point)))`
Packit	6c4009	`for index in range(0, len(decomposed_code_points)):`
Packit	6c4009	`if index > 0:`
Packit	6c4009	`translit_file.write(';')`
Packit	6c4009	`translit_file.write('"')`
Packit	6c4009	`for decomposed_code_point in decomposed_code_points[index]:`
Packit	6c4009	`translit_file.write('{:s}'.format(`
Packit	6c4009	`unicode_utils.ucs_symbol(decomposed_code_point)))`
Packit	6c4009	`translit_file.write('"')`
Packit	6c4009	`translit_file.write('\n')`
Packit	6c4009	`elif 'LIGATURE' in name and 'ARABIC' not in name:`
Packit	6c4009	`decomposed_code_points = special_ligature_decompose(code_point)`
Packit	6c4009	`if decomposed_code_points[0] != code_point:`
Packit	6c4009	`translit_file.write('% {:s}\n'.format(name))`
Packit	6c4009	`translit_file.write('{:s} '.format(`
Packit	6c4009	`unicode_utils.ucs_symbol(code_point)))`
Packit	6c4009	`translit_file.write('"')`
Packit	6c4009	`for decomposed_code_point in decomposed_code_points:`
Packit	6c4009	`translit_file.write('{:s}'.format(`
Packit	6c4009	`unicode_utils.ucs_symbol(decomposed_code_point)))`
Packit	6c4009	`translit_file.write('"')`
Packit	6c4009	`translit_file.write('\n')`
Packit	6c4009	`else:`
Packit	6c4009	`print('Warning: unhandled ligature: {:x} {:s}'.format(`
Packit	6c4009	`code_point, name))`
Packit	6c4009	`translit_file.write('\n')`
Packit	6c4009
Packit	6c4009	`if __name__ == "__main__":`
Packit	6c4009	`PARSER = argparse.ArgumentParser(`
Packit	6c4009	`description='''`
Packit	6c4009	`Generate a translit_compat file from UnicodeData.txt.`
Packit	6c4009	`''')`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'-u', '--unicode_data_file',`
Packit	6c4009	`nargs='?',`
Packit	6c4009	`type=str,`
Packit	6c4009	`default='UnicodeData.txt',`
Packit	6c4009	`help=('The UnicodeData.txt file to read, '`
Packit	6c4009	`+ 'default: %(default)s'))`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'-i', '--input_file',`
Packit	6c4009	`nargs='?',`
Packit	6c4009	`type=str,`
Packit	6c4009	`help=''' The original glibc/localedata/locales/translit_compat`
Packit	6c4009	`file.''')`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'-o', '--output_file',`
Packit	6c4009	`nargs='?',`
Packit	6c4009	`type=str,`
Packit	6c4009	`default='translit_compat.new',`
Packit	6c4009	`help='''The new translit_compat file, default: %(default)s. If the`
Packit	6c4009	`original glibc/localedata/locales/translit_compat file has`
Packit	6c4009	`been given as an option, the header up to the`
Packit	6c4009	`“translit_start” line and the tail from the “translit_end”`
Packit	6c4009	`line to the end of the file will be copied unchanged into the`
Packit	6c4009	`output file. ''')`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'--unicode_version',`
Packit	6c4009	`nargs='?',`
Packit	6c4009	`required=True,`
Packit	6c4009	`type=str,`
Packit	6c4009	`help='The Unicode version of the input files used.')`
Packit	6c4009	`ARGS = PARSER.parse_args()`
Packit	6c4009
Packit	6c4009	`unicode_utils.fill_attributes(ARGS.unicode_data_file)`
Packit	6c4009	`HEAD = TAIL = ''`
Packit	6c4009	`if ARGS.input_file:`
Packit	6c4009	`(HEAD, TAIL) = read_input_file(ARGS.input_file)`
Packit	6c4009	`with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:`
Packit	6c4009	`output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)`
Packit	6c4009	`output_transliteration(TRANSLIT_FILE)`
Packit	6c4009	`output_tail(TRANSLIT_FILE, tail=TAIL)`

source-git / glibc

Source Code

Blame localedata/unicode-gen/gen_translit_compat.py