Tree - source-git/glibc - CentOS Git server

source-git / glibc

Blame localedata/unicode-gen/utf8_compatibility.py

Blob History Raw

Packit	6c4009	`#!/usr/bin/python3`
Packit	6c4009	`# -- coding: utf-8 --`
Packit	6c4009	`# Copyright (C) 2014-2018 Free Software Foundation, Inc.`
Packit	6c4009	`# This file is part of the GNU C Library.`
Packit	6c4009	`#`
Packit	6c4009	`# The GNU C Library is free software; you can redistribute it and/or`
Packit	6c4009	`# modify it under the terms of the GNU Lesser General Public`
Packit	6c4009	`# License as published by the Free Software Foundation; either`
Packit	6c4009	`# version 2.1 of the License, or (at your option) any later version.`
Packit	6c4009	`#`
Packit	6c4009	`# The GNU C Library is distributed in the hope that it will be useful,`
Packit	6c4009	`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
Packit	6c4009	`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
Packit	6c4009	`# Lesser General Public License for more details.`
Packit	6c4009	`#`
Packit	6c4009	`# You should have received a copy of the GNU Lesser General Public`
Packit	6c4009	`# License along with the GNU C Library; if not, see`
Packit	6c4009	`# <http://www.gnu.org/licenses/>.`
Packit	6c4009
Packit	6c4009	`'''`
Packit	6c4009	`This script is useful for checking backward compatibility of newly`
Packit	6c4009	`generated UTF-8 file from utf8_gen.py script`
Packit	6c4009
Packit	6c4009	`To see how this script is used, call it with the “-h” option:`
Packit	6c4009
Packit	6c4009	`$ ./utf8_compatibility.py -h`
Packit	6c4009	`… prints usage message …`
Packit	6c4009	`'''`
Packit	6c4009
Packit	6c4009	`import sys`
Packit	6c4009	`import re`
Packit	6c4009	`import argparse`
Packit	6c4009	`import unicode_utils`
Packit	6c4009
Packit	6c4009	`def create_charmap_dictionary(file_name):`
Packit	6c4009	`'''Create a dictionary for all code points found in the CHARMAP`
Packit	6c4009	`section of a file`
Packit	6c4009	`'''`
Packit	6c4009	`with open(file_name, mode='r') as utf8_file:`
Packit	6c4009	`charmap_dictionary = {}`
Packit	6c4009	`for line in utf8_file:`
Packit	6c4009	`if line.startswith('CHARMAP'):`
Packit	6c4009	`break`
Packit	6c4009	`for line in utf8_file:`
Packit	6c4009	`if line.startswith('END CHARMAP'):`
Packit	6c4009	`return charmap_dictionary`
Packit	6c4009	`if line.startswith('%'):`
Packit	6c4009	`continue`
Packit	6c4009	`match = re.match(`
Packit	6c4009	`r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'`
Packit	6c4009	`+r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'`
Packit	6c4009	`+r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',`
Packit	6c4009	`line)`
Packit	6c4009	`if not match:`
Packit	6c4009	`continue`
Packit	6c4009	`codepoint1 = match.group('codepoint1')`
Packit	6c4009	`codepoint2 = match.group('codepoint2')`
Packit	6c4009	`if not codepoint2:`
Packit	6c4009	`codepoint2 = codepoint1`
Packit	6c4009	`for i in range(int(codepoint1, 16),`
Packit	6c4009	`int(codepoint2, 16) + 1):`
Packit	6c4009	`charmap_dictionary[i] = match.group('hexutf8')`
Packit	6c4009	`sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'`
Packit	6c4009	`%file_name)`
Packit	6c4009	`exit(1)`
Packit	6c4009
Packit	6c4009	`def check_charmap(original_file_name, new_file_name):`
Packit	6c4009	`'''Report differences in the CHARMAP section between the old and the`
Packit	6c4009	`new file`
Packit	6c4009	`'''`
Packit	6c4009	`print('************************************************************')`
Packit	6c4009	`print('Report on CHARMAP:')`
Packit	6c4009	`ocharmap = create_charmap_dictionary(original_file_name)`
Packit	6c4009	`ncharmap = create_charmap_dictionary(new_file_name)`
Packit	6c4009	`print('------------------------------------------------------------')`
Packit	6c4009	`print('Total removed characters in newly generated CHARMAP: %d'`
Packit	6c4009	`%len(set(ocharmap)-set(ncharmap)))`
Packit	6c4009	`if ARGS.show_missing_characters:`
Packit	6c4009	`for key in sorted(set(ocharmap)-set(ncharmap)):`
Packit	6c4009	`print('removed: {:s} {:s} {:s}'.format(`
Packit	6c4009	`unicode_utils.ucs_symbol(key),`
Packit	6c4009	`ocharmap[key],`
Packit	6c4009	`unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \`
Packit	6c4009	`if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))`
Packit	6c4009	`print('------------------------------------------------------------')`
Packit	6c4009	`changed_charmap = {}`
Packit	6c4009	`for key in set(ocharmap).intersection(set(ncharmap)):`
Packit	6c4009	`if ocharmap[key] != ncharmap[key]:`
Packit	6c4009	`changed_charmap[key] = (ocharmap[key], ncharmap[key])`
Packit	6c4009	`print('Total changed characters in newly generated CHARMAP: %d'`
Packit	6c4009	`%len(changed_charmap))`
Packit	6c4009	`if ARGS.show_changed_characters:`
Packit	6c4009	`for key in sorted(changed_charmap):`
Packit	6c4009	`print('changed: {:s} {:s}->{:s} {:s}'.format(`
Packit	6c4009	`unicode_utils.ucs_symbol(key),`
Packit	6c4009	`changed_charmap[key][0],`
Packit	6c4009	`changed_charmap[key][1],`
Packit	6c4009	`unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \`
Packit	6c4009	`if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))`
Packit	6c4009	`print('------------------------------------------------------------')`
Packit	6c4009	`print('Total added characters in newly generated CHARMAP: %d'`
Packit	6c4009	`%len(set(ncharmap)-set(ocharmap)))`
Packit	6c4009	`if ARGS.show_added_characters:`
Packit	6c4009	`for key in sorted(set(ncharmap)-set(ocharmap)):`
Packit	6c4009	`print('added: {:s} {:s} {:s}'.format(`
Packit	6c4009	`unicode_utils.ucs_symbol(key),`
Packit	6c4009	`ncharmap[key],`
Packit	6c4009	`unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \`
Packit	6c4009	`if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))`
Packit	6c4009
Packit	6c4009	`def create_width_dictionary(file_name):`
Packit	6c4009	`'''Create a dictionary for all code points found in the WIDTH`
Packit	6c4009	`section of a file`
Packit	6c4009	`'''`
Packit	6c4009	`with open(file_name, mode='r') as utf8_file:`
Packit	6c4009	`width_dictionary = {}`
Packit	6c4009	`for line in utf8_file:`
Packit	6c4009	`if line.startswith('WIDTH'):`
Packit	6c4009	`break`
Packit	6c4009	`for line in utf8_file:`
Packit	6c4009	`if line.startswith('END WIDTH'):`
Packit	6c4009	`return width_dictionary`
Packit	6c4009	`match = re.match(`
Packit	6c4009	`r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'`
Packit	6c4009	`+r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'`
Packit	6c4009	`+r'\s+(?P<width>[02])',`
Packit	6c4009	`line)`
Packit	6c4009	`if not match:`
Packit	6c4009	`continue`
Packit	6c4009	`codepoint1 = match.group('codepoint1')`
Packit	6c4009	`codepoint2 = match.group('codepoint2')`
Packit	6c4009	`if not codepoint2:`
Packit	6c4009	`codepoint2 = codepoint1`
Packit	6c4009	`for i in range(int(codepoint1, 16),`
Packit	6c4009	`int(codepoint2, 16) + 1):`
Packit	6c4009	`width_dictionary[i] = int(match.group('width'))`
Packit	6c4009	`sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)`
Packit	6c4009
Packit	6c4009	`def check_width(original_file_name, new_file_name):`
Packit	6c4009	`'''Report differences in the WIDTH section between the old and the new`
Packit	6c4009	`file`
Packit	6c4009	`'''`
Packit	6c4009	`print('************************************************************')`
Packit	6c4009	`print('Report on WIDTH:')`
Packit	6c4009	`owidth = create_width_dictionary(original_file_name)`
Packit	6c4009	`nwidth = create_width_dictionary(new_file_name)`
Packit	6c4009	`print('------------------------------------------------------------')`
Packit	6c4009	`print('Total removed characters in newly generated WIDTH: %d'`
Packit	6c4009	`%len(set(owidth)-set(nwidth)))`
Packit	6c4009	`print('(Characters not in WIDTH get width 1 by default, '`
Packit	6c4009	`+ 'i.e. these have width 1 now.)')`
Packit	6c4009	`if ARGS.show_missing_characters:`
Packit	6c4009	`for key in sorted(set(owidth)-set(nwidth)):`
Packit	6c4009	`print('removed: {:s} '.format(unicode_utils.ucs_symbol(key))`
Packit	6c4009	`+ '{:d} : '.format(owidth[key])`
Packit	6c4009	`+ 'eaw={:s} '.format(`
Packit	6c4009	`unicode_utils.EAST_ASIAN_WIDTHS[key]`
Packit	6c4009	`if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')`
Packit	6c4009	`+ 'category={:2s} '.format(`
Packit	6c4009	`unicode_utils.UNICODE_ATTRIBUTES[key]['category']`
Packit	6c4009	`if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')`
Packit	6c4009	`+ 'bidi={:3s} '.format(`
Packit	6c4009	`unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']`
Packit	6c4009	`if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')`
Packit	6c4009	`+ 'name={:s}'.format(`
Packit	6c4009	`unicode_utils.UNICODE_ATTRIBUTES[key]['name']`
Packit	6c4009	`if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))`
Packit	6c4009	`print('------------------------------------------------------------')`
Packit	6c4009	`changed_width = {}`
Packit	6c4009	`for key in set(owidth).intersection(set(nwidth)):`
Packit	6c4009	`if owidth[key] != nwidth[key]:`
Packit	6c4009	`changed_width[key] = (owidth[key], nwidth[key])`
Packit	6c4009	`print('Total changed characters in newly generated WIDTH: %d'`
Packit	6c4009	`%len(changed_width))`
Packit	6c4009	`if ARGS.show_changed_characters:`
Packit	6c4009	`for key in sorted(changed_width):`
Packit	6c4009	`print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key))`
Packit	6c4009	`+ '{:d}->{:d} : '.format(changed_width[key][0],`
Packit	6c4009	`changed_width[key][1])`
Packit	6c4009	`+ 'eaw={:s} '.format(`
Packit	6c4009	`unicode_utils.EAST_ASIAN_WIDTHS[key]`
Packit	6c4009	`if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')`
Packit	6c4009	`+ 'category={:2s} '.format(`
Packit	6c4009	`unicode_utils.UNICODE_ATTRIBUTES[key]['category']`
Packit	6c4009	`if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')`
Packit	6c4009	`+ 'bidi={:3s} '.format(`
Packit	6c4009	`unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']`
Packit	6c4009	`if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')`
Packit	6c4009	`+ 'name={:s}'.format(`
Packit	6c4009	`unicode_utils.UNICODE_ATTRIBUTES[key]['name']`
Packit	6c4009	`if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))`
Packit	6c4009	`print('------------------------------------------------------------')`
Packit	6c4009	`print('Total added characters in newly generated WIDTH: %d'`
Packit	6c4009	`%len(set(nwidth)-set(owidth)))`
Packit	6c4009	`print('(Characters not in WIDTH get width 1 by default, '`
Packit	6c4009	`+ 'i.e. these had width 1 before.)')`
Packit	6c4009	`if ARGS.show_added_characters:`
Packit	6c4009	`for key in sorted(set(nwidth)-set(owidth)):`
Packit	6c4009	`print('added: {:s} '.format(unicode_utils.ucs_symbol(key))`
Packit	6c4009	`+ '{:d} : '.format(nwidth[key])`
Packit	6c4009	`+ 'eaw={:s} '.format(`
Packit	6c4009	`unicode_utils.EAST_ASIAN_WIDTHS[key]`
Packit	6c4009	`if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')`
Packit	6c4009	`+ 'category={:2s} '.format(`
Packit	6c4009	`unicode_utils.UNICODE_ATTRIBUTES[key]['category']`
Packit	6c4009	`if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')`
Packit	6c4009	`+ 'bidi={:3s} '.format(`
Packit	6c4009	`unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']`
Packit	6c4009	`if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')`
Packit	6c4009	`+ 'name={:s}'.format(`
Packit	6c4009	`unicode_utils.UNICODE_ATTRIBUTES[key]['name']`
Packit	6c4009	`if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))`
Packit	6c4009
Packit	6c4009	`if __name__ == "__main__":`
Packit	6c4009	`PARSER = argparse.ArgumentParser(`
Packit	6c4009	`description='''`
Packit	6c4009	`Compare the contents of LC_CTYPE in two files and check for errors.`
Packit	6c4009	`''')`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'-o', '--old_utf8_file',`
Packit	6c4009	`nargs='?',`
Packit	6c4009	`required=True,`
Packit	6c4009	`type=str,`
Packit	6c4009	`help='The old UTF-8 file.')`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'-n', '--new_utf8_file',`
Packit	6c4009	`nargs='?',`
Packit	6c4009	`required=True,`
Packit	6c4009	`type=str,`
Packit	6c4009	`help='The new UTF-8 file.')`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'-u', '--unicode_data_file',`
Packit	6c4009	`nargs='?',`
Packit	6c4009	`type=str,`
Packit	6c4009	`help='The UnicodeData.txt file to read.')`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'-e', '--east_asian_width_file',`
Packit	6c4009	`nargs='?',`
Packit	6c4009	`type=str,`
Packit	6c4009	`help='The EastAsianWidth.txt file to read.')`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'-a', '--show_added_characters',`
Packit	6c4009	`action='store_true',`
Packit	6c4009	`help='Show characters which were added in detail.')`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'-m', '--show_missing_characters',`
Packit	6c4009	`action='store_true',`
Packit	6c4009	`help='Show characters which were removed in detail.')`
Packit	6c4009	`PARSER.add_argument(`
Packit	6c4009	`'-c', '--show_changed_characters',`
Packit	6c4009	`action='store_true',`
Packit	6c4009	`help='Show characters whose width was changed in detail.')`
Packit	6c4009	`ARGS = PARSER.parse_args()`
Packit	6c4009
Packit	6c4009	`if ARGS.unicode_data_file:`
Packit	6c4009	`unicode_utils.fill_attributes(ARGS.unicode_data_file)`
Packit	6c4009	`if ARGS.east_asian_width_file:`
Packit	6c4009	`unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file)`
Packit	6c4009	`check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)`
Packit	6c4009	`check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)`

source-git / glibc

Source Code

Blame localedata/unicode-gen/utf8_compatibility.py