Blame localedata/unicode-gen/utf8_compatibility.py

Packit 6c4009
#!/usr/bin/python3
Packit 6c4009
# -*- coding: utf-8 -*-
Packit 6c4009
# Copyright (C) 2014-2018 Free Software Foundation, Inc.
Packit 6c4009
# This file is part of the GNU C Library.
Packit 6c4009
#
Packit 6c4009
# The GNU C Library is free software; you can redistribute it and/or
Packit 6c4009
# modify it under the terms of the GNU Lesser General Public
Packit 6c4009
# License as published by the Free Software Foundation; either
Packit 6c4009
# version 2.1 of the License, or (at your option) any later version.
Packit 6c4009
#
Packit 6c4009
# The GNU C Library is distributed in the hope that it will be useful,
Packit 6c4009
# but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 6c4009
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Packit 6c4009
# Lesser General Public License for more details.
Packit 6c4009
#
Packit 6c4009
# You should have received a copy of the GNU Lesser General Public
Packit 6c4009
# License along with the GNU C Library; if not, see
Packit 6c4009
# <http://www.gnu.org/licenses/>.
Packit 6c4009
Packit 6c4009
'''
Packit 6c4009
This script is useful for checking backward compatibility of newly
Packit 6c4009
generated UTF-8 file from utf8_gen.py script
Packit 6c4009
Packit 6c4009
To see how this script is used, call it with the “-h” option:
Packit 6c4009
Packit 6c4009
    $ ./utf8_compatibility.py -h
Packit 6c4009
    … prints usage message …
Packit 6c4009
'''
Packit 6c4009
Packit 6c4009
import sys
Packit 6c4009
import re
Packit 6c4009
import argparse
Packit 6c4009
import unicode_utils
Packit 6c4009
Packit 6c4009
def create_charmap_dictionary(file_name):
Packit 6c4009
    '''Create a dictionary for all code points found in the CHARMAP
Packit 6c4009
    section of a file
Packit 6c4009
    '''
Packit 6c4009
    with open(file_name, mode='r') as utf8_file:
Packit 6c4009
        charmap_dictionary = {}
Packit 6c4009
        for line in utf8_file:
Packit 6c4009
            if line.startswith('CHARMAP'):
Packit 6c4009
                break
Packit 6c4009
        for line in utf8_file:
Packit 6c4009
            if line.startswith('END CHARMAP'):
Packit 6c4009
                return charmap_dictionary
Packit 6c4009
            if line.startswith('%'):
Packit 6c4009
                continue
Packit 6c4009
            match = re.match(
Packit 6c4009
                r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
Packit 6c4009
                +r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
Packit 6c4009
                +r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
Packit 6c4009
                line)
Packit 6c4009
            if not match:
Packit 6c4009
                continue
Packit 6c4009
            codepoint1 = match.group('codepoint1')
Packit 6c4009
            codepoint2 = match.group('codepoint2')
Packit 6c4009
            if not codepoint2:
Packit 6c4009
                codepoint2 = codepoint1
Packit 6c4009
            for i in range(int(codepoint1, 16),
Packit 6c4009
                           int(codepoint2, 16) + 1):
Packit 6c4009
                charmap_dictionary[i] = match.group('hexutf8')
Packit 6c4009
        sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
Packit 6c4009
                         %file_name)
Packit 6c4009
        exit(1)
Packit 6c4009
Packit 6c4009
def check_charmap(original_file_name, new_file_name):
Packit 6c4009
    '''Report differences in the CHARMAP section between the old and the
Packit 6c4009
    new file
Packit 6c4009
    '''
Packit 6c4009
    print('************************************************************')
Packit 6c4009
    print('Report on CHARMAP:')
Packit 6c4009
    ocharmap = create_charmap_dictionary(original_file_name)
Packit 6c4009
    ncharmap = create_charmap_dictionary(new_file_name)
Packit 6c4009
    print('------------------------------------------------------------')
Packit 6c4009
    print('Total removed characters in newly generated CHARMAP: %d'
Packit 6c4009
          %len(set(ocharmap)-set(ncharmap)))
Packit 6c4009
    if ARGS.show_missing_characters:
Packit 6c4009
        for key in sorted(set(ocharmap)-set(ncharmap)):
Packit 6c4009
            print('removed: {:s}     {:s} {:s}'.format(
Packit 6c4009
                unicode_utils.ucs_symbol(key),
Packit 6c4009
                ocharmap[key],
Packit 6c4009
                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
Packit 6c4009
                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
Packit 6c4009
    print('------------------------------------------------------------')
Packit 6c4009
    changed_charmap = {}
Packit 6c4009
    for key in set(ocharmap).intersection(set(ncharmap)):
Packit 6c4009
        if ocharmap[key] != ncharmap[key]:
Packit 6c4009
            changed_charmap[key] = (ocharmap[key], ncharmap[key])
Packit 6c4009
    print('Total changed characters in newly generated CHARMAP: %d'
Packit 6c4009
          %len(changed_charmap))
Packit 6c4009
    if ARGS.show_changed_characters:
Packit 6c4009
        for key in sorted(changed_charmap):
Packit 6c4009
            print('changed: {:s}     {:s}->{:s} {:s}'.format(
Packit 6c4009
                unicode_utils.ucs_symbol(key),
Packit 6c4009
                changed_charmap[key][0],
Packit 6c4009
                changed_charmap[key][1],
Packit 6c4009
                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
Packit 6c4009
                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
Packit 6c4009
    print('------------------------------------------------------------')
Packit 6c4009
    print('Total added characters in newly generated CHARMAP: %d'
Packit 6c4009
          %len(set(ncharmap)-set(ocharmap)))
Packit 6c4009
    if ARGS.show_added_characters:
Packit 6c4009
        for key in sorted(set(ncharmap)-set(ocharmap)):
Packit 6c4009
            print('added: {:s}     {:s} {:s}'.format(
Packit 6c4009
                unicode_utils.ucs_symbol(key),
Packit 6c4009
                ncharmap[key],
Packit 6c4009
                unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
Packit 6c4009
                if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
Packit 6c4009
Packit 6c4009
def create_width_dictionary(file_name):
Packit 6c4009
    '''Create a dictionary for all code points found in the WIDTH
Packit 6c4009
    section of a file
Packit 6c4009
    '''
Packit 6c4009
    with open(file_name, mode='r') as utf8_file:
Packit 6c4009
        width_dictionary = {}
Packit 6c4009
        for line in utf8_file:
Packit 6c4009
            if line.startswith('WIDTH'):
Packit 6c4009
                break
Packit 6c4009
        for line in utf8_file:
Packit 6c4009
            if line.startswith('END WIDTH'):
Packit 6c4009
                return width_dictionary
Packit 6c4009
            match = re.match(
Packit 6c4009
                r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
Packit 6c4009
                +r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
Packit 6c4009
                +r'\s+(?P<width>[02])',
Packit 6c4009
                line)
Packit 6c4009
            if not match:
Packit 6c4009
                continue
Packit 6c4009
            codepoint1 = match.group('codepoint1')
Packit 6c4009
            codepoint2 = match.group('codepoint2')
Packit 6c4009
            if not codepoint2:
Packit 6c4009
                codepoint2 = codepoint1
Packit 6c4009
            for i in range(int(codepoint1, 16),
Packit 6c4009
                           int(codepoint2, 16) + 1):
Packit 6c4009
                width_dictionary[i] = int(match.group('width'))
Packit 6c4009
        sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)
Packit 6c4009
Packit 6c4009
def check_width(original_file_name, new_file_name):
Packit 6c4009
    '''Report differences in the WIDTH section between the old and the new
Packit 6c4009
    file
Packit 6c4009
    '''
Packit 6c4009
    print('************************************************************')
Packit 6c4009
    print('Report on WIDTH:')
Packit 6c4009
    owidth = create_width_dictionary(original_file_name)
Packit 6c4009
    nwidth = create_width_dictionary(new_file_name)
Packit 6c4009
    print('------------------------------------------------------------')
Packit 6c4009
    print('Total removed characters in newly generated WIDTH: %d'
Packit 6c4009
          %len(set(owidth)-set(nwidth)))
Packit 6c4009
    print('(Characters not in WIDTH get width 1 by default, '
Packit 6c4009
          + 'i.e. these have width 1 now.)')
Packit 6c4009
    if ARGS.show_missing_characters:
Packit 6c4009
        for key in sorted(set(owidth)-set(nwidth)):
Packit 6c4009
            print('removed: {:s} '.format(unicode_utils.ucs_symbol(key))
Packit 6c4009
                  + '{:d} : '.format(owidth[key])
Packit 6c4009
                  + 'eaw={:s} '.format(
Packit 6c4009
                      unicode_utils.EAST_ASIAN_WIDTHS[key]
Packit 6c4009
                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
Packit 6c4009
                  + 'category={:2s} '.format(
Packit 6c4009
                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
Packit 6c4009
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
Packit 6c4009
                  + 'bidi={:3s} '.format(
Packit 6c4009
                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
Packit 6c4009
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
Packit 6c4009
                  + 'name={:s}'.format(
Packit 6c4009
                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
Packit 6c4009
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
Packit 6c4009
    print('------------------------------------------------------------')
Packit 6c4009
    changed_width = {}
Packit 6c4009
    for key in set(owidth).intersection(set(nwidth)):
Packit 6c4009
        if owidth[key] != nwidth[key]:
Packit 6c4009
            changed_width[key] = (owidth[key], nwidth[key])
Packit 6c4009
    print('Total changed characters in newly generated WIDTH: %d'
Packit 6c4009
          %len(changed_width))
Packit 6c4009
    if ARGS.show_changed_characters:
Packit 6c4009
        for key in sorted(changed_width):
Packit 6c4009
            print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key))
Packit 6c4009
                  + '{:d}->{:d} : '.format(changed_width[key][0],
Packit 6c4009
                                          changed_width[key][1])
Packit 6c4009
                  + 'eaw={:s} '.format(
Packit 6c4009
                      unicode_utils.EAST_ASIAN_WIDTHS[key]
Packit 6c4009
                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
Packit 6c4009
                  + 'category={:2s} '.format(
Packit 6c4009
                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
Packit 6c4009
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
Packit 6c4009
                  + 'bidi={:3s} '.format(
Packit 6c4009
                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
Packit 6c4009
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
Packit 6c4009
                  + 'name={:s}'.format(
Packit 6c4009
                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
Packit 6c4009
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
Packit 6c4009
    print('------------------------------------------------------------')
Packit 6c4009
    print('Total added characters in newly generated WIDTH: %d'
Packit 6c4009
          %len(set(nwidth)-set(owidth)))
Packit 6c4009
    print('(Characters not in WIDTH get width 1 by default, '
Packit 6c4009
          + 'i.e. these had width 1 before.)')
Packit 6c4009
    if ARGS.show_added_characters:
Packit 6c4009
        for key in sorted(set(nwidth)-set(owidth)):
Packit 6c4009
            print('added: {:s} '.format(unicode_utils.ucs_symbol(key))
Packit 6c4009
                  + '{:d} : '.format(nwidth[key])
Packit 6c4009
                  + 'eaw={:s} '.format(
Packit 6c4009
                      unicode_utils.EAST_ASIAN_WIDTHS[key]
Packit 6c4009
                      if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
Packit 6c4009
                  + 'category={:2s} '.format(
Packit 6c4009
                      unicode_utils.UNICODE_ATTRIBUTES[key]['category']
Packit 6c4009
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
Packit 6c4009
                  + 'bidi={:3s} '.format(
Packit 6c4009
                      unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
Packit 6c4009
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
Packit 6c4009
                  + 'name={:s}'.format(
Packit 6c4009
                      unicode_utils.UNICODE_ATTRIBUTES[key]['name']
Packit 6c4009
                      if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
Packit 6c4009
Packit 6c4009
if __name__ == "__main__":
Packit 6c4009
    PARSER = argparse.ArgumentParser(
Packit 6c4009
        description='''
Packit 6c4009
        Compare the contents of LC_CTYPE in two files and check for errors.
Packit 6c4009
        ''')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-o', '--old_utf8_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        required=True,
Packit 6c4009
        type=str,
Packit 6c4009
        help='The old UTF-8 file.')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-n', '--new_utf8_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        required=True,
Packit 6c4009
        type=str,
Packit 6c4009
        help='The new UTF-8 file.')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-u', '--unicode_data_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        help='The UnicodeData.txt file to read.')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-e', '--east_asian_width_file',
Packit 6c4009
        nargs='?',
Packit 6c4009
        type=str,
Packit 6c4009
        help='The EastAsianWidth.txt file to read.')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-a', '--show_added_characters',
Packit 6c4009
        action='store_true',
Packit 6c4009
        help='Show characters which were added in detail.')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-m', '--show_missing_characters',
Packit 6c4009
        action='store_true',
Packit 6c4009
        help='Show characters which were removed in detail.')
Packit 6c4009
    PARSER.add_argument(
Packit 6c4009
        '-c', '--show_changed_characters',
Packit 6c4009
        action='store_true',
Packit 6c4009
        help='Show characters whose width was changed in detail.')
Packit 6c4009
    ARGS = PARSER.parse_args()
Packit 6c4009
Packit 6c4009
    if ARGS.unicode_data_file:
Packit 6c4009
        unicode_utils.fill_attributes(ARGS.unicode_data_file)
Packit 6c4009
    if ARGS.east_asian_width_file:
Packit 6c4009
        unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file)
Packit 6c4009
    check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
Packit 6c4009
    check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)