Blob Blame History Raw
#!/usr/bin/python3

# Usage: gen-scripts.py --cldrinfile .../supplementalData.xml \
#                       --ucdinfile .../PropertyValueAliases.txt

import io
import re
import xml.etree.ElementTree as et

ISO_ALIASES = { 'Hans': ['Hani'], 'Hant': ['Hani'],
                'Jpan': ['Hrkt'],
                'Kore': ['Hang'] }
UCD_ALIASES = { 'Katakana_Or_Hiragana': ['Katakana', 'Hiragana'] }
DEFAULT_ALIASES = { 'en': ['Latin'] }

def get_language_data(infile, aliases):
    result = {}
    tree = et.parse(infile)
    for data in tree.findall('languageData'):
        for lang in data.findall('language'):
            type = lang.get('type')
            scripts = lang.get('scripts')
            if not scripts:
                continue
            territories = lang.get('territories')
            keys = []
            if territories:
                for territory in territories.split(' '):
                    keys.append('{0}_{1}'.format(type, territory))
            else:
                keys.append(type)

            scripts = scripts.split(' ')

            # Resolve aliases of the ISO 15924 codes.
            scripts = [ISO_ALIASES.get(script, [script]) for script in scripts]
            scripts = [script for elements in scripts for script in elements]

            # Resolve ISO 15924 to Unicode mapping.
            scripts = [aliases[script] for script in scripts
                       if script in aliases]

            # Resolve aliases of Unicode script names.
            scripts = [UCD_ALIASES.get(script, [script]) for script in scripts]
            scripts = [script for elements in scripts for script in elements]

            scripts = set(scripts)

            if len(scripts) == 0:
                continue
            for key in keys:
                result[key] = scripts
    temp = dict(DEFAULT_ALIASES)
    temp.update(result)
    return temp

def get_aliases(infile):
    result = {}
    for line in infile:
        if not line.startswith('sc'):
            continue
        (sc, iso, ucd, *comment) = re.split('\s*;\s*', line.strip())
        result[iso] = ucd
    return result

def build_header(data):
    print('#define NLANGUAGES {0}'.format(len(data)))
    print('''\
struct LanguageScripts
{{
  const gchar *language;
  const gchar *scripts[{0}];
}};'''.format(max([len(v) for v in data.values()])+1))
    print('''\
struct LanguageScripts language_scripts[NLANGUAGES] =
  {''')
    for index, (lang, scripts) in enumerate(sorted(data.items(), key=lambda x: x[0])):
        print('    {{ "{0}", {{ {1} }} }}'.format(lang, ', '.join(['N_("{0}")'.format(script) for script in scripts] + ['NULL'])), end='')
        if index + 1 < len(data):
            print(',')
        else:
            print('')
    print('};')

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(description='build')
    parser.add_argument('cldrinfile', type=argparse.FileType('r'),
                        help='CLDR input file')
    parser.add_argument('ucdinfile', type=argparse.FileType('r'),
                        help='UCD input file')
    args = parser.parse_args()

    # FIXME: argparse.FileType(encoding=...) is available since Python 3.4
    aliases = get_aliases(io.open(args.ucdinfile.name, encoding='utf_8_sig'))
    data = get_language_data(args.cldrinfile, aliases)
    build_header(data)