Blob Blame History Raw
#!/usr/bin/python
# vim:set fileencoding=utf-8 et sts=4 sw=4:
#
# ibus - Intelligent Input Bus for Linux / Unix OS
#
# Copyright © 2016 Takao Fujiwara <takao.fujiwara1@gmail.com>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library. If not, see <http://www.gnu.org/licenses/>.


# This script converts ISO 639-2 of three characters to ISO 639-1 of two
# characters in simple.xml.
# E.g. "eng" to "en"


from xml.sax import make_parser as sax_make_parser
from xml.sax.handler import feature_namespaces as sax_feature_namespaces
from xml.sax.saxutils import XMLFilterBase, XMLGenerator
from xml.sax._exceptions import SAXParseException

import codecs
import getopt
import io
import os
import sys

INSTALLED_SIMPLE_XML = '/usr/share/ibus/component/simple.xml'
PY3K = sys.version_info >= (3, 0)

if PY3K:
    from io import StringIO
else:
    # io.StringIO does not work with XMLGenerator
    from cStringIO import StringIO
    # iso_639.xml includes UTF-8
    reload(sys)
    sys.setdefaultencoding('utf-8')


def usage(prgname):
    print('''\
Usage:
  %s [OPTION...]

Options:
  -h, --help                         Show this message
  -i, --input=SIMPLE_XML             Load SIMPLE_XML file (default is:
                                         %s)
  -o, --output=FILE                  Output FILE (default is stdout)
''' % (prgname, INSTALLED_SIMPLE_XML))


class ISO639XML(XMLFilterBase):
    def __init__(self, parser=None):
        self.__code2to1 = {}
        self.__codetoname = {}
        XMLFilterBase.__init__(self, parser)
    def startElement(self, name, attrs):
        if name != 'iso_639_entry':
            return
        n = attrs.get('name')
        iso639_1 = attrs.get('iso_639_1_code')
        iso639_2b = attrs.get('iso_639_2B_code')
        iso639_2t = attrs.get('iso_639_2T_code')
        if iso639_1 != None:
            self.__codetoname[iso639_1] = n
            if iso639_2b != None:
                self.__code2to1[iso639_2b] = iso639_1
                self.__codetoname[iso639_2b] = n
            if iso639_2t != None and iso639_2b != iso639_2t:
                self.__code2to1[iso639_2t] = iso639_1
                self.__codetoname[iso639_2t] = n
    def code2to1(self, iso639_2):
        try:
            return self.__code2to1[iso639_2]
        except KeyError:
            return None


class IBusComponentXML(XMLFilterBase):
    def __init__(self, parser=None, downstream=None, iso639=None):
        XMLFilterBase.__init__(self, parser)
        self.__downstream = downstream
        self.__iso639 = iso639
        self.__is_language = False
    def startDocument(self):
        if self.__downstream:
            self.__downstream.startDocument()
    def endDocument(self):
        if self.__downstream:
            self.__downstream.endDocument()
    def startElement(self, name, attrs):
        if name == 'language':
            self.__is_language = True
        if self.__downstream:
            self.__downstream.startElement(name, attrs)
    def endElement(self, name):
        if name == 'language':
            self.__is_language = False
        if self.__downstream:
            self.__downstream.endElement(name)
    def characters(self, text):
        if self.__is_language:
            if self.__iso639:
                iso639_1 = self.__iso639.code2to1(text)
                if iso639_1 != None:
                    text = iso639_1
        if self.__downstream:
            self.__downstream.characters(text)


class ConvertEngineXML():
    def __init__(self, path, iso639=None):
        self.__path = path
        self.__iso639 = iso639

        self.__result = StringIO()
        downstream = XMLGenerator(self.__result, 'utf-8')
        self.__load(downstream)

    def __load(self, downstream=None):
        parser = sax_make_parser()
        parser.setFeature(sax_feature_namespaces, 0)
        self.__handler = IBusComponentXML(parser, downstream, self.__iso639)
        parser.setContentHandler(self.__handler)
        f = codecs.open(self.__path, 'r', encoding='utf-8')
        try:
            parser.parse(f)
        except SAXParseException:
            print('Error: Invalid file format: %s' % path)
        finally:
            f.close()
    def write(self, output=None):
        if output != None:
            od = codecs.open(output, 'w', encoding='utf-8')
        else:
            if PY3K:
                od = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
            else:
                od = codecs.getwriter('utf-8')(sys.stdout)
        contents = self.__result.getvalue()
        od.write(contents)


def parse_iso639(path):
    f = codecs.open(path, 'r', encoding='utf-8')
    parser = sax_make_parser()
    parser.setFeature(sax_feature_namespaces, 0)
    handler = ISO639XML(parser)
    parser.setContentHandler(handler)
    try:
        parser.parse(f)
    except SAXParseException:
        print('Error: Invalid file format: %s' % path)
    finally:
        f.close()
    return handler


if __name__ == '__main__':
    prgname = os.path.basename(sys.argv[0])
    try:
        opts, args = getopt.getopt(sys.argv[1:],
                                   'hi:o:',
                                   ['help', 'input=', 'output='])
    except getopt.GetoptError as err:
        print(err)
        usage(prgname)
        sys.exit(2)
    if len(args) > 0:
        usage(prgname)
        sys.exit(2)
    input = INSTALLED_SIMPLE_XML
    output = None
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(prgname)
            sys.exit()
        elif opt in ('-i', '--input'):
            input = arg
        elif opt in ('-o', '--output'):
            output = arg

    iso639 = parse_iso639('/usr/share/xml/iso-codes/iso_639.xml')
    xml = ConvertEngineXML(input, iso639)
    xml.write(output)