Tree - source-git/malaga-suomi-voikko

source-git / malaga-suomi-voikko

Files

Commit: b5e1074b2167462157791e7c3a1f54095cd15ee1
Blob Blame History Raw
# -*- coding: utf-8 -*-

# Copyright 2009 - 2015 Harri Pitkänen (hatapitk@iki.fi)

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

# This program converts an XML representation of autocorrect data
# into lexc format for autocorrect transducer.
#
# Usage: python triecompiler.py input.xml output.lexc

from __future__ import unicode_literals
import xml.dom.minidom
import sys

# Open the XML file
xmlFile = open(sys.argv[1], "r")
autoCorrect = xml.dom.minidom.parseString(xmlFile.read())
xmlFile.close()

# Open the lexc file
outputFile = open(sys.argv[2], "w")
outputFile.write("Multichar_Symbols\n@_SPACE_@\n\nLEXICON Root\n")

def formatForLexc(s):
	return s.replace("=", "").replace(" ", "@_SPACE_@")

# Read entries to lexc
for replacement in autoCorrect.getElementsByTagName("replacement"):
	incorrect = formatForLexc(replacement.getElementsByTagName("incorrect")[0].firstChild.wholeText)
	correct = formatForLexc(replacement.getElementsByTagName("correct")[0].firstChild.wholeText)
	outputFile.write((incorrect + u":" + correct + u"\t#\t;\n").encode("UTF-8"))

outputFile.close()
source-git / malaga-suomi-voikko

Source Code

Files