# -*- coding: utf8 -*-
# Copyright 2009 - 2015 Harri Pitkänen (hatapitk@iki.fi)
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
# This program converts an XML representation of autocorrect data
# into a lookup trie in C++. The trie consist of an array of TrieNode
# objects (NODES). NODES[0] is the trie root. If x is a node, the first
# child of x is NODES[x.subtreeStart], the second is NODES[x.subtreeStart + 1]
# etc. The end of child list is represented by a null node (node.label == 0).
# If node has no children, node.subtreeStart == 0.
#
# All nodes representing an input string have node.replacementIndex != 0.
# The replacement index is an index of array REPLACEMENTS that contains
# the suggested replacement for the input string.
#
# Usage: python triecompiler.py input.xml output.hpp
import xml.dom.minidom
import sys
class Node:
def __init__(self, value = None):
self.value = value
self.valueIndex = 0
self.children = { }
def appendToTrie(trie, key, value):
if len(key) == 1:
if key in trie.children:
# Duplicate key will replace the old value
trie.children[key].value = value
else:
trie.children[key] = Node(value)
else:
first = key[0]
rest = key[1:]
if first in trie.children:
appendToTrie(trie.children[first], rest, value)
else:
node = Node()
appendToTrie(node, rest, value)
trie.children[first] = node
def indexTrie(trie, nextNodeIndex, nextValueIndex):
for node in trie.children:
trie.children[node].nodeIndex = nextNodeIndex
nextNodeIndex += 1
if trie.children[node].value != None:
trie.children[node].valueIndex = nextValueIndex
nextValueIndex += 1
nextNodeIndex += 1 # skip terminating null node
for node in trie.children:
(nextNodeIndex, nextValueIndex) = indexTrie(trie.children[node], nextNodeIndex, nextValueIndex)
return (nextNodeIndex, nextValueIndex)
def cHexCodeForChar(unicodeChar):
ordinal = ord(unicodeChar)
if unicodeChar == "\\":
return "\\\\"
if 0x20 <= ordinal and ordinal <= 0x7f:
# These characters cannot be represented as unicode literals in C++
return unicodeChar
hexCode = hex(ordinal)[2:]
return "\\u" + hexCode.rjust(4, "0")
def writeTrieNodes(trie, outputFile):
for node in trie.children:
outputFile.write(",{")
outputFile.write("L'%s'" % cHexCodeForChar(node))
outputFile.write(",")
outputFile.write(str(trie.children[node].valueIndex))
outputFile.write(",")
if len(trie.children[node].children) != 0:
outputFile.write(str(next(iter(trie.children[node].children.values())).nodeIndex))
else:
outputFile.write("0")
outputFile.write("}")
outputFile.write(",{L'\\0',0,0}") # terminating null node
for node in trie.children:
writeTrieNodes(trie.children[node], outputFile)
def writeTrieValues(trie, outputFile):
for node in trie.children.values():
if node.value != None:
outputFile.write(',L"')
for char in node.value:
outputFile.write(cHexCodeForChar(char))
outputFile.write('"')
for node in trie.children.values():
writeTrieValues(node, outputFile)
# Open the XML file
xmlFile = open(sys.argv[1], "r")
autoCorrect = xml.dom.minidom.parseString(xmlFile.read())
xmlFile.close()
# Read entries to a trie
trieRoot = Node()
for replacement in autoCorrect.getElementsByTagName("replacement"):
incorrect = replacement.getElementsByTagName("incorrect")[0].firstChild.wholeText
correct = replacement.getElementsByTagName("correct")[0].firstChild.wholeText
appendToTrie(trieRoot, incorrect.replace("=", ""), correct.replace("=", ""))
autoCorrect = None
# Index trie nodes
indexTrie(trieRoot, 1, 1)
# Write trie to a file as a C++ structure
outputFile = open(sys.argv[2], "w")
outputFile.write("static const TrieNode NODES[] = {\n")
outputFile.write("{L'\\0',0,1}")
writeTrieNodes(trieRoot, outputFile)
outputFile.write('};\n')
outputFile.write("static const wchar_t * const REPLACEMENTS[] = {\n")
outputFile.write("0")
writeTrieValues(trieRoot, outputFile)
outputFile.write('};\n')
outputFile.close()