|
Packit |
1f3717 |
# -*- coding: utf-8 -*-
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Copyright 2007 - 2011 Harri Pitk채nen (hatapitk@iki.fi)
|
|
Packit |
1f3717 |
# 2007 Hannu V채is채nen (Etunimi.Sukunimi@joensuu.fi)
|
|
Packit |
1f3717 |
#
|
|
Packit |
1f3717 |
# Functions and variables that are common to Sukija and Voikko versions.
|
|
Packit |
1f3717 |
#
|
|
Packit |
1f3717 |
# This program is free software; you can redistribute it and/or modify
|
|
Packit |
1f3717 |
# it under the terms of the GNU General Public License as published by
|
|
Packit |
1f3717 |
# the Free Software Foundation; either version 2 of the License, or
|
|
Packit |
1f3717 |
# (at your option) any later version.
|
|
Packit |
1f3717 |
#
|
|
Packit |
1f3717 |
# This program is distributed in the hope that it will be useful,
|
|
Packit |
1f3717 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
Packit |
1f3717 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
Packit |
1f3717 |
# GNU General Public License for more details.
|
|
Packit |
1f3717 |
#
|
|
Packit |
1f3717 |
# You should have received a copy of the GNU General Public License
|
|
Packit |
1f3717 |
# along with this program; if not, write to the Free Software
|
|
Packit |
1f3717 |
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
import hfconv
|
|
Packit |
1f3717 |
import voikkoutils
|
|
Packit |
1f3717 |
import codecs
|
|
Packit |
1f3717 |
import getopt
|
|
Packit |
1f3717 |
import sys
|
|
Packit |
1f3717 |
from xml.dom import Node
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Path to source data directory
|
|
Packit |
1f3717 |
VOCABULARY_DATA = u"vocabulary"
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Vocabulary entries that should be saved to different files
|
|
Packit |
1f3717 |
# (group, name, file)
|
|
Packit |
1f3717 |
SPECIAL_VOCABULARY = [
|
|
Packit |
1f3717 |
('usage', 'it', 'atk.lex'),
|
|
Packit |
1f3717 |
('usage', 'medicine', 'laaketiede.lex'),
|
|
Packit |
1f3717 |
('usage', 'science', 'matluonnontiede.lex'),
|
|
Packit |
1f3717 |
('usage', 'education', 'kasvatustiede.lex'),
|
|
Packit |
1f3717 |
('style', 'foreign', 'vieraskieliset.lex')]
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def open_lex(path, filename):
|
|
Packit |
1f3717 |
file = codecs.open(path + u"/" + filename, 'w', 'UTF-8')
|
|
Packit |
1f3717 |
file.write(u"# This is automatically generated intermediate lexicon file for\n")
|
|
Packit |
1f3717 |
file.write(u"# Suomi-malaga Voikko edition. The original source data is\n")
|
|
Packit |
1f3717 |
file.write(u"# distributed under the GNU General Public License, version 2 or\n")
|
|
Packit |
1f3717 |
file.write(u"# later, as published by the Free Software Foundation. You should\n")
|
|
Packit |
1f3717 |
file.write(u"# have received the original data, tools and instructions to\n")
|
|
Packit |
1f3717 |
file.write(u"# generate this file (or instructions to obtain them) wherever\n")
|
|
Packit |
1f3717 |
file.write(u"# you got this file from.\n\n")
|
|
Packit |
1f3717 |
return file
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def tValue(element):
|
|
Packit |
1f3717 |
return element.firstChild.wholeText
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Returns a list of text values with given element name under DOM element "group"
|
|
Packit |
1f3717 |
def tValues(group, element_name):
|
|
Packit |
1f3717 |
values = []
|
|
Packit |
1f3717 |
for element in group.getElementsByTagName(element_name):
|
|
Packit |
1f3717 |
values.append(tValue(element))
|
|
Packit |
1f3717 |
return values
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Returns malaga word class for given word in Joukahainen
|
|
Packit |
1f3717 |
def get_malaga_word_class(j_wordclasses):
|
|
Packit |
1f3717 |
if "pnoun_place" in j_wordclasses: return u"paikannimi"
|
|
Packit |
1f3717 |
if "pnoun_firstname" in j_wordclasses: return u"etunimi"
|
|
Packit |
1f3717 |
if "pnoun_lastname" in j_wordclasses: return u"sukunimi"
|
|
Packit |
1f3717 |
if "pnoun_misc" in j_wordclasses: return u"nimi"
|
|
Packit |
1f3717 |
if "verb" in j_wordclasses: return u"teonsana"
|
|
Packit |
1f3717 |
if "adjective" in j_wordclasses and "noun" in j_wordclasses: return u"nimi_laatusana"
|
|
Packit |
1f3717 |
if "adjective" in j_wordclasses: return u"laatusana"
|
|
Packit |
1f3717 |
if "noun" in j_wordclasses: return u"nimisana"
|
|
Packit |
1f3717 |
if "interjection" in j_wordclasses: return u"huudahdussana"
|
|
Packit |
1f3717 |
if "prefix" in j_wordclasses: return u"etuliite"
|
|
Packit |
1f3717 |
if "abbreviation" in j_wordclasses: return u"lyhenne"
|
|
Packit |
1f3717 |
if "adverb" in j_wordclasses: return "seikkasana"
|
|
Packit |
1f3717 |
if "conjunction" in j_wordclasses: return "sidesana"
|
|
Packit |
1f3717 |
return None
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Returns flag names from given group for word in Joukahainen
|
|
Packit |
1f3717 |
def get_flags_from_group(word, groupName):
|
|
Packit |
1f3717 |
flags = []
|
|
Packit |
1f3717 |
for group in word.childNodes:
|
|
Packit |
1f3717 |
if group.nodeType != Node.ELEMENT_NODE or group.tagName != groupName:
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
for flag in group.childNodes:
|
|
Packit |
1f3717 |
if flag.nodeType != Node.ELEMENT_NODE:
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
if flag.tagName != "flag":
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
flags.append(flag.firstChild.wholeText)
|
|
Packit |
1f3717 |
return flags
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Returns malaga flags for given word in Joukahainen
|
|
Packit |
1f3717 |
def get_malaga_flags(word):
|
|
Packit |
1f3717 |
global flag_attributes
|
|
Packit |
1f3717 |
malagaFlags = []
|
|
Packit |
1f3717 |
for group in word.childNodes:
|
|
Packit |
1f3717 |
if group.nodeType != Node.ELEMENT_NODE:
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
for flag in group.childNodes:
|
|
Packit |
1f3717 |
if flag.nodeType != Node.ELEMENT_NODE:
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
if flag.tagName != "flag":
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
flagAttribute = flag_attributes[group.tagName + u"/" + tValue(flag)]
|
|
Packit |
1f3717 |
if flagAttribute.malagaFlag != None:
|
|
Packit |
1f3717 |
malagaFlags.append(flagAttribute.malagaFlag)
|
|
Packit |
1f3717 |
if len(malagaFlags) == 0: return u""
|
|
Packit |
1f3717 |
flag_string = u", tiedot: <"
|
|
Packit |
1f3717 |
for flag in malagaFlags:
|
|
Packit |
1f3717 |
flag_string = flag_string + flag + u","
|
|
Packit |
1f3717 |
flag_string = flag_string[:-1] + u">"
|
|
Packit |
1f3717 |
return flag_string
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
flag_attributes = voikkoutils.readFlagAttributes(VOCABULARY_DATA + u"/flags.txt")
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def vowel_type(group):
|
|
Packit |
1f3717 |
vtypes = group.getElementsByTagName("vtype")
|
|
Packit |
1f3717 |
if len(vtypes) != 1: return voikkoutils.VOWEL_DEFAULT
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
vtypes = tValue(vtypes[0])
|
|
Packit |
1f3717 |
if vtypes == u'a': return voikkoutils.VOWEL_BACK
|
|
Packit |
1f3717 |
elif vtypes == u'채': return voikkoutils.VOWEL_FRONT
|
|
Packit |
1f3717 |
else: return voikkoutils.VOWEL_BOTH
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def has_flag(word, flag):
|
|
Packit |
1f3717 |
if flag in tValues(word, "flag"): return True
|
|
Packit |
1f3717 |
return False
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Returns tuple (alku, jatko) for given word in Joukahainen
|
|
Packit |
1f3717 |
def get_malaga_inflection_class(wordform, j_infclass, j_wordclasses, j_classmap):
|
|
Packit |
1f3717 |
if j_infclass is None:
|
|
Packit |
1f3717 |
return (wordform, u"loppu")
|
|
Packit |
1f3717 |
(infclass, gradclass) = (list(j_infclass.split(u'-')) + [None])[:2]
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
if gradclass == None: gradtypes = [None]
|
|
Packit |
1f3717 |
else: gradtypes = [grad[1] for grad in hfconv.grads if grad[2] == gradclass]
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Determine the word class for the given word
|
|
Packit |
1f3717 |
if "adjective" in j_wordclasses: wclass = hfconv.ADJ
|
|
Packit |
1f3717 |
elif "noun" in j_wordclasses or "pnoun_firstname" in j_wordclasses or \
|
|
Packit |
1f3717 |
"pnoun_lastname" in j_wordclasses or "pnoun_place" in j_wordclasses or \
|
|
Packit |
1f3717 |
"pnoun_misc" in j_wordclasses: wclass = hfconv.SUBST
|
|
Packit |
1f3717 |
elif "verb" in j_wordclasses: wclass = hfconv.VERB
|
|
Packit |
1f3717 |
else: return (None, None)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
for (m_infclass, m_infclass_gradation, m_smclasses) in j_classmap:
|
|
Packit |
1f3717 |
if m_infclass != infclass: continue
|
|
Packit |
1f3717 |
for m_smclass in m_smclasses:
|
|
Packit |
1f3717 |
(m_gradtype, pattern, jatko, wclasses) = (list(m_smclass) + [None])[:4]
|
|
Packit |
1f3717 |
if wclasses != None and not wclass in wclasses: continue
|
|
Packit |
1f3717 |
if not m_gradtype in gradtypes: continue
|
|
Packit |
1f3717 |
alku = hfconv.match_re(wordform, pattern)
|
|
Packit |
1f3717 |
if alku != None: return (alku, jatko)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
return (None, None)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Returns a string describing the structure of a word, if necessary for the spellchecker
|
|
Packit |
1f3717 |
# or hyphenator
|
|
Packit |
1f3717 |
def get_structure(wordform, malaga_word_class):
|
|
Packit |
1f3717 |
needstructure = False
|
|
Packit |
1f3717 |
if malaga_word_class in [u'nimi', u'etunimi', u'sukunimi', 'paikannimi']: ispropernoun = True
|
|
Packit |
1f3717 |
else: ispropernoun = False
|
|
Packit |
1f3717 |
if malaga_word_class == u'lyhenne':
|
|
Packit |
1f3717 |
i = u"j"
|
|
Packit |
1f3717 |
p = u"q"
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
i = u"i"
|
|
Packit |
1f3717 |
p = u"p"
|
|
Packit |
1f3717 |
structstr = u', rakenne: "='
|
|
Packit |
1f3717 |
for idx in range(len(wordform)):
|
|
Packit |
1f3717 |
c = wordform[idx]
|
|
Packit |
1f3717 |
if c == u'-':
|
|
Packit |
1f3717 |
structstr = structstr + u"-="
|
|
Packit |
1f3717 |
needstructure = True
|
|
Packit |
1f3717 |
elif c == u'|': structstr = structstr
|
|
Packit |
1f3717 |
elif c == u'=':
|
|
Packit |
1f3717 |
structstr = structstr + u"="
|
|
Packit |
1f3717 |
needstructure = True
|
|
Packit |
1f3717 |
elif c == u':':
|
|
Packit |
1f3717 |
structstr = structstr + u":"
|
|
Packit |
1f3717 |
needstructure = True
|
|
Packit |
1f3717 |
elif c.isupper():
|
|
Packit |
1f3717 |
structstr = structstr + i
|
|
Packit |
1f3717 |
if not (ispropernoun and idx == 0):
|
|
Packit |
1f3717 |
needstructure = True
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
structstr = structstr + p
|
|
Packit |
1f3717 |
if ispropernoun and idx == 0:
|
|
Packit |
1f3717 |
needstructure = True
|
|
Packit |
1f3717 |
if needstructure: return structstr + u'"'
|
|
Packit |
1f3717 |
else: return u""
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Writes the vocabulary entry to a suitable file
|
|
Packit |
1f3717 |
def write_entry(main_vocabulary,vocabulary_files,word, entry):
|
|
Packit |
1f3717 |
special = False
|
|
Packit |
1f3717 |
for voc in SPECIAL_VOCABULARY:
|
|
Packit |
1f3717 |
group = word.getElementsByTagName(voc[0])
|
|
Packit |
1f3717 |
if len(group) == 0: continue
|
|
Packit |
1f3717 |
if has_flag(group[0], voc[1]):
|
|
Packit |
1f3717 |
vocabulary_files[voc[2]].write(entry + u"\n")
|
|
Packit |
1f3717 |
special = True
|
|
Packit |
1f3717 |
if not special:
|
|
Packit |
1f3717 |
main_vocabulary.write(entry + u"\n")
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Parse command line options and return them in a dictionary
|
|
Packit |
1f3717 |
def get_options():
|
|
Packit |
1f3717 |
try:
|
|
Packit |
1f3717 |
optlist = ["min-frequency=", "extra-usage=", "style=", "destdir=", "sourceid", "vanhat", "sukija", "sukija-ys"]
|
|
Packit |
1f3717 |
(opts, args) = getopt.getopt(sys.argv[1:], "", optlist)
|
|
Packit |
1f3717 |
except getopt.GetoptError:
|
|
Packit |
1f3717 |
sys.stderr.write("Invalid option list for %s\n" % sys.argv[0])
|
|
Packit |
1f3717 |
sys.exit(1)
|
|
Packit |
1f3717 |
options = {"frequency": 9,
|
|
Packit |
1f3717 |
"extra-usage": [],
|
|
Packit |
1f3717 |
"style": ["old", "international", "inappropriate"],
|
|
Packit |
1f3717 |
"sourceid": False,
|
|
Packit |
1f3717 |
"vanhat": False,
|
|
Packit |
1f3717 |
"destdir": None,
|
|
Packit |
1f3717 |
"sukija": False,
|
|
Packit |
1f3717 |
"sukija-ys": False}
|
|
Packit |
1f3717 |
for (name, value) in opts:
|
|
Packit |
1f3717 |
if name == "--min-frequency":
|
|
Packit |
1f3717 |
options["frequency"] = int(value)
|
|
Packit |
1f3717 |
elif name == "--extra-usage":
|
|
Packit |
1f3717 |
options["extra-usage"] = value.split(",")
|
|
Packit |
1f3717 |
elif name == "--style":
|
|
Packit |
1f3717 |
options["style"] = value.split(",")
|
|
Packit |
1f3717 |
elif name == "--destdir":
|
|
Packit |
1f3717 |
options["destdir"] = value
|
|
Packit |
1f3717 |
elif name == "--sourceid":
|
|
Packit |
1f3717 |
options["sourceid"] = True
|
|
Packit |
1f3717 |
elif name == "--vanhat":
|
|
Packit |
1f3717 |
options["vanhat"] = True
|
|
Packit |
1f3717 |
elif name == "--sukija":
|
|
Packit |
1f3717 |
options["sukija"] = True
|
|
Packit |
1f3717 |
elif name == "--sukija-ys":
|
|
Packit |
1f3717 |
options["sukija-ys"] = True
|
|
Packit |
1f3717 |
return options
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Strip whitespace and comments from LEXC input file
|
|
Packit |
1f3717 |
def stripWhitespaceAndComments(line):
|
|
Packit |
1f3717 |
if u"!" in line:
|
|
Packit |
1f3717 |
line = line[0:line.find(u"!")]
|
|
Packit |
1f3717 |
return line.strip()
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Filter LEXC input according to options
|
|
Packit |
1f3717 |
def filterVfstInput(line_orig, OPTIONS):
|
|
Packit |
1f3717 |
if line_orig.startswith(u'?Sukija'):
|
|
Packit |
1f3717 |
if OPTIONS["sukija"]:
|
|
Packit |
1f3717 |
line_orig = line_orig[7:]
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
return None
|
|
Packit |
1f3717 |
if line_orig.startswith(u'?Murre'):
|
|
Packit |
1f3717 |
if "dialect" in OPTIONS["style"] or OPTIONS["sukija"]:
|
|
Packit |
1f3717 |
line_orig = line_orig[6:]
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
return None
|
|
Packit |
1f3717 |
if line_orig.startswith(u'?Vanha'):
|
|
Packit |
1f3717 |
if OPTIONS["vanhat"] or OPTIONS["sukija"]:
|
|
Packit |
1f3717 |
line_orig = line_orig[6:]
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
return None
|
|
Packit |
1f3717 |
return stripWhitespaceAndComments(line_orig)
|