|
Packit |
1f3717 |
# -*- coding: utf-8 -*-
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Copyright 2007 - 2012 Harri Pitkänen (hatapitk@iki.fi)
|
|
Packit |
1f3717 |
# Program to generate lexicon files for Suomi-malaga Voikko edition
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# This program is free software; you can redistribute it and/or modify
|
|
Packit |
1f3717 |
# it under the terms of the GNU General Public License as published by
|
|
Packit |
1f3717 |
# the Free Software Foundation; either version 2 of the License, or
|
|
Packit |
1f3717 |
# (at your option) any later version.
|
|
Packit |
1f3717 |
#
|
|
Packit |
1f3717 |
# This program is distributed in the hope that it will be useful,
|
|
Packit |
1f3717 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
Packit |
1f3717 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
Packit |
1f3717 |
# GNU General Public License for more details.
|
|
Packit |
1f3717 |
#
|
|
Packit |
1f3717 |
# You should have received a copy of the GNU General Public License
|
|
Packit |
1f3717 |
# along with this program; if not, write to the Free Software
|
|
Packit |
1f3717 |
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
import sys
|
|
Packit |
1f3717 |
sys.path.append("common")
|
|
Packit |
1f3717 |
import hfconv
|
|
Packit |
1f3717 |
import generate_lex_common
|
|
Packit |
1f3717 |
import voikkoutils
|
|
Packit |
1f3717 |
import xml.dom.minidom
|
|
Packit |
1f3717 |
import codecs
|
|
Packit |
1f3717 |
from string import rfind
|
|
Packit |
1f3717 |
from xml.dom import Node
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
flag_attributes = voikkoutils.readFlagAttributes(generate_lex_common.VOCABULARY_DATA + u"/flags.txt")
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Get command line options
|
|
Packit |
1f3717 |
OPTIONS = generate_lex_common.get_options()
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Inflection class map
|
|
Packit |
1f3717 |
CLASSMAP = hfconv.compileClassmapREs(hfconv.modern_classmap)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# No special vocabularies are built for Voikko
|
|
Packit |
1f3717 |
generate_lex_common.SPECIAL_VOCABULARY = []
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
vocabularyFileSuffixes = [u"ep", u"ee", u"es", u"em", u"t", u"nl", u"l", u"n", u"h", u"p", u"a", u"s", u"c"]
|
|
Packit |
1f3717 |
vocabularyFiles = {}
|
|
Packit |
1f3717 |
for fileSuffix in vocabularyFileSuffixes:
|
|
Packit |
1f3717 |
vocFile = codecs.open(OPTIONS["destdir"] + u"/joukahainen-" + fileSuffix + u".lexc", 'w', 'UTF-8')
|
|
Packit |
1f3717 |
vocFile.write(u"! This is automatically generated intermediate lexicon file for\n")
|
|
Packit |
1f3717 |
vocFile.write(u"! VVFST morphology. The original source data is\n")
|
|
Packit |
1f3717 |
vocFile.write(u"! distributed under the GNU General Public License, version 2 or\n")
|
|
Packit |
1f3717 |
vocFile.write(u"! later, as published by the Free Software Foundation. You should\n")
|
|
Packit |
1f3717 |
vocFile.write(u"! have received the original data, tools and instructions to\n")
|
|
Packit |
1f3717 |
vocFile.write(u"! generate this file (or instructions to obtain them) wherever\n")
|
|
Packit |
1f3717 |
vocFile.write(u"! you got this file from.\n\n")
|
|
Packit |
1f3717 |
vocFile.write(u"LEXICON Joukahainen_" + fileSuffix + u"\n")
|
|
Packit |
1f3717 |
vocabularyFiles[fileSuffix] = vocFile
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def frequency(word):
|
|
Packit |
1f3717 |
fclass = word.getElementsByTagName("fclass")
|
|
Packit |
1f3717 |
if len(fclass) == 0: return 7
|
|
Packit |
1f3717 |
return int(generate_lex_common.tValue(fclass[0]))
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Check the style flags of the word according to current options.
|
|
Packit |
1f3717 |
# Returns True if the word is acceptable, otherwise returns false.
|
|
Packit |
1f3717 |
def check_style(word):
|
|
Packit |
1f3717 |
global OPTIONS
|
|
Packit |
1f3717 |
for styleE in word.getElementsByTagName("style"):
|
|
Packit |
1f3717 |
for style in generate_lex_common.tValues(styleE, "flag"):
|
|
Packit |
1f3717 |
if style == "foreignloan":
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
if not style in OPTIONS["style"]: return False
|
|
Packit |
1f3717 |
return True
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Returns True if the word is acceptable according to its usage flags.
|
|
Packit |
1f3717 |
def check_usage(word):
|
|
Packit |
1f3717 |
global OPTIONS
|
|
Packit |
1f3717 |
wordUsage = word.getElementsByTagName("usage")
|
|
Packit |
1f3717 |
if len(wordUsage) == 0: return True
|
|
Packit |
1f3717 |
for usageE in wordUsage:
|
|
Packit |
1f3717 |
for usage in generate_lex_common.tValues(usageE, "flag"):
|
|
Packit |
1f3717 |
if usage in OPTIONS["extra-usage"]: return True
|
|
Packit |
1f3717 |
return False
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Returns VFST word class for given word in Joukahainen
|
|
Packit |
1f3717 |
def get_vfst_word_class(j_wordclasses):
|
|
Packit |
1f3717 |
if "pnoun_place" in j_wordclasses: return u"[Lep]"
|
|
Packit |
1f3717 |
if "pnoun_firstname" in j_wordclasses: return u"[Lee]"
|
|
Packit |
1f3717 |
if "pnoun_lastname" in j_wordclasses: return u"[Les]"
|
|
Packit |
1f3717 |
if "pnoun_misc" in j_wordclasses: return u"[Lem]"
|
|
Packit |
1f3717 |
if "verb" in j_wordclasses: return u"[Lt]"
|
|
Packit |
1f3717 |
if "adjective" in j_wordclasses and "noun" in j_wordclasses: return u"[Lnl]"
|
|
Packit |
1f3717 |
if "adjective" in j_wordclasses: return u"[Ll]"
|
|
Packit |
1f3717 |
if "noun" in j_wordclasses: return u"[Ln]"
|
|
Packit |
1f3717 |
if "interjection" in j_wordclasses: return u"[Lh]"
|
|
Packit |
1f3717 |
if "prefix" in j_wordclasses: return u"[Lp]"
|
|
Packit |
1f3717 |
if "abbreviation" in j_wordclasses: return u"[La]"
|
|
Packit |
1f3717 |
if "adverb" in j_wordclasses: return u"[Ls]"
|
|
Packit |
1f3717 |
if "conjunction" in j_wordclasses: return u"[Lc]"
|
|
Packit |
1f3717 |
return None
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Returns a string describing the structure of a word, if necessary for the spellchecker
|
|
Packit |
1f3717 |
# or hyphenator
|
|
Packit |
1f3717 |
def get_structure(wordform, vfst_word_class, alku):
|
|
Packit |
1f3717 |
needstructure = False
|
|
Packit |
1f3717 |
ispropernoun = vfst_word_class[0:3] == u'[Le'
|
|
Packit |
1f3717 |
structstr = u'[Xr]'
|
|
Packit |
1f3717 |
oldAlku = alku
|
|
Packit |
1f3717 |
newAlku = u""
|
|
Packit |
1f3717 |
if vfst_word_class == u'[La]':
|
|
Packit |
1f3717 |
i = u"j"
|
|
Packit |
1f3717 |
p = u"q"
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
i = u"i"
|
|
Packit |
1f3717 |
p = u"p"
|
|
Packit |
1f3717 |
for idx in range(len(wordform)):
|
|
Packit |
1f3717 |
c = wordform[idx]
|
|
Packit |
1f3717 |
if c == u'-':
|
|
Packit |
1f3717 |
structstr = structstr + u"-="
|
|
Packit |
1f3717 |
if (len(oldAlku) > 0):
|
|
Packit |
1f3717 |
newAlku = newAlku + u'-[Bm]'
|
|
Packit |
1f3717 |
oldAlku = oldAlku[1:]
|
|
Packit |
1f3717 |
elif c == u'|':
|
|
Packit |
1f3717 |
structstr = structstr
|
|
Packit |
1f3717 |
elif c == u'=':
|
|
Packit |
1f3717 |
structstr = structstr + u"="
|
|
Packit |
1f3717 |
newAlku = newAlku + u"[Bm]"
|
|
Packit |
1f3717 |
elif c == u':':
|
|
Packit |
1f3717 |
structstr = structstr + u":"
|
|
Packit |
1f3717 |
needstructure = True
|
|
Packit |
1f3717 |
if (len(oldAlku) > 0):
|
|
Packit |
1f3717 |
newAlku = newAlku + u':'
|
|
Packit |
1f3717 |
oldAlku = oldAlku[1:]
|
|
Packit |
1f3717 |
elif c.isupper():
|
|
Packit |
1f3717 |
structstr = structstr + i
|
|
Packit |
1f3717 |
if not (ispropernoun and idx == 0):
|
|
Packit |
1f3717 |
needstructure = True
|
|
Packit |
1f3717 |
if (len(oldAlku) > 0):
|
|
Packit |
1f3717 |
newAlku = newAlku + oldAlku[0]
|
|
Packit |
1f3717 |
oldAlku = oldAlku[1:]
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
structstr = structstr + p
|
|
Packit |
1f3717 |
if ispropernoun and idx == 0:
|
|
Packit |
1f3717 |
needstructure = True
|
|
Packit |
1f3717 |
if (len(oldAlku) > 0):
|
|
Packit |
1f3717 |
newAlku = newAlku + oldAlku[0]
|
|
Packit |
1f3717 |
oldAlku = oldAlku[1:]
|
|
Packit |
1f3717 |
if needstructure:
|
|
Packit |
1f3717 |
returnedLength = len(structstr)
|
|
Packit |
1f3717 |
while structstr[returnedLength - 1] == p:
|
|
Packit |
1f3717 |
returnedLength = returnedLength - 1
|
|
Packit |
1f3717 |
return (structstr[0:returnedLength] + u'[X]', alku)
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
return (u"", newAlku)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def get_diacritics(word, altforms, vfst_word_class):
|
|
Packit |
1f3717 |
diacritics = []
|
|
Packit |
1f3717 |
for group in word.childNodes:
|
|
Packit |
1f3717 |
if group.nodeType != Node.ELEMENT_NODE:
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
for flag in group.childNodes:
|
|
Packit |
1f3717 |
if flag.nodeType != Node.ELEMENT_NODE:
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
if flag.tagName != "flag":
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
flagName = flag.firstChild.wholeText
|
|
Packit |
1f3717 |
if flagName == u"ei_yks":
|
|
Packit |
1f3717 |
diacritics.append(u"@P.EI_YKS.ON@")
|
|
Packit |
1f3717 |
elif flagName == u"ysj":
|
|
Packit |
1f3717 |
diacritics.append(u"@R.YS_ALKANUT@")
|
|
Packit |
1f3717 |
elif flagName == u"inen":
|
|
Packit |
1f3717 |
diacritics.append(u"@P.INEN_SALLITTU.ON@")
|
|
Packit |
1f3717 |
elif flagName == u"ei_inen":
|
|
Packit |
1f3717 |
diacritics.append(u"@P.INEN_KIELLETTY.ON@")
|
|
Packit |
1f3717 |
elif flagName == u"ei_mainen":
|
|
Packit |
1f3717 |
diacritics.append(u"@P.EI_MAINEN.ON@")
|
|
Packit |
1f3717 |
elif flagName == u"ei_lainen":
|
|
Packit |
1f3717 |
diacritics.append(u"@P.EI_LAINEN.ON@")
|
|
Packit |
1f3717 |
elif flagName == u"ei_vertm":
|
|
Packit |
1f3717 |
diacritics.append(u"@P.EI_VERTM.ON@")
|
|
Packit |
1f3717 |
elif flagName == u"ym3":
|
|
Packit |
1f3717 |
diacritics.append(u"@P.VAIN_YM3.ON@")
|
|
Packit |
1f3717 |
elif flagName == u"yt":
|
|
Packit |
1f3717 |
diacritics.append(u"@P.YKSITEKIJÄINEN.ON@")
|
|
Packit |
1f3717 |
elif flagName == u"geo_suffix":
|
|
Packit |
1f3717 |
diacritics.append(u"@C.PAIKANNIMEN_JL@")
|
|
Packit |
1f3717 |
if flagName in [u"ei_ys", u"ei_ysa"]:
|
|
Packit |
1f3717 |
diacritics.append(u"@P.YS_EI_JATKOA.ON@")
|
|
Packit |
1f3717 |
if flagName in [u"ei_ys", u"ei_ysj"]:
|
|
Packit |
1f3717 |
diacritics.append(u"@D.YS_ALKANUT@")
|
|
Packit |
1f3717 |
if vfst_word_class in [u"[Ln]", u"[Lnl]"] and (altforms[0].endswith(u"lainen") or altforms[0].endswith(u"läinen")):
|
|
Packit |
1f3717 |
diacritics.append(u"@P.LAINEN.ON@@C.LAINEN_VAADITTU@@C.VAIN_NIMISANA@")
|
|
Packit |
1f3717 |
return diacritics
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def get_info_flags(word):
|
|
Packit |
1f3717 |
flags = u""
|
|
Packit |
1f3717 |
for group in word.childNodes:
|
|
Packit |
1f3717 |
if group.nodeType != Node.ELEMENT_NODE:
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
for flag in group.childNodes:
|
|
Packit |
1f3717 |
if flag.nodeType != Node.ELEMENT_NODE:
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
if flag.tagName != "flag":
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
flagName = flag.firstChild.wholeText
|
|
Packit |
1f3717 |
if flagName == u"paikannimi_ulkopaikallissijat":
|
|
Packit |
1f3717 |
flags = flags + u"[Ipu]"
|
|
Packit |
1f3717 |
elif flagName == u"paikannimi_sisäpaikallissijat":
|
|
Packit |
1f3717 |
flags = flags + u"[Ips]"
|
|
Packit |
1f3717 |
elif flagName == u"foreignloan":
|
|
Packit |
1f3717 |
flags = flags + u"[Isf]"
|
|
Packit |
1f3717 |
elif flagName == u"el_altark":
|
|
Packit |
1f3717 |
flags = flags + u"[De]"
|
|
Packit |
1f3717 |
elif flagName == u"geo_suffix":
|
|
Packit |
1f3717 |
flags = flags + u"[Ica]"
|
|
Packit |
1f3717 |
elif flagName == u"org_suffix":
|
|
Packit |
1f3717 |
flags = flags + u"[Ion]"
|
|
Packit |
1f3717 |
elif flagName == u"free_suffix":
|
|
Packit |
1f3717 |
flags = flags + u"[Ivj]"
|
|
Packit |
1f3717 |
elif flagName == u"require_following_a":
|
|
Packit |
1f3717 |
flags = flags + u"[Ira]"
|
|
Packit |
1f3717 |
elif flagName == u"require_following_ma":
|
|
Packit |
1f3717 |
flags = flags + u"[Irm]"
|
|
Packit |
1f3717 |
return flags
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def get_vfst_class_prefix(vfst_class):
|
|
Packit |
1f3717 |
if vfst_class == u"[Ln]":
|
|
Packit |
1f3717 |
return u"Nimisana"
|
|
Packit |
1f3717 |
elif vfst_class == u"[Lee]":
|
|
Packit |
1f3717 |
return u"Etunimi"
|
|
Packit |
1f3717 |
elif vfst_class == u"[Lep]":
|
|
Packit |
1f3717 |
return u"Paikannimi"
|
|
Packit |
1f3717 |
elif vfst_class == u"[Les]":
|
|
Packit |
1f3717 |
return u"Sukunimi"
|
|
Packit |
1f3717 |
elif vfst_class == u"[Lem]":
|
|
Packit |
1f3717 |
return u"Nimi"
|
|
Packit |
1f3717 |
elif vfst_class == u"[Ll]":
|
|
Packit |
1f3717 |
return u"Laatusana"
|
|
Packit |
1f3717 |
elif vfst_class == u"[Lnl]":
|
|
Packit |
1f3717 |
return u"NimiLaatusana"
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
return u""
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def vowel_type_for_derived_verb(wordform):
|
|
Packit |
1f3717 |
for char in reversed(wordform):
|
|
Packit |
1f3717 |
if char in u"yäö":
|
|
Packit |
1f3717 |
return u"@P.V_SALLITTU.E@"
|
|
Packit |
1f3717 |
if char in u"uao":
|
|
Packit |
1f3717 |
return u"@P.V_SALLITTU.T@"
|
|
Packit |
1f3717 |
if char in u"]":
|
|
Packit |
1f3717 |
break
|
|
Packit |
1f3717 |
return u"@P.V_SALLITTU.T@"
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def get_prefix_jatko(word, altform):
|
|
Packit |
1f3717 |
flags = generate_lex_common.get_flags_from_group(word, u"compounding")
|
|
Packit |
1f3717 |
prefixJatko = u""
|
|
Packit |
1f3717 |
for flag in sorted(flags):
|
|
Packit |
1f3717 |
if flag in [u"eln", u"ell", u"elt", u"eltj"]:
|
|
Packit |
1f3717 |
prefixJatko = prefixJatko + flag
|
|
Packit |
1f3717 |
if altform.endswith(u"-"):
|
|
Packit |
1f3717 |
prefixJatko = prefixJatko + u"H"
|
|
Packit |
1f3717 |
return prefixJatko
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def get_adverb_jatko(word, altform):
|
|
Packit |
1f3717 |
flags = generate_lex_common.get_flags_from_group(word, u"inflection")
|
|
Packit |
1f3717 |
loppu = True
|
|
Packit |
1f3717 |
adverbJatko = u""
|
|
Packit |
1f3717 |
for flag in sorted(flags):
|
|
Packit |
1f3717 |
if flag in [u"liitesana", u"ulkopaikallissijat_yks"]:
|
|
Packit |
1f3717 |
adverbJatko = adverbJatko + flag.title()
|
|
Packit |
1f3717 |
elif flag == u"omistusliite":
|
|
Packit |
1f3717 |
if altform[-1] in u"aäe" and altform[-1] != altform[-2]:
|
|
Packit |
1f3717 |
adverbJatko = adverbJatko + u"OlV"
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
adverbJatko = adverbJatko + u"Omistusliite"
|
|
Packit |
1f3717 |
elif flag == u"required":
|
|
Packit |
1f3717 |
loppu = False;
|
|
Packit |
1f3717 |
if loppu:
|
|
Packit |
1f3717 |
adverbJatko = "Loppu" + adverbJatko
|
|
Packit |
1f3717 |
return adverbJatko
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def get_abbreviation_jatko(word, wordform):
|
|
Packit |
1f3717 |
flags = generate_lex_common.get_flags_from_group(word, u"inflection")
|
|
Packit |
1f3717 |
if wordform.endswith(u".") or u"none" in flags:
|
|
Packit |
1f3717 |
return u"PisteellisenLyhenteenJatko"
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
return u"Lyhenne"
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def handle_word(word):
|
|
Packit |
1f3717 |
global OPTIONS
|
|
Packit |
1f3717 |
global CLASSMAP
|
|
Packit |
1f3717 |
# Drop words that are not needed in the Voikko lexicon
|
|
Packit |
1f3717 |
# but only if not generating Sukija lexicon.
|
|
Packit |
1f3717 |
if generate_lex_common.has_flag(word, "not_voikko") and not OPTIONS["sukija"]: return
|
|
Packit |
1f3717 |
if not check_style(word): return
|
|
Packit |
1f3717 |
if not check_usage(word): return
|
|
Packit |
1f3717 |
if frequency(word) >= OPTIONS["frequency"] + 1: return
|
|
Packit |
1f3717 |
if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Get the inflection class. Exactly one inflection class is needed
|
|
Packit |
1f3717 |
voikko_infclass = None
|
|
Packit |
1f3717 |
if OPTIONS["sukija"]:
|
|
Packit |
1f3717 |
for infclass in word.getElementsByTagName("infclass"):
|
|
Packit |
1f3717 |
if infclass.getAttribute("type") == "historical":
|
|
Packit |
1f3717 |
voikko_infclass = generate_lex_common.tValue(infclass)
|
|
Packit |
1f3717 |
if voikko_infclass == u"banaali": # Banaali taipuu kuten paperi.
|
|
Packit |
1f3717 |
voikko_infclass = u"paperi"
|
|
Packit |
1f3717 |
elif voikko_infclass == u"pasuuna":
|
|
Packit |
1f3717 |
voikko_infclass = u"peruna"
|
|
Packit |
1f3717 |
if voikko_infclass not in [u"aavistaa-av1", u"arvelu", u"arvelu-av1", u"haravoida-av2", u"karahka", u"matala",
|
|
Packit |
1f3717 |
u"paperi", u"paperi-av1", u"peruna"]:
|
|
Packit |
1f3717 |
voikko_infclass = None
|
|
Packit |
1f3717 |
break
|
|
Packit |
1f3717 |
if voikko_infclass == None:
|
|
Packit |
1f3717 |
for infclass in word.getElementsByTagName("infclass"):
|
|
Packit |
1f3717 |
if infclass.getAttribute("type") != "historical":
|
|
Packit |
1f3717 |
voikko_infclass = generate_lex_common.tValue(infclass)
|
|
Packit |
1f3717 |
break
|
|
Packit |
1f3717 |
if voikko_infclass == u"poikkeava": return
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Get the word classes
|
|
Packit |
1f3717 |
wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass")
|
|
Packit |
1f3717 |
if wordclasses[0] not in [u"interjection", u"prefix", u"abbreviation", u"conjunction", u"adverb"] and voikko_infclass == None:
|
|
Packit |
1f3717 |
return
|
|
Packit |
1f3717 |
vfst_word_class = get_vfst_word_class(wordclasses)
|
|
Packit |
1f3717 |
if vfst_word_class == None: return
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Get diacritics
|
|
Packit |
1f3717 |
altforms = generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form")
|
|
Packit |
1f3717 |
diacritics = reduce(lambda x, y: x + y, get_diacritics(word, altforms, vfst_word_class), u"")
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Get forced vowel type
|
|
Packit |
1f3717 |
if voikko_infclass == None and vfst_word_class != u"[La]":
|
|
Packit |
1f3717 |
forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
inflectionElement = word.getElementsByTagName("inflection")
|
|
Packit |
1f3717 |
if len(inflectionElement) > 0:
|
|
Packit |
1f3717 |
forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0])
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Construct debug information
|
|
Packit |
1f3717 |
debug_info = u""
|
|
Packit |
1f3717 |
if OPTIONS["sourceid"]:
|
|
Packit |
1f3717 |
debug_info = u'[Xs]%s[X]' % word.getAttribute("id")[1:].replace(u"0", u"%0")
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
infoFlags = get_info_flags(word)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Process all alternative forms
|
|
Packit |
1f3717 |
singlePartForms = []
|
|
Packit |
1f3717 |
multiPartForms = []
|
|
Packit |
1f3717 |
for altform in altforms:
|
|
Packit |
1f3717 |
outputBaseform = altform.replace(u'|', u'')
|
|
Packit |
1f3717 |
wordform = outputBaseform.replace(u'=', u'')
|
|
Packit |
1f3717 |
if len(altform) == len(wordform.replace(u'-', u'')):
|
|
Packit |
1f3717 |
singlePartForms.append(altform)
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
multiPartForms.append(altform)
|
|
Packit |
1f3717 |
(alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP)
|
|
Packit |
1f3717 |
if alku == None:
|
|
Packit |
1f3717 |
errorstr = u"ERROR: VFST class not found for (%s, %s)\n" % (wordform, voikko_infclass)
|
|
Packit |
1f3717 |
sys.stderr.write(errorstr.encode(u"UTF-8"))
|
|
Packit |
1f3717 |
sys.exit(1)
|
|
Packit |
1f3717 |
if vfst_word_class == u"[La]":
|
|
Packit |
1f3717 |
jatko = get_abbreviation_jatko(word, altform)
|
|
Packit |
1f3717 |
elif vfst_word_class == u"[Ls]":
|
|
Packit |
1f3717 |
jatko = get_adverb_jatko(word, altform)
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
jatko = jatko.title()
|
|
Packit |
1f3717 |
if vfst_word_class in [u"[Ls]", u"[Lc]", u"[Lh]"]:
|
|
Packit |
1f3717 |
for element in word.getElementsByTagName(u"baseform"):
|
|
Packit |
1f3717 |
wordform = generate_lex_common.tValue(element)
|
|
Packit |
1f3717 |
outputBaseform = wordform.replace(u'|', u'')
|
|
Packit |
1f3717 |
if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
|
|
Packit |
1f3717 |
vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
|
|
Packit |
1f3717 |
else: vtype = forced_inflection_vtype
|
|
Packit |
1f3717 |
if vtype == voikkoutils.VOWEL_FRONT: vfst_vtype = u'ä'
|
|
Packit |
1f3717 |
elif vtype == voikkoutils.VOWEL_BACK: vfst_vtype = u'a'
|
|
Packit |
1f3717 |
elif vtype == voikkoutils.VOWEL_BOTH: vfst_vtype = u'aä'
|
|
Packit |
1f3717 |
vocabularyFile = vocabularyFiles[vfst_word_class.replace(u"[L", u"").replace(u"]", u"")]
|
|
Packit |
1f3717 |
if alku == None:
|
|
Packit |
1f3717 |
errorstr = u"ERROR: Malaga class not found for (%s, %s)\n" \
|
|
Packit |
1f3717 |
% (wordform, voikko_infclass)
|
|
Packit |
1f3717 |
generate_lex_common.write_entry(vocabularyFile, {}, word, errorstr)
|
|
Packit |
1f3717 |
sys.stderr.write(errorstr.encode(u"UTF-8"))
|
|
Packit |
1f3717 |
sys.exit(1)
|
|
Packit |
1f3717 |
alku = alku.lower()
|
|
Packit |
1f3717 |
(rakenne, alkuWithTags) = get_structure(altform, vfst_word_class, alku)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
if vfst_word_class == u"[Lh]":
|
|
Packit |
1f3717 |
entry = u'%s[Xp]%s[X]%s%s%s:%s # ;' % (vfst_word_class, outputBaseform, debug_info, rakenne, alkuWithTags, alku)
|
|
Packit |
1f3717 |
vocabularyFile.write(entry + u"\n")
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
vfst_class_prefix = get_vfst_class_prefix(vfst_word_class)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Vowel type in derived verbs
|
|
Packit |
1f3717 |
if jatko in [u"Heittää", u"Muistaa", u"Juontaa", u"Hohtaa", u"Murtaa", u"Nousta", u"Loistaa", u"Jättää", u"Kihistä"]:
|
|
Packit |
1f3717 |
diacritics = diacritics + vowel_type_for_derived_verb(alkuWithTags)
|
|
Packit |
1f3717 |
if jatko == u"Kihistä" and vtype == voikkoutils.VOWEL_FRONT and u"y" not in alku and u"ä" not in alku and u"ö" not in alku and u"e" in alku:
|
|
Packit |
1f3717 |
jatko = u"Helistä"
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
if jatko == u"Nainen" and vfst_class_prefix in [u"Laatusana", u"NimiLaatusana"] and altform.endswith(u"inen"):
|
|
Packit |
1f3717 |
jatko = u"NainenInen"
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
if vfst_word_class == u"[Lp]":
|
|
Packit |
1f3717 |
entry = u'[Lp]%s%s%s%s%s:%s%s EtuliitteenJatko_%s;' \
|
|
Packit |
1f3717 |
% (debug_info, rakenne, alkuWithTags, diacritics, infoFlags, alku, diacritics, get_prefix_jatko(word, altform))
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
entry = u'%s[Xp]%s[X]%s%s%s%s%s:%s%s %s%s_%s ;' \
|
|
Packit |
1f3717 |
% (vfst_word_class, outputBaseform, debug_info, rakenne, infoFlags,
|
|
Packit |
1f3717 |
alkuWithTags, diacritics, alku, diacritics, vfst_class_prefix, jatko, vfst_vtype)
|
|
Packit |
1f3717 |
vocabularyFile.write(entry + u"\n")
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Sanity check for alternative forms: if there are both multi part forms and single part forms
|
|
Packit |
1f3717 |
# then all multi part forms must end with a part contained in the single part set.
|
|
Packit |
1f3717 |
if singlePartForms:
|
|
Packit |
1f3717 |
for multiPartForm in multiPartForms:
|
|
Packit |
1f3717 |
lastPart = multiPartForm[max(rfind(multiPartForm, u"="), rfind(multiPartForm, u"|"), rfind(multiPartForm, u"-")) + 1:]
|
|
Packit |
1f3717 |
if lastPart not in singlePartForms:
|
|
Packit |
1f3717 |
sys.stderr.write(u"ERROR: suspicious alternative spelling: %s\n" % multiPartForm)
|
|
Packit |
1f3717 |
sys.exit(1)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
voikkoutils.process_wordlist(generate_lex_common.VOCABULARY_DATA + u'/joukahainen.xml', \
|
|
Packit |
1f3717 |
handle_word, True)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
for fileSuffix in vocabularyFileSuffixes:
|
|
Packit |
1f3717 |
vocabularyFiles[fileSuffix].write(u"\n\n") # Extra line feeds needed to avoid mixed lines in concatenated lexc file
|
|
Packit |
1f3717 |
vocabularyFiles[fileSuffix].close()
|