|
Packit |
1f3717 |
# -*- coding: utf-8 -*-
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Copyright 2007 - 2011 Harri Pitkänen (hatapitk@iki.fi)
|
|
Packit |
1f3717 |
# Program to generate lexicon files for Suomi-malaga Voikko edition
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# This program is free software; you can redistribute it and/or modify
|
|
Packit |
1f3717 |
# it under the terms of the GNU General Public License as published by
|
|
Packit |
1f3717 |
# the Free Software Foundation; either version 2 of the License, or
|
|
Packit |
1f3717 |
# (at your option) any later version.
|
|
Packit |
1f3717 |
#
|
|
Packit |
1f3717 |
# This program is distributed in the hope that it will be useful,
|
|
Packit |
1f3717 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
Packit |
1f3717 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
Packit |
1f3717 |
# GNU General Public License for more details.
|
|
Packit |
1f3717 |
#
|
|
Packit |
1f3717 |
# You should have received a copy of the GNU General Public License
|
|
Packit |
1f3717 |
# along with this program; if not, write to the Free Software
|
|
Packit |
1f3717 |
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
import sys
|
|
Packit |
1f3717 |
sys.path.append("common")
|
|
Packit |
1f3717 |
import hfconv
|
|
Packit |
1f3717 |
import generate_lex_common
|
|
Packit |
1f3717 |
import voikkoutils
|
|
Packit |
1f3717 |
import xml.dom.minidom
|
|
Packit |
1f3717 |
import codecs
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
flag_attributes = voikkoutils.readFlagAttributes(generate_lex_common.VOCABULARY_DATA + u"/flags.txt")
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Get command line options
|
|
Packit |
1f3717 |
OPTIONS = generate_lex_common.get_options()
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Inflection class map
|
|
Packit |
1f3717 |
CLASSMAP = hfconv.compileClassmapREs(hfconv.modern_classmap)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# No special vocabularies are built for Voikko
|
|
Packit |
1f3717 |
generate_lex_common.SPECIAL_VOCABULARY = []
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
main_vocabulary = generate_lex_common.open_lex(OPTIONS["destdir"], "joukahainen.lex")
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def frequency(word):
|
|
Packit |
1f3717 |
fclass = word.getElementsByTagName("fclass")
|
|
Packit |
1f3717 |
if len(fclass) == 0: return 7
|
|
Packit |
1f3717 |
return int(generate_lex_common.tValue(fclass[0]))
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Check the style flags of the word according to current options.
|
|
Packit |
1f3717 |
# Returns True if the word is acceptable, otherwise returns false.
|
|
Packit |
1f3717 |
def check_style(word):
|
|
Packit |
1f3717 |
global OPTIONS
|
|
Packit |
1f3717 |
for styleE in word.getElementsByTagName("style"):
|
|
Packit |
1f3717 |
for style in generate_lex_common.tValues(styleE, "flag"):
|
|
Packit |
1f3717 |
if style == "foreignloan":
|
|
Packit |
1f3717 |
continue
|
|
Packit |
1f3717 |
if not style in OPTIONS["style"]: return False
|
|
Packit |
1f3717 |
return True
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Returns True if the word is acceptable according to its usage flags.
|
|
Packit |
1f3717 |
def check_usage(word):
|
|
Packit |
1f3717 |
global OPTIONS
|
|
Packit |
1f3717 |
wordUsage = word.getElementsByTagName("usage")
|
|
Packit |
1f3717 |
if len(wordUsage) == 0: return True
|
|
Packit |
1f3717 |
for usageE in wordUsage:
|
|
Packit |
1f3717 |
for usage in generate_lex_common.tValues(usageE, "flag"):
|
|
Packit |
1f3717 |
if usage in OPTIONS["extra-usage"]: return True
|
|
Packit |
1f3717 |
return False
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def get_prefix_jatko(word):
|
|
Packit |
1f3717 |
flags = generate_lex_common.get_flags_from_group(word, u"compounding")
|
|
Packit |
1f3717 |
prefixJatko = u""
|
|
Packit |
1f3717 |
for flag in flags:
|
|
Packit |
1f3717 |
if flag in [u"eln", u"ell", u"elt", u"eltj"]:
|
|
Packit |
1f3717 |
if (len(prefixJatko) > 0):
|
|
Packit |
1f3717 |
prefixJatko = prefixJatko + u" + "
|
|
Packit |
1f3717 |
prefixJatko = prefixJatko + u"@" + flag
|
|
Packit |
1f3717 |
return prefixJatko
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def get_adverb_jatko(word):
|
|
Packit |
1f3717 |
flags = generate_lex_common.get_flags_from_group(word, u"inflection")
|
|
Packit |
1f3717 |
prefixJatko = u""
|
|
Packit |
1f3717 |
loppu = True
|
|
Packit |
1f3717 |
for flag in flags:
|
|
Packit |
1f3717 |
if flag in [u"liitesana", u"omistusliite"]:
|
|
Packit |
1f3717 |
prefixJatko = prefixJatko + u", " + flag
|
|
Packit |
1f3717 |
elif flag == u"ulkopaikallissijat_yks":
|
|
Packit |
1f3717 |
prefixJatko = prefixJatko + u", ulkopaikallissija_llA"
|
|
Packit |
1f3717 |
elif flag == u"required":
|
|
Packit |
1f3717 |
loppu = False;
|
|
Packit |
1f3717 |
if loppu:
|
|
Packit |
1f3717 |
prefixJatko = prefixJatko + u", loppu"
|
|
Packit |
1f3717 |
if prefixJatko.startswith(u", "):
|
|
Packit |
1f3717 |
prefixJatko = prefixJatko[2:]
|
|
Packit |
1f3717 |
return prefixJatko
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def get_abbreviation_jatko(word, wordform):
|
|
Packit |
1f3717 |
flags = generate_lex_common.get_flags_from_group(word, u"inflection")
|
|
Packit |
1f3717 |
if wordform.endswith(u".") or u"none" in flags:
|
|
Packit |
1f3717 |
return u"loppu"
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
return u"tavuviiva, kaksoispiste, loppu"
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def get_additional_attributes(word):
|
|
Packit |
1f3717 |
flags = generate_lex_common.get_flags_from_group(word, u"compounding")
|
|
Packit |
1f3717 |
result = u""
|
|
Packit |
1f3717 |
if u"el_altark" in flags:
|
|
Packit |
1f3717 |
result = result + u", aluetta_tarkentava_etuliite: yes"
|
|
Packit |
1f3717 |
if u"geo_suffix" in flags:
|
|
Packit |
1f3717 |
result = result + u", paikannimen_jälkiliite: yes"
|
|
Packit |
1f3717 |
if u"org_suffix" in flags:
|
|
Packit |
1f3717 |
result = result + u", erisnimen_pääte: yes"
|
|
Packit |
1f3717 |
if u"free_suffix" in flags:
|
|
Packit |
1f3717 |
result = result + u", vapaa_jälkiosa: yes"
|
|
Packit |
1f3717 |
flags = generate_lex_common.get_flags_from_group(word, u"grammar")
|
|
Packit |
1f3717 |
if u"require_following_a" in flags:
|
|
Packit |
1f3717 |
result = result + u", vaatii_tapaluokan: nimitapa_1"
|
|
Packit |
1f3717 |
if u"require_following_ma" in flags:
|
|
Packit |
1f3717 |
result = result + u", vaatii_tapaluokan: nimitapa_3"
|
|
Packit |
1f3717 |
return result
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def handle_word(word):
|
|
Packit |
1f3717 |
global OPTIONS
|
|
Packit |
1f3717 |
global CLASSMAP
|
|
Packit |
1f3717 |
# Drop words that are not needed in the Voikko lexicon
|
|
Packit |
1f3717 |
if generate_lex_common.has_flag(word, "not_voikko") and "sukija" not in OPTIONS["extra-usage"]:
|
|
Packit |
1f3717 |
return
|
|
Packit |
1f3717 |
if not check_style(word): return
|
|
Packit |
1f3717 |
if not check_usage(word): return
|
|
Packit |
1f3717 |
if frequency(word) >= OPTIONS["frequency"] + 1: return
|
|
Packit |
1f3717 |
if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Get the inflection class. Exactly one inflection class is needed
|
|
Packit |
1f3717 |
voikko_infclass = None
|
|
Packit |
1f3717 |
for infclass in word.getElementsByTagName("infclass"):
|
|
Packit |
1f3717 |
if infclass.getAttribute("type") != "historical":
|
|
Packit |
1f3717 |
voikko_infclass = generate_lex_common.tValue(infclass)
|
|
Packit |
1f3717 |
break
|
|
Packit |
1f3717 |
if voikko_infclass == u"poikkeava": return
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Get the word classes
|
|
Packit |
1f3717 |
wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass")
|
|
Packit |
1f3717 |
if wordclasses[0] not in [u"interjection", u"prefix", u"abbreviation", u"conjunction", u"adverb"] and voikko_infclass == None:
|
|
Packit |
1f3717 |
return
|
|
Packit |
1f3717 |
malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses)
|
|
Packit |
1f3717 |
if malaga_word_class == None: return
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
baseformTags = word.getElementsByTagName("baseform")
|
|
Packit |
1f3717 |
if len(baseformTags) > 0:
|
|
Packit |
1f3717 |
baseform = generate_lex_common.tValue(baseformTags[0])
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
baseform = None
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Get malaga flags
|
|
Packit |
1f3717 |
malaga_flags = generate_lex_common.get_malaga_flags(word)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Get forced vowel type
|
|
Packit |
1f3717 |
if voikko_infclass == None and malaga_word_class != u"lyhenne":
|
|
Packit |
1f3717 |
forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
inflectionElement = word.getElementsByTagName("inflection")
|
|
Packit |
1f3717 |
if len(inflectionElement) > 0:
|
|
Packit |
1f3717 |
forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0])
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Construct debug information and additional attributes
|
|
Packit |
1f3717 |
additional_attributes = get_additional_attributes(word)
|
|
Packit |
1f3717 |
if OPTIONS["sourceid"]:
|
|
Packit |
1f3717 |
additional_attributes = additional_attributes + u', sourceid: "%s"' % word.getAttribute("id")
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Process all alternative forms
|
|
Packit |
1f3717 |
singlePartForms = []
|
|
Packit |
1f3717 |
multiPartForms = []
|
|
Packit |
1f3717 |
for altform in generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form"):
|
|
Packit |
1f3717 |
wordform = altform.replace(u'|', u'').replace(u'=', u'')
|
|
Packit |
1f3717 |
if len(altform) == len(wordform.replace(u'-', u'')):
|
|
Packit |
1f3717 |
singlePartForms.append(altform)
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
multiPartForms.append(altform)
|
|
Packit |
1f3717 |
(alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP)
|
|
Packit |
1f3717 |
if alku == None:
|
|
Packit |
1f3717 |
errorstr = u"ERROR: Malaga class not found for (%s, %s)\n" \
|
|
Packit |
1f3717 |
% (wordform, voikko_infclass)
|
|
Packit |
1f3717 |
generate_lex_common.write_entry(main_vocabulary, {}, word, errorstr)
|
|
Packit |
1f3717 |
sys.stderr.write(errorstr.encode(u"UTF-8"))
|
|
Packit |
1f3717 |
sys.exit(1)
|
|
Packit |
1f3717 |
if malaga_word_class == u"lyhenne":
|
|
Packit |
1f3717 |
jatko = get_abbreviation_jatko(word, altform)
|
|
Packit |
1f3717 |
elif malaga_word_class == u"seikkasana":
|
|
Packit |
1f3717 |
jatko = get_adverb_jatko(word)
|
|
Packit |
1f3717 |
if malaga_word_class == u"etuliite":
|
|
Packit |
1f3717 |
vtype = voikkoutils.VOWEL_BOTH
|
|
Packit |
1f3717 |
malaga_jatko = get_prefix_jatko(word)
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
|
|
Packit |
1f3717 |
vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
vtype = forced_inflection_vtype
|
|
Packit |
1f3717 |
malaga_jatko = u"<" + jatko + u">"
|
|
Packit |
1f3717 |
if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = u'ä'
|
|
Packit |
1f3717 |
elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = u'a'
|
|
Packit |
1f3717 |
elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = u'aä'
|
|
Packit |
1f3717 |
rakenne = generate_lex_common.get_structure(altform, malaga_word_class)
|
|
Packit |
1f3717 |
if baseform is None:
|
|
Packit |
1f3717 |
altBaseform = altform
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
altBaseform = baseform
|
|
Packit |
1f3717 |
if malaga_word_class == u"lyhenne":
|
|
Packit |
1f3717 |
perusmuotoEntry = u""
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
perusmuotoEntry = u'perusmuoto: "%s", ' % altBaseform
|
|
Packit |
1f3717 |
entry = u'[%salku: "%s", luokka: %s, jatko: %s, äs: %s%s%s%s];' \
|
|
Packit |
1f3717 |
% (perusmuotoEntry, alku, malaga_word_class, malaga_jatko, malaga_vtype, malaga_flags,
|
|
Packit |
1f3717 |
generate_lex_common.get_structure(altform, malaga_word_class),
|
|
Packit |
1f3717 |
additional_attributes)
|
|
Packit |
1f3717 |
generate_lex_common.write_entry(main_vocabulary, {}, word, entry)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Sanity check for alternative forms: if there are both multi part forms and single part forms
|
|
Packit |
1f3717 |
# then all multi part forms must end with a part contained in the single part set.
|
|
Packit |
1f3717 |
if singlePartForms:
|
|
Packit |
1f3717 |
for multiPartForm in multiPartForms:
|
|
Packit Service |
b5e107 |
lastPart = multiPartForm[max(multiPartForm.rfind(u"="), multiPartForm.rfind(u"|"), multiPartForm.rfind(u"-")) + 1:]
|
|
Packit |
1f3717 |
if lastPart not in singlePartForms:
|
|
Packit |
1f3717 |
sys.stderr.write(u"ERROR: suspicious alternative spelling: %s\n" % multiPartForm)
|
|
Packit |
1f3717 |
sys.exit(1)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
voikkoutils.process_wordlist(generate_lex_common.VOCABULARY_DATA + u'/joukahainen.xml', \
|
|
Packit |
1f3717 |
handle_word, True)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
main_vocabulary.close()
|