Blame voikko/generate_lex.py

Packit 1f3717
# -*- coding: utf-8 -*-
Packit 1f3717
Packit 1f3717
# Copyright 2007 - 2011 Harri Pitkänen (hatapitk@iki.fi)
Packit 1f3717
# Program to generate lexicon files for Suomi-malaga Voikko edition
Packit 1f3717
Packit 1f3717
# This program is free software; you can redistribute it and/or modify
Packit 1f3717
# it under the terms of the GNU General Public License as published by
Packit 1f3717
# the Free Software Foundation; either version 2 of the License, or
Packit 1f3717
# (at your option) any later version.
Packit 1f3717
#
Packit 1f3717
# This program is distributed in the hope that it will be useful,
Packit 1f3717
# but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 1f3717
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 1f3717
# GNU General Public License for more details.
Packit 1f3717
#
Packit 1f3717
# You should have received a copy of the GNU General Public License
Packit 1f3717
# along with this program; if not, write to the Free Software
Packit 1f3717
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
Packit 1f3717
Packit 1f3717
import sys
Packit 1f3717
sys.path.append("common")
Packit 1f3717
import hfconv
Packit 1f3717
import generate_lex_common
Packit 1f3717
import voikkoutils
Packit 1f3717
import xml.dom.minidom
Packit 1f3717
import codecs
Packit 1f3717
Packit 1f3717
flag_attributes = voikkoutils.readFlagAttributes(generate_lex_common.VOCABULARY_DATA + u"/flags.txt")
Packit 1f3717
Packit 1f3717
# Get command line options
Packit 1f3717
OPTIONS = generate_lex_common.get_options()
Packit 1f3717
Packit 1f3717
# Inflection class map
Packit 1f3717
CLASSMAP = hfconv.compileClassmapREs(hfconv.modern_classmap)
Packit 1f3717
Packit 1f3717
# No special vocabularies are built for Voikko
Packit 1f3717
generate_lex_common.SPECIAL_VOCABULARY = []
Packit 1f3717
Packit 1f3717
main_vocabulary = generate_lex_common.open_lex(OPTIONS["destdir"], "joukahainen.lex")
Packit 1f3717
Packit 1f3717
def frequency(word):
Packit 1f3717
	fclass = word.getElementsByTagName("fclass")
Packit 1f3717
	if len(fclass) == 0: return 7
Packit 1f3717
	return int(generate_lex_common.tValue(fclass[0]))
Packit 1f3717
Packit 1f3717
# Check the style flags of the word according to current options.
Packit 1f3717
# Returns True if the word is acceptable, otherwise returns false.
Packit 1f3717
def check_style(word):
Packit 1f3717
	global OPTIONS
Packit 1f3717
	for styleE in word.getElementsByTagName("style"):
Packit 1f3717
		for style in generate_lex_common.tValues(styleE, "flag"):
Packit 1f3717
			if style == "foreignloan":
Packit 1f3717
				continue
Packit 1f3717
			if not style in OPTIONS["style"]: return False
Packit 1f3717
	return True
Packit 1f3717
Packit 1f3717
# Returns True if the word is acceptable according to its usage flags.
Packit 1f3717
def check_usage(word):
Packit 1f3717
	global OPTIONS
Packit 1f3717
	wordUsage = word.getElementsByTagName("usage")
Packit 1f3717
	if len(wordUsage) == 0: return True
Packit 1f3717
	for usageE in wordUsage:
Packit 1f3717
		for usage in generate_lex_common.tValues(usageE, "flag"):
Packit 1f3717
			if usage in OPTIONS["extra-usage"]: return True
Packit 1f3717
	return False
Packit 1f3717
Packit 1f3717
def get_prefix_jatko(word):
Packit 1f3717
	flags = generate_lex_common.get_flags_from_group(word, u"compounding")
Packit 1f3717
	prefixJatko = u""
Packit 1f3717
	for flag in flags:
Packit 1f3717
		if flag in [u"eln", u"ell", u"elt", u"eltj"]:
Packit 1f3717
			if (len(prefixJatko) > 0):
Packit 1f3717
				prefixJatko = prefixJatko + u" + "
Packit 1f3717
			prefixJatko = prefixJatko + u"@" + flag
Packit 1f3717
	return prefixJatko
Packit 1f3717
Packit 1f3717
def get_adverb_jatko(word):
Packit 1f3717
	flags = generate_lex_common.get_flags_from_group(word, u"inflection")
Packit 1f3717
	prefixJatko = u""
Packit 1f3717
	loppu = True
Packit 1f3717
	for flag in flags:
Packit 1f3717
		if flag in [u"liitesana", u"omistusliite"]:
Packit 1f3717
			prefixJatko = prefixJatko + u", " + flag
Packit 1f3717
		elif flag == u"ulkopaikallissijat_yks":
Packit 1f3717
			prefixJatko = prefixJatko + u", ulkopaikallissija_llA"
Packit 1f3717
		elif flag == u"required":
Packit 1f3717
			loppu = False;
Packit 1f3717
	if loppu:
Packit 1f3717
		prefixJatko = prefixJatko + u", loppu"
Packit 1f3717
	if prefixJatko.startswith(u", "):
Packit 1f3717
		prefixJatko = prefixJatko[2:]
Packit 1f3717
	return prefixJatko
Packit 1f3717
Packit 1f3717
def get_abbreviation_jatko(word, wordform):
Packit 1f3717
	flags = generate_lex_common.get_flags_from_group(word, u"inflection")
Packit 1f3717
	if wordform.endswith(u".") or u"none" in flags:
Packit 1f3717
		return u"loppu"
Packit 1f3717
	else:
Packit 1f3717
		return u"tavuviiva, kaksoispiste, loppu"
Packit 1f3717
Packit 1f3717
def get_additional_attributes(word):
Packit 1f3717
	flags = generate_lex_common.get_flags_from_group(word, u"compounding")
Packit 1f3717
	result = u""
Packit 1f3717
	if u"el_altark" in flags:
Packit 1f3717
		result = result + u", aluetta_tarkentava_etuliite: yes"
Packit 1f3717
	if u"geo_suffix" in flags:
Packit 1f3717
		result = result + u", paikannimen_jälkiliite: yes"
Packit 1f3717
	if u"org_suffix" in flags:
Packit 1f3717
		result = result + u", erisnimen_pääte: yes"
Packit 1f3717
	if u"free_suffix" in flags:
Packit 1f3717
		result = result + u", vapaa_jälkiosa: yes"
Packit 1f3717
	flags = generate_lex_common.get_flags_from_group(word, u"grammar")
Packit 1f3717
	if u"require_following_a" in flags:
Packit 1f3717
		result = result + u", vaatii_tapaluokan: nimitapa_1"
Packit 1f3717
	if u"require_following_ma" in flags:
Packit 1f3717
		result = result + u", vaatii_tapaluokan: nimitapa_3"
Packit 1f3717
	return result
Packit 1f3717
Packit 1f3717
def handle_word(word):
Packit 1f3717
	global OPTIONS
Packit 1f3717
	global CLASSMAP
Packit 1f3717
	# Drop words that are not needed in the Voikko lexicon
Packit 1f3717
	if generate_lex_common.has_flag(word, "not_voikko") and "sukija" not in OPTIONS["extra-usage"]:
Packit 1f3717
		return
Packit 1f3717
	if not check_style(word): return
Packit 1f3717
	if not check_usage(word): return
Packit 1f3717
	if frequency(word) >= OPTIONS["frequency"] + 1: return
Packit 1f3717
	if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return
Packit 1f3717
	
Packit 1f3717
	# Get the inflection class. Exactly one inflection class is needed
Packit 1f3717
	voikko_infclass = None
Packit 1f3717
	for infclass in word.getElementsByTagName("infclass"):
Packit 1f3717
		if infclass.getAttribute("type") != "historical":
Packit 1f3717
			voikko_infclass = generate_lex_common.tValue(infclass)
Packit 1f3717
			break
Packit 1f3717
	if voikko_infclass == u"poikkeava": return
Packit 1f3717
	
Packit 1f3717
	# Get the word classes
Packit 1f3717
	wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass")
Packit 1f3717
	if wordclasses[0] not in [u"interjection", u"prefix", u"abbreviation", u"conjunction", u"adverb"] and voikko_infclass == None:
Packit 1f3717
		return
Packit 1f3717
	malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses)
Packit 1f3717
	if malaga_word_class == None: return
Packit 1f3717
	
Packit 1f3717
	baseformTags = word.getElementsByTagName("baseform")
Packit 1f3717
	if len(baseformTags) > 0:
Packit 1f3717
		baseform = generate_lex_common.tValue(baseformTags[0])
Packit 1f3717
	else:
Packit 1f3717
		baseform = None
Packit 1f3717
	
Packit 1f3717
	# Get malaga flags
Packit 1f3717
	malaga_flags = generate_lex_common.get_malaga_flags(word)
Packit 1f3717
	
Packit 1f3717
	# Get forced vowel type
Packit 1f3717
	if voikko_infclass == None and malaga_word_class != u"lyhenne":
Packit 1f3717
		forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
Packit 1f3717
	else:
Packit 1f3717
		inflectionElement = word.getElementsByTagName("inflection")
Packit 1f3717
		if len(inflectionElement) > 0:
Packit 1f3717
			forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0])
Packit 1f3717
		else:
Packit 1f3717
			forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
Packit 1f3717
	
Packit 1f3717
	# Construct debug information and additional attributes
Packit 1f3717
	additional_attributes = get_additional_attributes(word)
Packit 1f3717
	if OPTIONS["sourceid"]:
Packit 1f3717
		additional_attributes = additional_attributes + u', sourceid: "%s"' % word.getAttribute("id")
Packit 1f3717
	
Packit 1f3717
	# Process all alternative forms
Packit 1f3717
	singlePartForms = []
Packit 1f3717
	multiPartForms = []
Packit 1f3717
	for altform in generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form"):
Packit 1f3717
		wordform = altform.replace(u'|', u'').replace(u'=', u'')
Packit 1f3717
		if len(altform) == len(wordform.replace(u'-', u'')):
Packit 1f3717
			singlePartForms.append(altform)
Packit 1f3717
		else:
Packit 1f3717
			multiPartForms.append(altform)
Packit 1f3717
		(alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP)
Packit 1f3717
		if alku == None:
Packit 1f3717
			errorstr = u"ERROR: Malaga class not found for (%s, %s)\n" \
Packit 1f3717
				% (wordform, voikko_infclass)
Packit 1f3717
			generate_lex_common.write_entry(main_vocabulary, {}, word, errorstr)
Packit 1f3717
			sys.stderr.write(errorstr.encode(u"UTF-8"))
Packit 1f3717
			sys.exit(1)
Packit 1f3717
		if malaga_word_class == u"lyhenne":
Packit 1f3717
			jatko = get_abbreviation_jatko(word, altform)
Packit 1f3717
		elif malaga_word_class == u"seikkasana":
Packit 1f3717
			jatko = get_adverb_jatko(word)
Packit 1f3717
		if malaga_word_class == u"etuliite":
Packit 1f3717
			vtype = voikkoutils.VOWEL_BOTH
Packit 1f3717
			malaga_jatko = get_prefix_jatko(word)
Packit 1f3717
		else:
Packit 1f3717
			if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
Packit 1f3717
				vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
Packit 1f3717
			else:
Packit 1f3717
				vtype = forced_inflection_vtype
Packit 1f3717
			malaga_jatko = u"<" + jatko + u">"
Packit 1f3717
		if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = u'ä'
Packit 1f3717
		elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = u'a'
Packit 1f3717
		elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = u'aä'
Packit 1f3717
		rakenne = generate_lex_common.get_structure(altform, malaga_word_class)
Packit 1f3717
		if baseform is None:
Packit 1f3717
			altBaseform = altform
Packit 1f3717
		else:
Packit 1f3717
			altBaseform = baseform
Packit 1f3717
		if malaga_word_class == u"lyhenne":
Packit 1f3717
			perusmuotoEntry = u""
Packit 1f3717
		else:
Packit 1f3717
			perusmuotoEntry = u'perusmuoto: "%s", ' % altBaseform
Packit 1f3717
		entry = u'[%salku: "%s", luokka: %s, jatko: %s, äs: %s%s%s%s];' \
Packit 1f3717
		          % (perusmuotoEntry, alku, malaga_word_class, malaga_jatko, malaga_vtype, malaga_flags,
Packit 1f3717
			   generate_lex_common.get_structure(altform, malaga_word_class),
Packit 1f3717
			   additional_attributes)
Packit 1f3717
		generate_lex_common.write_entry(main_vocabulary, {}, word, entry)
Packit 1f3717
	
Packit 1f3717
	# Sanity check for alternative forms: if there are both multi part forms and single part forms
Packit 1f3717
	# then all multi part forms must end with a part contained in the single part set.
Packit 1f3717
	if singlePartForms:
Packit 1f3717
		for multiPartForm in multiPartForms:
Packit Service b5e107
			lastPart = multiPartForm[max(multiPartForm.rfind(u"="), multiPartForm.rfind(u"|"), multiPartForm.rfind(u"-")) + 1:]
Packit 1f3717
			if lastPart not in singlePartForms:
Packit 1f3717
				sys.stderr.write(u"ERROR: suspicious alternative spelling: %s\n" % multiPartForm)
Packit 1f3717
				sys.exit(1)
Packit 1f3717
Packit 1f3717
Packit 1f3717
voikkoutils.process_wordlist(generate_lex_common.VOCABULARY_DATA + u'/joukahainen.xml', \
Packit 1f3717
                             handle_word, True)
Packit 1f3717
Packit 1f3717
main_vocabulary.close()