Blob Blame History Raw
# -*- coding: utf-8 -*-

# Copyright 2007 - 2011 Harri Pitkänen (hatapitk@iki.fi)
# Program to generate lexicon files for Suomi-malaga Voikko edition

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

import sys
sys.path.append("common")
import hfconv
import generate_lex_common
import voikkoutils
import xml.dom.minidom
import codecs

flag_attributes = voikkoutils.readFlagAttributes(generate_lex_common.VOCABULARY_DATA + u"/flags.txt")

# Get command line options
OPTIONS = generate_lex_common.get_options()

# Inflection class map
CLASSMAP = hfconv.compileClassmapREs(hfconv.modern_classmap)

# No special vocabularies are built for Voikko
generate_lex_common.SPECIAL_VOCABULARY = []

main_vocabulary = generate_lex_common.open_lex(OPTIONS["destdir"], "joukahainen.lex")

def frequency(word):
	fclass = word.getElementsByTagName("fclass")
	if len(fclass) == 0: return 7
	return int(generate_lex_common.tValue(fclass[0]))

# Check the style flags of the word according to current options.
# Returns True if the word is acceptable, otherwise returns false.
def check_style(word):
	global OPTIONS
	for styleE in word.getElementsByTagName("style"):
		for style in generate_lex_common.tValues(styleE, "flag"):
			if style == "foreignloan":
				continue
			if not style in OPTIONS["style"]: return False
	return True

# Returns True if the word is acceptable according to its usage flags.
def check_usage(word):
	global OPTIONS
	wordUsage = word.getElementsByTagName("usage")
	if len(wordUsage) == 0: return True
	for usageE in wordUsage:
		for usage in generate_lex_common.tValues(usageE, "flag"):
			if usage in OPTIONS["extra-usage"]: return True
	return False

def get_prefix_jatko(word):
	flags = generate_lex_common.get_flags_from_group(word, u"compounding")
	prefixJatko = u""
	for flag in flags:
		if flag in [u"eln", u"ell", u"elt", u"eltj"]:
			if (len(prefixJatko) > 0):
				prefixJatko = prefixJatko + u" + "
			prefixJatko = prefixJatko + u"@" + flag
	return prefixJatko

def get_adverb_jatko(word):
	flags = generate_lex_common.get_flags_from_group(word, u"inflection")
	prefixJatko = u""
	loppu = True
	for flag in flags:
		if flag in [u"liitesana", u"omistusliite"]:
			prefixJatko = prefixJatko + u", " + flag
		elif flag == u"ulkopaikallissijat_yks":
			prefixJatko = prefixJatko + u", ulkopaikallissija_llA"
		elif flag == u"required":
			loppu = False;
	if loppu:
		prefixJatko = prefixJatko + u", loppu"
	if prefixJatko.startswith(u", "):
		prefixJatko = prefixJatko[2:]
	return prefixJatko

def get_abbreviation_jatko(word, wordform):
	flags = generate_lex_common.get_flags_from_group(word, u"inflection")
	if wordform.endswith(u".") or u"none" in flags:
		return u"loppu"
	else:
		return u"tavuviiva, kaksoispiste, loppu"

def get_additional_attributes(word):
	flags = generate_lex_common.get_flags_from_group(word, u"compounding")
	result = u""
	if u"el_altark" in flags:
		result = result + u", aluetta_tarkentava_etuliite: yes"
	if u"geo_suffix" in flags:
		result = result + u", paikannimen_jälkiliite: yes"
	if u"org_suffix" in flags:
		result = result + u", erisnimen_pääte: yes"
	if u"free_suffix" in flags:
		result = result + u", vapaa_jälkiosa: yes"
	flags = generate_lex_common.get_flags_from_group(word, u"grammar")
	if u"require_following_a" in flags:
		result = result + u", vaatii_tapaluokan: nimitapa_1"
	if u"require_following_ma" in flags:
		result = result + u", vaatii_tapaluokan: nimitapa_3"
	return result

def handle_word(word):
	global OPTIONS
	global CLASSMAP
	# Drop words that are not needed in the Voikko lexicon
	if generate_lex_common.has_flag(word, "not_voikko") and "sukija" not in OPTIONS["extra-usage"]:
		return
	if not check_style(word): return
	if not check_usage(word): return
	if frequency(word) >= OPTIONS["frequency"] + 1: return
	if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return
	
	# Get the inflection class. Exactly one inflection class is needed
	voikko_infclass = None
	for infclass in word.getElementsByTagName("infclass"):
		if infclass.getAttribute("type") != "historical":
			voikko_infclass = generate_lex_common.tValue(infclass)
			break
	if voikko_infclass == u"poikkeava": return
	
	# Get the word classes
	wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass")
	if wordclasses[0] not in [u"interjection", u"prefix", u"abbreviation", u"conjunction", u"adverb"] and voikko_infclass == None:
		return
	malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses)
	if malaga_word_class == None: return
	
	baseformTags = word.getElementsByTagName("baseform")
	if len(baseformTags) > 0:
		baseform = generate_lex_common.tValue(baseformTags[0])
	else:
		baseform = None
	
	# Get malaga flags
	malaga_flags = generate_lex_common.get_malaga_flags(word)
	
	# Get forced vowel type
	if voikko_infclass == None and malaga_word_class != u"lyhenne":
		forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
	else:
		inflectionElement = word.getElementsByTagName("inflection")
		if len(inflectionElement) > 0:
			forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0])
		else:
			forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
	
	# Construct debug information and additional attributes
	additional_attributes = get_additional_attributes(word)
	if OPTIONS["sourceid"]:
		additional_attributes = additional_attributes + u', sourceid: "%s"' % word.getAttribute("id")
	
	# Process all alternative forms
	singlePartForms = []
	multiPartForms = []
	for altform in generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form"):
		wordform = altform.replace(u'|', u'').replace(u'=', u'')
		if len(altform) == len(wordform.replace(u'-', u'')):
			singlePartForms.append(altform)
		else:
			multiPartForms.append(altform)
		(alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP)
		if alku == None:
			errorstr = u"ERROR: Malaga class not found for (%s, %s)\n" \
				% (wordform, voikko_infclass)
			generate_lex_common.write_entry(main_vocabulary, {}, word, errorstr)
			sys.stderr.write(errorstr.encode(u"UTF-8"))
			sys.exit(1)
		if malaga_word_class == u"lyhenne":
			jatko = get_abbreviation_jatko(word, altform)
		elif malaga_word_class == u"seikkasana":
			jatko = get_adverb_jatko(word)
		if malaga_word_class == u"etuliite":
			vtype = voikkoutils.VOWEL_BOTH
			malaga_jatko = get_prefix_jatko(word)
		else:
			if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
				vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
			else:
				vtype = forced_inflection_vtype
			malaga_jatko = u"<" + jatko + u">"
		if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = u'ä'
		elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = u'a'
		elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = u'aä'
		rakenne = generate_lex_common.get_structure(altform, malaga_word_class)
		if baseform is None:
			altBaseform = altform
		else:
			altBaseform = baseform
		if malaga_word_class == u"lyhenne":
			perusmuotoEntry = u""
		else:
			perusmuotoEntry = u'perusmuoto: "%s", ' % altBaseform
		entry = u'[%salku: "%s", luokka: %s, jatko: %s, äs: %s%s%s%s];' \
		          % (perusmuotoEntry, alku, malaga_word_class, malaga_jatko, malaga_vtype, malaga_flags,
			   generate_lex_common.get_structure(altform, malaga_word_class),
			   additional_attributes)
		generate_lex_common.write_entry(main_vocabulary, {}, word, entry)
	
	# Sanity check for alternative forms: if there are both multi part forms and single part forms
	# then all multi part forms must end with a part contained in the single part set.
	if singlePartForms:
		for multiPartForm in multiPartForms:
			lastPart = multiPartForm[max(multiPartForm.rfind(u"="), multiPartForm.rfind(u"|"), multiPartForm.rfind(u"-")) + 1:]
			if lastPart not in singlePartForms:
				sys.stderr.write(u"ERROR: suspicious alternative spelling: %s\n" % multiPartForm)
				sys.exit(1)


voikkoutils.process_wordlist(generate_lex_common.VOCABULARY_DATA + u'/joukahainen.xml', \
                             handle_word, True)

main_vocabulary.close()