Blame vvfst/generate_lex.py

Packit 1f3717
# -*- coding: utf-8 -*-
Packit 1f3717
Packit 1f3717
# Copyright 2007 - 2012 Harri Pitkänen (hatapitk@iki.fi)
Packit 1f3717
# Program to generate lexicon files for Suomi-malaga Voikko edition
Packit 1f3717
Packit 1f3717
# This program is free software; you can redistribute it and/or modify
Packit 1f3717
# it under the terms of the GNU General Public License as published by
Packit 1f3717
# the Free Software Foundation; either version 2 of the License, or
Packit 1f3717
# (at your option) any later version.
Packit 1f3717
#
Packit 1f3717
# This program is distributed in the hope that it will be useful,
Packit 1f3717
# but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 1f3717
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 1f3717
# GNU General Public License for more details.
Packit 1f3717
#
Packit 1f3717
# You should have received a copy of the GNU General Public License
Packit 1f3717
# along with this program; if not, write to the Free Software
Packit 1f3717
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
Packit 1f3717
Packit 1f3717
import sys
Packit 1f3717
sys.path.append("common")
Packit 1f3717
import hfconv
Packit 1f3717
import generate_lex_common
Packit 1f3717
import voikkoutils
Packit 1f3717
import xml.dom.minidom
Packit 1f3717
import codecs
Packit 1f3717
from string import rfind
Packit 1f3717
from xml.dom import Node
Packit 1f3717
Packit 1f3717
flag_attributes = voikkoutils.readFlagAttributes(generate_lex_common.VOCABULARY_DATA + u"/flags.txt")
Packit 1f3717
Packit 1f3717
# Get command line options
Packit 1f3717
OPTIONS = generate_lex_common.get_options()
Packit 1f3717
Packit 1f3717
# Inflection class map
Packit 1f3717
CLASSMAP = hfconv.compileClassmapREs(hfconv.modern_classmap)
Packit 1f3717
Packit 1f3717
# No special vocabularies are built for Voikko
Packit 1f3717
generate_lex_common.SPECIAL_VOCABULARY = []
Packit 1f3717
Packit 1f3717
vocabularyFileSuffixes = [u"ep", u"ee", u"es", u"em", u"t", u"nl", u"l", u"n", u"h", u"p", u"a", u"s", u"c"]
Packit 1f3717
vocabularyFiles = {}
Packit 1f3717
for fileSuffix in vocabularyFileSuffixes:
Packit 1f3717
	vocFile = codecs.open(OPTIONS["destdir"] + u"/joukahainen-" + fileSuffix + u".lexc", 'w', 'UTF-8')
Packit 1f3717
	vocFile.write(u"! This is automatically generated intermediate lexicon file for\n")
Packit 1f3717
	vocFile.write(u"! VVFST morphology. The original source data is\n")
Packit 1f3717
	vocFile.write(u"! distributed under the GNU General Public License, version 2 or\n")
Packit 1f3717
	vocFile.write(u"! later, as published by the Free Software Foundation. You should\n")
Packit 1f3717
	vocFile.write(u"! have received the original data, tools and instructions to\n")
Packit 1f3717
	vocFile.write(u"! generate this file (or instructions to obtain them) wherever\n")
Packit 1f3717
	vocFile.write(u"! you got this file from.\n\n")
Packit 1f3717
	vocFile.write(u"LEXICON Joukahainen_" + fileSuffix + u"\n")
Packit 1f3717
	vocabularyFiles[fileSuffix] = vocFile
Packit 1f3717
Packit 1f3717
Packit 1f3717
def frequency(word):
Packit 1f3717
	fclass = word.getElementsByTagName("fclass")
Packit 1f3717
	if len(fclass) == 0: return 7
Packit 1f3717
	return int(generate_lex_common.tValue(fclass[0]))
Packit 1f3717
Packit 1f3717
# Check the style flags of the word according to current options.
Packit 1f3717
# Returns True if the word is acceptable, otherwise returns false.
Packit 1f3717
def check_style(word):
Packit 1f3717
	global OPTIONS
Packit 1f3717
	for styleE in word.getElementsByTagName("style"):
Packit 1f3717
		for style in generate_lex_common.tValues(styleE, "flag"):
Packit 1f3717
			if style == "foreignloan":
Packit 1f3717
				continue
Packit 1f3717
			if not style in OPTIONS["style"]: return False
Packit 1f3717
	return True
Packit 1f3717
Packit 1f3717
# Returns True if the word is acceptable according to its usage flags.
Packit 1f3717
def check_usage(word):
Packit 1f3717
	global OPTIONS
Packit 1f3717
	wordUsage = word.getElementsByTagName("usage")
Packit 1f3717
	if len(wordUsage) == 0: return True
Packit 1f3717
	for usageE in wordUsage:
Packit 1f3717
		for usage in generate_lex_common.tValues(usageE, "flag"):
Packit 1f3717
			if usage in OPTIONS["extra-usage"]: return True
Packit 1f3717
	return False
Packit 1f3717
Packit 1f3717
# Returns VFST word class for given word in Joukahainen
Packit 1f3717
def get_vfst_word_class(j_wordclasses):
Packit 1f3717
	if "pnoun_place" in j_wordclasses: return u"[Lep]"
Packit 1f3717
	if "pnoun_firstname" in j_wordclasses: return u"[Lee]"
Packit 1f3717
	if "pnoun_lastname" in j_wordclasses: return u"[Les]"
Packit 1f3717
	if "pnoun_misc" in j_wordclasses: return u"[Lem]"
Packit 1f3717
	if "verb" in j_wordclasses: return u"[Lt]"
Packit 1f3717
	if "adjective" in j_wordclasses and "noun" in j_wordclasses: return u"[Lnl]"
Packit 1f3717
	if "adjective" in j_wordclasses: return u"[Ll]"
Packit 1f3717
	if "noun" in j_wordclasses: return u"[Ln]"
Packit 1f3717
	if "interjection" in j_wordclasses: return u"[Lh]"
Packit 1f3717
	if "prefix" in j_wordclasses: return u"[Lp]"
Packit 1f3717
	if "abbreviation" in j_wordclasses: return u"[La]"
Packit 1f3717
	if "adverb" in j_wordclasses: return u"[Ls]"
Packit 1f3717
	if "conjunction" in j_wordclasses: return u"[Lc]"
Packit 1f3717
	return None
Packit 1f3717
Packit 1f3717
# Returns a string describing the structure of a word, if necessary for the spellchecker
Packit 1f3717
# or hyphenator
Packit 1f3717
def get_structure(wordform, vfst_word_class, alku):
Packit 1f3717
	needstructure = False
Packit 1f3717
	ispropernoun = vfst_word_class[0:3] == u'[Le'
Packit 1f3717
	structstr = u'[Xr]'
Packit 1f3717
	oldAlku = alku
Packit 1f3717
	newAlku = u""
Packit 1f3717
	if vfst_word_class == u'[La]':
Packit 1f3717
		i = u"j"
Packit 1f3717
		p = u"q"
Packit 1f3717
	else:
Packit 1f3717
		i = u"i"
Packit 1f3717
		p = u"p"
Packit 1f3717
	for idx in range(len(wordform)):
Packit 1f3717
		c = wordform[idx]
Packit 1f3717
		if c == u'-':
Packit 1f3717
			structstr = structstr + u"-="
Packit 1f3717
			if (len(oldAlku) > 0):
Packit 1f3717
				newAlku = newAlku + u'-[Bm]'
Packit 1f3717
				oldAlku = oldAlku[1:]
Packit 1f3717
		elif c == u'|':
Packit 1f3717
			structstr = structstr
Packit 1f3717
		elif c == u'=':
Packit 1f3717
			structstr = structstr + u"="
Packit 1f3717
			newAlku = newAlku + u"[Bm]"
Packit 1f3717
		elif c == u':':
Packit 1f3717
			structstr = structstr + u":"
Packit 1f3717
			needstructure = True
Packit 1f3717
			if (len(oldAlku) > 0):
Packit 1f3717
				newAlku = newAlku + u':'
Packit 1f3717
				oldAlku = oldAlku[1:]
Packit 1f3717
		elif c.isupper():
Packit 1f3717
			structstr = structstr + i
Packit 1f3717
			if not (ispropernoun and idx == 0):
Packit 1f3717
				needstructure = True
Packit 1f3717
			if (len(oldAlku) > 0):
Packit 1f3717
				newAlku = newAlku + oldAlku[0]
Packit 1f3717
				oldAlku = oldAlku[1:]
Packit 1f3717
		else:
Packit 1f3717
			structstr = structstr + p
Packit 1f3717
			if ispropernoun and idx == 0:
Packit 1f3717
				needstructure = True
Packit 1f3717
			if (len(oldAlku) > 0):
Packit 1f3717
				newAlku = newAlku + oldAlku[0]
Packit 1f3717
				oldAlku = oldAlku[1:]
Packit 1f3717
	if needstructure:
Packit 1f3717
		returnedLength = len(structstr)
Packit 1f3717
		while structstr[returnedLength - 1] == p:
Packit 1f3717
			returnedLength = returnedLength - 1
Packit 1f3717
		return (structstr[0:returnedLength] + u'[X]', alku)
Packit 1f3717
	else:
Packit 1f3717
		return (u"", newAlku)
Packit 1f3717
Packit 1f3717
def get_diacritics(word, altforms, vfst_word_class):
Packit 1f3717
	diacritics = []
Packit 1f3717
	for group in word.childNodes:
Packit 1f3717
		if group.nodeType != Node.ELEMENT_NODE:
Packit 1f3717
			continue
Packit 1f3717
		for flag in group.childNodes:
Packit 1f3717
			if flag.nodeType != Node.ELEMENT_NODE:
Packit 1f3717
				continue
Packit 1f3717
			if flag.tagName != "flag":
Packit 1f3717
				continue
Packit 1f3717
			flagName = flag.firstChild.wholeText
Packit 1f3717
			if flagName == u"ei_yks":
Packit 1f3717
				diacritics.append(u"@P.EI_YKS.ON@")
Packit 1f3717
			elif flagName == u"ysj":
Packit 1f3717
				diacritics.append(u"@R.YS_ALKANUT@")
Packit 1f3717
			elif flagName == u"inen":
Packit 1f3717
				diacritics.append(u"@P.INEN_SALLITTU.ON@")
Packit 1f3717
			elif flagName == u"ei_inen":
Packit 1f3717
				diacritics.append(u"@P.INEN_KIELLETTY.ON@")
Packit 1f3717
			elif flagName == u"ei_mainen":
Packit 1f3717
				diacritics.append(u"@P.EI_MAINEN.ON@")
Packit 1f3717
			elif flagName == u"ei_lainen":
Packit 1f3717
				diacritics.append(u"@P.EI_LAINEN.ON@")
Packit 1f3717
			elif flagName == u"ei_vertm":
Packit 1f3717
				diacritics.append(u"@P.EI_VERTM.ON@")
Packit 1f3717
			elif flagName == u"ym3":
Packit 1f3717
				diacritics.append(u"@P.VAIN_YM3.ON@")
Packit 1f3717
			elif flagName == u"yt":
Packit 1f3717
				diacritics.append(u"@P.YKSITEKIJÄINEN.ON@")
Packit 1f3717
			elif flagName == u"geo_suffix":
Packit 1f3717
				diacritics.append(u"@C.PAIKANNIMEN_JL@")
Packit 1f3717
			if flagName in [u"ei_ys", u"ei_ysa"]:
Packit 1f3717
				diacritics.append(u"@P.YS_EI_JATKOA.ON@")
Packit 1f3717
			if flagName in [u"ei_ys", u"ei_ysj"]:
Packit 1f3717
				diacritics.append(u"@D.YS_ALKANUT@")
Packit 1f3717
	if vfst_word_class in [u"[Ln]", u"[Lnl]"] and (altforms[0].endswith(u"lainen") or altforms[0].endswith(u"läinen")):
Packit 1f3717
		diacritics.append(u"@P.LAINEN.ON@@C.LAINEN_VAADITTU@@C.VAIN_NIMISANA@")
Packit 1f3717
	return diacritics
Packit 1f3717
Packit 1f3717
def get_info_flags(word):
Packit 1f3717
	flags = u""
Packit 1f3717
	for group in word.childNodes:
Packit 1f3717
		if group.nodeType != Node.ELEMENT_NODE:
Packit 1f3717
			continue
Packit 1f3717
		for flag in group.childNodes:
Packit 1f3717
			if flag.nodeType != Node.ELEMENT_NODE:
Packit 1f3717
				continue
Packit 1f3717
			if flag.tagName != "flag":
Packit 1f3717
				continue
Packit 1f3717
			flagName = flag.firstChild.wholeText
Packit 1f3717
			if flagName == u"paikannimi_ulkopaikallissijat":
Packit 1f3717
				flags = flags + u"[Ipu]"
Packit 1f3717
			elif flagName == u"paikannimi_sisäpaikallissijat":
Packit 1f3717
				flags = flags + u"[Ips]"
Packit 1f3717
			elif flagName == u"foreignloan":
Packit 1f3717
				flags = flags + u"[Isf]"
Packit 1f3717
			elif flagName == u"el_altark":
Packit 1f3717
				flags = flags + u"[De]"
Packit 1f3717
			elif flagName == u"geo_suffix":
Packit 1f3717
				flags = flags + u"[Ica]"
Packit 1f3717
			elif flagName == u"org_suffix":
Packit 1f3717
				flags = flags + u"[Ion]"
Packit 1f3717
			elif flagName == u"free_suffix":
Packit 1f3717
				flags = flags + u"[Ivj]"
Packit 1f3717
			elif flagName == u"require_following_a":
Packit 1f3717
				flags = flags + u"[Ira]"
Packit 1f3717
			elif flagName == u"require_following_ma":
Packit 1f3717
				flags = flags + u"[Irm]"
Packit 1f3717
	return flags
Packit 1f3717
Packit 1f3717
def get_vfst_class_prefix(vfst_class):
Packit 1f3717
	if vfst_class == u"[Ln]":
Packit 1f3717
		return u"Nimisana"
Packit 1f3717
	elif vfst_class == u"[Lee]":
Packit 1f3717
		return u"Etunimi"
Packit 1f3717
	elif vfst_class == u"[Lep]":
Packit 1f3717
		return u"Paikannimi"
Packit 1f3717
	elif vfst_class == u"[Les]":
Packit 1f3717
		return u"Sukunimi"
Packit 1f3717
	elif vfst_class == u"[Lem]":
Packit 1f3717
		return u"Nimi"
Packit 1f3717
	elif vfst_class == u"[Ll]":
Packit 1f3717
		return u"Laatusana"
Packit 1f3717
	elif vfst_class == u"[Lnl]":
Packit 1f3717
		return u"NimiLaatusana"
Packit 1f3717
	else:
Packit 1f3717
		return u""
Packit 1f3717
Packit 1f3717
def vowel_type_for_derived_verb(wordform):
Packit 1f3717
	for char in reversed(wordform):
Packit 1f3717
		if char in u"yäö":
Packit 1f3717
			return u"@P.V_SALLITTU.E@"
Packit 1f3717
		if char in u"uao":
Packit 1f3717
			return u"@P.V_SALLITTU.T@"
Packit 1f3717
		if char in u"]":
Packit 1f3717
			break
Packit 1f3717
	return u"@P.V_SALLITTU.T@"
Packit 1f3717
Packit 1f3717
def get_prefix_jatko(word, altform):
Packit 1f3717
	flags = generate_lex_common.get_flags_from_group(word, u"compounding")
Packit 1f3717
	prefixJatko = u""
Packit 1f3717
	for flag in sorted(flags):
Packit 1f3717
		if flag in [u"eln", u"ell", u"elt", u"eltj"]:
Packit 1f3717
			prefixJatko = prefixJatko + flag
Packit 1f3717
	if altform.endswith(u"-"):
Packit 1f3717
		prefixJatko = prefixJatko + u"H"
Packit 1f3717
	return prefixJatko
Packit 1f3717
Packit 1f3717
def get_adverb_jatko(word, altform):
Packit 1f3717
	flags = generate_lex_common.get_flags_from_group(word, u"inflection")
Packit 1f3717
	loppu = True
Packit 1f3717
	adverbJatko = u""
Packit 1f3717
	for flag in sorted(flags):
Packit 1f3717
		if flag in [u"liitesana", u"ulkopaikallissijat_yks"]:
Packit 1f3717
			adverbJatko = adverbJatko + flag.title()
Packit 1f3717
		elif flag == u"omistusliite":
Packit 1f3717
			if altform[-1] in u"aäe" and altform[-1] != altform[-2]:
Packit 1f3717
				adverbJatko = adverbJatko + u"OlV"
Packit 1f3717
			else:
Packit 1f3717
				adverbJatko = adverbJatko + u"Omistusliite"
Packit 1f3717
		elif flag == u"required":
Packit 1f3717
			loppu = False;
Packit 1f3717
	if loppu:
Packit 1f3717
		adverbJatko = "Loppu" + adverbJatko
Packit 1f3717
	return adverbJatko
Packit 1f3717
Packit 1f3717
def get_abbreviation_jatko(word, wordform):
Packit 1f3717
	flags = generate_lex_common.get_flags_from_group(word, u"inflection")
Packit 1f3717
	if wordform.endswith(u".") or u"none" in flags:
Packit 1f3717
		return u"PisteellisenLyhenteenJatko"
Packit 1f3717
	else:
Packit 1f3717
		return u"Lyhenne"
Packit 1f3717
Packit 1f3717
def handle_word(word):
Packit 1f3717
	global OPTIONS
Packit 1f3717
	global CLASSMAP
Packit 1f3717
	# Drop words that are not needed in the Voikko lexicon
Packit 1f3717
	# but only if not generating Sukija lexicon.
Packit 1f3717
	if generate_lex_common.has_flag(word, "not_voikko") and not OPTIONS["sukija"]: return
Packit 1f3717
	if not check_style(word): return
Packit 1f3717
	if not check_usage(word): return
Packit 1f3717
	if frequency(word) >= OPTIONS["frequency"] + 1: return
Packit 1f3717
	if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return
Packit 1f3717
	
Packit 1f3717
	# Get the inflection class. Exactly one inflection class is needed
Packit 1f3717
	voikko_infclass = None
Packit 1f3717
        if OPTIONS["sukija"]:
Packit 1f3717
                for infclass in word.getElementsByTagName("infclass"):
Packit 1f3717
                        if infclass.getAttribute("type") == "historical":
Packit 1f3717
                                voikko_infclass = generate_lex_common.tValue(infclass)
Packit 1f3717
                                if voikko_infclass == u"banaali":   # Banaali taipuu kuten paperi.
Packit 1f3717
                                        voikko_infclass = u"paperi"
Packit 1f3717
                                elif voikko_infclass == u"pasuuna":
Packit 1f3717
                                        voikko_infclass = u"peruna"
Packit 1f3717
                                if voikko_infclass not in [u"aavistaa-av1", u"arvelu", u"arvelu-av1", u"haravoida-av2", u"karahka", u"matala",
Packit 1f3717
                                                           u"paperi", u"paperi-av1", u"peruna"]:
Packit 1f3717
                                        voikko_infclass = None
Packit 1f3717
                                break
Packit 1f3717
        if voikko_infclass == None:
Packit 1f3717
                for infclass in word.getElementsByTagName("infclass"):
Packit 1f3717
                        if infclass.getAttribute("type") != "historical":
Packit 1f3717
                                voikko_infclass = generate_lex_common.tValue(infclass)
Packit 1f3717
                                break
Packit 1f3717
	if voikko_infclass == u"poikkeava": return
Packit 1f3717
	
Packit 1f3717
	# Get the word classes
Packit 1f3717
	wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass")
Packit 1f3717
	if wordclasses[0] not in [u"interjection", u"prefix", u"abbreviation", u"conjunction", u"adverb"] and voikko_infclass == None:
Packit 1f3717
		return
Packit 1f3717
	vfst_word_class = get_vfst_word_class(wordclasses)
Packit 1f3717
	if vfst_word_class == None: return
Packit 1f3717
	
Packit 1f3717
	# Get diacritics
Packit 1f3717
	altforms = generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form")
Packit 1f3717
	diacritics = reduce(lambda x, y: x + y, get_diacritics(word, altforms, vfst_word_class), u"")
Packit 1f3717
	
Packit 1f3717
	# Get forced vowel type
Packit 1f3717
	if voikko_infclass == None and vfst_word_class != u"[La]":
Packit 1f3717
		forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
Packit 1f3717
	else:
Packit 1f3717
		inflectionElement = word.getElementsByTagName("inflection")
Packit 1f3717
		if len(inflectionElement) > 0:
Packit 1f3717
			forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0])
Packit 1f3717
		else:
Packit 1f3717
			forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
Packit 1f3717
	
Packit 1f3717
	# Construct debug information
Packit 1f3717
	debug_info = u""
Packit 1f3717
	if OPTIONS["sourceid"]:
Packit 1f3717
		debug_info = u'[Xs]%s[X]' % word.getAttribute("id")[1:].replace(u"0", u"%0")
Packit 1f3717
	
Packit 1f3717
	infoFlags = get_info_flags(word)
Packit 1f3717
	
Packit 1f3717
	# Process all alternative forms
Packit 1f3717
	singlePartForms = []
Packit 1f3717
	multiPartForms = []
Packit 1f3717
	for altform in altforms:
Packit 1f3717
		outputBaseform = altform.replace(u'|', u'')
Packit 1f3717
		wordform = outputBaseform.replace(u'=', u'')
Packit 1f3717
		if len(altform) == len(wordform.replace(u'-', u'')):
Packit 1f3717
			singlePartForms.append(altform)
Packit 1f3717
		else:
Packit 1f3717
			multiPartForms.append(altform)
Packit 1f3717
		(alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP)
Packit 1f3717
		if alku == None:
Packit 1f3717
			errorstr = u"ERROR: VFST class not found for (%s, %s)\n" % (wordform, voikko_infclass)
Packit 1f3717
			sys.stderr.write(errorstr.encode(u"UTF-8"))
Packit 1f3717
			sys.exit(1)
Packit 1f3717
		if vfst_word_class == u"[La]":
Packit 1f3717
			jatko = get_abbreviation_jatko(word, altform)
Packit 1f3717
		elif vfst_word_class == u"[Ls]":
Packit 1f3717
			jatko = get_adverb_jatko(word, altform)
Packit 1f3717
		else:
Packit 1f3717
			jatko = jatko.title()
Packit 1f3717
		if vfst_word_class in [u"[Ls]", u"[Lc]", u"[Lh]"]:
Packit 1f3717
			for element in word.getElementsByTagName(u"baseform"):
Packit 1f3717
				wordform = generate_lex_common.tValue(element)
Packit 1f3717
				outputBaseform = wordform.replace(u'|', u'')
Packit 1f3717
		if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
Packit 1f3717
			vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
Packit 1f3717
		else: vtype = forced_inflection_vtype
Packit 1f3717
		if vtype == voikkoutils.VOWEL_FRONT: vfst_vtype = u'ä'
Packit 1f3717
		elif vtype == voikkoutils.VOWEL_BACK: vfst_vtype = u'a'
Packit 1f3717
		elif vtype == voikkoutils.VOWEL_BOTH: vfst_vtype = u'aä'
Packit 1f3717
		vocabularyFile = vocabularyFiles[vfst_word_class.replace(u"[L", u"").replace(u"]", u"")]
Packit 1f3717
		if alku == None:
Packit 1f3717
			errorstr = u"ERROR: Malaga class not found for (%s, %s)\n" \
Packit 1f3717
				% (wordform, voikko_infclass)
Packit 1f3717
			generate_lex_common.write_entry(vocabularyFile, {}, word, errorstr)
Packit 1f3717
			sys.stderr.write(errorstr.encode(u"UTF-8"))
Packit 1f3717
			sys.exit(1)
Packit 1f3717
		alku = alku.lower()
Packit 1f3717
		(rakenne, alkuWithTags) = get_structure(altform, vfst_word_class, alku)
Packit 1f3717
		
Packit 1f3717
		if vfst_word_class == u"[Lh]":
Packit 1f3717
			entry = u'%s[Xp]%s[X]%s%s%s:%s # ;' % (vfst_word_class, outputBaseform, debug_info, rakenne, alkuWithTags, alku)
Packit 1f3717
			vocabularyFile.write(entry + u"\n")
Packit 1f3717
			continue
Packit 1f3717
		vfst_class_prefix = get_vfst_class_prefix(vfst_word_class)
Packit 1f3717
		
Packit 1f3717
		# Vowel type in derived verbs
Packit 1f3717
		if jatko in [u"Heittää", u"Muistaa", u"Juontaa", u"Hohtaa", u"Murtaa", u"Nousta", u"Loistaa", u"Jättää", u"Kihistä"]:
Packit 1f3717
			diacritics = diacritics + vowel_type_for_derived_verb(alkuWithTags)
Packit 1f3717
			if jatko == u"Kihistä" and vtype == voikkoutils.VOWEL_FRONT and u"y" not in alku and u"ä" not in alku and u"ö" not in alku and u"e" in alku:
Packit 1f3717
				jatko = u"Helistä"
Packit 1f3717
		
Packit 1f3717
		if jatko == u"Nainen" and vfst_class_prefix in [u"Laatusana", u"NimiLaatusana"] and altform.endswith(u"inen"):
Packit 1f3717
			jatko = u"NainenInen"
Packit 1f3717
		
Packit 1f3717
		if vfst_word_class == u"[Lp]":
Packit 1f3717
			entry = u'[Lp]%s%s%s%s%s:%s%s EtuliitteenJatko_%s;' \
Packit 1f3717
			        % (debug_info, rakenne, alkuWithTags, diacritics, infoFlags, alku, diacritics, get_prefix_jatko(word, altform))
Packit 1f3717
		else:
Packit 1f3717
			entry = u'%s[Xp]%s[X]%s%s%s%s%s:%s%s %s%s_%s ;' \
Packit 1f3717
			        % (vfst_word_class, outputBaseform, debug_info, rakenne, infoFlags,
Packit 1f3717
			        alkuWithTags, diacritics, alku, diacritics, vfst_class_prefix, jatko, vfst_vtype)
Packit 1f3717
		vocabularyFile.write(entry + u"\n")
Packit 1f3717
	
Packit 1f3717
	# Sanity check for alternative forms: if there are both multi part forms and single part forms
Packit 1f3717
	# then all multi part forms must end with a part contained in the single part set.
Packit 1f3717
	if singlePartForms:
Packit 1f3717
		for multiPartForm in multiPartForms:
Packit 1f3717
			lastPart = multiPartForm[max(rfind(multiPartForm, u"="), rfind(multiPartForm, u"|"), rfind(multiPartForm, u"-")) + 1:]
Packit 1f3717
			if lastPart not in singlePartForms:
Packit 1f3717
				sys.stderr.write(u"ERROR: suspicious alternative spelling: %s\n" % multiPartForm)
Packit 1f3717
				sys.exit(1)
Packit 1f3717
Packit 1f3717
Packit 1f3717
voikkoutils.process_wordlist(generate_lex_common.VOCABULARY_DATA + u'/joukahainen.xml', \
Packit 1f3717
                             handle_word, True)
Packit 1f3717
Packit 1f3717
for fileSuffix in vocabularyFileSuffixes:
Packit 1f3717
	vocabularyFiles[fileSuffix].write(u"\n\n") # Extra line feeds needed to avoid mixed lines in concatenated lexc file
Packit 1f3717
	vocabularyFiles[fileSuffix].close()