Tree - source-git/malaga-suomi-voikko

source-git / malaga-suomi-voikko

Files

Commit: 1f37173b019941788e5603446756c1628970c40f
Blob Blame History Raw
# -*- coding: utf-8 -*-

# Copyright 2007 - 2012 Harri Pitkänen (hatapitk@iki.fi)
# Program to generate lexicon files for Suomi-malaga Voikko edition

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

import sys
sys.path.append("common")
import hfconv
import generate_lex_common
import voikkoutils
import xml.dom.minidom
import codecs
from string import rfind
from xml.dom import Node

flag_attributes = voikkoutils.readFlagAttributes(generate_lex_common.VOCABULARY_DATA + u"/flags.txt")

# Get command line options
OPTIONS = generate_lex_common.get_options()

# Inflection class map
CLASSMAP = hfconv.compileClassmapREs(hfconv.modern_classmap)

# No special vocabularies are built for Voikko
generate_lex_common.SPECIAL_VOCABULARY = []

vocabularyFileSuffixes = [u"ep", u"ee", u"es", u"em", u"t", u"nl", u"l", u"n", u"h", u"p", u"a", u"s", u"c"]
vocabularyFiles = {}
for fileSuffix in vocabularyFileSuffixes:
	vocFile = codecs.open(OPTIONS["destdir"] + u"/joukahainen-" + fileSuffix + u".lexc", 'w', 'UTF-8')
	vocFile.write(u"! This is automatically generated intermediate lexicon file for\n")
	vocFile.write(u"! VVFST morphology. The original source data is\n")
	vocFile.write(u"! distributed under the GNU General Public License, version 2 or\n")
	vocFile.write(u"! later, as published by the Free Software Foundation. You should\n")
	vocFile.write(u"! have received the original data, tools and instructions to\n")
	vocFile.write(u"! generate this file (or instructions to obtain them) wherever\n")
	vocFile.write(u"! you got this file from.\n\n")
	vocFile.write(u"LEXICON Joukahainen_" + fileSuffix + u"\n")
	vocabularyFiles[fileSuffix] = vocFile


def frequency(word):
	fclass = word.getElementsByTagName("fclass")
	if len(fclass) == 0: return 7
	return int(generate_lex_common.tValue(fclass[0]))

# Check the style flags of the word according to current options.
# Returns True if the word is acceptable, otherwise returns false.
def check_style(word):
	global OPTIONS
	for styleE in word.getElementsByTagName("style"):
		for style in generate_lex_common.tValues(styleE, "flag"):
			if style == "foreignloan":
				continue
			if not style in OPTIONS["style"]: return False
	return True

# Returns True if the word is acceptable according to its usage flags.
def check_usage(word):
	global OPTIONS
	wordUsage = word.getElementsByTagName("usage")
	if len(wordUsage) == 0: return True
	for usageE in wordUsage:
		for usage in generate_lex_common.tValues(usageE, "flag"):
			if usage in OPTIONS["extra-usage"]: return True
	return False

# Returns VFST word class for given word in Joukahainen
def get_vfst_word_class(j_wordclasses):
	if "pnoun_place" in j_wordclasses: return u"[Lep]"
	if "pnoun_firstname" in j_wordclasses: return u"[Lee]"
	if "pnoun_lastname" in j_wordclasses: return u"[Les]"
	if "pnoun_misc" in j_wordclasses: return u"[Lem]"
	if "verb" in j_wordclasses: return u"[Lt]"
	if "adjective" in j_wordclasses and "noun" in j_wordclasses: return u"[Lnl]"
	if "adjective" in j_wordclasses: return u"[Ll]"
	if "noun" in j_wordclasses: return u"[Ln]"
	if "interjection" in j_wordclasses: return u"[Lh]"
	if "prefix" in j_wordclasses: return u"[Lp]"
	if "abbreviation" in j_wordclasses: return u"[La]"
	if "adverb" in j_wordclasses: return u"[Ls]"
	if "conjunction" in j_wordclasses: return u"[Lc]"
	return None

# Returns a string describing the structure of a word, if necessary for the spellchecker
# or hyphenator
def get_structure(wordform, vfst_word_class, alku):
	needstructure = False
	ispropernoun = vfst_word_class[0:3] == u'[Le'
	structstr = u'[Xr]'
	oldAlku = alku
	newAlku = u""
	if vfst_word_class == u'[La]':
		i = u"j"
		p = u"q"
	else:
		i = u"i"
		p = u"p"
	for idx in range(len(wordform)):
		c = wordform[idx]
		if c == u'-':
			structstr = structstr + u"-="
			if (len(oldAlku) > 0):
				newAlku = newAlku + u'-[Bm]'
				oldAlku = oldAlku[1:]
		elif c == u'|':
			structstr = structstr
		elif c == u'=':
			structstr = structstr + u"="
			newAlku = newAlku + u"[Bm]"
		elif c == u':':
			structstr = structstr + u":"
			needstructure = True
			if (len(oldAlku) > 0):
				newAlku = newAlku + u':'
				oldAlku = oldAlku[1:]
		elif c.isupper():
			structstr = structstr + i
			if not (ispropernoun and idx == 0):
				needstructure = True
			if (len(oldAlku) > 0):
				newAlku = newAlku + oldAlku[0]
				oldAlku = oldAlku[1:]
		else:
			structstr = structstr + p
			if ispropernoun and idx == 0:
				needstructure = True
			if (len(oldAlku) > 0):
				newAlku = newAlku + oldAlku[0]
				oldAlku = oldAlku[1:]
	if needstructure:
		returnedLength = len(structstr)
		while structstr[returnedLength - 1] == p:
			returnedLength = returnedLength - 1
		return (structstr[0:returnedLength] + u'[X]', alku)
	else:
		return (u"", newAlku)

def get_diacritics(word, altforms, vfst_word_class):
	diacritics = []
	for group in word.childNodes:
		if group.nodeType != Node.ELEMENT_NODE:
			continue
		for flag in group.childNodes:
			if flag.nodeType != Node.ELEMENT_NODE:
				continue
			if flag.tagName != "flag":
				continue
			flagName = flag.firstChild.wholeText
			if flagName == u"ei_yks":
				diacritics.append(u"@P.EI_YKS.ON@")
			elif flagName == u"ysj":
				diacritics.append(u"@R.YS_ALKANUT@")
			elif flagName == u"inen":
				diacritics.append(u"@P.INEN_SALLITTU.ON@")
			elif flagName == u"ei_inen":
				diacritics.append(u"@P.INEN_KIELLETTY.ON@")
			elif flagName == u"ei_mainen":
				diacritics.append(u"@P.EI_MAINEN.ON@")
			elif flagName == u"ei_lainen":
				diacritics.append(u"@P.EI_LAINEN.ON@")
			elif flagName == u"ei_vertm":
				diacritics.append(u"@P.EI_VERTM.ON@")
			elif flagName == u"ym3":
				diacritics.append(u"@P.VAIN_YM3.ON@")
			elif flagName == u"yt":
				diacritics.append(u"@P.YKSITEKIJÄINEN.ON@")
			elif flagName == u"geo_suffix":
				diacritics.append(u"@C.PAIKANNIMEN_JL@")
			if flagName in [u"ei_ys", u"ei_ysa"]:
				diacritics.append(u"@P.YS_EI_JATKOA.ON@")
			if flagName in [u"ei_ys", u"ei_ysj"]:
				diacritics.append(u"@D.YS_ALKANUT@")
	if vfst_word_class in [u"[Ln]", u"[Lnl]"] and (altforms[0].endswith(u"lainen") or altforms[0].endswith(u"läinen")):
		diacritics.append(u"@P.LAINEN.ON@@C.LAINEN_VAADITTU@@C.VAIN_NIMISANA@")
	return diacritics

def get_info_flags(word):
	flags = u""
	for group in word.childNodes:
		if group.nodeType != Node.ELEMENT_NODE:
			continue
		for flag in group.childNodes:
			if flag.nodeType != Node.ELEMENT_NODE:
				continue
			if flag.tagName != "flag":
				continue
			flagName = flag.firstChild.wholeText
			if flagName == u"paikannimi_ulkopaikallissijat":
				flags = flags + u"[Ipu]"
			elif flagName == u"paikannimi_sisäpaikallissijat":
				flags = flags + u"[Ips]"
			elif flagName == u"foreignloan":
				flags = flags + u"[Isf]"
			elif flagName == u"el_altark":
				flags = flags + u"[De]"
			elif flagName == u"geo_suffix":
				flags = flags + u"[Ica]"
			elif flagName == u"org_suffix":
				flags = flags + u"[Ion]"
			elif flagName == u"free_suffix":
				flags = flags + u"[Ivj]"
			elif flagName == u"require_following_a":
				flags = flags + u"[Ira]"
			elif flagName == u"require_following_ma":
				flags = flags + u"[Irm]"
	return flags

def get_vfst_class_prefix(vfst_class):
	if vfst_class == u"[Ln]":
		return u"Nimisana"
	elif vfst_class == u"[Lee]":
		return u"Etunimi"
	elif vfst_class == u"[Lep]":
		return u"Paikannimi"
	elif vfst_class == u"[Les]":
		return u"Sukunimi"
	elif vfst_class == u"[Lem]":
		return u"Nimi"
	elif vfst_class == u"[Ll]":
		return u"Laatusana"
	elif vfst_class == u"[Lnl]":
		return u"NimiLaatusana"
	else:
		return u""

def vowel_type_for_derived_verb(wordform):
	for char in reversed(wordform):
		if char in u"yäö":
			return u"@P.V_SALLITTU.E@"
		if char in u"uao":
			return u"@P.V_SALLITTU.T@"
		if char in u"]":
			break
	return u"@P.V_SALLITTU.T@"

def get_prefix_jatko(word, altform):
	flags = generate_lex_common.get_flags_from_group(word, u"compounding")
	prefixJatko = u""
	for flag in sorted(flags):
		if flag in [u"eln", u"ell", u"elt", u"eltj"]:
			prefixJatko = prefixJatko + flag
	if altform.endswith(u"-"):
		prefixJatko = prefixJatko + u"H"
	return prefixJatko

def get_adverb_jatko(word, altform):
	flags = generate_lex_common.get_flags_from_group(word, u"inflection")
	loppu = True
	adverbJatko = u""
	for flag in sorted(flags):
		if flag in [u"liitesana", u"ulkopaikallissijat_yks"]:
			adverbJatko = adverbJatko + flag.title()
		elif flag == u"omistusliite":
			if altform[-1] in u"aäe" and altform[-1] != altform[-2]:
				adverbJatko = adverbJatko + u"OlV"
			else:
				adverbJatko = adverbJatko + u"Omistusliite"
		elif flag == u"required":
			loppu = False;
	if loppu:
		adverbJatko = "Loppu" + adverbJatko
	return adverbJatko

def get_abbreviation_jatko(word, wordform):
	flags = generate_lex_common.get_flags_from_group(word, u"inflection")
	if wordform.endswith(u".") or u"none" in flags:
		return u"PisteellisenLyhenteenJatko"
	else:
		return u"Lyhenne"

def handle_word(word):
	global OPTIONS
	global CLASSMAP
	# Drop words that are not needed in the Voikko lexicon
	# but only if not generating Sukija lexicon.
	if generate_lex_common.has_flag(word, "not_voikko") and not OPTIONS["sukija"]: return
	if not check_style(word): return
	if not check_usage(word): return
	if frequency(word) >= OPTIONS["frequency"] + 1: return
	if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return
	
	# Get the inflection class. Exactly one inflection class is needed
	voikko_infclass = None
        if OPTIONS["sukija"]:
                for infclass in word.getElementsByTagName("infclass"):
                        if infclass.getAttribute("type") == "historical":
                                voikko_infclass = generate_lex_common.tValue(infclass)
                                if voikko_infclass == u"banaali":   # Banaali taipuu kuten paperi.
                                        voikko_infclass = u"paperi"
                                elif voikko_infclass == u"pasuuna":
                                        voikko_infclass = u"peruna"
                                if voikko_infclass not in [u"aavistaa-av1", u"arvelu", u"arvelu-av1", u"haravoida-av2", u"karahka", u"matala",
                                                           u"paperi", u"paperi-av1", u"peruna"]:
                                        voikko_infclass = None
                                break
        if voikko_infclass == None:
                for infclass in word.getElementsByTagName("infclass"):
                        if infclass.getAttribute("type") != "historical":
                                voikko_infclass = generate_lex_common.tValue(infclass)
                                break
	if voikko_infclass == u"poikkeava": return
	
	# Get the word classes
	wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass")
	if wordclasses[0] not in [u"interjection", u"prefix", u"abbreviation", u"conjunction", u"adverb"] and voikko_infclass == None:
		return
	vfst_word_class = get_vfst_word_class(wordclasses)
	if vfst_word_class == None: return
	
	# Get diacritics
	altforms = generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form")
	diacritics = reduce(lambda x, y: x + y, get_diacritics(word, altforms, vfst_word_class), u"")
	
	# Get forced vowel type
	if voikko_infclass == None and vfst_word_class != u"[La]":
		forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
	else:
		inflectionElement = word.getElementsByTagName("inflection")
		if len(inflectionElement) > 0:
			forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0])
		else:
			forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
	
	# Construct debug information
	debug_info = u""
	if OPTIONS["sourceid"]:
		debug_info = u'[Xs]%s[X]' % word.getAttribute("id")[1:].replace(u"0", u"%0")
	
	infoFlags = get_info_flags(word)
	
	# Process all alternative forms
	singlePartForms = []
	multiPartForms = []
	for altform in altforms:
		outputBaseform = altform.replace(u'|', u'')
		wordform = outputBaseform.replace(u'=', u'')
		if len(altform) == len(wordform.replace(u'-', u'')):
			singlePartForms.append(altform)
		else:
			multiPartForms.append(altform)
		(alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP)
		if alku == None:
			errorstr = u"ERROR: VFST class not found for (%s, %s)\n" % (wordform, voikko_infclass)
			sys.stderr.write(errorstr.encode(u"UTF-8"))
			sys.exit(1)
		if vfst_word_class == u"[La]":
			jatko = get_abbreviation_jatko(word, altform)
		elif vfst_word_class == u"[Ls]":
			jatko = get_adverb_jatko(word, altform)
		else:
			jatko = jatko.title()
		if vfst_word_class in [u"[Ls]", u"[Lc]", u"[Lh]"]:
			for element in word.getElementsByTagName(u"baseform"):
				wordform = generate_lex_common.tValue(element)
				outputBaseform = wordform.replace(u'|', u'')
		if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
			vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
		else: vtype = forced_inflection_vtype
		if vtype == voikkoutils.VOWEL_FRONT: vfst_vtype = u'ä'
		elif vtype == voikkoutils.VOWEL_BACK: vfst_vtype = u'a'
		elif vtype == voikkoutils.VOWEL_BOTH: vfst_vtype = u'aä'
		vocabularyFile = vocabularyFiles[vfst_word_class.replace(u"[L", u"").replace(u"]", u"")]
		if alku == None:
			errorstr = u"ERROR: Malaga class not found for (%s, %s)\n" \
				% (wordform, voikko_infclass)
			generate_lex_common.write_entry(vocabularyFile, {}, word, errorstr)
			sys.stderr.write(errorstr.encode(u"UTF-8"))
			sys.exit(1)
		alku = alku.lower()
		(rakenne, alkuWithTags) = get_structure(altform, vfst_word_class, alku)
		
		if vfst_word_class == u"[Lh]":
			entry = u'%s[Xp]%s[X]%s%s%s:%s # ;' % (vfst_word_class, outputBaseform, debug_info, rakenne, alkuWithTags, alku)
			vocabularyFile.write(entry + u"\n")
			continue
		vfst_class_prefix = get_vfst_class_prefix(vfst_word_class)
		
		# Vowel type in derived verbs
		if jatko in [u"Heittää", u"Muistaa", u"Juontaa", u"Hohtaa", u"Murtaa", u"Nousta", u"Loistaa", u"Jättää", u"Kihistä"]:
			diacritics = diacritics + vowel_type_for_derived_verb(alkuWithTags)
			if jatko == u"Kihistä" and vtype == voikkoutils.VOWEL_FRONT and u"y" not in alku and u"ä" not in alku and u"ö" not in alku and u"e" in alku:
				jatko = u"Helistä"
		
		if jatko == u"Nainen" and vfst_class_prefix in [u"Laatusana", u"NimiLaatusana"] and altform.endswith(u"inen"):
			jatko = u"NainenInen"
		
		if vfst_word_class == u"[Lp]":
			entry = u'[Lp]%s%s%s%s%s:%s%s EtuliitteenJatko_%s;' \
			        % (debug_info, rakenne, alkuWithTags, diacritics, infoFlags, alku, diacritics, get_prefix_jatko(word, altform))
		else:
			entry = u'%s[Xp]%s[X]%s%s%s%s%s:%s%s %s%s_%s ;' \
			        % (vfst_word_class, outputBaseform, debug_info, rakenne, infoFlags,
			        alkuWithTags, diacritics, alku, diacritics, vfst_class_prefix, jatko, vfst_vtype)
		vocabularyFile.write(entry + u"\n")
	
	# Sanity check for alternative forms: if there are both multi part forms and single part forms
	# then all multi part forms must end with a part contained in the single part set.
	if singlePartForms:
		for multiPartForm in multiPartForms:
			lastPart = multiPartForm[max(rfind(multiPartForm, u"="), rfind(multiPartForm, u"|"), rfind(multiPartForm, u"-")) + 1:]
			if lastPart not in singlePartForms:
				sys.stderr.write(u"ERROR: suspicious alternative spelling: %s\n" % multiPartForm)
				sys.exit(1)


voikkoutils.process_wordlist(generate_lex_common.VOCABULARY_DATA + u'/joukahainen.xml', \
                             handle_word, True)

for fileSuffix in vocabularyFileSuffixes:
	vocabularyFiles[fileSuffix].write(u"\n\n") # Extra line feeds needed to avoid mixed lines in concatenated lexc file
	vocabularyFiles[fileSuffix].close()
source-git / malaga-suomi-voikko

Source Code

Files