Blame sukija/sukija.py

Packit 1f3717
# -*- coding: utf-8 -*-
Packit 1f3717
Packit 1f3717
# Copyright 2007-2009, 2013 Hannu Väisänen (Etunimi.Sukunimi@uef.fi
Packit 1f3717
# Program to generate lexicon files for Suomi-malaga Sukija edition.
Packit 1f3717
Packit 1f3717
# This program is free software; you can redistribute it and/or modify
Packit 1f3717
# it under the terms of the GNU General Public License as published by
Packit 1f3717
# the Free Software Foundation; either version 2 of the License, or
Packit 1f3717
# (at your option) any later version.
Packit 1f3717
#
Packit 1f3717
# This program is distributed in the hope that it will be useful,
Packit 1f3717
# but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 1f3717
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 1f3717
# GNU General Public License for more details.
Packit 1f3717
#
Packit 1f3717
# You should have received a copy of the GNU General Public License
Packit 1f3717
# along with this program; if not, write to the Free Software
Packit 1f3717
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
Packit 1f3717
Packit 1f3717
Packit 1f3717
# This code is heavily based on code written by Harri Pitkänen.
Packit 1f3717
Packit 1f3717
import codecs
Packit 1f3717
import generate_lex_common
Packit 1f3717
import hfconv
Packit 1f3717
import voikkoutils
Packit 1f3717
import re
Packit 1f3717
import sys
Packit 1f3717
Packit 1f3717
Packit 1f3717
# Historical inflections in alphabetical order.
Packit 1f3717
historical = [
Packit 1f3717
	(u'aavistaa', u'sw', [(u'tt',u'(.*O)ittAA',u'kirjoittaa'),
Packit 1f3717
			      (u'tt',u'(.*O)ttAA',u'ammottaa'),
Packit 1f3717
			      (None,u'(.*t)AA',u'aavistaa'),
Packit 1f3717
			      (u'tt',u'(.*eUt)tAA',u'kuluttaa'),
Packit 1f3717
			      (u'tt',u'(.*[AeiU]t)tAA',u'alittaa'),
Packit 1f3717
			      (u't',u'(.*h)tAA',u'astahtaa')]),
Packit 1f3717
	(u'ahven',    u'ws', [(None,u'(.*CVC)',u'ahven')]),
Packit 1f3717
	(u'altis',    u'ws', [(None, u'(.*t)is', u'altis')]),
Packit 1f3717
	(u'antautua', u'sw', [(u't',u'(.*)tUA',u'antautua')]),
Packit 1f3717
	(u'arvailla',   u'-',  [(None,u'(.*[AOU]])illA',u'arvailla')]),
Packit 1f3717
	(u'arvelu',   u'sw', [(None,u'(.*e)istO',u'aarteisto')]),
Packit 1f3717
	(u'autio', u'-', [(None,u'(..*C)aatio',u'obligaatio'),
Packit 1f3717
			  (None,u'(..*C)uutio',u'resoluutio'),
Packit 1f3717
			  (None,u'(..*C)uusio',u'illuusio'),
Packit 1f3717
			  (None,u'(..*C)itio',u'traditio'),
Packit 1f3717
			  (None,u'(.*)ktio',u'funktio'),
Packit 1f3717
			  (None,u'(.*)',u'autio')]),
Packit 1f3717
        (u'banaali',  u'sw', [(None,u'(..*[^aeouyäö]o)di',u'symboli_di'),
Packit 1f3717
			      (None,u'(..*[^aeouyäö]o)fi',u'symboli_fi'),
Packit 1f3717
			      (None,u'(..*[^aeouyäö]o)gi',u'symboli_gi'),
Packit 1f3717
			      (None,u'(..*[^aeouyäö]o)li',u'symboli_li'),
Packit 1f3717
			      (None,u'(..*[^aeouyäö]o)mi',u'symboli_mi'),
Packit 1f3717
			      (None,u'(..*[^aeouyäö]o)ni',u'symboli_ni'),
Packit 1f3717
			      (None,u'(..*[^aeouyäö]o)ri',u'symboli_ri'),
Packit 1f3717
			      (None,u'(..*[^aeouyäö]o)vi',u'symboli_vi'),
Packit 1f3717
			      (None,u'(..*a)di',u'balladi'),
Packit 1f3717
			      (None,u'(pisto)oli',u'pistooli'),
Packit 1f3717
			      (None,u'(poli)isi',u'poliisi'),
Packit 1f3717
			      (None,u'(.*)i',u'banaali'),
Packit 1f3717
                              (u'nt',u'(.*n)ti',u'hollanti'),
Packit 1f3717
			      (u'nk',u'(.*n)ki',u'killinki'),
Packit 1f3717
			      (u'kk',u'(.*k)ki',u'kajakki'),
Packit 1f3717
			      (u'tt',u'(.*t)ti',u'salaatti'),
Packit 1f3717
			      (u'pp',u'(.*p)pi',u'sinappi'),
Packit 1f3717
			      (u't',u'(.*)ti',u'konvehti') ]),
Packit 1f3717
        (u'bébé', u'-', [(None,u'(.*V)',u'bébé')]),
Packit 1f3717
        (u'haastaa', u'sw', [(None,u'(.*Ct)AA',u'haastaa')]),
Packit 1f3717
	(u'hame', u'ws', [(u't',u'(..*CO)ite',u'osoite'),
Packit 1f3717
			  (u't',u'(..*CO)te',u'tiedote')]),
Packit 1f3717
	(u'herttua', u'-', [(None,u'(.*tU)A',u'herttua')]),
Packit 1f3717
	(u'hohtaa',  u'sw', [(u'tt',u'(.*t)tAA',u'heittää')]),
Packit 1f3717
	(u'huutaa', u'sw', [(u'nt',u'(.*Vn)tAA',u'alentaa'),
Packit 1f3717
			(u't',u'(.*V)tAA',u'huutaa')]),
Packit 1f3717
	(u'iäkäs',   u'ws', [(u'k',u'(.*[mntv]e)ikAs',u'maineikas'),
Packit 1f3717
			     (u'k',u'(.*k)As',u'iäkäs')]),
Packit 1f3717
        (u'kaihtaa', u'sw', [(u't',u'(.*)tAA',u'kaihtaa')]),
Packit 1f3717
	(u'kaivaa',     u'sw', [(None,u'(.*aj)AA',u'ajaa')]),
Packit 1f3717
        (u'kantaja', u'-', [(None,u'(.*)jA',u'kantaja')]),
Packit 1f3717
	(u'katsella',   u'ws', [(None,u'(.*[AOU])illA',u'arvailla')]),
Packit 1f3717
        (u'kirjoitella', u'ws', [(None,u'(...*O)itellA',u'kilvoitella'),
Packit 1f3717
                                 (None,u'(.*O)tellA',u'ilotella')]),
Packit 1f3717
        (u'kirjoittaa', u'sw', [(u'tt',u'(.*O)ittAA',u'kirjoittaa'),
Packit 1f3717
				(u'tt',u'(.*O)ttAA',u'ammottaa'),
Packit 1f3717
				(u'tt',u'(.*[AeiU]t)tAA',u'asettaa')]),
Packit 1f3717
        (u'karahka', u'-', [(None,u'(.*lo)gia',u'analogia'),
Packit 1f3717
			    (None,u'(.*so)fia',u'filosofia'),
Packit 1f3717
			    (None,u'(.*gra)fia',u'topografia')]),
Packit 1f3717
        (u'koiras', u'ws', [(None,u'(.*A)s',u'koiras')]),
Packit 1f3717
	(u'kohota', u'ws', [(u'k',u'(.*ik)OtA',u'laota'),
Packit 1f3717
			    (u'k',u'(.*Vk)OtA',u'saota'),
Packit 1f3717
			    (u'>k',u'(hi|la)OtA',u'laota'),
Packit 1f3717
			    (u'>k',u'(C[AiU])OtA',u'saota')]),
Packit 1f3717
	(u'kulkija', u'-', [(None,u'(.*lo)gia',u'analogia'),
Packit 1f3717
			    (None,u'(.*so)fia',u'filosofia'),
Packit 1f3717
			    (None,u'(.*gra)fia',u'topografia')]),
Packit 1f3717
        (u'kuollut', u'-', [(None,u'(.*neits)yt',u'neitsyt'),
Packit 1f3717
			    (None,u'(.*C)lUt',u'kuollut'),
Packit 1f3717
			    (None,u'(.*)nUt', u'punonut'),
Packit 1f3717
			    (None,u'(.*C)rUt',u'purrut'),
Packit 1f3717
			    (None,u'(.*C)sUt',u'juossut')]),
Packit 1f3717
	(u'kutiaa', u'-',  [(None,u'(.*Cia)a',u'kutiaa')]),
Packit 1f3717
        (u'laittaa', u'sw', [(u'tt',u'(.*t)tAA',u'laittaa')]),
Packit 1f3717
	(u'lampi', u'-',   [(None,u'(.*kam)pi',u'lampi')]),
Packit 1f3717
	(u'lovi', u'sw',   [(None,u'(rips|sin)i',u'kiiski')]),
Packit 1f3717
	(u'nainen', u'-', [(None,u'(hevo)nen',u'hevoinen'),
Packit 1f3717
			   (None,u'(.*Co)rinen',u'allegorinen'),
Packit 1f3717
			   (None,u'(.*Co)finen',u'filosofinen'),
Packit 1f3717
			   (None,u'(.*Co)ginen',u'psykologinen'),
Packit 1f3717
			   (None,u'(.*Co)ninen',u'ironinen'),
Packit 1f3717
			   (None,u'(.*gra)finen',u'topografinen'),
Packit 1f3717
			   (None,u'(.*(?:aa|ee|ii|oo|uu|yy|ää|öö)p)pinen',u'eeppinen'),
Packit 1f3717
			   (None,u'(.*(?:aa|ee|ii|oo|uu|yy|ää|öö)t)tinen',u'kriittinen'),
Packit 1f3717
			   (None,u'(.*(?:aa|ee|ii|oo|uu|yy|ää|öö)k)kinen',u'psyykkinen'),
Packit 1f3717
			   (None,u'(.*[ts]i)ivinen',u'relatiivinen'),  # Myös massi(i)vinen yms.
Packit 1f3717
			   (None,u'(.*)nen',u'nainen')]),
Packit 1f3717
	(u'neiti', u'sw', [(u't',u'(.*)ti',u'neiti')]),
Packit 1f3717
        (u'nuori', u'-', [(None,u'(.*C)i',u'nuori')]),
Packit 1f3717
	(u'onneton', u'ws', [(None,u'(.*)tOn',u'alaston'),
Packit 1f3717
			     (u't',u'(.*)tOn',u'onneton')]),
Packit 1f3717
        (u'paahtaa', u'sw', [(u't',u'(.*)tAA',u'paahtaa')]),
Packit 1f3717
        (u'paistaa', u'sw', [(None,u'(.*C)AA',u'paistaa')]),
Packit 1f3717
        (u'palata', u'ws', [(None,u'(.*)AtA',u'palata')]),
Packit 1f3717
        (u'palaa', u'ws', [(None,u'(.*C)AA',u'palaa')]),
Packit 1f3717
        (u'paperi',  u'sw', [(None,u'(..*[^aeouyäö]o)di',u'symboli_di'),
Packit 1f3717
			     (None,u'(..*[^aeouyäö]o)fi',u'symboli_fi'),
Packit 1f3717
			     (None,u'(..*[^aeouyäö]o)gi',u'symboli_gi'),
Packit 1f3717
			     (None,u'(..*[^aeouyäö]o)li',u'symboli_li'),
Packit 1f3717
			     (None,u'(..*[^aeouyäö]o)mi',u'symboli_mi'),
Packit 1f3717
			     (None,u'(..*[^aeouyäö]o)ni',u'symboli_ni'),
Packit 1f3717
			     (None,u'(..*[^aeouyäö]o)ri',u'symboli_ri'),
Packit 1f3717
			     (None,u'(..*[^aeouyäö]o)vi',u'symboli_vi'),
Packit 1f3717
			     (None,u'(kam)ari',u'kamari'),
Packit 1f3717
			     (None,u'(pisto)oli',u'pistooli'),
Packit 1f3717
			     (None,u'(poli)isi',u'poliisi'),
Packit 1f3717
			     (None,u'(..*a)di',u'balladi')]),
Packit 1f3717
        (u'pasuuna', u'sw', [(None,u'(.*)A',u'pasuuna')]),
Packit 1f3717
	(u'punoa',   u'sw', [(u't',u'(...*AU)tUA',u'antautua')]),
Packit 1f3717
	(u'rakentaa', u'-', [(None,u'(.*n)tAA',u'rakentaa')]),
Packit 1f3717
        (u'risti',   u'sw', [(None,u'(..*[^aeouyäö]o)di',u'telefoni_di'),
Packit 1f3717
			     (None,u'(..*[^aeouyäö]o)fi',u'telefoni_fi'),
Packit 1f3717
			     (None,u'(..*[^aeouyäö]o)gi',u'telefoni_gi'),
Packit 1f3717
			     (None,u'(..*[^aeouyäö]o)li',u'telefoni_li'),
Packit 1f3717
			     (None,u'(..*[^aeouyäö]o)mi',u'telefoni_mi'),
Packit 1f3717
			     (None,u'(..*[^aeouyäö]o)ni',u'telefoni_ni'),
Packit 1f3717
			     (None,u'(..*[^aeouyäö]o)ri',u'telefoni_ri'),
Packit 1f3717
			     (None,u'(..*[^aeouyäö]o)vi',u'telefoni_vi'),
Packit 1f3717
		             (None,u'(..*gr)afi',u'biografi'),
Packit 1f3717
			     (None,u'(..*)adi',u'marinadi'),
Packit 1f3717
			     (None,u'(..*)idi',u'pyramidi'),
Packit 1f3717
			     (u't',u'(tä|äi)ti',u'äiti')]),
Packit 1f3717
        (u'siivota', u'ws', [(None,u'(.*O)tA',u'siivota')]),
Packit 1f3717
	(u'sydän', u'-', [(None,u'(.*A)n',u'sydän')]),
Packit 1f3717
        (u'taittaa', u'sw', [(u'tt',u'(.*t)tAA',u'taittaa')]),
Packit 1f3717
	(u'tulla', u'ws', [(None,u'(.*Vl)lA',u'tulla')]),
Packit 1f3717
        (u'tuomi', u'-', [(None,u'(.*V)mi',u'tuomi')]),
Packit 1f3717
	(u'uros', u'-', [(None,u'(.*)s',u'uros')]),
Packit 1f3717
	(u'terve', u'-',[(None,u'(.*)',u'terve')]),
Packit 1f3717
	(u'valmis',u'ws', [(None,u'(.*)is',u'valmis')]),
Packit 1f3717
	(u'vastaus', u'-', [(None,u'(lootu)s',u'vastaus'),
Packit 1f3717
			    (None,u'(..*CO)itUs',u'aivoitus'),
Packit 1f3717
			    (None,u'(...*O)tUs',u'jaotus'),
Packit 1f3717
			    (None,u'(.*V)s',u'vastaus'),]),
Packit 1f3717
	(u'veranta', u'sw', [(u'nt',u'(.*n)tA',u'veranta')]),
Packit 1f3717
	(u'vieras',  u'ws', [(None,u'(.*[lr]iA)s',u'utelias'),
Packit 1f3717
			     (u'k',u'(.*mek)As',u'iäkäs'),
Packit 1f3717
			     (u'k',u'(.*k)As',u'varas')]),
Packit 1f3717
	(u'vihanta', u'sw', [(u'nt',u'(.*n)tA',u'vihanta')]),
Packit 1f3717
        (u'virkkaa', u'sw', [(u'kk',u'(.*k)kAA',u'virkkaa')])
Packit 1f3717
        ]
Packit 1f3717
Packit 1f3717
classmap = hfconv.compileClassmapREs(historical)
Packit 1f3717
classmap.extend(hfconv.compileClassmapREs(hfconv.modern_classmap))
Packit 1f3717
Packit 1f3717
pattern = u"^(?P<alku>.*)(?:" + \
Packit 1f3717
	  u"(?P<keltainen>C[aouyäö]i?nen)|" + \
Packit 1f3717
	  u"(?P<symboli_ym>[^aeouyäö]o[dfglmnrv]i)|" + \
Packit 1f3717
	  u"(?P<maineikas>[mntv]eikAs)" + \
Packit 1f3717
          u")$"
Packit 1f3717
Packit 1f3717
pattern = pattern.replace(u"A", u"[aä]")
Packit 1f3717
pattern = pattern.replace(u"O", u"[oö]")
Packit 1f3717
pattern = pattern.replace(u"U", u"[uy]")
Packit 1f3717
pattern = pattern.replace(u"C", u"[bcdfghjklmnpqrstvwxzšžçðñþß]")
Packit 1f3717
rx = re.compile(pattern, re.IGNORECASE)
Packit 1f3717
Packit 1f3717
begin = u"(amerikan|jälleen|tiibetin|uudelleen).+"
Packit 1f3717
rx_begin = re.compile(begin, re.IGNORECASE)
Packit 1f3717
Packit 1f3717
end = u".+(herkkä|pöllö|valmis)"
Packit 1f3717
rx_end = re.compile(end, re.IGNORECASE)
Packit 1f3717
Packit 1f3717
Packit 1f3717
#print pattern
Packit 1f3717
Packit 1f3717
Packit 1f3717
# Sanat, jotka tunnistetaan Sukija-versiossa automaagisesti toisten
Packit 1f3717
# sanojen johdoksina. Tällaiset sanat pitäisi merkitä Joukahaisen
Packit 1f3717
# sanastoon lipulla ei kuulu indeksointisanastoon.
Packit 1f3717
#
Packit 1f3717
# Niiden lisäksi Sukijassa ei tarvita erisnimiä, jotka ovat myös
Packit 1f3717
# yleisnimiä. Kuitenkin mukaan pitää ottaa sellaiset sanat, jotka
Packit 1f3717
# taipuvat eri tavalla yleis- ja erisniminä. Esim. Lempi, Lempin;
Packit 1f3717
# lempi, lemmen.
Packit 1f3717
#
Packit 1f3717
# Sanaluettelon saa näin:
Packit 1f3717
# grep '<form>' ../*/*xml | sed -e "s@</\?form>@@g" | sort
Packit 1f3717
#
Packit 1f3717
words = []
Packit 1f3717
inputfile = codecs.open ('sukija/ei-sukija.txt', 'r', 'UTF-8')
Packit 1f3717
while True:
Packit 1f3717
	word = inputfile.readline()
Packit 1f3717
	if (len(word) == 0):
Packit 1f3717
		break
Packit 1f3717
	if (word[0] == '#'):
Packit 1f3717
		continue;
Packit 1f3717
	word = word[:-1]      # Poistetaan \n sanan lopusta.
Packit 1f3717
#	print (word)
Packit 1f3717
	words.append (word)
Packit 1f3717
Packit 1f3717
Packit 1f3717
# Aksentilliset kirjaimet UTF-8 -merkistössä 0000-017F,
Packit 1f3717
# ei kuitenkaan merkkejä š ja ž.
Packit 1f3717
#
Packit 1f3717
# C0 Controls and Basic Latin.        Range: 0000-007F
Packit 1f3717
# C1 Controls and Latin-1 Supplement  Range: 0080-00FF
Packit 1f3717
# Latin Extended-A                    Range: 0100-017F
Packit 1f3717
#
Packit 1f3717
# C0 on sama kuin ASCII, C0+C1 on sama kuin ISO-8859-1.
Packit 1f3717
#
Packit 1f3717
# Kirjaimet å, ä ja ö eivät ole aksentillisia kirjaimia suomen kielessä.
Packit 1f3717
#
Packit 1f3717
accents = u"ÀÁÂÃÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕØÙÚÛÜÝÞßàáâãæçèéêëìíîïðñòóôõøùúûüýþÿ" + \
Packit 1f3717
          u"ĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸ" + \
Packit 1f3717
	  u"ĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżſ"
Packit 1f3717
Packit 1f3717
replace = u"AAAAÆCEEEEIIIIDNOOOÖÖUUUUYÞßaaaaæceeeeiiiidnoooööuuuuyþy" + \
Packit 1f3717
          u"AaAaAaCcCcCcCcDdDdEeEeEeEeEeGgGgGgGgHhHhIiIiIiIiIiIJijJjKkk" + \
Packit 1f3717
	  u"LlLlLlLlLlNnNnNnnNnOoOoÖöŒœRrRrSsSsSsTtTtTtUuUuUuUuYyUuWwYyYZzZzs"
Packit 1f3717
Packit 1f3717
rx_accents = re.compile (u"[" + accents + u"]")
Packit 1f3717
Packit 1f3717
Packit 1f3717
# Jaetaan sana tavuihin. Esim.
Packit 1f3717
# hyphenate(u"valkoinen") = val-koi-nen.
Packit 1f3717
#
Packit 1f3717
#
Packit 1f3717
# Algoritmi: Facta-tietosanakirja (1970), osa 9, palsta 50.
Packit 1f3717
#
Packit 1f3717
# "(1) kaksi peräkkäistä vokaalikirjainta kuuluvat samaan tavuun
Packit 1f3717
# jos ja vain jos ne ääntyvät pitkänä vokaalina tai diftongina.
Packit 1f3717
#
Packit 1f3717
# (2) jos konsonanttia seuraa vokaali, ne kuuluvat samaan tavuun,
Packit 1f3717
# muutoin konsonantti kuuluu edellisen kirjaimen tavuun (kuitenkin
Packit 1f3717
# vierasperäisen sanan kaikki alkukonsonantit kuuluvat samaan tavuun)."
Packit 1f3717
Packit 1f3717
# Kahden ääntiön yhdistelmät, jotka voivat olla tavussa.
Packit 1f3717
A0 = [u"ei", u"ai", u"äi", u"ui", u"yi", u"oi", u"öi"]
Packit 1f3717
A1 = [u"au", u"äy", u"ou", u"öy", u"iu", u"iy", u"eu", u"ey", u"uo", u"yö", u"ie"]
Packit 1f3717
A1.extend(A0)
Packit 1f3717
A2 = [u"aa", u"ee", u"ii", u"oo", u"uu", u"yy", u"ää", u"öö"]
Packit 1f3717
Packit 1f3717
V2 = A1
Packit 1f3717
V2.extend(A2)
Packit 1f3717
Packit 1f3717
V = u"AÀÁÂÃEÈÉÊËŒÆIÌÍÎÏOÒÓÔUÙÚÛYÝÿÜÅÄÖØÕaàáâãeèéêëœæiìíîïoòóôuùúûyýÿüåäöøõ"
Packit 1f3717
C = u"BCDFGHJKLMNŃPQRSTVWXZŠŽÇÐÑÞßbcdfghjklmnńpqrstvwxzšžçðñþß"
Packit 1f3717
Packit 1f3717
Packit 1f3717
# Palautetaan True, jos sanassa on ainakin yksi ääntiö.
Packit 1f3717
#
Packit 1f3717
def has_vowel(s):
Packit 1f3717
	for i in s:
Packit 1f3717
		if (i in V):
Packit 1f3717
			return 1
Packit 1f3717
	return 0
Packit 1f3717
Packit 1f3717
Packit 1f3717
# Korvataan sanasta 'word' aksenttimerkit
Packit 1f3717
# aksentittomilla kohtien 'start' ja 'end' välistä.
Packit 1f3717
#
Packit 1f3717
def deaccent(word, start, end):
Packit 1f3717
	s = u""
Packit 1f3717
	for i in range(start, end):
Packit 1f3717
		j = accents.find(word[i])
Packit 1f3717
		if (j >= 0):
Packit 1f3717
			s = s + replace[j]
Packit 1f3717
		else:
Packit 1f3717
			s = s + word[i]
Packit 1f3717
	s = s + word[end:]
Packit 1f3717
	return s
Packit 1f3717
Packit 1f3717
Packit 1f3717
# Kirjoitetaan sana Malagan tietokantaan korvaamalla aksenttimerkit aksentittomilla (esim. á == a),
Packit 1f3717
# mutta ei korvata kirjaimia š ja ž s:llä ja z:lla.
Packit 1f3717
#
Packit 1f3717
def write_word_without_accents(main_vocabulary, vocabulary_files, word, entry, wordform):
Packit 1f3717
	if ((rx_accents.search(wordform) != None) and (wordform != u"šakki")):
Packit 1f3717
		n = entry.find(u" luokka: ")
Packit 1f3717
		if (n == -1):
Packit 1f3717
			print("write_word_without_accents: Virhe Malaga-koodissa: " + entry + u"\n")
Packit 1f3717
		entry2 = deaccent (entry, 0, n)
Packit 1f3717
#		print (entry  + entry2 + u"\n")
Packit 1f3717
		generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry2)
Packit 1f3717
Packit 1f3717
Packit 1f3717
word_end = re.compile(u".+geeni(nen)?$")
Packit 1f3717
Packit 1f3717
# Hyväksytään esim. karsinogeenia ja karsinogeeniä.
Packit 1f3717
#
Packit 1f3717
def new_vtype (malaga_vtype, wordform):
Packit 1f3717
	if (word_end.match(wordform)):
Packit 1f3717
		return u"aä"
Packit 1f3717
	else:
Packit 1f3717
	       	return malaga_vtype
Packit 1f3717
Packit 1f3717
def handle_word(main_vocabulary,vocabulary_files,word):
Packit 1f3717
	if generate_lex_common.has_flag(word, "not_sukija"): return
Packit 1f3717
Packit 1f3717
	# Get the inflection class. Exactly one inflection class is needed.
Packit 1f3717
	infclasses = word.getElementsByTagName("infclass")
Packit 1f3717
	voikko_infclass = None
Packit 1f3717
	for infclass in word.getElementsByTagName("infclass"):
Packit 1f3717
		if infclass.getAttribute("type") == "historical":
Packit 1f3717
			voikko_infclass = generate_lex_common.tValue(infclass)
Packit 1f3717
			break
Packit 1f3717
	if (voikko_infclass in [u"antautua", u"kaihtaa", u"laittaa", u"paahtaa",
Packit 1f3717
				u"taittaa", u"veranta", u"vihanta", u"virkkaa"]):
Packit 1f3717
		voikko_infclass = voikko_infclass + u"-av1"
Packit 1f3717
	
Packit 1f3717
	if voikko_infclass == None:
Packit 1f3717
		for infclass in word.getElementsByTagName("infclass"):
Packit 1f3717
			if infclass.getAttribute("type") != "historical":
Packit 1f3717
				voikko_infclass = generate_lex_common.tValue(infclass)
Packit 1f3717
				break
Packit 1f3717
	
Packit 1f3717
##	if voikko_infclass == None: return
Packit 1f3717
	if voikko_infclass == u"poikkeava": return
Packit 1f3717
	
Packit 1f3717
	# Get the word classes
Packit 1f3717
	wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass")
Packit 1f3717
	if wordclasses[0] != u"interjection" and voikko_infclass == None:
Packit 1f3717
		return
Packit 1f3717
	malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses)
Packit 1f3717
	if malaga_word_class == None: return
Packit 1f3717
	
Packit 1f3717
	# Get malaga flags
Packit 1f3717
	malaga_flags = generate_lex_common.get_malaga_flags(word)
Packit 1f3717
	
Packit 1f3717
	# Get forced vowel type
Packit 1f3717
	if voikko_infclass == None:
Packit 1f3717
		forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
Packit 1f3717
	else:
Packit 1f3717
		forced_inflection_vtype = generate_lex_common.vowel_type(word.getElementsByTagName("inflection")[0])
Packit 1f3717
Packit 1f3717
Packit 1f3717
	# Get forced vowel type
Packit 1f3717
###	forced_inflection_vtype = generate_lex_common.vowel_type(word.getElementsByTagName("inflection")[0])
Packit 1f3717
	
Packit 1f3717
	# Process all alternative forms
Packit 1f3717
	for altform in generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form"):
Packit 1f3717
		wordform = altform.replace(u'|', u'').replace(u'=', u'')
Packit 1f3717
		if (voikko_infclass == u"nuolaista-av2") and (wordform in [u"häväistä", u"vavista"]):
Packit 1f3717
			voikko_infclass = u"nuolaista"
Packit 1f3717
#		print (u"Hoo " + str(voikko_infclass) + u" " + u" " + wordform + u"\n")
Packit 1f3717
#		print(u"Tavutus1 " + wordform + u" " + hyphenate(wordform.lower()) + u"\n")
Packit 1f3717
		(alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, classmap)
Packit 1f3717
#		print (u"Huu " + wordform + u" " + str(alku) + u" " + str(jatko) + u" "  + str(voikko_infclass))
Packit 1f3717
		if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
Packit 1f3717
			vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
Packit 1f3717
		else: vtype = forced_inflection_vtype
Packit 1f3717
		if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = u'ä'
Packit 1f3717
		elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = u'a'
Packit 1f3717
		elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = u'aä'
Packit 1f3717
		malaga_vtype = new_vtype (malaga_vtype, wordform)
Packit 1f3717
		rakenne = generate_lex_common.get_structure(altform, malaga_word_class)
Packit 1f3717
		if alku == None:
Packit 1f3717
			generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, \
Packit 1f3717
							u"#Malaga class not found for (%s, %s)\n" \
Packit 1f3717
			                   % (wordform, voikko_infclass))
Packit 1f3717
			continue
Packit 1f3717
		if (wordform in words):
Packit 1f3717
#			print ("Ei tarvita: " + wordform)
Packit 1f3717
			continue
Packit 1f3717
		if (rx_begin.match(wordform) != None):
Packit 1f3717
#			print ("Ei tarvita: " + wordform)
Packit 1f3717
			continue
Packit 1f3717
		if (rx_end.match(wordform) != None):
Packit 1f3717
#			print ("Ei tarvita: " + wordform)
Packit 1f3717
			continue
Packit 1f3717
		# Joillakin sanoilla on sanastossa kaksi taivususkaavaa, Sukijassa
Packit 1f3717
		# taivutuskaavat on yhdistetty, ja toisen taivutuskaavan voi poistaa.
Packit 1f3717
		if ((wordform in [u'ori', u'ripsi', u'sini', u'täti', u'äiti']) and (jatko == u'risti')):
Packit 1f3717
#			print ("Ei tarvita: " + wordform)
Packit 1f3717
			continue
Packit 1f3717
		if  ((wordform == u'kampi') and (jatko == u'sampi')):
Packit 1f3717
#			print ("Ei tarvita: " + wordform)
Packit 1f3717
			continue
Packit 1f3717
		
Packit 1f3717
#		nsyl = number_of_syllabels(wordform)
Packit 1f3717
Packit 1f3717
		m = rx.match(wordform)
Packit 1f3717
		d = None
Packit 1f3717
Packit 1f3717
		if (m != None):
Packit 1f3717
			d = m.groupdict()
Packit 1f3717
		
Packit 1f3717
		alku2 = u""
Packit 1f3717
		jatko2 = u""
Packit 1f3717
		wordform2 = u""
Packit 1f3717
Packit 1f3717
		alku3 = u""
Packit 1f3717
		jatko3 = u""
Packit 1f3717
		wordform3 = u""
Packit 1f3717
Packit 1f3717
		alku4 = u""
Packit 1f3717
		jatko4 = u""
Packit 1f3717
		wordform4 = u""
Packit 1f3717
Packit 1f3717
		alku5 = u""
Packit 1f3717
		jatko5 = u""
Packit 1f3717
		wordform5 = u""
Packit 1f3717
Packit 1f3717
		alku6 = u""
Packit 1f3717
		jatko6 = u""
Packit 1f3717
		wordform6 = u""
Packit 1f3717
Packit 1f3717
		s = u"lähtösana: \"" + wordform + u"\", lähtöalku: \"" + alku + u"\""
Packit 1f3717
Packit 1f3717
		# Korjataan alku- ja jatko-kenttien arvoja.
Packit 1f3717
		#
Packit 1f3717
#		elif (jatko == u"rakentaa"):
Packit 1f3717
		if (jatko == u"rakentaa"):
Packit 1f3717
			alku = wordform[:-4]
Packit 1f3717
Packit 1f3717
		# Tulostetaan.
Packit 1f3717
Packit 1f3717
#		print(u"Word   " + wordform + u"\n")
Packit 1f3717
		entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s];' \
Packit 1f3717
			% (wordform, alku, malaga_word_class, jatko, malaga_vtype, malaga_flags,
Packit 1f3717
			   generate_lex_common.get_structure(altform, malaga_word_class))
Packit 1f3717
		generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)
Packit 1f3717
		
Packit 1f3717
		write_word_without_accents(main_vocabulary, vocabulary_files, word, entry, wordform)
Packit 1f3717
Packit 1f3717
		
Packit 1f3717
		if (len(wordform2) > 0):
Packit 1f3717
			entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \
Packit 1f3717
				% (wordform2, alku2, malaga_word_class, jatko2, malaga_vtype, malaga_flags,
Packit 1f3717
				   generate_lex_common.get_structure(altform, malaga_word_class), s)
Packit 1f3717
			generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)
Packit 1f3717
Packit 1f3717
		if (len(wordform3) > 0):
Packit 1f3717
			entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \
Packit 1f3717
				% (wordform3, alku3, malaga_word_class, jatko3, malaga_vtype, malaga_flags,
Packit 1f3717
				   generate_lex_common.get_structure(altform, malaga_word_class), s)
Packit 1f3717
			generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)
Packit 1f3717
Packit 1f3717
		if (len(wordform4) > 0):
Packit 1f3717
			entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \
Packit 1f3717
				% (wordform4, alku4, malaga_word_class, jatko4, malaga_vtype, malaga_flags,
Packit 1f3717
				   generate_lex_common.get_structure(altform, malaga_word_class), s)
Packit 1f3717
			generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)
Packit 1f3717
Packit 1f3717
		if (len(wordform5) > 0):
Packit 1f3717
			entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \
Packit 1f3717
				% (wordform5, alku5, malaga_word_class, jatko5, malaga_vtype, malaga_flags,
Packit 1f3717
				   generate_lex_common.get_structure(altform, malaga_word_class), s)
Packit 1f3717
			generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)
Packit 1f3717
Packit 1f3717
		if (len(wordform6) > 0):
Packit 1f3717
			entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \
Packit 1f3717
				% (wordform6, alku6, malaga_word_class, jatko6, malaga_vtype, malaga_flags,
Packit 1f3717
				   generate_lex_common.get_structure(altform, malaga_word_class), s)
Packit 1f3717
			generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)