Blame common/generate_lex_common.py

Packit 1f3717
# -*- coding: utf-8 -*-
Packit 1f3717
Packit 1f3717
# Copyright 2007 - 2011 Harri Pitk채nen (hatapitk@iki.fi)
Packit 1f3717
#           2007        Hannu V채is채nen (Etunimi.Sukunimi@joensuu.fi)
Packit 1f3717
#
Packit 1f3717
# Functions and variables that are common to Sukija and Voikko versions.
Packit 1f3717
#
Packit 1f3717
# This program is free software; you can redistribute it and/or modify
Packit 1f3717
# it under the terms of the GNU General Public License as published by
Packit 1f3717
# the Free Software Foundation; either version 2 of the License, or
Packit 1f3717
# (at your option) any later version.
Packit 1f3717
#
Packit 1f3717
# This program is distributed in the hope that it will be useful,
Packit 1f3717
# but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 1f3717
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 1f3717
# GNU General Public License for more details.
Packit 1f3717
#
Packit 1f3717
# You should have received a copy of the GNU General Public License
Packit 1f3717
# along with this program; if not, write to the Free Software
Packit 1f3717
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
Packit 1f3717
Packit 1f3717
import hfconv
Packit 1f3717
import voikkoutils
Packit 1f3717
import codecs
Packit 1f3717
import getopt
Packit 1f3717
import sys
Packit 1f3717
from xml.dom import Node
Packit 1f3717
Packit 1f3717
# Path to source data directory
Packit 1f3717
VOCABULARY_DATA = u"vocabulary"
Packit 1f3717
Packit 1f3717
# Vocabulary entries that should be saved to different files
Packit 1f3717
# (group, name, file)
Packit 1f3717
SPECIAL_VOCABULARY = [
Packit 1f3717
	('usage', 'it', 'atk.lex'),
Packit 1f3717
	('usage', 'medicine', 'laaketiede.lex'),
Packit 1f3717
	('usage', 'science', 'matluonnontiede.lex'),
Packit 1f3717
	('usage', 'education', 'kasvatustiede.lex'),
Packit 1f3717
	('style', 'foreign', 'vieraskieliset.lex')]
Packit 1f3717
Packit 1f3717
def open_lex(path, filename):
Packit 1f3717
	file = codecs.open(path + u"/" + filename, 'w', 'UTF-8')
Packit 1f3717
	file.write(u"# This is automatically generated intermediate lexicon file for\n")
Packit 1f3717
	file.write(u"# Suomi-malaga Voikko edition. The original source data is\n")
Packit 1f3717
	file.write(u"# distributed under the GNU General Public License, version 2 or\n")
Packit 1f3717
	file.write(u"# later, as published by the Free Software Foundation. You should\n")
Packit 1f3717
	file.write(u"# have received the original data, tools and instructions to\n")
Packit 1f3717
	file.write(u"# generate this file (or instructions to obtain them) wherever\n")
Packit 1f3717
	file.write(u"# you got this file from.\n\n")
Packit 1f3717
	return file
Packit 1f3717
Packit 1f3717
def tValue(element):
Packit 1f3717
	return element.firstChild.wholeText
Packit 1f3717
Packit 1f3717
# Returns a list of text values with given element name under DOM element "group"
Packit 1f3717
def tValues(group, element_name):
Packit 1f3717
	values = []
Packit 1f3717
	for element in group.getElementsByTagName(element_name):
Packit 1f3717
		values.append(tValue(element))
Packit 1f3717
	return values
Packit 1f3717
Packit 1f3717
# Returns malaga word class for given word in Joukahainen
Packit 1f3717
def get_malaga_word_class(j_wordclasses):
Packit 1f3717
	if "pnoun_place" in j_wordclasses: return u"paikannimi"
Packit 1f3717
	if "pnoun_firstname" in j_wordclasses: return u"etunimi"
Packit 1f3717
	if "pnoun_lastname" in j_wordclasses: return u"sukunimi"
Packit 1f3717
	if "pnoun_misc" in j_wordclasses: return u"nimi"
Packit 1f3717
	if "verb" in j_wordclasses: return u"teonsana"
Packit 1f3717
	if "adjective" in j_wordclasses and "noun" in j_wordclasses: return u"nimi_laatusana"
Packit 1f3717
	if "adjective" in j_wordclasses: return u"laatusana"
Packit 1f3717
	if "noun" in j_wordclasses: return u"nimisana"
Packit 1f3717
	if "interjection" in j_wordclasses: return u"huudahdussana"
Packit 1f3717
	if "prefix" in j_wordclasses: return u"etuliite"
Packit 1f3717
	if "abbreviation" in j_wordclasses: return u"lyhenne"
Packit 1f3717
	if "adverb" in j_wordclasses: return "seikkasana"
Packit 1f3717
	if "conjunction" in j_wordclasses: return "sidesana"
Packit 1f3717
	return None
Packit 1f3717
Packit 1f3717
# Returns flag names from given group for word in Joukahainen
Packit 1f3717
def get_flags_from_group(word, groupName):
Packit 1f3717
	flags = []
Packit 1f3717
	for group in word.childNodes:
Packit 1f3717
		if group.nodeType != Node.ELEMENT_NODE or group.tagName != groupName:
Packit 1f3717
			continue
Packit 1f3717
		for flag in group.childNodes:
Packit 1f3717
			if flag.nodeType != Node.ELEMENT_NODE:
Packit 1f3717
				continue
Packit 1f3717
			if flag.tagName != "flag":
Packit 1f3717
				continue
Packit 1f3717
			flags.append(flag.firstChild.wholeText)
Packit 1f3717
	return flags
Packit 1f3717
Packit 1f3717
# Returns malaga flags for given word in Joukahainen
Packit 1f3717
def get_malaga_flags(word):
Packit 1f3717
	global flag_attributes
Packit 1f3717
	malagaFlags = []
Packit 1f3717
	for group in word.childNodes:
Packit 1f3717
		if group.nodeType != Node.ELEMENT_NODE:
Packit 1f3717
			continue
Packit 1f3717
		for flag in group.childNodes:
Packit 1f3717
			if flag.nodeType != Node.ELEMENT_NODE:
Packit 1f3717
				continue
Packit 1f3717
			if flag.tagName != "flag":
Packit 1f3717
				continue
Packit 1f3717
			flagAttribute = flag_attributes[group.tagName + u"/" + tValue(flag)]
Packit 1f3717
			if flagAttribute.malagaFlag != None:
Packit 1f3717
				malagaFlags.append(flagAttribute.malagaFlag)
Packit 1f3717
	if len(malagaFlags) == 0: return u""
Packit 1f3717
	flag_string = u", tiedot: <"
Packit 1f3717
	for flag in malagaFlags:
Packit 1f3717
		flag_string = flag_string + flag + u","
Packit 1f3717
	flag_string = flag_string[:-1] + u">"
Packit 1f3717
	return flag_string
Packit 1f3717
Packit 1f3717
flag_attributes = voikkoutils.readFlagAttributes(VOCABULARY_DATA + u"/flags.txt")
Packit 1f3717
Packit 1f3717
def vowel_type(group):
Packit 1f3717
	vtypes = group.getElementsByTagName("vtype")
Packit 1f3717
	if len(vtypes) != 1: return voikkoutils.VOWEL_DEFAULT
Packit 1f3717
	else:
Packit 1f3717
		vtypes = tValue(vtypes[0])
Packit 1f3717
		if vtypes == u'a': return voikkoutils.VOWEL_BACK
Packit 1f3717
		elif vtypes == u'채': return voikkoutils.VOWEL_FRONT
Packit 1f3717
		else: return voikkoutils.VOWEL_BOTH
Packit 1f3717
Packit 1f3717
def has_flag(word, flag):
Packit 1f3717
	if flag in tValues(word, "flag"): return True
Packit 1f3717
	return False
Packit 1f3717
Packit 1f3717
# Returns tuple (alku, jatko) for given word in Joukahainen
Packit 1f3717
def get_malaga_inflection_class(wordform, j_infclass, j_wordclasses, j_classmap):
Packit 1f3717
	if j_infclass is None:
Packit 1f3717
		return (wordform, u"loppu")
Packit 1f3717
	(infclass, gradclass) = (list(j_infclass.split(u'-')) + [None])[:2]
Packit 1f3717
	
Packit 1f3717
	if gradclass == None: gradtypes = [None]
Packit 1f3717
	else: gradtypes = [grad[1] for grad in hfconv.grads if grad[2] == gradclass]
Packit 1f3717
	
Packit 1f3717
	# Determine the word class for the given word
Packit 1f3717
	if "adjective" in j_wordclasses: wclass = hfconv.ADJ
Packit 1f3717
	elif "noun" in j_wordclasses or "pnoun_firstname" in j_wordclasses or \
Packit 1f3717
	     "pnoun_lastname" in j_wordclasses or "pnoun_place" in j_wordclasses or \
Packit 1f3717
	     "pnoun_misc" in j_wordclasses: wclass = hfconv.SUBST
Packit 1f3717
	elif "verb" in j_wordclasses: wclass = hfconv.VERB 
Packit 1f3717
	else: return (None, None)
Packit 1f3717
	
Packit 1f3717
	for (m_infclass, m_infclass_gradation, m_smclasses) in j_classmap:
Packit 1f3717
		if m_infclass != infclass: continue
Packit 1f3717
		for m_smclass in m_smclasses:
Packit 1f3717
			(m_gradtype, pattern, jatko, wclasses) = (list(m_smclass) + [None])[:4]
Packit 1f3717
			if wclasses != None and not wclass in wclasses: continue
Packit 1f3717
			if not m_gradtype in gradtypes: continue
Packit 1f3717
			alku = hfconv.match_re(wordform, pattern)
Packit 1f3717
			if alku != None: return (alku, jatko)
Packit 1f3717
	
Packit 1f3717
	return (None, None)
Packit 1f3717
Packit 1f3717
Packit 1f3717
# Returns a string describing the structure of a word, if necessary for the spellchecker
Packit 1f3717
# or hyphenator
Packit 1f3717
def get_structure(wordform, malaga_word_class):
Packit 1f3717
	needstructure = False
Packit 1f3717
	if malaga_word_class in [u'nimi', u'etunimi', u'sukunimi', 'paikannimi']: ispropernoun = True
Packit 1f3717
	else: ispropernoun = False
Packit 1f3717
	if malaga_word_class == u'lyhenne':
Packit 1f3717
		i = u"j"
Packit 1f3717
		p = u"q"
Packit 1f3717
	else:
Packit 1f3717
		i = u"i"
Packit 1f3717
		p = u"p"
Packit 1f3717
	structstr = u', rakenne: "='
Packit 1f3717
	for idx in range(len(wordform)):
Packit 1f3717
		c = wordform[idx]
Packit 1f3717
		if c == u'-':
Packit 1f3717
			structstr = structstr + u"-="
Packit 1f3717
			needstructure = True
Packit 1f3717
		elif c == u'|': structstr = structstr
Packit 1f3717
		elif c == u'=':
Packit 1f3717
			structstr = structstr + u"="
Packit 1f3717
			needstructure = True
Packit 1f3717
		elif c == u':':
Packit 1f3717
			structstr = structstr + u":"
Packit 1f3717
			needstructure = True
Packit 1f3717
		elif c.isupper():
Packit 1f3717
			structstr = structstr + i
Packit 1f3717
			if not (ispropernoun and idx == 0):
Packit 1f3717
				needstructure = True
Packit 1f3717
		else:
Packit 1f3717
			structstr = structstr + p
Packit 1f3717
			if ispropernoun and idx == 0:
Packit 1f3717
				needstructure = True
Packit 1f3717
	if needstructure: return structstr + u'"'
Packit 1f3717
	else: return u""
Packit 1f3717
Packit 1f3717
# Writes the vocabulary entry to a suitable file
Packit 1f3717
def write_entry(main_vocabulary,vocabulary_files,word, entry):
Packit 1f3717
	special = False
Packit 1f3717
	for voc in SPECIAL_VOCABULARY:
Packit 1f3717
		group = word.getElementsByTagName(voc[0])
Packit 1f3717
		if len(group) == 0: continue
Packit 1f3717
		if has_flag(group[0], voc[1]):
Packit 1f3717
			vocabulary_files[voc[2]].write(entry + u"\n")
Packit 1f3717
			special = True
Packit 1f3717
	if not special:
Packit 1f3717
		main_vocabulary.write(entry + u"\n")
Packit 1f3717
Packit 1f3717
# Parse command line options and return them in a dictionary
Packit 1f3717
def get_options():
Packit 1f3717
	try:
Packit 1f3717
		optlist = ["min-frequency=", "extra-usage=", "style=", "destdir=", "sourceid", "vanhat", "sukija", "sukija-ys"]
Packit 1f3717
		(opts, args) = getopt.getopt(sys.argv[1:], "", optlist)
Packit 1f3717
	except getopt.GetoptError:
Packit 1f3717
		sys.stderr.write("Invalid option list for %s\n" % sys.argv[0])
Packit 1f3717
		sys.exit(1)
Packit 1f3717
	options = {"frequency": 9,
Packit 1f3717
	           "extra-usage": [],
Packit 1f3717
	           "style": ["old", "international", "inappropriate"],
Packit 1f3717
	           "sourceid": False,
Packit 1f3717
	           "vanhat": False,
Packit 1f3717
	           "destdir": None,
Packit 1f3717
		   "sukija": False,
Packit 1f3717
		   "sukija-ys": False}
Packit 1f3717
	for (name, value) in opts:
Packit 1f3717
		if name == "--min-frequency":
Packit 1f3717
			options["frequency"] = int(value)
Packit 1f3717
		elif name == "--extra-usage":
Packit 1f3717
			options["extra-usage"] = value.split(",")
Packit 1f3717
		elif name == "--style":
Packit 1f3717
			options["style"] = value.split(",")
Packit 1f3717
		elif name == "--destdir":
Packit 1f3717
			options["destdir"] = value
Packit 1f3717
		elif name == "--sourceid":
Packit 1f3717
			options["sourceid"] = True
Packit 1f3717
		elif name == "--vanhat":
Packit 1f3717
			options["vanhat"] = True
Packit 1f3717
		elif name == "--sukija":
Packit 1f3717
			options["sukija"] = True
Packit 1f3717
		elif name == "--sukija-ys":
Packit 1f3717
			options["sukija-ys"] = True
Packit 1f3717
	return options
Packit 1f3717
Packit 1f3717
# Strip whitespace and comments from LEXC input file
Packit 1f3717
def stripWhitespaceAndComments(line):
Packit 1f3717
	if u"!" in line:
Packit 1f3717
		line = line[0:line.find(u"!")]
Packit 1f3717
	return line.strip()
Packit 1f3717
Packit 1f3717
# Filter LEXC input according to options
Packit 1f3717
def filterVfstInput(line_orig, OPTIONS):
Packit 1f3717
	if line_orig.startswith(u'?Sukija'):
Packit 1f3717
		if OPTIONS["sukija"]:
Packit 1f3717
			line_orig = line_orig[7:]
Packit 1f3717
		else:
Packit 1f3717
			return None
Packit 1f3717
	if line_orig.startswith(u'?Murre'):
Packit 1f3717
		if "dialect" in OPTIONS["style"] or OPTIONS["sukija"]:
Packit 1f3717
			line_orig = line_orig[6:]
Packit 1f3717
		else:
Packit 1f3717
			return None
Packit 1f3717
	if line_orig.startswith(u'?Vanha'):
Packit 1f3717
		if OPTIONS["vanhat"] or OPTIONS["sukija"]:
Packit 1f3717
			line_orig = line_orig[6:]
Packit 1f3717
		else:
Packit 1f3717
			return None
Packit 1f3717
	return stripWhitespaceAndComments(line_orig)