Tree - source-git/malaga-suomi-voikko

source-git / malaga-suomi-voikko

Files

Commit: 1f37173b019941788e5603446756c1628970c40f
Blob Blame History Raw
# -*- coding: utf-8 -*-

# Copyright 2007 - 2011 Harri Pitk채nen (hatapitk@iki.fi)
#           2007        Hannu V채is채nen (Etunimi.Sukunimi@joensuu.fi)
#
# Functions and variables that are common to Sukija and Voikko versions.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

import hfconv
import voikkoutils
import codecs
import getopt
import sys
from xml.dom import Node

# Path to source data directory
VOCABULARY_DATA = u"vocabulary"

# Vocabulary entries that should be saved to different files
# (group, name, file)
SPECIAL_VOCABULARY = [
	('usage', 'it', 'atk.lex'),
	('usage', 'medicine', 'laaketiede.lex'),
	('usage', 'science', 'matluonnontiede.lex'),
	('usage', 'education', 'kasvatustiede.lex'),
	('style', 'foreign', 'vieraskieliset.lex')]

def open_lex(path, filename):
	file = codecs.open(path + u"/" + filename, 'w', 'UTF-8')
	file.write(u"# This is automatically generated intermediate lexicon file for\n")
	file.write(u"# Suomi-malaga Voikko edition. The original source data is\n")
	file.write(u"# distributed under the GNU General Public License, version 2 or\n")
	file.write(u"# later, as published by the Free Software Foundation. You should\n")
	file.write(u"# have received the original data, tools and instructions to\n")
	file.write(u"# generate this file (or instructions to obtain them) wherever\n")
	file.write(u"# you got this file from.\n\n")
	return file

def tValue(element):
	return element.firstChild.wholeText

# Returns a list of text values with given element name under DOM element "group"
def tValues(group, element_name):
	values = []
	for element in group.getElementsByTagName(element_name):
		values.append(tValue(element))
	return values

# Returns malaga word class for given word in Joukahainen
def get_malaga_word_class(j_wordclasses):
	if "pnoun_place" in j_wordclasses: return u"paikannimi"
	if "pnoun_firstname" in j_wordclasses: return u"etunimi"
	if "pnoun_lastname" in j_wordclasses: return u"sukunimi"
	if "pnoun_misc" in j_wordclasses: return u"nimi"
	if "verb" in j_wordclasses: return u"teonsana"
	if "adjective" in j_wordclasses and "noun" in j_wordclasses: return u"nimi_laatusana"
	if "adjective" in j_wordclasses: return u"laatusana"
	if "noun" in j_wordclasses: return u"nimisana"
	if "interjection" in j_wordclasses: return u"huudahdussana"
	if "prefix" in j_wordclasses: return u"etuliite"
	if "abbreviation" in j_wordclasses: return u"lyhenne"
	if "adverb" in j_wordclasses: return "seikkasana"
	if "conjunction" in j_wordclasses: return "sidesana"
	return None

# Returns flag names from given group for word in Joukahainen
def get_flags_from_group(word, groupName):
	flags = []
	for group in word.childNodes:
		if group.nodeType != Node.ELEMENT_NODE or group.tagName != groupName:
			continue
		for flag in group.childNodes:
			if flag.nodeType != Node.ELEMENT_NODE:
				continue
			if flag.tagName != "flag":
				continue
			flags.append(flag.firstChild.wholeText)
	return flags

# Returns malaga flags for given word in Joukahainen
def get_malaga_flags(word):
	global flag_attributes
	malagaFlags = []
	for group in word.childNodes:
		if group.nodeType != Node.ELEMENT_NODE:
			continue
		for flag in group.childNodes:
			if flag.nodeType != Node.ELEMENT_NODE:
				continue
			if flag.tagName != "flag":
				continue
			flagAttribute = flag_attributes[group.tagName + u"/" + tValue(flag)]
			if flagAttribute.malagaFlag != None:
				malagaFlags.append(flagAttribute.malagaFlag)
	if len(malagaFlags) == 0: return u""
	flag_string = u", tiedot: <"
	for flag in malagaFlags:
		flag_string = flag_string + flag + u","
	flag_string = flag_string[:-1] + u">"
	return flag_string

flag_attributes = voikkoutils.readFlagAttributes(VOCABULARY_DATA + u"/flags.txt")

def vowel_type(group):
	vtypes = group.getElementsByTagName("vtype")
	if len(vtypes) != 1: return voikkoutils.VOWEL_DEFAULT
	else:
		vtypes = tValue(vtypes[0])
		if vtypes == u'a': return voikkoutils.VOWEL_BACK
		elif vtypes == u'채': return voikkoutils.VOWEL_FRONT
		else: return voikkoutils.VOWEL_BOTH

def has_flag(word, flag):
	if flag in tValues(word, "flag"): return True
	return False

# Returns tuple (alku, jatko) for given word in Joukahainen
def get_malaga_inflection_class(wordform, j_infclass, j_wordclasses, j_classmap):
	if j_infclass is None:
		return (wordform, u"loppu")
	(infclass, gradclass) = (list(j_infclass.split(u'-')) + [None])[:2]
	
	if gradclass == None: gradtypes = [None]
	else: gradtypes = [grad[1] for grad in hfconv.grads if grad[2] == gradclass]
	
	# Determine the word class for the given word
	if "adjective" in j_wordclasses: wclass = hfconv.ADJ
	elif "noun" in j_wordclasses or "pnoun_firstname" in j_wordclasses or \
	     "pnoun_lastname" in j_wordclasses or "pnoun_place" in j_wordclasses or \
	     "pnoun_misc" in j_wordclasses: wclass = hfconv.SUBST
	elif "verb" in j_wordclasses: wclass = hfconv.VERB 
	else: return (None, None)
	
	for (m_infclass, m_infclass_gradation, m_smclasses) in j_classmap:
		if m_infclass != infclass: continue
		for m_smclass in m_smclasses:
			(m_gradtype, pattern, jatko, wclasses) = (list(m_smclass) + [None])[:4]
			if wclasses != None and not wclass in wclasses: continue
			if not m_gradtype in gradtypes: continue
			alku = hfconv.match_re(wordform, pattern)
			if alku != None: return (alku, jatko)
	
	return (None, None)


# Returns a string describing the structure of a word, if necessary for the spellchecker
# or hyphenator
def get_structure(wordform, malaga_word_class):
	needstructure = False
	if malaga_word_class in [u'nimi', u'etunimi', u'sukunimi', 'paikannimi']: ispropernoun = True
	else: ispropernoun = False
	if malaga_word_class == u'lyhenne':
		i = u"j"
		p = u"q"
	else:
		i = u"i"
		p = u"p"
	structstr = u', rakenne: "='
	for idx in range(len(wordform)):
		c = wordform[idx]
		if c == u'-':
			structstr = structstr + u"-="
			needstructure = True
		elif c == u'|': structstr = structstr
		elif c == u'=':
			structstr = structstr + u"="
			needstructure = True
		elif c == u':':
			structstr = structstr + u":"
			needstructure = True
		elif c.isupper():
			structstr = structstr + i
			if not (ispropernoun and idx == 0):
				needstructure = True
		else:
			structstr = structstr + p
			if ispropernoun and idx == 0:
				needstructure = True
	if needstructure: return structstr + u'"'
	else: return u""

# Writes the vocabulary entry to a suitable file
def write_entry(main_vocabulary,vocabulary_files,word, entry):
	special = False
	for voc in SPECIAL_VOCABULARY:
		group = word.getElementsByTagName(voc[0])
		if len(group) == 0: continue
		if has_flag(group[0], voc[1]):
			vocabulary_files[voc[2]].write(entry + u"\n")
			special = True
	if not special:
		main_vocabulary.write(entry + u"\n")

# Parse command line options and return them in a dictionary
def get_options():
	try:
		optlist = ["min-frequency=", "extra-usage=", "style=", "destdir=", "sourceid", "vanhat", "sukija", "sukija-ys"]
		(opts, args) = getopt.getopt(sys.argv[1:], "", optlist)
	except getopt.GetoptError:
		sys.stderr.write("Invalid option list for %s\n" % sys.argv[0])
		sys.exit(1)
	options = {"frequency": 9,
	           "extra-usage": [],
	           "style": ["old", "international", "inappropriate"],
	           "sourceid": False,
	           "vanhat": False,
	           "destdir": None,
		   "sukija": False,
		   "sukija-ys": False}
	for (name, value) in opts:
		if name == "--min-frequency":
			options["frequency"] = int(value)
		elif name == "--extra-usage":
			options["extra-usage"] = value.split(",")
		elif name == "--style":
			options["style"] = value.split(",")
		elif name == "--destdir":
			options["destdir"] = value
		elif name == "--sourceid":
			options["sourceid"] = True
		elif name == "--vanhat":
			options["vanhat"] = True
		elif name == "--sukija":
			options["sukija"] = True
		elif name == "--sukija-ys":
			options["sukija-ys"] = True
	return options

# Strip whitespace and comments from LEXC input file
def stripWhitespaceAndComments(line):
	if u"!" in line:
		line = line[0:line.find(u"!")]
	return line.strip()

# Filter LEXC input according to options
def filterVfstInput(line_orig, OPTIONS):
	if line_orig.startswith(u'?Sukija'):
		if OPTIONS["sukija"]:
			line_orig = line_orig[7:]
		else:
			return None
	if line_orig.startswith(u'?Murre'):
		if "dialect" in OPTIONS["style"] or OPTIONS["sukija"]:
			line_orig = line_orig[6:]
		else:
			return None
	if line_orig.startswith(u'?Vanha'):
		if OPTIONS["vanhat"] or OPTIONS["sukija"]:
			line_orig = line_orig[6:]
		else:
			return None
	return stripWhitespaceAndComments(line_orig)
source-git / malaga-suomi-voikko

Source Code

Files