Blame common/voikkoutils.py

Packit 1f3717
# -*- coding: utf-8 -*-
Packit 1f3717
Packit 1f3717
# Copyright 2007 Harri Pitkänen (hatapitk@iki.fi)
Packit 1f3717
Packit 1f3717
# This program is free software; you can redistribute it and/or modify
Packit 1f3717
# it under the terms of the GNU General Public License as published by
Packit 1f3717
# the Free Software Foundation; either version 2 of the License, or
Packit 1f3717
# (at your option) any later version.
Packit 1f3717
#
Packit 1f3717
# This program is distributed in the hope that it will be useful,
Packit 1f3717
# but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 1f3717
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 1f3717
# GNU General Public License for more details.
Packit 1f3717
#
Packit 1f3717
# You should have received a copy of the GNU General Public License
Packit 1f3717
# along with this program; if not, write to the Free Software
Packit 1f3717
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
Packit 1f3717
Packit 1f3717
# This module contains general helper functions and classes for use
Packit 1f3717
# with Python and Voikko.
Packit 1f3717
Packit 1f3717
import codecs
Packit 1f3717
import os
Packit 1f3717
import locale
Packit 1f3717
import sys
Packit 1f3717
import xml.dom.minidom
Packit 1f3717
import gzip
Packit 1f3717
Packit 1f3717
# Word classes
Packit 1f3717
NOUN=1
Packit 1f3717
ADJECTIVE=2
Packit 1f3717
VERB=3
Packit 1f3717
Packit 1f3717
# Vowel types
Packit 1f3717
VOWEL_DEFAULT=0
Packit 1f3717
VOWEL_FRONT=1
Packit 1f3717
VOWEL_BACK=2
Packit 1f3717
VOWEL_BOTH=3
Packit 1f3717
Packit 1f3717
# Gradation types
Packit 1f3717
GRAD_NONE = 0
Packit 1f3717
GRAD_SW = 1
Packit 1f3717
GRAD_WS = 2
Packit 1f3717
Packit 1f3717
GRAD_WEAK = 3
Packit 1f3717
GRAD_STRONG = 4
Packit 1f3717
Packit 1f3717
class FlagAttribute:
Packit 1f3717
	"Vocabulary flag attribute"
Packit 1f3717
	joukahainen = 0
Packit 1f3717
	xmlGroup = None
Packit 1f3717
	xmlFlag = None
Packit 1f3717
	malagaFlag = None
Packit 1f3717
	description = None
Packit 1f3717
Packit 1f3717
## Remove comments from a given line of text.
Packit 1f3717
def removeComments(line):
Packit 1f3717
	comment_start = line.find(u'#')
Packit 1f3717
	if comment_start == -1:
Packit 1f3717
		return line
Packit 1f3717
	if comment_start == 0:
Packit 1f3717
		return u''
Packit 1f3717
	return line[:comment_start]
Packit 1f3717
Packit 1f3717
def readFlagAttributes(filename):
Packit 1f3717
	"""Returns a map of flag attributes from given file. The keys in the
Packit 1f3717
	map are in form xmlGroup/xmlFlag, such as 'compounding/ei_ys'."""
Packit 1f3717
	inputfile = codecs.open(filename, 'r', 'UTF-8')
Packit 1f3717
	flags = {}
Packit 1f3717
	fileCont = True
Packit 1f3717
	while fileCont:
Packit 1f3717
		line = inputfile.readline()
Packit 1f3717
		fileCont = line.endswith('\n')
Packit 1f3717
		line = removeComments(line).strip()
Packit 1f3717
		if len(line) > 0:
Packit 1f3717
			f = FlagAttribute()
Packit 1f3717
			endind = line.find(u' ')
Packit 1f3717
			f.joukahainen = int(line[:endind])
Packit 1f3717
			line = line[endind:].strip()
Packit 1f3717
			endind = line.find(u'/')
Packit 1f3717
			f.xmlGroup = line[:endind]
Packit 1f3717
			line = line[endind + 1:]
Packit 1f3717
			endind = line.find(u' ')
Packit 1f3717
			f.xmlFlag = line[:endind]
Packit 1f3717
			line = line[endind:].strip()
Packit 1f3717
			endind = line.find(u' ')
Packit 1f3717
			if line[:endind] != u'-': f.malagaFlag = line[:endind]
Packit 1f3717
			line = line[endind:].strip()
Packit 1f3717
			if len(line) > 0: f.description = line
Packit 1f3717
			flags[f.xmlGroup + u'/' + f.xmlFlag] = f
Packit 1f3717
	inputfile.close()
Packit 1f3717
	return flags
Packit 1f3717
Packit 1f3717
## Function that returns the type of vowels that are allowed in the suffixes for
Packit 1f3717
# given simple word.
Packit 1f3717
# The possible values are VOWEL_FRONT, VOWEL_BACK and VOWEL_BOTH.
Packit 1f3717
def _simple_vowel_type(word):
Packit 1f3717
	word = word.lower()
Packit 1f3717
	last_back = max(word.rfind(u'a'), word.rfind(u'o'), word.rfind(u'å'), word.rfind(u'u'))
Packit 1f3717
	last_ord_front = max(word.rfind(u'ä'), word.rfind(u'ö'))
Packit 1f3717
	last_y = word.rfind(u'y')
Packit 1f3717
	if last_back > -1 and max(last_ord_front, last_y) == -1:
Packit 1f3717
		return VOWEL_BACK
Packit 1f3717
	if last_back == -1 and max(last_ord_front, last_y) > -1:
Packit 1f3717
		return VOWEL_FRONT
Packit 1f3717
	if max(last_back, last_ord_front, last_y) == -1:
Packit 1f3717
		return VOWEL_FRONT
Packit 1f3717
	if last_y < max(last_back, last_ord_front):
Packit 1f3717
		if last_back > last_ord_front: return VOWEL_BACK
Packit 1f3717
		else: return VOWEL_FRONT
Packit 1f3717
	else:
Packit 1f3717
		return VOWEL_BOTH
Packit 1f3717
Packit 1f3717
## Returns autodetected vowel type of infection suffixes for a word.
Packit 1f3717
# If word contains character '=', automatic detection is only performed on the
Packit 1f3717
# trailing part. If word contains character '|', automatic detection is performed
Packit 1f3717
# on the trailing part and the whole word, and the union of accepted vowel types is returned.
Packit 1f3717
def get_wordform_infl_vowel_type(wordform):
Packit 1f3717
	# Search for last '=' or '-', check the trailing part using recursion
Packit 1f3717
	startind = max(wordform.rfind(u'='), wordform.rfind(u'-'))
Packit 1f3717
	if startind == len(wordform) - 1: return VOWEL_BOTH # Not allowed
Packit 1f3717
	if startind != -1: return get_wordform_infl_vowel_type(wordform[startind+1:])
Packit 1f3717
	
Packit 1f3717
	# Search for first '|', check the trailing part using recursion
Packit 1f3717
	startind = wordform.find(u'|')
Packit 1f3717
	if startind == len(wordform) - 1: return VOWEL_BOTH # Not allowed
Packit 1f3717
	vtype_whole = _simple_vowel_type(wordform)
Packit 1f3717
	if startind == -1: return vtype_whole
Packit 1f3717
	vtype_part = get_wordform_infl_vowel_type(wordform[startind+1:])
Packit 1f3717
	if vtype_whole == vtype_part: return vtype_whole
Packit 1f3717
	else: return VOWEL_BOTH
Packit 1f3717
Packit 1f3717
def get_preference(prefname):
Packit 1f3717
	u'Returns the value of given preference'
Packit 1f3717
	try:
Packit 1f3717
		import voikko_dev_prefs
Packit 1f3717
		if prefname == 'svnroot' and hasattr(voikko_dev_prefs, 'svnroot'):
Packit 1f3717
			return voikko_dev_prefs.svnroot
Packit 1f3717
		if prefname == 'voikkotest_dir' and hasattr(voikko_dev_prefs, 'voikkotest_dir'):
Packit 1f3717
			return voikko_dev_prefs.voikkotest_dir
Packit 1f3717
		if prefname == 'voikkotest_build_options' and hasattr(voikko_dev_prefs, 'voikkotest_build_options'):
Packit 1f3717
			return voikko_dev_prefs.voikkotest_build_options
Packit 1f3717
		if prefname == 'voikko_data_dir' and hasattr(voikko_dev_prefs, 'voikko_data_dir'):
Packit 1f3717
			return voikko_dev_prefs.voikko_data_dir
Packit 1f3717
		if prefname == 'encoding' and hasattr(voikko_dev_prefs, 'encoding'):
Packit 1f3717
			return voikko_dev_prefs.encoding
Packit 1f3717
		if prefname == 'libvoikko_bin' and hasattr(voikko_dev_prefs, 'libvoikko_bin'):
Packit 1f3717
			return voikko_dev_prefs.libvoikko_bin
Packit 1f3717
		if prefname == 'diffviewcmd' and hasattr(voikko_dev_prefs, 'diffviewcmd'):
Packit 1f3717
			return voikko_dev_prefs.diffviewcmd
Packit 1f3717
	except ImportError:
Packit 1f3717
		pass
Packit 1f3717
	if prefname == 'svnroot': return os.environ['HOME'] + '/svn/voikko'
Packit 1f3717
	if prefname == 'voikkotest_dir': return os.environ['HOME'] + '/tmp/voikkotest'
Packit 1f3717
	if prefname == 'voikkotest_build_options': return ''
Packit 1f3717
	if prefname == 'voikko_data_dir': return os.environ['HOME'] + '/svn/voikko/trunk/data'
Packit 1f3717
	if prefname == 'encoding': return locale.getpreferredencoding()
Packit 1f3717
	if prefname == 'libvoikko_bin': return '/usr/bin'
Packit 1f3717
	if prefname == 'diffviewcmd': return 'diff -U 0 "%s" "%s" | grep ^.C: 2>/dev/null | less'
Packit 1f3717
	return None
Packit 1f3717
Packit 1f3717
## Returns True, if given character is a consonant, otherwise retuns False.
Packit 1f3717
def is_consonant(letter):
Packit 1f3717
	if letter.lower() in u'qwrtpsdfghjklzxcvbnm':
Packit 1f3717
		return True
Packit 1f3717
	else:
Packit 1f3717
		return False
Packit 1f3717
Packit 1f3717
## Function that returns the type of vowels that are allowed in the affixes for given word.
Packit 1f3717
# The possible values are VOWEL_FRONT, VOWEL_BACK and VOWEL_BOTH.
Packit 1f3717
def vowel_type(word):
Packit 1f3717
	word = word.lower()
Packit 1f3717
	last_back = max(word.rfind(u'a'), word.rfind(u'o'), word.rfind(u'å'), word.rfind(u'u'))
Packit 1f3717
	last_ord_front = max(word.rfind(u'ä'), word.rfind(u'ö'))
Packit 1f3717
	last_y = word.rfind(u'y')
Packit 1f3717
	if last_back > -1 and max(last_ord_front, last_y) == -1:
Packit 1f3717
		return VOWEL_BACK
Packit 1f3717
	if last_back == -1 and max(last_ord_front, last_y) > -1:
Packit 1f3717
		return VOWEL_FRONT
Packit 1f3717
	if max(last_back, last_ord_front, last_y) == -1:
Packit 1f3717
		return VOWEL_FRONT
Packit 1f3717
	if last_y < max(last_back, last_ord_front):
Packit 1f3717
		if last_back > last_ord_front: return VOWEL_BACK
Packit 1f3717
		else: return VOWEL_FRONT
Packit 1f3717
	else:
Packit 1f3717
		return VOWEL_BOTH
Packit 1f3717
Packit 1f3717
Packit 1f3717
## Expands capital letters to useful character classes for regular expressions
Packit 1f3717
def capital_char_regexp(pattern):
Packit 1f3717
	pattern = pattern.replace('V', u'(?:a|e|i|o|u|y|ä|ö|é|è|á|ó|â)')
Packit 1f3717
	pattern = pattern.replace('C', u'(?:b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|z|š|ž)')
Packit 1f3717
	pattern = pattern.replace('A', u'(?:a|ä)')
Packit 1f3717
	pattern = pattern.replace('O', u'(?:o|ö)')
Packit 1f3717
	pattern = pattern.replace('U', u'(?:u|y)')
Packit 1f3717
	return pattern
Packit 1f3717
Packit 1f3717
## Reads the word list in XML format specified by filename. If the name ends
Packit 1f3717
# with .gz, the file is assumed to be gzip compressed. Calls function word_handler
Packit 1f3717
# for each word, passing a XML Document object representing the word as a parameter.
Packit 1f3717
# If show_progress == True, prints progess information to stdout
Packit 1f3717
def process_wordlist(filename, word_handler, show_progress = False):
Packit 1f3717
	if filename.endswith(".gz"):
Packit 1f3717
		listfile = gzip.GzipFile(filename, 'r')
Packit 1f3717
	else:
Packit 1f3717
		listfile = open(filename, 'r')
Packit 1f3717
	line = ""
Packit 1f3717
	while line != '<wordlist xml:lang="fi">\n':
Packit 1f3717
		line = listfile.readline()
Packit 1f3717
		if line == '':
Packit 1f3717
			sys.stderr.write("Malformed file " + filename + "\n")
Packit 1f3717
			return
Packit 1f3717
	
Packit 1f3717
	wcount = 0
Packit 1f3717
	while True:
Packit 1f3717
		wordstr = ""
Packit 1f3717
		line = listfile.readline()
Packit 1f3717
		if line == "</wordlist>\n": break
Packit 1f3717
		while line != '</word>\n':
Packit 1f3717
			wordstr = wordstr + line
Packit 1f3717
			line = listfile.readline()
Packit 1f3717
		word = xml.dom.minidom.parseString(wordstr + line)
Packit 1f3717
		word_handler(word.documentElement)
Packit 1f3717
		wcount = wcount + 1
Packit 1f3717
		if show_progress and wcount % 1000 == 0:
Packit 1f3717
			sys.stdout.write("#")
Packit 1f3717
			sys.stdout.flush()
Packit 1f3717
	
Packit 1f3717
	if show_progress: sys.stdout.write("\n")
Packit 1f3717
	listfile.close()