Tree - source-git/malaga-suomi-voikko

source-git / malaga-suomi-voikko

Files

Commit: 1f37173b019941788e5603446756c1628970c40f
Blob Blame History Raw
# -*- coding: utf-8 -*-

# Copyright 2007 Harri Pitkänen (hatapitk@iki.fi)

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

# This module contains general helper functions and classes for use
# with Python and Voikko.

import codecs
import os
import locale
import sys
import xml.dom.minidom
import gzip

# Word classes
NOUN=1
ADJECTIVE=2
VERB=3

# Vowel types
VOWEL_DEFAULT=0
VOWEL_FRONT=1
VOWEL_BACK=2
VOWEL_BOTH=3

# Gradation types
GRAD_NONE = 0
GRAD_SW = 1
GRAD_WS = 2

GRAD_WEAK = 3
GRAD_STRONG = 4

class FlagAttribute:
	"Vocabulary flag attribute"
	joukahainen = 0
	xmlGroup = None
	xmlFlag = None
	malagaFlag = None
	description = None

## Remove comments from a given line of text.
def removeComments(line):
	comment_start = line.find(u'#')
	if comment_start == -1:
		return line
	if comment_start == 0:
		return u''
	return line[:comment_start]

def readFlagAttributes(filename):
	"""Returns a map of flag attributes from given file. The keys in the
	map are in form xmlGroup/xmlFlag, such as 'compounding/ei_ys'."""
	inputfile = codecs.open(filename, 'r', 'UTF-8')
	flags = {}
	fileCont = True
	while fileCont:
		line = inputfile.readline()
		fileCont = line.endswith('\n')
		line = removeComments(line).strip()
		if len(line) > 0:
			f = FlagAttribute()
			endind = line.find(u' ')
			f.joukahainen = int(line[:endind])
			line = line[endind:].strip()
			endind = line.find(u'/')
			f.xmlGroup = line[:endind]
			line = line[endind + 1:]
			endind = line.find(u' ')
			f.xmlFlag = line[:endind]
			line = line[endind:].strip()
			endind = line.find(u' ')
			if line[:endind] != u'-': f.malagaFlag = line[:endind]
			line = line[endind:].strip()
			if len(line) > 0: f.description = line
			flags[f.xmlGroup + u'/' + f.xmlFlag] = f
	inputfile.close()
	return flags

## Function that returns the type of vowels that are allowed in the suffixes for
# given simple word.
# The possible values are VOWEL_FRONT, VOWEL_BACK and VOWEL_BOTH.
def _simple_vowel_type(word):
	word = word.lower()
	last_back = max(word.rfind(u'a'), word.rfind(u'o'), word.rfind(u'å'), word.rfind(u'u'))
	last_ord_front = max(word.rfind(u'ä'), word.rfind(u'ö'))
	last_y = word.rfind(u'y')
	if last_back > -1 and max(last_ord_front, last_y) == -1:
		return VOWEL_BACK
	if last_back == -1 and max(last_ord_front, last_y) > -1:
		return VOWEL_FRONT
	if max(last_back, last_ord_front, last_y) == -1:
		return VOWEL_FRONT
	if last_y < max(last_back, last_ord_front):
		if last_back > last_ord_front: return VOWEL_BACK
		else: return VOWEL_FRONT
	else:
		return VOWEL_BOTH

## Returns autodetected vowel type of infection suffixes for a word.
# If word contains character '=', automatic detection is only performed on the
# trailing part. If word contains character '|', automatic detection is performed
# on the trailing part and the whole word, and the union of accepted vowel types is returned.
def get_wordform_infl_vowel_type(wordform):
	# Search for last '=' or '-', check the trailing part using recursion
	startind = max(wordform.rfind(u'='), wordform.rfind(u'-'))
	if startind == len(wordform) - 1: return VOWEL_BOTH # Not allowed
	if startind != -1: return get_wordform_infl_vowel_type(wordform[startind+1:])
	
	# Search for first '|', check the trailing part using recursion
	startind = wordform.find(u'|')
	if startind == len(wordform) - 1: return VOWEL_BOTH # Not allowed
	vtype_whole = _simple_vowel_type(wordform)
	if startind == -1: return vtype_whole
	vtype_part = get_wordform_infl_vowel_type(wordform[startind+1:])
	if vtype_whole == vtype_part: return vtype_whole
	else: return VOWEL_BOTH

def get_preference(prefname):
	u'Returns the value of given preference'
	try:
		import voikko_dev_prefs
		if prefname == 'svnroot' and hasattr(voikko_dev_prefs, 'svnroot'):
			return voikko_dev_prefs.svnroot
		if prefname == 'voikkotest_dir' and hasattr(voikko_dev_prefs, 'voikkotest_dir'):
			return voikko_dev_prefs.voikkotest_dir
		if prefname == 'voikkotest_build_options' and hasattr(voikko_dev_prefs, 'voikkotest_build_options'):
			return voikko_dev_prefs.voikkotest_build_options
		if prefname == 'voikko_data_dir' and hasattr(voikko_dev_prefs, 'voikko_data_dir'):
			return voikko_dev_prefs.voikko_data_dir
		if prefname == 'encoding' and hasattr(voikko_dev_prefs, 'encoding'):
			return voikko_dev_prefs.encoding
		if prefname == 'libvoikko_bin' and hasattr(voikko_dev_prefs, 'libvoikko_bin'):
			return voikko_dev_prefs.libvoikko_bin
		if prefname == 'diffviewcmd' and hasattr(voikko_dev_prefs, 'diffviewcmd'):
			return voikko_dev_prefs.diffviewcmd
	except ImportError:
		pass
	if prefname == 'svnroot': return os.environ['HOME'] + '/svn/voikko'
	if prefname == 'voikkotest_dir': return os.environ['HOME'] + '/tmp/voikkotest'
	if prefname == 'voikkotest_build_options': return ''
	if prefname == 'voikko_data_dir': return os.environ['HOME'] + '/svn/voikko/trunk/data'
	if prefname == 'encoding': return locale.getpreferredencoding()
	if prefname == 'libvoikko_bin': return '/usr/bin'
	if prefname == 'diffviewcmd': return 'diff -U 0 "%s" "%s" | grep ^.C: 2>/dev/null | less'
	return None

## Returns True, if given character is a consonant, otherwise retuns False.
def is_consonant(letter):
	if letter.lower() in u'qwrtpsdfghjklzxcvbnm':
		return True
	else:
		return False

## Function that returns the type of vowels that are allowed in the affixes for given word.
# The possible values are VOWEL_FRONT, VOWEL_BACK and VOWEL_BOTH.
def vowel_type(word):
	word = word.lower()
	last_back = max(word.rfind(u'a'), word.rfind(u'o'), word.rfind(u'å'), word.rfind(u'u'))
	last_ord_front = max(word.rfind(u'ä'), word.rfind(u'ö'))
	last_y = word.rfind(u'y')
	if last_back > -1 and max(last_ord_front, last_y) == -1:
		return VOWEL_BACK
	if last_back == -1 and max(last_ord_front, last_y) > -1:
		return VOWEL_FRONT
	if max(last_back, last_ord_front, last_y) == -1:
		return VOWEL_FRONT
	if last_y < max(last_back, last_ord_front):
		if last_back > last_ord_front: return VOWEL_BACK
		else: return VOWEL_FRONT
	else:
		return VOWEL_BOTH


## Expands capital letters to useful character classes for regular expressions
def capital_char_regexp(pattern):
	pattern = pattern.replace('V', u'(?:a|e|i|o|u|y|ä|ö|é|è|á|ó|â)')
	pattern = pattern.replace('C', u'(?:b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|z|š|ž)')
	pattern = pattern.replace('A', u'(?:a|ä)')
	pattern = pattern.replace('O', u'(?:o|ö)')
	pattern = pattern.replace('U', u'(?:u|y)')
	return pattern

## Reads the word list in XML format specified by filename. If the name ends
# with .gz, the file is assumed to be gzip compressed. Calls function word_handler
# for each word, passing a XML Document object representing the word as a parameter.
# If show_progress == True, prints progess information to stdout
def process_wordlist(filename, word_handler, show_progress = False):
	if filename.endswith(".gz"):
		listfile = gzip.GzipFile(filename, 'r')
	else:
		listfile = open(filename, 'r')
	line = ""
	while line != '<wordlist xml:lang="fi">\n':
		line = listfile.readline()
		if line == '':
			sys.stderr.write("Malformed file " + filename + "\n")
			return
	
	wcount = 0
	while True:
		wordstr = ""
		line = listfile.readline()
		if line == "</wordlist>\n": break
		while line != '</word>\n':
			wordstr = wordstr + line
			line = listfile.readline()
		word = xml.dom.minidom.parseString(wordstr + line)
		word_handler(word.documentElement)
		wcount = wcount + 1
		if show_progress and wcount % 1000 == 0:
			sys.stdout.write("#")
			sys.stdout.flush()
	
	if show_progress: sys.stdout.write("\n")
	listfile.close()
source-git / malaga-suomi-voikko

Source Code

Files