|
Packit |
1f3717 |
# -*- coding: utf-8 -*-
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Copyright 2007 Harri Pitkänen (hatapitk@iki.fi)
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# This program is free software; you can redistribute it and/or modify
|
|
Packit |
1f3717 |
# it under the terms of the GNU General Public License as published by
|
|
Packit |
1f3717 |
# the Free Software Foundation; either version 2 of the License, or
|
|
Packit |
1f3717 |
# (at your option) any later version.
|
|
Packit |
1f3717 |
#
|
|
Packit |
1f3717 |
# This program is distributed in the hope that it will be useful,
|
|
Packit |
1f3717 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
Packit |
1f3717 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
Packit |
1f3717 |
# GNU General Public License for more details.
|
|
Packit |
1f3717 |
#
|
|
Packit |
1f3717 |
# You should have received a copy of the GNU General Public License
|
|
Packit |
1f3717 |
# along with this program; if not, write to the Free Software
|
|
Packit |
1f3717 |
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# This module contains general helper functions and classes for use
|
|
Packit |
1f3717 |
# with Python and Voikko.
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
import codecs
|
|
Packit |
1f3717 |
import os
|
|
Packit |
1f3717 |
import locale
|
|
Packit |
1f3717 |
import sys
|
|
Packit |
1f3717 |
import xml.dom.minidom
|
|
Packit |
1f3717 |
import gzip
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Word classes
|
|
Packit |
1f3717 |
NOUN=1
|
|
Packit |
1f3717 |
ADJECTIVE=2
|
|
Packit |
1f3717 |
VERB=3
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Vowel types
|
|
Packit |
1f3717 |
VOWEL_DEFAULT=0
|
|
Packit |
1f3717 |
VOWEL_FRONT=1
|
|
Packit |
1f3717 |
VOWEL_BACK=2
|
|
Packit |
1f3717 |
VOWEL_BOTH=3
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Gradation types
|
|
Packit |
1f3717 |
GRAD_NONE = 0
|
|
Packit |
1f3717 |
GRAD_SW = 1
|
|
Packit |
1f3717 |
GRAD_WS = 2
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
GRAD_WEAK = 3
|
|
Packit |
1f3717 |
GRAD_STRONG = 4
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
class FlagAttribute:
|
|
Packit |
1f3717 |
"Vocabulary flag attribute"
|
|
Packit |
1f3717 |
joukahainen = 0
|
|
Packit |
1f3717 |
xmlGroup = None
|
|
Packit |
1f3717 |
xmlFlag = None
|
|
Packit |
1f3717 |
malagaFlag = None
|
|
Packit |
1f3717 |
description = None
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
## Remove comments from a given line of text.
|
|
Packit |
1f3717 |
def removeComments(line):
|
|
Packit |
1f3717 |
comment_start = line.find(u'#')
|
|
Packit |
1f3717 |
if comment_start == -1:
|
|
Packit |
1f3717 |
return line
|
|
Packit |
1f3717 |
if comment_start == 0:
|
|
Packit |
1f3717 |
return u''
|
|
Packit |
1f3717 |
return line[:comment_start]
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def readFlagAttributes(filename):
|
|
Packit |
1f3717 |
"""Returns a map of flag attributes from given file. The keys in the
|
|
Packit |
1f3717 |
map are in form xmlGroup/xmlFlag, such as 'compounding/ei_ys'."""
|
|
Packit |
1f3717 |
inputfile = codecs.open(filename, 'r', 'UTF-8')
|
|
Packit |
1f3717 |
flags = {}
|
|
Packit |
1f3717 |
fileCont = True
|
|
Packit |
1f3717 |
while fileCont:
|
|
Packit |
1f3717 |
line = inputfile.readline()
|
|
Packit |
1f3717 |
fileCont = line.endswith('\n')
|
|
Packit |
1f3717 |
line = removeComments(line).strip()
|
|
Packit |
1f3717 |
if len(line) > 0:
|
|
Packit |
1f3717 |
f = FlagAttribute()
|
|
Packit |
1f3717 |
endind = line.find(u' ')
|
|
Packit |
1f3717 |
f.joukahainen = int(line[:endind])
|
|
Packit |
1f3717 |
line = line[endind:].strip()
|
|
Packit |
1f3717 |
endind = line.find(u'/')
|
|
Packit |
1f3717 |
f.xmlGroup = line[:endind]
|
|
Packit |
1f3717 |
line = line[endind + 1:]
|
|
Packit |
1f3717 |
endind = line.find(u' ')
|
|
Packit |
1f3717 |
f.xmlFlag = line[:endind]
|
|
Packit |
1f3717 |
line = line[endind:].strip()
|
|
Packit |
1f3717 |
endind = line.find(u' ')
|
|
Packit |
1f3717 |
if line[:endind] != u'-': f.malagaFlag = line[:endind]
|
|
Packit |
1f3717 |
line = line[endind:].strip()
|
|
Packit |
1f3717 |
if len(line) > 0: f.description = line
|
|
Packit |
1f3717 |
flags[f.xmlGroup + u'/' + f.xmlFlag] = f
|
|
Packit |
1f3717 |
inputfile.close()
|
|
Packit |
1f3717 |
return flags
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
## Function that returns the type of vowels that are allowed in the suffixes for
|
|
Packit |
1f3717 |
# given simple word.
|
|
Packit |
1f3717 |
# The possible values are VOWEL_FRONT, VOWEL_BACK and VOWEL_BOTH.
|
|
Packit |
1f3717 |
def _simple_vowel_type(word):
|
|
Packit |
1f3717 |
word = word.lower()
|
|
Packit |
1f3717 |
last_back = max(word.rfind(u'a'), word.rfind(u'o'), word.rfind(u'å'), word.rfind(u'u'))
|
|
Packit |
1f3717 |
last_ord_front = max(word.rfind(u'ä'), word.rfind(u'ö'))
|
|
Packit |
1f3717 |
last_y = word.rfind(u'y')
|
|
Packit |
1f3717 |
if last_back > -1 and max(last_ord_front, last_y) == -1:
|
|
Packit |
1f3717 |
return VOWEL_BACK
|
|
Packit |
1f3717 |
if last_back == -1 and max(last_ord_front, last_y) > -1:
|
|
Packit |
1f3717 |
return VOWEL_FRONT
|
|
Packit |
1f3717 |
if max(last_back, last_ord_front, last_y) == -1:
|
|
Packit |
1f3717 |
return VOWEL_FRONT
|
|
Packit |
1f3717 |
if last_y < max(last_back, last_ord_front):
|
|
Packit |
1f3717 |
if last_back > last_ord_front: return VOWEL_BACK
|
|
Packit |
1f3717 |
else: return VOWEL_FRONT
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
return VOWEL_BOTH
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
## Returns autodetected vowel type of infection suffixes for a word.
|
|
Packit |
1f3717 |
# If word contains character '=', automatic detection is only performed on the
|
|
Packit |
1f3717 |
# trailing part. If word contains character '|', automatic detection is performed
|
|
Packit |
1f3717 |
# on the trailing part and the whole word, and the union of accepted vowel types is returned.
|
|
Packit |
1f3717 |
def get_wordform_infl_vowel_type(wordform):
|
|
Packit |
1f3717 |
# Search for last '=' or '-', check the trailing part using recursion
|
|
Packit |
1f3717 |
startind = max(wordform.rfind(u'='), wordform.rfind(u'-'))
|
|
Packit |
1f3717 |
if startind == len(wordform) - 1: return VOWEL_BOTH # Not allowed
|
|
Packit |
1f3717 |
if startind != -1: return get_wordform_infl_vowel_type(wordform[startind+1:])
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
# Search for first '|', check the trailing part using recursion
|
|
Packit |
1f3717 |
startind = wordform.find(u'|')
|
|
Packit |
1f3717 |
if startind == len(wordform) - 1: return VOWEL_BOTH # Not allowed
|
|
Packit |
1f3717 |
vtype_whole = _simple_vowel_type(wordform)
|
|
Packit |
1f3717 |
if startind == -1: return vtype_whole
|
|
Packit |
1f3717 |
vtype_part = get_wordform_infl_vowel_type(wordform[startind+1:])
|
|
Packit |
1f3717 |
if vtype_whole == vtype_part: return vtype_whole
|
|
Packit |
1f3717 |
else: return VOWEL_BOTH
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
def get_preference(prefname):
|
|
Packit |
1f3717 |
u'Returns the value of given preference'
|
|
Packit |
1f3717 |
try:
|
|
Packit |
1f3717 |
import voikko_dev_prefs
|
|
Packit |
1f3717 |
if prefname == 'svnroot' and hasattr(voikko_dev_prefs, 'svnroot'):
|
|
Packit |
1f3717 |
return voikko_dev_prefs.svnroot
|
|
Packit |
1f3717 |
if prefname == 'voikkotest_dir' and hasattr(voikko_dev_prefs, 'voikkotest_dir'):
|
|
Packit |
1f3717 |
return voikko_dev_prefs.voikkotest_dir
|
|
Packit |
1f3717 |
if prefname == 'voikkotest_build_options' and hasattr(voikko_dev_prefs, 'voikkotest_build_options'):
|
|
Packit |
1f3717 |
return voikko_dev_prefs.voikkotest_build_options
|
|
Packit |
1f3717 |
if prefname == 'voikko_data_dir' and hasattr(voikko_dev_prefs, 'voikko_data_dir'):
|
|
Packit |
1f3717 |
return voikko_dev_prefs.voikko_data_dir
|
|
Packit |
1f3717 |
if prefname == 'encoding' and hasattr(voikko_dev_prefs, 'encoding'):
|
|
Packit |
1f3717 |
return voikko_dev_prefs.encoding
|
|
Packit |
1f3717 |
if prefname == 'libvoikko_bin' and hasattr(voikko_dev_prefs, 'libvoikko_bin'):
|
|
Packit |
1f3717 |
return voikko_dev_prefs.libvoikko_bin
|
|
Packit |
1f3717 |
if prefname == 'diffviewcmd' and hasattr(voikko_dev_prefs, 'diffviewcmd'):
|
|
Packit |
1f3717 |
return voikko_dev_prefs.diffviewcmd
|
|
Packit |
1f3717 |
except ImportError:
|
|
Packit |
1f3717 |
pass
|
|
Packit |
1f3717 |
if prefname == 'svnroot': return os.environ['HOME'] + '/svn/voikko'
|
|
Packit |
1f3717 |
if prefname == 'voikkotest_dir': return os.environ['HOME'] + '/tmp/voikkotest'
|
|
Packit |
1f3717 |
if prefname == 'voikkotest_build_options': return ''
|
|
Packit |
1f3717 |
if prefname == 'voikko_data_dir': return os.environ['HOME'] + '/svn/voikko/trunk/data'
|
|
Packit |
1f3717 |
if prefname == 'encoding': return locale.getpreferredencoding()
|
|
Packit |
1f3717 |
if prefname == 'libvoikko_bin': return '/usr/bin'
|
|
Packit |
1f3717 |
if prefname == 'diffviewcmd': return 'diff -U 0 "%s" "%s" | grep ^.C: 2>/dev/null | less'
|
|
Packit |
1f3717 |
return None
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
## Returns True, if given character is a consonant, otherwise retuns False.
|
|
Packit |
1f3717 |
def is_consonant(letter):
|
|
Packit |
1f3717 |
if letter.lower() in u'qwrtpsdfghjklzxcvbnm':
|
|
Packit |
1f3717 |
return True
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
return False
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
## Function that returns the type of vowels that are allowed in the affixes for given word.
|
|
Packit |
1f3717 |
# The possible values are VOWEL_FRONT, VOWEL_BACK and VOWEL_BOTH.
|
|
Packit |
1f3717 |
def vowel_type(word):
|
|
Packit |
1f3717 |
word = word.lower()
|
|
Packit |
1f3717 |
last_back = max(word.rfind(u'a'), word.rfind(u'o'), word.rfind(u'å'), word.rfind(u'u'))
|
|
Packit |
1f3717 |
last_ord_front = max(word.rfind(u'ä'), word.rfind(u'ö'))
|
|
Packit |
1f3717 |
last_y = word.rfind(u'y')
|
|
Packit |
1f3717 |
if last_back > -1 and max(last_ord_front, last_y) == -1:
|
|
Packit |
1f3717 |
return VOWEL_BACK
|
|
Packit |
1f3717 |
if last_back == -1 and max(last_ord_front, last_y) > -1:
|
|
Packit |
1f3717 |
return VOWEL_FRONT
|
|
Packit |
1f3717 |
if max(last_back, last_ord_front, last_y) == -1:
|
|
Packit |
1f3717 |
return VOWEL_FRONT
|
|
Packit |
1f3717 |
if last_y < max(last_back, last_ord_front):
|
|
Packit |
1f3717 |
if last_back > last_ord_front: return VOWEL_BACK
|
|
Packit |
1f3717 |
else: return VOWEL_FRONT
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
return VOWEL_BOTH
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
## Expands capital letters to useful character classes for regular expressions
|
|
Packit |
1f3717 |
def capital_char_regexp(pattern):
|
|
Packit |
1f3717 |
pattern = pattern.replace('V', u'(?:a|e|i|o|u|y|ä|ö|é|è|á|ó|â)')
|
|
Packit |
1f3717 |
pattern = pattern.replace('C', u'(?:b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|z|š|ž)')
|
|
Packit |
1f3717 |
pattern = pattern.replace('A', u'(?:a|ä)')
|
|
Packit |
1f3717 |
pattern = pattern.replace('O', u'(?:o|ö)')
|
|
Packit |
1f3717 |
pattern = pattern.replace('U', u'(?:u|y)')
|
|
Packit |
1f3717 |
return pattern
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
## Reads the word list in XML format specified by filename. If the name ends
|
|
Packit |
1f3717 |
# with .gz, the file is assumed to be gzip compressed. Calls function word_handler
|
|
Packit |
1f3717 |
# for each word, passing a XML Document object representing the word as a parameter.
|
|
Packit |
1f3717 |
# If show_progress == True, prints progess information to stdout
|
|
Packit |
1f3717 |
def process_wordlist(filename, word_handler, show_progress = False):
|
|
Packit |
1f3717 |
if filename.endswith(".gz"):
|
|
Packit |
1f3717 |
listfile = gzip.GzipFile(filename, 'r')
|
|
Packit |
1f3717 |
else:
|
|
Packit |
1f3717 |
listfile = open(filename, 'r')
|
|
Packit |
1f3717 |
line = ""
|
|
Packit |
1f3717 |
while line != '<wordlist xml:lang="fi">\n':
|
|
Packit |
1f3717 |
line = listfile.readline()
|
|
Packit |
1f3717 |
if line == '':
|
|
Packit |
1f3717 |
sys.stderr.write("Malformed file " + filename + "\n")
|
|
Packit |
1f3717 |
return
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
wcount = 0
|
|
Packit |
1f3717 |
while True:
|
|
Packit |
1f3717 |
wordstr = ""
|
|
Packit |
1f3717 |
line = listfile.readline()
|
|
Packit |
1f3717 |
if line == "</wordlist>\n": break
|
|
Packit |
1f3717 |
while line != '</word>\n':
|
|
Packit |
1f3717 |
wordstr = wordstr + line
|
|
Packit |
1f3717 |
line = listfile.readline()
|
|
Packit |
1f3717 |
word = xml.dom.minidom.parseString(wordstr + line)
|
|
Packit |
1f3717 |
word_handler(word.documentElement)
|
|
Packit |
1f3717 |
wcount = wcount + 1
|
|
Packit |
1f3717 |
if show_progress and wcount % 1000 == 0:
|
|
Packit |
1f3717 |
sys.stdout.write("#")
|
|
Packit |
1f3717 |
sys.stdout.flush()
|
|
Packit |
1f3717 |
|
|
Packit |
1f3717 |
if show_progress: sys.stdout.write("\n")
|
|
Packit |
1f3717 |
listfile.close()
|