Tree - source-git/malaga-suomi-voikko

source-git / malaga-suomi-voikko

Blame common/voikkoutils.py

Blob History Raw

Packit	1f3717	`# -- coding: utf-8 --`
Packit	1f3717
Packit	1f3717	`# Copyright 2007 Harri Pitkänen (hatapitk@iki.fi)`
Packit	1f3717
Packit	1f3717	`# This program is free software; you can redistribute it and/or modify`
Packit	1f3717	`# it under the terms of the GNU General Public License as published by`
Packit	1f3717	`# the Free Software Foundation; either version 2 of the License, or`
Packit	1f3717	`# (at your option) any later version.`
Packit	1f3717	`#`
Packit	1f3717	`# This program is distributed in the hope that it will be useful,`
Packit	1f3717	`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
Packit	1f3717	`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
Packit	1f3717	`# GNU General Public License for more details.`
Packit	1f3717	`#`
Packit	1f3717	`# You should have received a copy of the GNU General Public License`
Packit	1f3717	`# along with this program; if not, write to the Free Software`
Packit	1f3717	`# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA`
Packit	1f3717
Packit	1f3717	`# This module contains general helper functions and classes for use`
Packit	1f3717	`# with Python and Voikko.`
Packit	1f3717
Packit	1f3717	`import codecs`
Packit	1f3717	`import os`
Packit	1f3717	`import locale`
Packit	1f3717	`import sys`
Packit	1f3717	`import xml.dom.minidom`
Packit	1f3717	`import gzip`
Packit	1f3717
Packit	1f3717	`# Word classes`
Packit	1f3717	`NOUN=1`
Packit	1f3717	`ADJECTIVE=2`
Packit	1f3717	`VERB=3`
Packit	1f3717
Packit	1f3717	`# Vowel types`
Packit	1f3717	`VOWEL_DEFAULT=0`
Packit	1f3717	`VOWEL_FRONT=1`
Packit	1f3717	`VOWEL_BACK=2`
Packit	1f3717	`VOWEL_BOTH=3`
Packit	1f3717
Packit	1f3717	`# Gradation types`
Packit	1f3717	`GRAD_NONE = 0`
Packit	1f3717	`GRAD_SW = 1`
Packit	1f3717	`GRAD_WS = 2`
Packit	1f3717
Packit	1f3717	`GRAD_WEAK = 3`
Packit	1f3717	`GRAD_STRONG = 4`
Packit	1f3717
Packit	1f3717	`class FlagAttribute:`
Packit	1f3717	`"Vocabulary flag attribute"`
Packit	1f3717	`joukahainen = 0`
Packit	1f3717	`xmlGroup = None`
Packit	1f3717	`xmlFlag = None`
Packit	1f3717	`malagaFlag = None`
Packit	1f3717	`description = None`
Packit	1f3717
Packit	1f3717	`## Remove comments from a given line of text.`
Packit	1f3717	`def removeComments(line):`
Packit	1f3717	`comment_start = line.find(u'#')`
Packit	1f3717	`if comment_start == -1:`
Packit	1f3717	`return line`
Packit	1f3717	`if comment_start == 0:`
Packit	1f3717	`return u''`
Packit	1f3717	`return line[:comment_start]`
Packit	1f3717
Packit	1f3717	`def readFlagAttributes(filename):`
Packit	1f3717	`"""Returns a map of flag attributes from given file. The keys in the`
Packit	1f3717	`map are in form xmlGroup/xmlFlag, such as 'compounding/ei_ys'."""`
Packit	1f3717	`inputfile = codecs.open(filename, 'r', 'UTF-8')`
Packit	1f3717	`flags = {}`
Packit	1f3717	`fileCont = True`
Packit	1f3717	`while fileCont:`
Packit	1f3717	`line = inputfile.readline()`
Packit	1f3717	`fileCont = line.endswith('\n')`
Packit	1f3717	`line = removeComments(line).strip()`
Packit	1f3717	`if len(line) > 0:`
Packit	1f3717	`f = FlagAttribute()`
Packit	1f3717	`endind = line.find(u' ')`
Packit	1f3717	`f.joukahainen = int(line[:endind])`
Packit	1f3717	`line = line[endind:].strip()`
Packit	1f3717	`endind = line.find(u'/')`
Packit	1f3717	`f.xmlGroup = line[:endind]`
Packit	1f3717	`line = line[endind + 1:]`
Packit	1f3717	`endind = line.find(u' ')`
Packit	1f3717	`f.xmlFlag = line[:endind]`
Packit	1f3717	`line = line[endind:].strip()`
Packit	1f3717	`endind = line.find(u' ')`
Packit	1f3717	`if line[:endind] != u'-': f.malagaFlag = line[:endind]`
Packit	1f3717	`line = line[endind:].strip()`
Packit	1f3717	`if len(line) > 0: f.description = line`
Packit	1f3717	`flags[f.xmlGroup + u'/' + f.xmlFlag] = f`
Packit	1f3717	`inputfile.close()`
Packit	1f3717	`return flags`
Packit	1f3717
Packit	1f3717	`## Function that returns the type of vowels that are allowed in the suffixes for`
Packit	1f3717	`# given simple word.`
Packit	1f3717	`# The possible values are VOWEL_FRONT, VOWEL_BACK and VOWEL_BOTH.`
Packit	1f3717	`def _simple_vowel_type(word):`
Packit	1f3717	`word = word.lower()`
Packit	1f3717	`last_back = max(word.rfind(u'a'), word.rfind(u'o'), word.rfind(u'å'), word.rfind(u'u'))`
Packit	1f3717	`last_ord_front = max(word.rfind(u'ä'), word.rfind(u'ö'))`
Packit	1f3717	`last_y = word.rfind(u'y')`
Packit	1f3717	`if last_back > -1 and max(last_ord_front, last_y) == -1:`
Packit	1f3717	`return VOWEL_BACK`
Packit	1f3717	`if last_back == -1 and max(last_ord_front, last_y) > -1:`
Packit	1f3717	`return VOWEL_FRONT`
Packit	1f3717	`if max(last_back, last_ord_front, last_y) == -1:`
Packit	1f3717	`return VOWEL_FRONT`
Packit	1f3717	`if last_y < max(last_back, last_ord_front):`
Packit	1f3717	`if last_back > last_ord_front: return VOWEL_BACK`
Packit	1f3717	`else: return VOWEL_FRONT`
Packit	1f3717	`else:`
Packit	1f3717	`return VOWEL_BOTH`
Packit	1f3717
Packit	1f3717	`## Returns autodetected vowel type of infection suffixes for a word.`
Packit	1f3717	`# If word contains character '=', automatic detection is only performed on the`
Packit	1f3717	`# trailing part. If word contains character '\|', automatic detection is performed`
Packit	1f3717	`# on the trailing part and the whole word, and the union of accepted vowel types is returned.`
Packit	1f3717	`def get_wordform_infl_vowel_type(wordform):`
Packit	1f3717	`# Search for last '=' or '-', check the trailing part using recursion`
Packit	1f3717	`startind = max(wordform.rfind(u'='), wordform.rfind(u'-'))`
Packit	1f3717	`if startind == len(wordform) - 1: return VOWEL_BOTH # Not allowed`
Packit	1f3717	`if startind != -1: return get_wordform_infl_vowel_type(wordform[startind+1:])`
Packit	1f3717
Packit	1f3717	`# Search for first '\|', check the trailing part using recursion`
Packit	1f3717	`startind = wordform.find(u'\|')`
Packit	1f3717	`if startind == len(wordform) - 1: return VOWEL_BOTH # Not allowed`
Packit	1f3717	`vtype_whole = _simple_vowel_type(wordform)`
Packit	1f3717	`if startind == -1: return vtype_whole`
Packit	1f3717	`vtype_part = get_wordform_infl_vowel_type(wordform[startind+1:])`
Packit	1f3717	`if vtype_whole == vtype_part: return vtype_whole`
Packit	1f3717	`else: return VOWEL_BOTH`
Packit	1f3717
Packit	1f3717	`def get_preference(prefname):`
Packit	1f3717	`u'Returns the value of given preference'`
Packit	1f3717	`try:`
Packit	1f3717	`import voikko_dev_prefs`
Packit	1f3717	`if prefname == 'svnroot' and hasattr(voikko_dev_prefs, 'svnroot'):`
Packit	1f3717	`return voikko_dev_prefs.svnroot`
Packit	1f3717	`if prefname == 'voikkotest_dir' and hasattr(voikko_dev_prefs, 'voikkotest_dir'):`
Packit	1f3717	`return voikko_dev_prefs.voikkotest_dir`
Packit	1f3717	`if prefname == 'voikkotest_build_options' and hasattr(voikko_dev_prefs, 'voikkotest_build_options'):`
Packit	1f3717	`return voikko_dev_prefs.voikkotest_build_options`
Packit	1f3717	`if prefname == 'voikko_data_dir' and hasattr(voikko_dev_prefs, 'voikko_data_dir'):`
Packit	1f3717	`return voikko_dev_prefs.voikko_data_dir`
Packit	1f3717	`if prefname == 'encoding' and hasattr(voikko_dev_prefs, 'encoding'):`
Packit	1f3717	`return voikko_dev_prefs.encoding`
Packit	1f3717	`if prefname == 'libvoikko_bin' and hasattr(voikko_dev_prefs, 'libvoikko_bin'):`
Packit	1f3717	`return voikko_dev_prefs.libvoikko_bin`
Packit	1f3717	`if prefname == 'diffviewcmd' and hasattr(voikko_dev_prefs, 'diffviewcmd'):`
Packit	1f3717	`return voikko_dev_prefs.diffviewcmd`
Packit	1f3717	`except ImportError:`
Packit	1f3717	`pass`
Packit	1f3717	`if prefname == 'svnroot': return os.environ['HOME'] + '/svn/voikko'`
Packit	1f3717	`if prefname == 'voikkotest_dir': return os.environ['HOME'] + '/tmp/voikkotest'`
Packit	1f3717	`if prefname == 'voikkotest_build_options': return ''`
Packit	1f3717	`if prefname == 'voikko_data_dir': return os.environ['HOME'] + '/svn/voikko/trunk/data'`
Packit	1f3717	`if prefname == 'encoding': return locale.getpreferredencoding()`
Packit	1f3717	`if prefname == 'libvoikko_bin': return '/usr/bin'`
Packit	1f3717	`if prefname == 'diffviewcmd': return 'diff -U 0 "%s" "%s" \| grep ^.C: 2>/dev/null \| less'`
Packit	1f3717	`return None`
Packit	1f3717
Packit	1f3717	`## Returns True, if given character is a consonant, otherwise retuns False.`
Packit	1f3717	`def is_consonant(letter):`
Packit	1f3717	`if letter.lower() in u'qwrtpsdfghjklzxcvbnm':`
Packit	1f3717	`return True`
Packit	1f3717	`else:`
Packit	1f3717	`return False`
Packit	1f3717
Packit	1f3717	`## Function that returns the type of vowels that are allowed in the affixes for given word.`
Packit	1f3717	`# The possible values are VOWEL_FRONT, VOWEL_BACK and VOWEL_BOTH.`
Packit	1f3717	`def vowel_type(word):`
Packit	1f3717	`word = word.lower()`
Packit	1f3717	`last_back = max(word.rfind(u'a'), word.rfind(u'o'), word.rfind(u'å'), word.rfind(u'u'))`
Packit	1f3717	`last_ord_front = max(word.rfind(u'ä'), word.rfind(u'ö'))`
Packit	1f3717	`last_y = word.rfind(u'y')`
Packit	1f3717	`if last_back > -1 and max(last_ord_front, last_y) == -1:`
Packit	1f3717	`return VOWEL_BACK`
Packit	1f3717	`if last_back == -1 and max(last_ord_front, last_y) > -1:`
Packit	1f3717	`return VOWEL_FRONT`
Packit	1f3717	`if max(last_back, last_ord_front, last_y) == -1:`
Packit	1f3717	`return VOWEL_FRONT`
Packit	1f3717	`if last_y < max(last_back, last_ord_front):`
Packit	1f3717	`if last_back > last_ord_front: return VOWEL_BACK`
Packit	1f3717	`else: return VOWEL_FRONT`
Packit	1f3717	`else:`
Packit	1f3717	`return VOWEL_BOTH`
Packit	1f3717
Packit	1f3717
Packit	1f3717	`## Expands capital letters to useful character classes for regular expressions`
Packit	1f3717	`def capital_char_regexp(pattern):`
Packit	1f3717	`pattern = pattern.replace('V', u'(?:a\|e\|i\|o\|u\|y\|ä\|ö\|é\|è\|á\|ó\|â)')`
Packit	1f3717	`pattern = pattern.replace('C', u'(?:b\|c\|d\|f\|g\|h\|j\|k\|l\|m\|n\|p\|q\|r\|s\|t\|v\|w\|x\|z\|š\|ž)')`
Packit	1f3717	`pattern = pattern.replace('A', u'(?:a\|ä)')`
Packit	1f3717	`pattern = pattern.replace('O', u'(?:o\|ö)')`
Packit	1f3717	`pattern = pattern.replace('U', u'(?:u\|y)')`
Packit	1f3717	`return pattern`
Packit	1f3717
Packit	1f3717	`## Reads the word list in XML format specified by filename. If the name ends`
Packit	1f3717	`# with .gz, the file is assumed to be gzip compressed. Calls function word_handler`
Packit	1f3717	`# for each word, passing a XML Document object representing the word as a parameter.`
Packit	1f3717	`# If show_progress == True, prints progess information to stdout`
Packit	1f3717	`def process_wordlist(filename, word_handler, show_progress = False):`
Packit	1f3717	`if filename.endswith(".gz"):`
Packit	1f3717	`listfile = gzip.GzipFile(filename, 'r')`
Packit	1f3717	`else:`
Packit	1f3717	`listfile = open(filename, 'r')`
Packit	1f3717	`line = ""`
Packit	1f3717	`while line != '<wordlist xml:lang="fi">\n':`
Packit	1f3717	`line = listfile.readline()`
Packit	1f3717	`if line == '':`
Packit	1f3717	`sys.stderr.write("Malformed file " + filename + "\n")`
Packit	1f3717	`return`
Packit	1f3717
Packit	1f3717	`wcount = 0`
Packit	1f3717	`while True:`
Packit	1f3717	`wordstr = ""`
Packit	1f3717	`line = listfile.readline()`
Packit	1f3717	`if line == "</wordlist>\n": break`
Packit	1f3717	`while line != '</word>\n':`
Packit	1f3717	`wordstr = wordstr + line`
Packit	1f3717	`line = listfile.readline()`
Packit	1f3717	`word = xml.dom.minidom.parseString(wordstr + line)`
Packit	1f3717	`word_handler(word.documentElement)`
Packit	1f3717	`wcount = wcount + 1`
Packit	1f3717	`if show_progress and wcount % 1000 == 0:`
Packit	1f3717	`sys.stdout.write("#")`
Packit	1f3717	`sys.stdout.flush()`
Packit	1f3717
Packit	1f3717	`if show_progress: sys.stdout.write("\n")`
Packit	1f3717	`listfile.close()`

source-git / malaga-suomi-voikko

Source Code

Blame common/voikkoutils.py