Blame sukija/generate_lex.py

Packit 1f3717
# -*- coding: utf-8 -*-
Packit 1f3717
Packit 1f3717
# Copyright 2007 Harri Pitkänen (hatapitk@iki.fi)
Packit 1f3717
#           2013 Hannu Väisänen (Hannu.Vaisanen@uef.fi)
Packit 1f3717
#
Packit 1f3717
# Program to generate lexicon files for Suomi-malaga
Packit 1f3717
Packit 1f3717
# This program is free software; you can redistribute it and/or modify
Packit 1f3717
# it under the terms of the GNU General Public License as published by
Packit 1f3717
# the Free Software Foundation; either version 2 of the License, or
Packit 1f3717
# (at your option) any later version.
Packit 1f3717
#
Packit 1f3717
# This program is distributed in the hope that it will be useful,
Packit 1f3717
# but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 1f3717
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 1f3717
# GNU General Public License for more details.
Packit 1f3717
#
Packit 1f3717
# You should have received a copy of the GNU General Public License
Packit 1f3717
# along with this program; if not, write to the Free Software
Packit 1f3717
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
Packit 1f3717
Packit 1f3717
# Path to target directory
Packit 1f3717
SUKIJA_LEX = u"sukija/voikonsanat"
Packit 1f3717
Packit 1f3717
import sys
Packit 1f3717
sys.path.append("common")
Packit 1f3717
sys.path.append("sukija")
Packit 1f3717
import generate_lex_common
Packit 1f3717
import voikkoutils
Packit 1f3717
import sukija
Packit 1f3717
import hfconv
Packit 1f3717
import xml.dom.minidom
Packit 1f3717
import codecs
Packit 1f3717
import getopt
Packit 1f3717
Packit 1f3717
import locale
Packit 1f3717
#print locale.getlocale(locale.LC_ALL)
Packit 1f3717
locale.setlocale(locale.LC_ALL, '')
Packit 1f3717
Packit 1f3717
Packit 1f3717
path = SUKIJA_LEX
Packit 1f3717
Packit 1f3717
flag_attributes = voikkoutils.readFlagAttributes(generate_lex_common.VOCABULARY_DATA + u"/flags.txt")
Packit 1f3717
Packit 1f3717
main_vocabulary = generate_lex_common.open_lex(path,"joukahainen.lex")
Packit 1f3717
vocabulary_files = {}
Packit 1f3717
for voc in generate_lex_common.SPECIAL_VOCABULARY:
Packit 1f3717
	vocabulary_files[voc[2]] = generate_lex_common.open_lex(path,voc[2])
Packit 1f3717
Packit 1f3717
Packit 1f3717
listfile = open(generate_lex_common.VOCABULARY_DATA + u'/joukahainen.xml', 'r')
Packit 1f3717
Packit 1f3717
line = ""
Packit 1f3717
while line != '<wordlist xml:lang="fi">\n':
Packit 1f3717
	line = listfile.readline()
Packit 1f3717
	if line == '':
Packit 1f3717
		sys.stderr.write("Malformed file " + generate_lex_common.VOCABULARY_DATA + \
Packit 1f3717
		                 "/joukahainen.xml\n")
Packit 1f3717
		sys.exit(1)
Packit 1f3717
Packit 1f3717
wcount = 0
Packit 1f3717
while True:
Packit 1f3717
	wordstr = ""
Packit 1f3717
	line = listfile.readline()
Packit 1f3717
	if line == "</wordlist>\n": break
Packit 1f3717
	while line != '</word>\n':
Packit 1f3717
		wordstr = wordstr + line
Packit 1f3717
		line = listfile.readline()
Packit 1f3717
	word = xml.dom.minidom.parseString(wordstr + line)
Packit 1f3717
	sukija.handle_word(main_vocabulary, vocabulary_files, word.documentElement)
Packit 1f3717
	wcount = wcount + 1
Packit 1f3717
	if wcount % 1000 == 0:
Packit 1f3717
		sys.stdout.write("#")
Packit 1f3717
		sys.stdout.flush()
Packit 1f3717
Packit 1f3717
sys.stdout.write("\n")
Packit 1f3717
listfile.close()
Packit 1f3717
main_vocabulary.close()
Packit 1f3717
for (name, file) in vocabulary_files.iteritems():
Packit 1f3717
	file.close()
Packit 1f3717