Tree - source-git/hunspell-ko - CentOS Git server

source-git / hunspell-ko

Files

Commit: 6280359e2a385e8bab8a2266dd747e18a9a0027e
Blob Blame History Raw
# aff/dic output

# ***** BEGIN LICENSE BLOCK *****
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Hunspell Korean spellchecking dictionary.
#
# The Initial Developer of the Original Code is
# Changwoo Ryu.
# Portions created by the Initial Developer are Copyright (C) 2009
# the Initial Developer. All Rights Reserved.
#
# Contributor(s): See CREDITS file
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ***** END LICENSE BLOCK *****

import sys
import unicodedata
import json

import config
import suffix
import josa
import encoding
from flags import *


def ENC(unistr):
    if config.internal_encoding == '2+RST':
        return encoding.encode(unistr)
    else:
        return unicodedata.normalize('NFD', unistr)


def warn(s):
    return sys.stderr.write(s + '\n')


def progress(s):
    return sys.stderr.write('Progress: ' + s + '...\n')


class Word:
    def __init__(self):
        self.word = ''
        self.pos = ''
        self.props = []
        self.stem = ''
        self.flags = []
        self.flags_alias = -1
        self.morph_alias = -1

    def __hash__(self):
        return (self.word + self.pos).__hash__()

    # to make it orderable
    def __lt__(self, other):
        if self.word < other.word:
            return True
        if self.pos < other.pos:
            return True
        # FIXME: 이렇게 하면 순서가 다를텐데. set에서 뭐가 먼저 나올지 알고...
        for prop in other.props:
            if prop not in self.props:
                return True
        return False

    def __repr__(self):
        return 'Word %s pos:%s' % (self.word, self.pos)

    def attach_flags(word):
        pos_default_flags = {
            '명사': [],
            '대명사': [],
            '수사': [],
            '특수:복수접미사': [plural_suffix_flag],
            '특수:알파벳': [alpha_flag],
            '특수:숫자': [digit_flag],
            '특수:금지어': [forbidden_flag],
            '내부:활용:-어': [conjugation_eo_flag],
            '내부:활용:-은': [conjugation_eun_flag],
            '내부:활용:-을': [conjugation_eul_flag],
        }
        try:
            word.flags = pos_default_flags[word.pos]
        except KeyError:
            pass
        if word.pos == '동사' or word.pos == '형용사':
            word.flags += suffix.find_flags(word.word, word.pos, word.props)
        word.flags += josa.find_flags(word.word, word.pos, word.props)
        prop_default_flags = {
            '단위명사': [counter_flag],
            '보조용언:-어': [auxiliary_eo_flag],
            '보조용언:-은': [auxiliary_eun_flag],
            '보조용언:-을': [auxiliary_eul_flag],
            '수:1': [number_1_flag],
            '수:10': [number_10_flag],
            '수:100': [number_100_flag],
            '수:1000': [number_1000_flag],
            '수:10000': [number_10000_flag],
            '고유수:1': [knumber_1_flag],
            '고유수:10': [knumber_10_flag],
        }
        for prop in word.props:
            try:
                word.flags += prop_default_flags[prop]
            except KeyError:
                pass
        word.flags.sort()


class Dictionary:
    def __init__(self):
        self.words = set()
        self.flag_aliases = []
        self.morph_aliases = []

    def add(self, word):
        self.words.add(word)

    def remove(self, word):
        self.words.remove(word)

    def append(self, words):
        for w in words:
            self.words.add(w)

    def load_json(self, infile):
        d = json.load(infile)
        for entry in d['entries']:
            w = Word()
            w.word = entry['word']
            w.pos = entry['pos']
            if 'props' in entry:
                w.props = entry['props']
            if 'stem' in entry:
                w.stem = entry['stem']
            self.add(w)

    def process(self):
        progress('복수형 확장')
        self.expand_plurals()
        if config.expand_auxiliary_attached:
            progress('플래그 계산')
            self.attach_flags()
            progress('보조용언 확장')
            self.expand_auxiliary()
        else:
            progress('보조용언 확장')
            self.expand_auxiliary()
            progress('플래그 계산')
            self.attach_flags()
        if config.output_word_morph:
            self.attach_morph()

    def output(self, afffile, dicfile):
        progress('dic 출력')
        self.output_dic(dicfile)
        progress('aff 출력')
        self.output_aff(afffile)

    ######################################################################

    def output_dic(self, outfile):
        outfile.write('%d\n' % len(self.words))
        for word in sorted(list(self.words)):
            line = '%s' % word.word
            if word.flags_alias > 0:
                line += ('/%d' % word.flags_alias)
            if config.output_word_stem:
                if word.stem:
                    line += (' st:%s' % word.stem)
            if config.output_word_morph:
                if word.morph_alias > 0:
                    line += (' %d' % word.morph_alias)
            line += '\n'
            outfile.write(ENC(line))

    def output_aff(self, outfile):
        from string import Template
        import aff
        template = Template(open('template.aff', encoding='utf-8').read())

        # 주의: flag alias를 변경하므로 get_AF() 앞에 와야 한다.
        suffix_str = aff.get_suffix_defines(self.flag_aliases)
        josa_str = aff.get_josa_defines(self.flag_aliases)
        af_str = self.get_AF()
        am_str = self.get_AM()

        d = {'version': config.version,
             'required_hunspell': '%d.%d.%d' % config.required_hunspell_version,
             'url': 'https://spellcheck-ko.github.io/',
             'CONV': aff.CONV_DEFINES,
             'AF': af_str,
             'AM': am_str,
             'forbidden_flag': str(forbidden_flag),
             'trychars': aff.TRYCHARS,
             'MAP': aff.MAP_DEFINES,
             'REP': aff.REP_DEFINES,
             'COMPOUNDRULE': aff.COMPOUNDRULE_DEFINES,
             'JOSA': josa_str,
             'SUFFIX': suffix_str, }
        outfile.write(template.substitute(d))

    def get_AF(self):
        aliases = self.flag_aliases
        if len(aliases) > 0:
            result = 'AF %d\n' % len(aliases)
            for flags in aliases:
                result += 'AF %s\n' % ','.join('%d' % f for f in flags)
            return result
        else:
            return ''

    def get_AM(self):
        aliases = self.morph_aliases
        if len(aliases) > 0:
            result = 'AM %d\n' % len(aliases)
            for morph in aliases:
                result += 'AM %s\n' % morph
            return result
        else:
            return ''

    ######################################################################

    def attach_flags(self):
        aliases = []
        for word in self.words:
            word.attach_flags()
            if word.flags:
                if word.flags not in aliases:
                    aliases.append(word.flags)
                word.flags_alias = aliases.index(word.flags) + 1
        self.flag_aliases = aliases

    def attach_morph(self):
        aliases = []
        for word in self.words:
            morph = ''
            if morph:
                if morph not in aliases:
                    aliases.append(morph)
                word.morph_alias = aliases.index(morph) + 1
        self.morph_aliases = aliases

    def expand_plurals(self):
        new_words = []
        for word in self.words:
            pos_base = word.pos.split(':')[0]
            if pos_base != '명사':
                continue
            elif word.pos == '대명사':
                if word.word == '너' or word.word == '나':
                    continue
            new_word = Word()
            new_word.word = word.word + '들'
            new_word.pos = word.pos
            new_word.props = [p for p in word.props if p != '가산명사']
            new_word.stem = word.word
            new_words.append(new_word)
        self.append(new_words)

    def expand_auxiliary(self):
        new_words = []
        forms = ['-어', '-은', '-을']
        verbs = [w for w in self.words if w.pos in ['동사', '형용사']]
        for form in forms:
            auxiliaries = [w for w in verbs if ('보조용언:' + form) in w.props]
            for verb in verbs:
                # 본용언이 용언+용언 합성용언이면 붙여 쓸 수 없다
                if '용언합성' in verb.props:
                    continue
                prefixes = suffix.make_conjugations(verb.word,
                                                    verb.pos, verb.props, form)
                if config.expand_auxiliary_attached:
                    for auxiliary in auxiliaries:
                        # 본용언이 해당 보조용언으로 끝나는 합성어인 경우 생략
                        # 예: 다가오다 + 오다 => 다가와오다 (x)
                        if verb.word != auxiliary.word:
                            if verb.word.endswith(auxiliary.word):
                                continue
                        new_props = [p for p in auxiliary.props
                                     if not p.startswith('보조용언:')]
                        for prefix in prefixes:
                            new_word = Word()
                            new_word.word = prefix + auxiliary.word
                            new_word.pos = auxiliary.pos
                            new_word.stem = verb.word
                            new_word.props = new_props
                            new_word.flags = auxiliary.flags
                            new_word.flags_alias = auxiliary.flags_alias
                            new_words.append(new_word)
                else:
                    for prefix in prefixes:
                        new_word = Word()
                        new_word.word = prefix
                        new_word.stem = verb.word
                        new_word.pos = '내부:활용:' + form
                        new_words.append(new_word)
        self.append(new_words)

if __name__ == '__main__':
    afffilename = sys.argv[1]
    dicfilename = sys.argv[2]
    infilenames = sys.argv[3:]
    dic = Dictionary()
    for filename in infilenames:
        if filename.endswith('.json'):
            dic.load_json(open(filename, encoding='utf-8'))
        else:
            print('ERROR: unknown file type: ' + filename)
            sys.exit(1)
    dic.process()
    dic.output(open(afffilename, 'w', encoding='utf-8'),
               open(dicfilename, 'w', encoding='utf-8'))
source-git / hunspell-ko

Source Code

Files