Tree - source-git/hunspell-ko - CentOS Git server

source-git / hunspell-ko

Files

Commit: 91e7ec5c259aa6ee15590c96dc94f5ccbb2d0ef7
Blob Blame History Raw
# Internal encoding conversion

# ***** BEGIN LICENSE BLOCK *****
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is Hunspell Korean spellchecking dictionary.
#
# The Initial Developer of the Original Code is
# Changwoo Ryu.
# Portions created by the Initial Developer are Copyright (C) 2008, 2009, 2010
# the Initial Developer. All Rights Reserved.
#
# Contributor(s): See CREDITS file
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ***** END LICENSE BLOCK *****

import unicodedata

RESET_CODE = '\uE000'


class Encoder:
    STATE_INITIAL = 0
    STATE_C = 1
    STATE_V = 2

    JAMO2STROKES = {
        '\u1100': 'ㄱ',
        '\u1101': 'ㄲ',
        '\u1102': 'ㄴ',
        '\u1103': 'ㄷ',
        '\u1104': 'ㄸ',
        '\u1105': 'ㄹ',
        '\u1106': 'ㅁ',
        '\u1107': 'ㅂ',
        '\u1108': 'ㅃ',
        '\u1109': 'ㅅ',
        '\u110A': 'ㅆ',
        '\u110B': 'ㅇ',
        '\u110C': 'ㅈ',
        '\u110D': 'ㅉ',
        '\u110E': 'ㅊ',
        '\u110F': 'ㅋ',
        '\u1110': 'ㅌ',
        '\u1111': 'ㅍ',
        '\u1112': 'ㅎ',
        '\u1161': 'ㅏ',
        '\u1162': 'ㅐ',
        '\u1163': 'ㅑ',
        '\u1164': 'ㅒ',
        '\u1165': 'ㅓ',
        '\u1166': 'ㅔ',
        '\u1167': 'ㅕ',
        '\u1168': 'ㅖ',
        '\u1169': 'ㅗ',
        '\u116A': 'ㅗㅏ',
        '\u116B': 'ㅗㅐ',
        '\u116C': 'ㅗㅣ',
        '\u116D': 'ㅛ',
        '\u116E': 'ㅜ',
        '\u116F': 'ㅜㅓ',
        '\u1170': 'ㅜㅔ',
        '\u1171': 'ㅜㅣ',
        '\u1172': 'ㅠ',
        '\u1173': 'ㅡ',
        '\u1174': 'ㅡㅣ',
        '\u1175': 'ㅣ',
        '\u11A8': 'ㄱ',
        '\u11A9': 'ㄲ',
        '\u11AA': 'ㄱㅅ',
        '\u11AB': 'ㄴ',
        '\u11AC': 'ㄴㅈ',
        '\u11AD': 'ㄴㅎ',
        '\u11AE': 'ㄷ',
        '\u11AF': 'ㄹ',
        '\u11B0': 'ㄹㄱ',
        '\u11B1': 'ㄹㅁ',
        '\u11B2': 'ㄹㅂ',
        '\u11B3': 'ㄹㅅ',
        '\u11B4': 'ㄹㅌ',
        '\u11B5': 'ㄹㅍ',
        '\u11B6': 'ㄹㅎ',
        '\u11B7': 'ㅁ',
        '\u11B8': 'ㅂ',
        '\u11B9': 'ㅂㅅ',
        '\u11BA': 'ㅅ',
        '\u11BB': 'ㅆ',
        '\u11BC': 'ㅇ',
        '\u11BD': 'ㅈ',
        '\u11BE': 'ㅊ',
        '\u11BF': 'ㅋ',
        '\u11C0': 'ㅌ',
        '\u11C1': 'ㅍ',
        '\u11C2': 'ㅎ',
    }

    JAMO2STROKES_ALL = {
        '\u1100': 'ㄱ',
        '\u1101': 'ㄲ',
        '\u1102': 'ㄴ',
        '\u1103': 'ㄷ',
        '\u1104': 'ㄸ',
        '\u1105': 'ㄹ',
        '\u1106': 'ㅁ',
        '\u1107': 'ㅂ',
        '\u1108': 'ㅃ',
        '\u1109': 'ㅅ',
        '\u110A': 'ㅆ',
        '\u110B': 'ㅇ',
        '\u110C': 'ㅈ',
        '\u110D': 'ㅉ',
        '\u110E': 'ㅊ',
        '\u110F': 'ㅋ',
        '\u1110': 'ㅌ',
        '\u1111': 'ㅍ',
        '\u1112': 'ㅎ',
        '\u1113': 'ㄴㄱ',
        '\u1114': 'ㄴㄴ',
        '\u1115': 'ㄴㄷ',
        '\u1116': 'ㄴㅂ',
        '\u1117': 'ㄷㄱ',
        '\u1118': 'ㄹㄴ',
        '\u1119': 'ㄹㄹ',
        '\u111A': 'ㄹㅎ',
        '\u111B': 'ㄹㅇ',
        '\u111C': 'ㅁㅂ',
        '\u111D': 'ㅁㅇ',
        '\u111E': 'ㅂㄱ',
        '\u111F': 'ㅂㄴ',
        '\u1120': 'ㅂㄷ',
        '\u1121': 'ㅂㅅ',
        '\u1122': 'ㅂㅅㄱ',
        '\u1123': 'ㅂㅅㄷ',
        '\u1124': 'ㅂㅅㅂ',
        '\u1125': 'ㅂㅆ',
        '\u1126': 'ㅂㅅㅈ',
        '\u1127': 'ㅂㅈ',
        '\u1128': 'ㅂㅊ',
        '\u1129': 'ㅂㅌ',
        '\u112A': 'ㅂㅍ',
        '\u112B': 'ㅂㅇ',
        '\u112C': 'ㅃㅇ',
        '\u112D': 'ㅅㄱ',
        '\u112E': 'ㅅㄴ',
        '\u112F': 'ㅅㄷ',
        '\u1130': 'ㅅㄹ',
        '\u1131': 'ㅅㅁ',
        '\u1132': 'ㅅㅂ',
        '\u1133': 'ㅅㅂㄱ',
        '\u1134': 'ㅆㅅ',
        '\u1135': 'ㅅㅇ',
        '\u1136': 'ㅅㅈ',
        '\u1137': 'ㅅㅊ',
        '\u1138': 'ㅅㅋ',
        '\u1139': 'ㅅㅌ',
        '\u113A': 'ㅅㅍ',
        '\u113B': 'ㅅㅎ',
        '\u1141': 'ㅇㄱ',
        '\u1142': 'ㅇㄷ',
        '\u1143': 'ㅇㅁ',
        '\u1144': 'ㅇㅂ',
        '\u1145': 'ㅇㅅ',
        '\u1146': 'ㅇㅿ',
        '\u1147': 'ㅇㅇ',
        '\u1148': 'ㅇㅈ',
        '\u1149': 'ㅇㅊ',
        '\u114A': 'ㅇㅌ',
        '\u114B': 'ㅇㅍ',
        '\u114D': 'ㅈㅇ',
        '\u1152': 'ㅊㅋ',
        '\u1153': 'ㅊㅎ',
        '\u1156': 'ㅍㅂ',
        '\u1157': 'ㅍㅇ',
        '\u1156': 'ㅎㅎ',
        '\u115A': 'ㄱㄷ',
        '\u115B': 'ㄴㅅ',
        '\u115C': 'ㄴㅈ',
        '\u115D': 'ㄴㅎ',
        '\u115E': 'ㄷㄹ',

        '\u1161': 'ㅏ',
        '\u1162': 'ㅐ',
        '\u1163': 'ㅑ',
        '\u1164': 'ㅒ',
        '\u1165': 'ㅓ',
        '\u1166': 'ㅔ',
        '\u1167': 'ㅕ',
        '\u1168': 'ㅖ',
        '\u1169': 'ㅗ',
        '\u116A': 'ㅗㅏ',
        '\u116B': 'ㅗㅐ',
        '\u116C': 'ㅗㅣ',
        '\u116D': 'ㅛ',
        '\u116E': 'ㅜ',
        '\u116F': 'ㅜㅓ',
        '\u1170': 'ㅜㅔ',
        '\u1171': 'ㅜㅣ',
        '\u1172': 'ㅠ',
        '\u1173': 'ㅡ',
        '\u1174': 'ㅡㅣ',
        '\u1175': 'ㅣ',
        '\u1176': 'ㅏㅗ',
        '\u1177': 'ㅏㅜ',
        '\u1178': 'ㅑㅗ',
        '\u1179': 'ㅑㅛ',
        '\u117A': 'ㅓㅗ',
        '\u117B': 'ㅓㅜ',
        '\u117C': 'ㅓㅡ',
        '\u117D': 'ㅓㅗ',
        '\u117E': 'ㅕㅜ',
        '\u117F': 'ㅗㅓ',
        '\u1180': 'ㅗㅔ',
        '\u1181': 'ㅗㅖ',
        '\u1182': 'ㅗㅗ',
        '\u1183': 'ㅗㅜ',
        '\u1184': 'ㅛㅑ',
        '\u1185': 'ㅛㅒ',
        '\u1186': 'ㅛㅕ',
        '\u1187': 'ㅛㅗ',
        '\u1188': 'ㅛㅣ',
        '\u1189': 'ㅜㅏ',
        '\u118A': 'ㅜㅐ',
        '\u118B': 'ㅜㅓㅡ',
        '\u118C': 'ㅜㅖ',
        '\u118D': 'ㅜㅜ',
        '\u118E': 'ㅠㅏ',
        '\u118F': 'ㅠㅓ',
        '\u1190': 'ㅠㅔ',
        '\u1191': 'ㅠㅕ',
        '\u1192': 'ㅠㅖ',
        '\u1193': 'ㅠㅜ',
        '\u1194': 'ㅠㅣ',
        '\u1195': 'ㅡㅜ',
        '\u1196': 'ㅡㅡ',
        '\u1197': 'ㅡㅣㅜ',
        '\u1198': 'ㅣㅏ',
        '\u1199': 'ㅣㅑ',
        '\u119A': 'ㅣㅗ',
        '\u119B': 'ㅣㅜ',
        '\u119C': 'ㅣㅡ',
        '\u119D': 'ㅣㆍ',
        '\u119F': 'ㆍㅓ',
        '\u11A0': 'ㆍㅜ',
        '\u11A1': 'ㆍㅣ',
        '\u11A2': 'ㆍㆍ',
        '\u11A3': 'ㅏㅡ',
        '\u11A4': 'ㅑㅜ',
        '\u11A5': 'ㅕㅑ',
        '\u11A6': 'ㅗㅑ',
        '\u11A7': 'ㅗㅒ',

        '\u11A8': 'ㄱ',
        '\u11A9': 'ㄲ',
        '\u11AA': 'ㄱㅅ',
        '\u11AB': 'ㄴ',
        '\u11AC': 'ㄴㅈ',
        '\u11AD': 'ㄴㅎ',
        '\u11AE': 'ㄷ',
        '\u11AF': 'ㄹ',
        '\u11B0': 'ㄹㄱ',
        '\u11B1': 'ㄹㅁ',
        '\u11B2': 'ㄹㅂ',
        '\u11B3': 'ㄹㅅ',
        '\u11B4': 'ㄹㅌ',
        '\u11B5': 'ㄹㅍ',
        '\u11B6': 'ㄹㅎ',
        '\u11B7': 'ㅁ',
        '\u11B8': 'ㅂ',
        '\u11B9': 'ㅂㅅ',
        '\u11BA': 'ㅅ',
        '\u11BB': 'ㅆ',
        '\u11BC': 'ㅇ',
        '\u11BD': 'ㅈ',
        '\u11BE': 'ㅊ',
        '\u11BF': 'ㅋ',
        '\u11C0': 'ㅌ',
        '\u11C1': 'ㅍ',
        '\u11C2': 'ㅎ',
        '\u11C3': 'ㄱㄹ',
        '\u11C4': 'ㄱㅅㄱ',
        '\u11C5': 'ㄴㄱ',
        '\u11C6': 'ㄴㄷ',
        '\u11C7': 'ㄴㅅ',
        '\u11C8': 'ㄴㅿ',
        '\u11C9': 'ㄴㅌ',
        '\u11CA': 'ㄷㄱ',
        '\u11CB': 'ㄷㄹ',
        '\u11CC': 'ㄹㄱㅅ',
        '\u11CD': 'ㄹㄴ',
        '\u11CE': 'ㄹㄷ',
        '\u11CF': 'ㄹㄷㅎ',
        '\u11D0': 'ㄹㄹ',
        '\u11D1': 'ㄹㅁㄱ',
        '\u11D2': 'ㄹㅁㅅ',
        '\u11D3': 'ㄹㅂㅅ',
        '\u11D4': 'ㄹㅂㅎ',
        '\u11D5': 'ㄹㅂㅇ',
        '\u11D6': 'ㄹㅆ',
        '\u11D7': 'ㄹㅿ',
        '\u11D8': 'ㄹㅋ',
        '\u11D9': 'ㄹㆆ',
        '\u11DA': 'ㅁㄱ',
        '\u11DB': 'ㅁㄹ',
        '\u11DC': 'ㅁㅂ',
        '\u11DD': 'ㅁㅅ',
        '\u11DE': 'ㅁㅆ',
        '\u11DF': 'ㅁㅿ',
        '\u11E0': 'ㅁㅊ',
        '\u11E1': 'ㅁㅎ',
        '\u11E2': 'ㅁㅇ',
        '\u11E3': 'ㅂㄹ',
        '\u11E4': 'ㅂㅍ',
        '\u11E5': 'ㅂㅎ',
        '\u11E6': 'ㅂㅇ',
        '\u11E7': 'ㅅㄱ',
        '\u11E8': 'ㅅㄷ',
        '\u11E9': 'ㅅㄹ',
        '\u11EA': 'ㅅㅂ',
        '\u11EC': 'ㅇㄱ',
        '\u11ED': 'ㅇㄲ',
        '\u11EE': 'ㅇㅇ',
        '\u11EF': 'ㅇㅋ',
        '\u11F1': 'ㆁㅅ',
        '\u11F2': 'ㆁㅿ',
        '\u11F3': 'ㅍㅂ',
        '\u11F4': 'ㅍㅇ',
        '\u11F5': 'ㅎㄴ',
        '\u11F6': 'ㅎㄹ',
        '\u11F7': 'ㅎㅁ',
        '\u11F8': 'ㅎㅂ',
        '\u11FA': 'ㄱㄴ',
        '\u11FB': 'ㄱㅂ',
        '\u11FC': 'ㄱㅊ',
        '\u11FD': 'ㄱㅋ',
        '\u11FE': 'ㄱㅎ',
        '\u11FF': 'ㄴㄴ',

        # Hangul Jamo Extended-A
        '\uA960': 'ㄷㅁ',
        '\uA961': 'ㄷㅂ',
        '\uA962': 'ㄷㅅ',
        '\uA963': 'ㄷㅈ',
        '\uA964': 'ㄹㄱ',
        '\uA965': 'ㄹㄲ',
        '\uA966': 'ㄹㄷ',
        '\uA967': 'ㄹㄸ',
        '\uA968': 'ㄹㅁ',
        '\uA969': 'ㄹㅂ',
        '\uA96A': 'ㄹㅃ',
        '\uA96B': 'ㄹㅂㅇ',
        '\uA96C': 'ㄹㅅ',
        '\uA96D': 'ㄹㅈ',
        '\uA96E': 'ㄹㅋ',
        '\uA96F': 'ㅁㄱ',
        '\uA970': 'ㅁㄷ',
        '\uA971': 'ㅁㅅ',
        '\uA972': 'ㅂㅅㅌ',
        '\uA973': 'ㅂㅋ',
        '\uA974': 'ㅂㅎ',
        '\uA975': 'ㅆㅂ',
        '\uA976': 'ㅇㄹ',
        '\uA977': 'ㅇㅎ',
        '\uA978': 'ㅉㅎ',
        '\uA979': 'ㅌㅌ',
        '\uA97A': 'ㅍㅎ',
        '\uA97B': 'ㅎㅅ',
        '\uA97C': 'ㆆㆆ',

        # Hangul Jamo Extended-B
        '\uD7B0': 'ㅗㅕ',
        '\uD7B1': 'ㅗㅗㅣ',
        '\uD7B2': 'ㅛㅏ',
        '\uD7B3': 'ㅛㅐ',
        '\uD7B4': 'ㅛㅓ',
        '\uD7B5': 'ㅜㅕ',
        '\uD7B6': 'ㅜㅣㅣ',
        '\uD7B7': 'ㅜㅐ',
        '\uD7B8': 'ㅠㅗ',
        '\uD7B9': 'ㅡㅏ',
        '\uD7BA': 'ㅡㅓ',
        '\uD7BB': 'ㅡㅔ',
        '\uD7BC': 'ㅡㅗ',
        '\uD7BD': 'ㅣㅑㅗ',
        '\uD7BE': 'ㅣㅒ',
        '\uD7BF': 'ㅣㅕ',
        '\uD7C0': 'ㅣㅖ',
        '\uD7C1': 'ㅣㅗㅣ',
        '\uD7C2': 'ㅣㅛ',
        '\uD7C3': 'ㅣㅠ',
        '\uD7C4': 'ㅣㅣ',
        '\uD7C5': 'ㆍㅏ',
        '\uD7C6': 'ㆍㅔ',

        '\uD7CB': 'ㄴㄹ',
        '\uD7CC': 'ㄴㅊ',
        '\uD7CD': 'ㄸ',
        '\uD7CE': 'ㄸㅂ',
        '\uD7CF': 'ㄷㅂ',
        '\uD7D0': 'ㄷㅅ',
        '\uD7D1': 'ㄷㅅㄱ',
        '\uD7D2': 'ㄷㅈ',
        '\uD7D3': 'ㄷㅊ',
        '\uD7D4': 'ㄷㅌ',
        '\uD7D5': 'ㄹㄲ',
        '\uD7D6': 'ㄹㄱㅎ',
        '\uD7D7': 'ㄹㄹㅋ',
        '\uD7D8': 'ㄹㅁㅎ',
        '\uD7D9': 'ㄹㅂㄷ',
        '\uD7DA': 'ㄹㅂㅍ',
        '\uD7DB': 'ㄹㆁ',
        '\uD7DC': 'ㄹㆆㅎ',
        '\uD7DD': 'ㄹㅇ',
        '\uD7DE': 'ㅁㄴ',
        '\uD7DF': 'ㅁㄴㄴ',
        '\uD7E0': 'ㅁㅁ',
        '\uD7E1': 'ㅁㅂㅅ',
        '\uD7E2': 'ㅁㅈ',
        '\uD7E3': 'ㅂㄷ',
        '\uD7E4': 'ㅂㄹㅍ',
        '\uD7E5': 'ㅂㅁ',
        '\uD7E6': 'ㅃ',
        '\uD7E7': 'ㅂㅅㄷ',
        '\uD7E8': 'ㅂㅈ',
        '\uD7E9': 'ㅂㅊ',
        '\uD7EA': 'ㅅㅁ',
        '\uD7EB': 'ㅅㅂㅇ',
        '\uD7EC': 'ㅆㄱ',
        '\uD7ED': 'ㅆㄷ',
        '\uD7EE': 'ㅅㅿ',
        '\uD7EF': 'ㅅㅈ',
        '\uD7F0': 'ㅅㅊ',
        '\uD7F1': 'ㅅㅌ',
        '\uD7F2': 'ㅅㅎ',
        '\uD7F3': 'ㅿㅂ',
        '\uD7F4': 'ㅿㅂㅇ',
        '\uD7F5': 'ㆁㅁ',
        '\uD7F6': 'ㆁㅎ',
        '\uD7F7': 'ㅈㅂ',
        '\uD7F8': 'ㅈㅃ',
        '\uD7F9': 'ㅉ',
        '\uD7FA': 'ㅍㅅ',
        '\uD7FB': 'ㅍㅌ',
    }

    COMP2STROKES = {
        'ㄳ': 'ㄱㅅ',
        'ㄵ': 'ㄴㅈ',
        'ㄶ': 'ㄴㅎ',
        'ㄺ': 'ㄹㄱ',
        'ㄻ': 'ㄹㅁ',
        'ㄼ': 'ㄹㅂ',
        'ㄽ': 'ㄹㅅ',
        'ㄾ': 'ㄹㅌ',
        'ㄿ': 'ㄹㅍ',
        'ㅀ': 'ㄹㅎ',
        'ㅄ': 'ㅂㅅ',
        'ㅘ': 'ㅗㅏ',
        'ㅙ': 'ㅗㅐ',
        'ㅚ': 'ㅗㅣ',
        'ㅝ': 'ㅜㅓ',
        'ㅞ': 'ㅜㅔ',
        'ㅟ': 'ㅜㅣ',
        'ㅢ': 'ㅡㅣ',

        'ㅥ': 'ㄴㄴ',
        'ㅦ': 'ㄴㄷ',
        'ㅧ': 'ㄴㅅ',
        'ㅨ': 'ㄴㅿ',
        'ㅩ': 'ㄹㄱㅅ',
        'ㅪ': 'ㄹㄷ',
        'ㅫ': 'ㄹㅂㅅ',
        'ㅬ': 'ㄹㅿ',
        'ㅭ': 'ㄹㆆ',
        'ㅮ': 'ㅁㅂ',
        'ㅯ': 'ㅁㅅ',
        'ㅰ': 'ㅇㅇ',
        'ㅱ': 'ㅁㅿ',
        'ㅲ': 'ㅂㄱ',
        'ㅳ': 'ㅂㄷ',
        'ㅴ': 'ㅂㅅㄱ',
        'ㅵ': 'ㅂㅅㄷ',
        'ㅶ': 'ㅂㅈ',
        'ㅷ': 'ㅂㅌ',
        'ㅸ': 'ㅂㅇ',
        'ㅹ': 'ㅃㅇ',
        'ㅺ': 'ㅅㄱ',
        'ㅻ': 'ㅅㄴ',
        'ㅼ': 'ㅅㄷ',
        'ㅽ': 'ㅅㅂ',
        'ㅾ': 'ㅅㅈ',
        'ㅿ': 'ㅿ',
        'ㆀ': 'ㅇㅇ',
        'ㆁ': 'ㆁ',
        'ㆂ': 'ㆁㅅ',
        'ㆃ': 'ㆁㅿ',
        'ㆄ': 'ㅍㅇ',
        'ㆅ': 'ㅎㅎ',
        'ㆆ': 'ㆆㆆ',
        'ㆇ': 'ㅛㅑ',
        'ㆈ': 'ㅛㅒ',
        'ㆉ': 'ㅛㅣ',
        'ㆊ': 'ㅠㅕ',
        'ㆋ': 'ㅠㅖ',
        'ㆌ': 'ㅠㅣ',
        'ㆍ': 'ㆍ',
        'ㆎ': 'ㆍㅣ',
    }

    STROKES2COMP = {
        'ㄱㅅ': 'ㄳ',
        'ㄴㅈ': 'ㄵ',
        'ㄴㅎ': 'ㄶ',
        'ㄹㄱ': 'ㄺ',
        'ㄹㅁ': 'ㄻ',
        'ㄹㅂ': 'ㄼ',
        'ㄹㅅ': 'ㄽ',
        'ㄹㅌ': 'ㄾ',
        'ㄹㅍ': 'ㄿ',
        'ㄹㅎ': 'ㅀ',
        'ㅂㅅ': 'ㅄ',
        'ㅗㅏ': 'ㅘ',
        'ㅗㅐ': 'ㅙ',
        'ㅗㅣ': 'ㅚ',
        'ㅜㅓ': 'ㅝ',
        'ㅜㅔ': 'ㅞ',
        'ㅜㅣ': 'ㅟ',
        'ㅡㅣ': 'ㅢ',
    }

    def __init__(self):
        self.state = Encoder.STATE_INITIAL
        self.last = ''
        self.result = []

    def encode_syllable(self, ch):
        result = []
        jamos = unicodedata.normalize('NFD', ch)
        c = Encoder.JAMO2STROKES[jamos[0]]
        result.append(c)
        c = Encoder.JAMO2STROKES[jamos[1]]
        result.append(c)
        if len(jamos) == 2:
            self.state = Encoder.STATE_V
        else:
            c = Encoder.JAMO2STROKES[jamos[2]]
            result.append(c)
            self.state = Encoder.STATE_C
        self.last = c[-1]
        return ''.join(result)

    def encode_compjamo(self, ch):
        result = []

        is_c = (ord(ch) >= 0x3131) and (ord(ch) <= 0x314E)

        if ch in Encoder.COMP2STROKES:
            s = Encoder.COMP2STROKES[ch]
        else:
            s = ch
        # reset 필요한 경우:
        # (1) 모음 뒤에 자음
        # (2) 자음 뒤에 모음
        # (3) 앞에 글자와 합쳐서 복합 자음 / 복합 모음인 경우
        if ((self.state == Encoder.STATE_V and is_c) or
                (self.state == Encoder.STATE_C and not is_c) or
                ((self.last + s[0]) in Encoder.STROKES2COMP)):
            result.append(RESET_CODE)
        result.append(s)
        if is_c:
            # 이 뒤에 모음이 오면 reset 필요
            self.state = Encoder.STATE_C
        else:
            # 호환성 모음 뒤에는 자음이 오더라도 음절을 조합하지 않으니 뒤에
            # reset 불필요
            self.state = Encoder.STATE_INITIAL
        self.last = s[-1]
        return ''.join(result)

    def encode_jamo(self, ch):
        result = []

        is_c = (((ord(ch) >= 0x1100) and (ord(ch) <= 0x115F)) or
                ((ord(ch) >= 0x11A8) and (ord(ch) <= 0x11FF)) or
                ((ord(ch) >= 0xA960) and (ord(ch) <= 0xA97F)) or
                ((ord(ch) >= 0xD7CB) and (ord(ch) <= 0xD7FF)))

        if ch in Encoder.JAMO2STROKES_ALL:
            s = Encoder.JAMO2STROKES_ALL[ch]
        else:
            s = ch

        if ((self.state == Encoder.STATE_V and is_c) or
                (self.state == Encoder.STATE_C and not is_c)):
            result.append(RESET_CODE)
        result.append(s)

        if is_c:
            # 이 뒤에 모음이 오면 reset 필요
            self.state = Encoder.STATE_C
        else:
            # 호환성 모음 뒤에는 자음이 오더라도 음절을 조합하지 않으니 뒤에
            # reset 불필요
            self.state = Encoder.STATE_V
        self.last = s[-1]
        return ''.join(result)

    def encode(self, s):
        s = unicodedata.normalize('NFC', s)
        outlist = []
        self.state = Encoder.STATE_INITIAL
        self.last = ''
        for ch in s:
            if (ord(ch) >= 0xAC00) and (ord(ch) <= 0xD7A3):
                outlist.append(self.encode_syllable(ch))
            elif (ord(ch) >= 0x3131) and (ord(ch) <= 0x3163):
                outlist.append(self.encode_compjamo(ch))
            elif (((ord(ch) >= 0x1100) and (ord(ch) <= 0x11FF)) or
                  ((ord(ch) >= 0xA960) and (ord(ch) <= 0xA97F)) or
                  ((ord(ch) >= 0xD7B0) and (ord(ch) <= 0xD7FF))):
                outlist.append(self.encode_jamo(ch))
            else:
                self.state = Encoder.STATE_INITIAL
                self.last = ''
                outlist.append(ch)
        return ''.join(outlist)


DUMP_DECODER = False


class Decoder:
    def __init__(self):
        pass

    def stroke_is_c(self, ch):
        return (ord(ch) >= 0x3131) and (ord(ch) <= 0x314E)

    def stroke_is_v(self, ch):
        return (ord(ch) >= 0x314F) and (ord(ch) <= 0x3163)

    def compose(self, s):
        l_table = {
            'ㄱ': '\u1100',
            'ㄲ': '\u1101',
            'ㄴ': '\u1102',
            'ㄷ': '\u1103',
            'ㄸ': '\u1104',
            'ㄹ': '\u1105',
            'ㅁ': '\u1106',
            'ㅂ': '\u1107',
            'ㅃ': '\u1108',
            'ㅅ': '\u1109',
            'ㅆ': '\u110A',
            'ㅇ': '\u110B',
            'ㅈ': '\u110C',
            'ㅉ': '\u110D',
            'ㅊ': '\u110E',
            'ㅋ': '\u110F',
            'ㅌ': '\u1110',
            'ㅍ': '\u1111',
            'ㅎ': '\u1112',
        }
        v_table = {
            'ㅏ': '\u1161',
            'ㅐ': '\u1162',
            'ㅑ': '\u1163',
            'ㅒ': '\u1164',
            'ㅓ': '\u1165',
            'ㅔ': '\u1166',
            'ㅕ': '\u1167',
            'ㅖ': '\u1168',
            'ㅗ': '\u1169',
            'ㅗㅏ': '\u116A',
            'ㅗㅐ': '\u116B',
            'ㅗㅣ': '\u116C',
            'ㅛ': '\u116D',
            'ㅜ': '\u116E',
            'ㅜㅓ': '\u116F',
            'ㅜㅔ': '\u1170',
            'ㅜㅣ': '\u1171',
            'ㅠ': '\u1172',
            'ㅡ': '\u1173',
            'ㅡㅣ': '\u1174',
            'ㅣ': '\u1175',
        }
        t_table = {
            'ㄱ': '\u11A8',
            'ㄲ': '\u11A9',
            'ㄱㅅ': '\u11AA',
            'ㄴ': '\u11AB',
            'ㄴㅈ': '\u11AC',
            'ㄴㅎ': '\u11AD',
            'ㄷ': '\u11AE',
            'ㄹ': '\u11AF',
            'ㄹㄱ': '\u11B0',
            'ㄹㅁ': '\u11B1',
            'ㄹㅂ': '\u11B2',
            'ㄹㅅ': '\u11B3',
            'ㄹㅌ': '\u11B4',
            'ㄹㅍ': '\u11B5',
            'ㄹㅎ': '\u11B6',
            'ㅁ': '\u11B7',
            'ㅂ': '\u11B8',
            'ㅂㅅ': '\u11B9',
            'ㅅ': '\u11BA',
            'ㅆ': '\u11BB',
            'ㅇ': '\u11BC',
            'ㅈ': '\u11BD',
            'ㅊ': '\u11BE',
            'ㅋ': '\u11BF',
            'ㅌ': '\u11C0',
            'ㅍ': '\u11C1',
            'ㅎ': '\u11C2',
        }
        vv_table = {'ㅗㅐ': 'ㅙ', 'ㅗㅣ': 'ㅚ', 'ㅜㅓ': 'ㅝ', 'ㅜㅔ': 'ㅞ',
                    'ㅡㅣ': 'ㅢ'}
        tt_table = {'ㄱㅅ': 'ㄳ', 'ㄴㅈ': 'ㄵ', 'ㄴㅎ': 'ㄶ', 'ㄹㄱ': 'ㄺ',
                    'ㄹㅁ': 'ㄻ', 'ㄹㅂ': 'ㄼ', 'ㄹㅅ': 'ㄽ', 'ㄹㅌ': 'ㄾ',
                    'ㄹㅍ': 'ㄿ', 'ㄹㅎ': 'ㅀ', 'ㅂㅅ': 'ㅄ'}

        assert len(s) >= 2
        nfd = l_table[s[0]]
        i = 1
        assert self.stroke_is_v(s[i])
        if len(s) > (i + 1) and self.stroke_is_v(s[i+1]):
            nfd += v_table[s[i:i+2]]
            i += 2
        else:
            nfd += v_table[s[i]]
            i += 1
        if len(s) >= (i + 1):
            if len(s) > (i + 1) and self.stroke_is_c(s[i+1]):
                nfd += t_table[s[i:i+2]]
                i += 2
            else:
                assert self.stroke_is_c(s[i])
                nfd += t_table[s[i]]
                i += 1
        assert len(s) == i
        return unicodedata.normalize('NFC', nfd)

    def decode(self, s):
        composed = []
        strokes = []
        precomposed = ''
        prestrokes = ''

        STATE_INITIAL = 1
        STATE_L = 2
        STATE_LL = 22
        STATE_V = 3
        STATE_VC = 33
        STATE_T = 4
        STATE_TT = 5
        state = STATE_INITIAL

        for ch in s:
            if ch == RESET_CODE:
                composed.append(precomposed)
                strokes.append(prestrokes)
                precomposed = ''
                prestrokes = ''
                state = STATE_INITIAL
            elif ch in 'ㄱㄲㄴㄷㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ':
                t_table = {'ㄱㅅ': 'ㄳ', 'ㄴㅈ': 'ㄵ', 'ㄴㅎ': 'ㄶ', 'ㄹㄱ': 'ㄺ',
                           'ㄹㅁ': 'ㄻ', 'ㄹㅂ': 'ㄼ', 'ㄹㅅ': 'ㄽ', 'ㄹㅌ': 'ㄾ',
                           'ㄹㅍ': 'ㄿ', 'ㄹㅎ': 'ㅀ', 'ㅂㅅ': 'ㅄ'}
                if ((state == STATE_INITIAL) or (state == STATE_LL) or
                        (state == STATE_VC)):
                    if precomposed:
                        composed.append(precomposed)
                        strokes.append(prestrokes)
                    precomposed = ch
                    prestrokes = ch
                    state = STATE_L
                elif state == STATE_L:
                    if (prestrokes + ch) in t_table:
                        precomposed = t_table[prestrokes + ch]
                        prestrokes += ch
                        state = STATE_LL
                    else:
                        composed.append(precomposed)
                        strokes.append(prestrokes)
                        precomposed = ch
                        prestrokes = ch
                        state = STATE_L
                elif state == STATE_V:
                    if ch in 'ㅃㅉ':
                        if precomposed:
                            composed.append(precomposed)
                            strokes.append(prestrokes)
                        precomposed = ch
                        prestrokes = ch
                        state = STATE_L
                    else:
                        prestrokes += ch
                        precomposed = self.compose(prestrokes)
                        state = STATE_T
                elif state == STATE_T:
                    if (prestrokes[-1] + ch) in t_table:
                        prestrokes += ch
                        precomposed = self.compose(prestrokes)
                        state = STATE_TT
                    else:
                        composed.append(precomposed)
                        strokes.append(prestrokes)
                        prestrokes = ch
                        precomposed = ch
                        state = STATE_L
                elif state == STATE_TT:
                    composed.append(precomposed)
                    strokes.append(prestrokes)
                    prestrokes = ch
                    precomposed = ch
                    state = STATE_L
                else:
                    assert False
            elif self.stroke_is_v(ch):
                v_table = {'ㅗㅏ': 'ㅘ', 'ㅗㅐ': 'ㅙ', 'ㅗㅣ': 'ㅚ',
                           'ㅜㅓ': 'ㅝ', 'ㅜㅔ': 'ㅞ', 'ㅜㅣ': 'ㅟ',
                           'ㅡㅣ': 'ㅢ'}
                if state == STATE_INITIAL:
                    composed.append(precomposed)
                    strokes.append(prestrokes)
                    precomposed = ch
                    prestrokes = ch
                    state = STATE_VC
                elif state == STATE_VC:
                    if (prestrokes + ch) in v_table:
                        precomposed = v_table[prestrokes + ch]
                        prestrokes += ch
                        state = STATE_VC
                    else:
                        composed.append(precomposed)
                        strokes.append(prestrokes)
                        precomposed = ch
                        prestrokes = ch
                        state = STATE_VC
                elif state == STATE_LL:
                    composed.append(prestrokes[0])
                    strokes.append(prestrokes[0])
                    prestrokes = prestrokes[1:] + ch
                    precomposed = self.compose(prestrokes)
                    state = STATE_V
                elif state == STATE_L:
                    prestrokes += ch
                    precomposed = self.compose(prestrokes)
                    state = STATE_V
                elif state == STATE_V:
                    if (prestrokes[-1] + ch) in v_table:
                        prestrokes += ch
                        precomposed = self.compose(prestrokes)
                        state = STATE_V
                    else:
                        composed.append(precomposed)
                        strokes.append(prestrokes)
                        prestrokes = ch
                        precomposed = ch
                        state = STATE_VC
                elif state == STATE_T or state == STATE_TT:
                    composed.append(self.compose(prestrokes[:-1]))
                    strokes.append(prestrokes[:-1])
                    prestrokes = prestrokes[-1] + ch
                    precomposed = self.compose(prestrokes)
                    state = STATE_V
                else:
                    assert False
            else:
                if precomposed:
                    composed.append(precomposed)
                    strokes.append(prestrokes)
                composed.append(ch)
                strokes.append(ch)
                precomposed = ''
                prestrokes = ''
                state = STATE_INITIAL
            if DUMP_DECODER:
                print('================')
                print('ch: %s' % ch)
                print('composed: %s' % composed)
                print('strokes: %s' % strokes)
                print('precomposed: %s' % precomposed)
                print('prestrokes: %s' % prestrokes)
        if precomposed:
            composed.append(precomposed)
            strokes.append(prestrokes)
        return ''.join(composed)


def encode(s):
    encoder = Encoder()
    return encoder.encode(s)


def decode(s):
    decoder = Decoder()
    return decoder.decode(s)


if __name__ == '__main__':
    import sys

    def assert_round_trip(decoded, encoded):
        if encode(decoded) != encoded:
            print('encode(%s) = %s != %s' % (decoded, encode(decoded),
                                             encoded))
        assert encode(decoded) == encoded
        if decode(encoded) != decoded:
            print('encode(%s) = %s != %s' % (decoded, encode(decoded),
                                             encoded))
        assert decode(encoded) == decoded

    def assert_encode(decoded, encoded):
        if encode(decoded) != encoded:
            print('encode(%s) = %s != %s' % (decoded, encode(decoded),
                                             encoded))
        assert encode(decoded) == encoded

    assert_round_trip('바둑이', 'ㅂㅏㄷㅜㄱㅇㅣ')
    assert_round_trip('과일', 'ㄱㅗㅏㅇㅣㄹ')
    assert_round_trip('뷁이', 'ㅂㅜㅔㄹㄱㅇㅣ')
    assert_round_trip('뷀기', 'ㅂㅜㅔㄹㄱㅣ')
    assert_round_trip('쌇아', 'ㅆㅏㄹㅎㅇㅏ')
    assert_round_trip('ㄳ', 'ㄱㅅ')
    assert_round_trip('ㅙ', 'ㅗㅐ')
    assert_round_trip('ㅙ', 'ㅗㅐ')
    assert_round_trip('ㄱ삯', 'ㄱㅅㅏㄱㅅ')
    assert_round_trip('ㄱ삭가', 'ㄱㅅㅏㄱㄱㅏ')
    assert_round_trip('ㅙㄱ', 'ㅗㅐㄱ')
    assert_round_trip('바둑이ㄱ', 'ㅂㅏㄷㅜㄱㅇㅣ' + RESET_CODE + 'ㄱ')
    assert_round_trip('ㄱㅅ', 'ㄱ' + RESET_CODE + 'ㅅ')
    assert_round_trip('ㅙㄱㅣ', 'ㅗㅐㄱ' + RESET_CODE + 'ㅣ')
    assert_round_trip('맨To맨', 'ㅁㅐㄴToㅁㅐㄴ')
    assert_round_trip('English맨', 'Englishㅁㅐㄴ')

    # one way
    assert_encode('\u1100\u1161\u11A8', 'ㄱㅏㄱ')
    assert_encode('\u1100\u1161\u11A8\u1161', 'ㄱㅏㄱ' + RESET_CODE + 'ㅏ')
    assert_encode('\u1100\u1161\u112D\u1161\u11A8', 'ㄱㅏ' + RESET_CODE +
                  'ㅅㄱ' + RESET_CODE + 'ㅏ' + RESET_CODE + 'ㄱ')
source-git / hunspell-ko

Source Code

Files