/* mtext-lbrk.c -- line break Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 National Institute of Advanced Industrial Science and Technology (AIST) Registration Number H15PRO112 This file is part of the m17n library. The m17n library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The m17n library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the m17n library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE) /*** @addtogroup m17nInternal @{ */ #include #include #include #include "config.h" #include "m17n.h" #include "m17n-misc.h" #include "internal.h" #include "mtext.h" enum LineBreakClass { LBC_OP, /* open */ LBC_CL, /* close */ LBC_QU, /* quotation */ LBC_GL, /* glue */ LBC_NS, /* no-start */ LBC_EX, /* exclamation/interrogation */ LBC_SY, /* Syntax (slash) */ LBC_IS, /* infix (numeric) separator */ LBC_PR, /* prefix */ LBC_PO, /* postfix */ LBC_NU, /* numeric */ LBC_AL, /* alphabetic */ LBC_ID, /* ideograph (atomic) */ LBC_IN, /* inseparable */ LBC_HY, /* hyphen */ LBC_BA, /* break after */ LBC_BB, /* break before */ LBC_B2, /* break both */ LBC_ZW, /* ZW space */ LBC_CM, /* combining mark */ LBC_WJ, /* word joiner */ /* used for 4.1 pair table */ LBC_H2, /* Hamgul 2 Jamo Syllable */ LBC_H3, /* Hangul 3 Jamo Syllable */ LBC_JL, /* Jamo leading consonant */ LBC_JV, /* Jamo vowel */ LBC_JT, /* Jamo trailing consonant */ /* These are not handled in the pair tables. */ LBC_SA, /* south (east) asian */ LBC_SP, /* space */ LBC_PS, /* paragraph and line separators */ LBC_BK, /* hard break (newline) */ LBC_CR, /* carriage return */ LBC_LF, /* line feed */ LBC_NL, /* next line */ LBC_CB, /* contingent break opportunity */ LBC_SG, /* surrogate */ LBC_AI, /* ambiguous */ LBC_XX, /* unknown */ LBC_MAX }; enum LineBreakAction { LBA_DIRECT = '_', LBA_INDIRECT = '%', LBA_COMBINING_INDIRECT = '#', LBA_COMBINING_PROHIBITED = '@', LBA_PROHIBITED = '^', LBA_MAX }; /* The pair table of line break actions. */ static char *lba_pair_table[] = /* OP GL SY PO ID BA ZW H2 JV CL NS IS NU IN BB CM H3 JT QU EX PR AL HY B2 WJ JL */ { "^^^^^^^^^^^^^^^^^^^@^^^^^^", /* OP */ "_^%%^^^^_%____%%__^#^_____", /* CL */ "^^%%%^^^%%%%%%%%%%^#^%%%%%", /* QU */ "%^%%%^^^%%%%%%%%%%^#^%%%%%", /* GL */ "_^%%%^^^______%%__^#^_____", /* NS */ "_^%%%^^^______%%__^#^_____", /* EX */ "_^%%%^^^__%___%%__^#^_____", /* SY */ "_^%%%^^^__%%__%%__^#^_____", /* IS */ "%^%%%^^^__%%%_%%__^#^%%%%%", /* PR */ "_^%%%^^^______%%__^#^_____", /* PO */ "_^%%%^^^_%%%_%%%__^#^_____", /* NU */ "_^%%%^^^__%%_%%%__^#^_____", /* AL */ "_^%%%^^^_%___%%%__^#^_____", /* ID */ "_^%%%^^^_____%%%__^#^_____", /* IN */ "_^%%%^^^__%___%%__^#^_____", /* HY */ "_^%%%^^^______%%__^#^_____", /* BA */ "%^%%%^^^%%%%%%%%%%^#^%%%%%", /* BB */ "_^%%%^^^______%%_^^#^_____", /* B2 */ "__________________^_______", /* ZW */ "_^%%%^^^__%%_%%%__^#^_____", /* CM */ "%^%%%^^^%%%%%%%%%%^#^%%%%%", /* WJ */ "_^%%%^^^_%___%%%__^#^___%%", /* H2 */ "_^%%%^^^_%___%%%__^#^____%", /* H3 */ "_^%%%^^^_%___%%%__^#^%%%%_", /* JL */ "_^%%%^^^_%___%%%__^#^___%%", /* JV */ "_^%%%^^^_%___%%%__^#^____%" /* JT */ }; static MCharTable *lbc_table; /* Set LBC to enum LineBreakClass of the character at POS of MT (length is LEN) while converting LBC_AI and LBC_XX to LBC_AL, LBC_CB to LBC_B2, LBC_CR, LBC_LF, and LBC_NL to LBC_BK. If POS is out of range, set LBC to LBC_BK. */ #define GET_LBC(LBC, MT, LEN, POS, OPTION) \ do { \ if ((POS) < 0 || (POS) >= (LEN)) \ (LBC) = LBC_BK; \ else \ { \ int c = mtext_ref_char ((MT), (POS)); \ (LBC) = (enum LineBreakClass) mchartable_lookup (lbc_table, c); \ if ((LBC) == LBC_NL) \ (LBC) = LBC_BK; \ else if ((LBC) == LBC_AI) \ (LBC) = ((OPTION) & MTEXT_LBO_AI_AS_ID) ? LBC_ID : LBC_AL; \ else if (! ((OPTION) & MTEXT_LBO_KOREAN_SP) \ && (LBC) >= LBC_H2 && (LBC) <= LBC_JT) \ (LBC) = LBC_AL; \ else if ((LBC) == LBC_CB) \ (LBC) = LBC_B2; \ else if ((LBC) == LBC_XX) \ (LBC) = LBC_AL; \ } \ } while (0) /*** @} */ #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */ /* External API */ /*** @addtogroup m17nMtext */ /*** @{ */ /*=*/ /***en @brief Find a linebreak postion of an M-text. The mtext_line_break () function checks if position $POS is a proper linebreak position of an M-text $MT according to the algorithm of The Unicode Standard 4.0 UAX#14. It so, it returns $POS. Otherwise, it returns a proper linebreak position before $POS. If $OPTION is nonzero, it controls the algorithm by logical-or of the members of #MTextLineBreakOption. If $AFTER is not NULL, a proper linebreak position after $POS is stored there. */ int mtext_line_break (MText *mt, int pos, int option, int *after) { int break_before, break_after; int len = mtext_len (mt); enum LineBreakClass lbc; enum LineBreakClass Blbc, Albc; /* B(efore) and A(fter) lbcs. */ int Bpos, Apos; /* B(efore) and A(fter) positions. */ enum LineBreakAction action; if (pos >= len) { /* The end of text is an explicit break position. */ if (after) *after = pos; return pos; } if (! lbc_table) { MSymbol key = mchar_define_property ("linebreak", Minteger); lbc_table = mchar_get_prop_table (key, NULL); } GET_LBC (lbc, mt, len, pos, option); Apos = pos; Albc = lbc; if (Albc == LBC_SP) { if (option & MTEXT_LBO_SP_CM) { GET_LBC (Albc, mt, len, Apos + 1, option); Albc = (Albc == LBC_CM) ? LBC_ID : LBC_SP; } while (Albc == LBC_SP) { Apos--; GET_LBC (Albc, mt, len, Apos, option); } } if ((option & MTEXT_LBO_SP_CM) && (Albc == LBC_CM)) { Apos--; GET_LBC (Albc, mt, len, Apos, option); if (Albc == LBC_SP) Albc = LBC_ID; else Apos++, Albc = LBC_CM; } if (Albc == LBC_CR) Albc = LBC_BK; else if (Albc == LBC_LF) { GET_LBC (Albc, mt, len, Apos - 1, option); if (Albc == LBC_CR) Apos--; Albc = LBC_BK; } else if (Albc == LBC_SA) Albc = mtext__word_segment (mt, Apos, &Apos, NULL) > 0 ? LBC_BB : LBC_AL; Bpos = Apos; /* After exiting from the following loop, if Apos is positive, it is the previous (including POS) break position. */ while (Apos > 0) { int indirect; int next = -1; /* Now Bpos == Apos. */ do { Bpos--; GET_LBC (Blbc, mt, len, Bpos, option); } while (Blbc == LBC_SP); if (Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR) { /* Explicit break. */ break; } indirect = Bpos + 1 < Apos; if (Blbc == LBC_CM) { do { Bpos--; GET_LBC (Blbc, mt, len, Bpos, option); } while (Blbc == LBC_CM); if ((option & MTEXT_LBO_SP_CM) && (Blbc == LBC_SP)) Blbc = LBC_ID; else if (Blbc == LBC_SP || Blbc == LBC_ZW || Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR) { Blbc = LBC_AL; Bpos++; } } if (Blbc == LBC_SA) { mtext__word_segment (mt, Bpos, &next, NULL); Blbc = LBC_AL; } if (Albc != LBC_BK) { action = lba_pair_table[Blbc][Albc]; if (action == LBA_DIRECT) break; else if (action == LBA_INDIRECT) { if (indirect) break; } else if (action == LBA_COMBINING_INDIRECT) { if (indirect) break; } } if (next >= 0) Apos = next, Albc = LBC_BB; else Apos = Bpos, Albc = Blbc; } break_before = Apos; if (break_before > 0) { if (! after) return break_before; if (break_before == pos) { if (after) *after = break_before; return break_before; } } /* Now find a break position after POS. */ break_after = 0; Bpos = pos; Blbc = lbc; if (Blbc == LBC_CM) { do { Bpos--; GET_LBC (Blbc, mt, len, Bpos, option); } while (Blbc == LBC_CM); if (Blbc == LBC_SP || Blbc == LBC_ZW || Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR) { if ((Blbc == LBC_SP) && (option & MTEXT_LBO_SP_CM)) Blbc = LBC_ID; else Blbc = LBC_AL; } Bpos = pos; } if (Blbc == LBC_SA) { mtext__word_segment (mt, Bpos, NULL, &Bpos); Blbc = LBC_AL; } else if (Blbc == LBC_SP) { if (option & MTEXT_LBO_SP_CM) { GET_LBC (Blbc, mt, len, Bpos + 1, option); if (Blbc == LBC_CM) Blbc = LBC_ID, Bpos++; else Blbc = LBC_SP; } while (Blbc == LBC_SP) { Bpos--; GET_LBC (Blbc, mt, len, Bpos, option); } if (Bpos < 0) Bpos = pos; } Apos = Bpos; /* After exiting from the following loop, if Apos is positive, it is the next break position. */ while (1) { int indirect; int next = -1; /* Now Bpos == Apos. */ if (Blbc == LBC_LF || Blbc == LBC_BK || Blbc == LBC_CR) { Apos++; if (Blbc == LBC_CR) { GET_LBC (Blbc, mt, len, Bpos + 1, option); if (Blbc == LBC_LF) Apos++; } break; } do { Apos++; GET_LBC (Albc, mt, len, Apos, option); } while (Albc == LBC_SP); if (Blbc == LBC_SP) break; if (Apos == len) /* Explicit break at the end of text. */ break; indirect = Bpos + 1 < Apos; if (Albc == LBC_SA) Albc = mtext__word_segment (mt, Apos, NULL, &next) ? LBC_BB : LBC_AL; action = lba_pair_table[Blbc][Albc]; if (action == LBA_DIRECT) /* Direct break at Apos. */ break; else if (action == LBA_INDIRECT) { if (indirect) break; } else if (action == LBA_COMBINING_INDIRECT) { if (indirect) { if (option & MTEXT_LBO_SP_CM) Apos--; break; } } if (next >= 0) Bpos = next, Blbc = LBC_AL; else { Bpos = Apos; if (Albc != LBC_CM) Blbc = Albc; } } break_after = Apos; if (after) *after = break_after; return (break_before > 0 ? break_before : break_after); } /*** @} */ /* Local Variables: coding: euc-japan End: */