Blob Blame History Raw
/* mtext-lbrk.c -- line break
   Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
     National Institute of Advanced Industrial Science and Technology (AIST)
     Registration Number H15PRO112

   This file is part of the m17n library.

   The m17n library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public License
   as published by the Free Software Foundation; either version 2.1 of
   the License, or (at your option) any later version.

   The m17n library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the m17n library; if not, write to the Free
   Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
   Boston, MA 02110-1301 USA.  */

#if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
/*** @addtogroup m17nInternal
     @{ */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "config.h"
#include "m17n.h"
#include "m17n-misc.h"
#include "internal.h"
#include "mtext.h"

enum LineBreakClass
  {
    LBC_OP, /* open */
    LBC_CL, /* close */
    LBC_QU, /* quotation */
    LBC_GL, /* glue */
    LBC_NS, /* no-start */
    LBC_EX, /* exclamation/interrogation */
    LBC_SY, /* Syntax (slash) */
    LBC_IS, /* infix (numeric) separator */
    LBC_PR, /* prefix */
    LBC_PO, /* postfix */
    LBC_NU, /* numeric */
    LBC_AL, /* alphabetic */
    LBC_ID, /* ideograph (atomic) */
    LBC_IN, /* inseparable */
    LBC_HY, /* hyphen */
    LBC_BA, /* break after */
    LBC_BB, /* break before */
    LBC_B2, /* break both */
    LBC_ZW, /* ZW space */
    LBC_CM, /* combining mark */
    LBC_WJ, /* word joiner */

    /* used for 4.1 pair table */
    LBC_H2, /* Hamgul 2 Jamo Syllable */
    LBC_H3, /* Hangul 3 Jamo Syllable */
    LBC_JL, /* Jamo leading consonant */
    LBC_JV, /* Jamo vowel */
    LBC_JT, /* Jamo trailing consonant */

    /* These are not handled in the pair tables. */
    LBC_SA, /* south (east) asian */
    LBC_SP, /* space */
    LBC_PS, /* paragraph and line separators */
    LBC_BK, /* hard break (newline) */
    LBC_CR, /* carriage return */
    LBC_LF, /* line feed */
    LBC_NL, /* next line */
    LBC_CB, /* contingent break opportunity */
    LBC_SG, /* surrogate */
    LBC_AI, /* ambiguous */
    LBC_XX, /* unknown */
    LBC_MAX
  };

enum LineBreakAction
  {
    LBA_DIRECT =		'_',
    LBA_INDIRECT =		'%',
    LBA_COMBINING_INDIRECT =	'#',
    LBA_COMBINING_PROHIBITED =	'@',
    LBA_PROHIBITED =		'^',
    LBA_MAX
  };

/* The pair table of line break actions.  */
static char *lba_pair_table[] =
  /* OP GL SY PO ID BA ZW H2 JV
      CL NS IS NU IN BB CM H3 JT
       QU EX PR AL HY B2 WJ JL  */
  { "^^^^^^^^^^^^^^^^^^^@^^^^^^", /* OP */
    "_^%%^^^^_%____%%__^#^_____", /* CL */
    "^^%%%^^^%%%%%%%%%%^#^%%%%%", /* QU */
    "%^%%%^^^%%%%%%%%%%^#^%%%%%", /* GL */
    "_^%%%^^^______%%__^#^_____", /* NS */
    "_^%%%^^^______%%__^#^_____", /* EX */
    "_^%%%^^^__%___%%__^#^_____", /* SY */
    "_^%%%^^^__%%__%%__^#^_____", /* IS */
    "%^%%%^^^__%%%_%%__^#^%%%%%", /* PR */
    "_^%%%^^^______%%__^#^_____", /* PO */
    "_^%%%^^^_%%%_%%%__^#^_____", /* NU */
    "_^%%%^^^__%%_%%%__^#^_____", /* AL */
    "_^%%%^^^_%___%%%__^#^_____", /* ID */
    "_^%%%^^^_____%%%__^#^_____", /* IN */
    "_^%%%^^^__%___%%__^#^_____", /* HY */
    "_^%%%^^^______%%__^#^_____", /* BA */
    "%^%%%^^^%%%%%%%%%%^#^%%%%%", /* BB */
    "_^%%%^^^______%%_^^#^_____", /* B2 */
    "__________________^_______", /* ZW */
    "_^%%%^^^__%%_%%%__^#^_____", /* CM */
    "%^%%%^^^%%%%%%%%%%^#^%%%%%", /* WJ */
    "_^%%%^^^_%___%%%__^#^___%%", /* H2 */
    "_^%%%^^^_%___%%%__^#^____%", /* H3 */
    "_^%%%^^^_%___%%%__^#^%%%%_", /* JL */
    "_^%%%^^^_%___%%%__^#^___%%", /* JV */
    "_^%%%^^^_%___%%%__^#^____%"  /* JT */
  };

static MCharTable *lbc_table;

/* Set LBC to enum LineBreakClass of the character at POS of MT
   (length is LEN) while converting LBC_AI and LBC_XX to LBC_AL,
   LBC_CB to LBC_B2, LBC_CR, LBC_LF, and LBC_NL to LBC_BK.  If POS is
   out of range, set LBC to LBC_BK.  */

#define GET_LBC(LBC, MT, LEN, POS, OPTION)				\
  do {									\
    if ((POS) < 0 || (POS) >= (LEN))					\
      (LBC) = LBC_BK;							\
    else								\
      {									\
	int c = mtext_ref_char ((MT), (POS));				\
	(LBC) = (enum LineBreakClass) mchartable_lookup (lbc_table, c);	\
	if ((LBC) == LBC_NL)						\
	  (LBC) = LBC_BK;						\
	else if ((LBC) == LBC_AI)					\
	  (LBC) = ((OPTION) & MTEXT_LBO_AI_AS_ID) ? LBC_ID : LBC_AL;	\
	else if (! ((OPTION) & MTEXT_LBO_KOREAN_SP)			\
		 && (LBC) >= LBC_H2 && (LBC) <= LBC_JT)			\
	  (LBC) = LBC_AL;						\
	else if ((LBC) == LBC_CB)					\
	  (LBC) = LBC_B2;						\
	else if ((LBC) == LBC_XX)					\
	  (LBC) = LBC_AL;						\
      }									\
  } while (0)


/*** @} */
#endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */


/* External API */

/*** @addtogroup m17nMtext */
/*** @{ */
/*=*/

/***en
    @brief Find a linebreak postion of an M-text.

    The mtext_line_break () function checks if position $POS is a
    proper linebreak position of an M-text $MT according to the
    algorithm of The Unicode Standard 4.0 UAX#14.  It so, it returns
    $POS.  Otherwise, it returns a proper linebreak position before
    $POS.

    If $OPTION is nonzero, it controls the algorithm by logical-or of
    the members of #MTextLineBreakOption.

    If $AFTER is not NULL, a proper linebreak position after $POS is
    stored there.  */

int
mtext_line_break (MText *mt, int pos, int option, int *after)
{
  int break_before, break_after;
  int len = mtext_len (mt);
  enum LineBreakClass lbc;
  enum LineBreakClass Blbc, Albc; /* B(efore) and A(fter) lbcs.  */
  int Bpos, Apos;		  /* B(efore) and A(fter) positions.  */
  enum LineBreakAction action;
  
  if (pos >= len)
    {
      /* The end of text is an explicit break position.  */
      if (after)
	*after = pos;
      return pos;
    }

  if (! lbc_table)
    {
      MSymbol key = mchar_define_property ("linebreak", Minteger);

      lbc_table = mchar_get_prop_table (key, NULL);
    }

  GET_LBC (lbc, mt, len, pos, option);
  Apos = pos;
  Albc = lbc;
  if (Albc == LBC_SP)
    {
      if (option & MTEXT_LBO_SP_CM)
	{
	  GET_LBC (Albc, mt, len, Apos + 1, option);
	  Albc = (Albc == LBC_CM) ? LBC_ID : LBC_SP;
	}
      while (Albc == LBC_SP)
	{
	  Apos--;
	  GET_LBC (Albc, mt, len, Apos, option);
	}
    }
  if ((option & MTEXT_LBO_SP_CM) && (Albc == LBC_CM))
    {
      Apos--;
      GET_LBC (Albc, mt, len, Apos, option);
      if (Albc == LBC_SP)
	Albc = LBC_ID;
      else
	Apos++, Albc = LBC_CM;
    }

  if (Albc == LBC_CR)
    Albc = LBC_BK;
  else if (Albc == LBC_LF)
    {
      GET_LBC (Albc, mt, len, Apos - 1, option);
      if (Albc == LBC_CR)
	Apos--;
      Albc = LBC_BK;
    }
  else if (Albc == LBC_SA)
    Albc = mtext__word_segment (mt, Apos, &Apos, NULL) > 0 ? LBC_BB : LBC_AL;
  Bpos = Apos;
  /* After exiting from the following loop, if Apos is positive, it is
     the previous (including POS) break position.  */
  while (Apos > 0)
    {
      int indirect;
      int next = -1;

      /* Now Bpos == Apos.  */
      do {
	Bpos--;
	GET_LBC (Blbc, mt, len, Bpos, option);
      } while (Blbc == LBC_SP);

      if (Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR)
	{
	  /* Explicit break.  */
	  break;
	}

      indirect = Bpos + 1 < Apos;

      if (Blbc == LBC_CM)
	{
	  do {
	      Bpos--;
	      GET_LBC (Blbc, mt, len, Bpos, option);
	  } while (Blbc == LBC_CM);
	  if ((option & MTEXT_LBO_SP_CM) && (Blbc == LBC_SP))
	    Blbc = LBC_ID;
	  else if (Blbc == LBC_SP || Blbc == LBC_ZW
		   || Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR)
	    {
	      Blbc = LBC_AL;
	      Bpos++;
	    }
	}		   
      if (Blbc == LBC_SA)
	{
	  mtext__word_segment (mt, Bpos, &next, NULL);
	  Blbc = LBC_AL;
	}

      if (Albc != LBC_BK)
	{
	  action = lba_pair_table[Blbc][Albc];
	  if (action == LBA_DIRECT)
	    break;
	  else if (action == LBA_INDIRECT)
	    {
	      if (indirect)
		break;
	    }
	  else if (action == LBA_COMBINING_INDIRECT)
	    {
	      if (indirect)
		break;
	    }
	}
      if (next >= 0)
	Apos = next, Albc = LBC_BB;
      else
	Apos = Bpos, Albc = Blbc;
    }
  break_before = Apos;
  if (break_before > 0)
    {
      if (! after)
	return break_before;
      if (break_before == pos)
	{
	  if (after)
	    *after = break_before;
	  return break_before;
	}
    }

  /* Now find a break position after POS.  */
  break_after = 0;
  Bpos = pos;
  Blbc = lbc;
  if (Blbc == LBC_CM)
    {
      do {
	Bpos--;
	GET_LBC (Blbc, mt, len, Bpos, option);
      } while (Blbc == LBC_CM);
      if (Blbc == LBC_SP || Blbc == LBC_ZW
	  || Blbc == LBC_BK || Blbc == LBC_LF || Blbc == LBC_CR)
	{
	  if ((Blbc == LBC_SP) && (option & MTEXT_LBO_SP_CM))
	    Blbc = LBC_ID;
	  else
	    Blbc = LBC_AL;
	}
      Bpos = pos;
    }
  if (Blbc == LBC_SA)
    {
      mtext__word_segment (mt, Bpos, NULL, &Bpos);
      Blbc = LBC_AL;
    }
  else if (Blbc == LBC_SP)
    {
      if (option & MTEXT_LBO_SP_CM)
	{
	  GET_LBC (Blbc, mt, len, Bpos + 1, option);
	  if (Blbc == LBC_CM)
	    Blbc = LBC_ID, Bpos++;
	  else
	    Blbc = LBC_SP;
	}
      while (Blbc == LBC_SP)
	{
	  Bpos--;
	  GET_LBC (Blbc, mt, len, Bpos, option);
	}
      if (Bpos < 0)
	Bpos = pos;
    }
  Apos = Bpos;
  /* After exiting from the following loop, if Apos is positive, it is
     the next break position.  */
  while (1)
    {
      int indirect;
      int next = -1;

      /* Now Bpos == Apos.  */
      if (Blbc == LBC_LF || Blbc == LBC_BK || Blbc == LBC_CR)
	{
	  Apos++;
	  if (Blbc == LBC_CR)
	    {
	      GET_LBC (Blbc, mt, len, Bpos + 1, option);
	      if (Blbc == LBC_LF)
		Apos++;
	    }
	  break;
	}

      do {
	Apos++;
	GET_LBC (Albc, mt, len, Apos, option);
      } while (Albc == LBC_SP);
      
      if (Blbc == LBC_SP)
	break;

      if (Apos == len)
	/* Explicit break at the end of text.  */
	break;

      indirect = Bpos + 1 < Apos;

      if (Albc == LBC_SA)
	Albc = mtext__word_segment (mt, Apos, NULL, &next) ? LBC_BB : LBC_AL;

      action = lba_pair_table[Blbc][Albc];
      if (action == LBA_DIRECT)
	/* Direct break at Apos.  */
	break;
      else if (action == LBA_INDIRECT)
	{
	  if (indirect)
	    break;
	}
      else if (action == LBA_COMBINING_INDIRECT)
	{
	  if (indirect)
	    {
	      if (option & MTEXT_LBO_SP_CM)
		Apos--;
	      break;
	    }
	}
      if (next >= 0)
	Bpos = next, Blbc = LBC_AL;
      else
	{
	  Bpos = Apos;
	  if (Albc != LBC_CM)
	    Blbc = Albc;
	}
    }
  break_after = Apos;
  if (after)
    *after = break_after;

  return (break_before > 0 ? break_before : break_after);
}

/*** @} */ 

/*
  Local Variables:
  coding: euc-japan
  End:
*/