Blob Blame History Raw
/* Pango
 * break.c:
 *
 * Copyright (C) 1999 Red Hat Software
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

#include "config.h"

#include "pango-break.h"
#include "pango-engine-private.h"
#include "pango-script-private.h"
#include "pango-impl-utils.h"
#include <string.h>

#define PARAGRAPH_SEPARATOR 0x2029
#define PARAGRAPH_SEPARATOR_STRING "\xE2\x80\xA9"

/* See http://www.unicode.org/unicode/reports/tr14/ if you hope
 * to understand the line breaking code.
 */

typedef enum
{
  BREAK_ALREADY_HANDLED,   /* didn't use the table */
  BREAK_PROHIBITED, /* no break, even if spaces intervene */
  BREAK_IF_SPACES,  /* "indirect break" (only if there are spaces) */
  BREAK_ALLOWED     /* "direct break" (can always break here) */
  /* TR 14 has two more break-opportunity classes,
   * "indirect break opportunity for combining marks following a space"
   * and "prohibited break for combining marks"
   * but we handle that inline in the code.
   */
} BreakOpportunity;

/* need to sync the break range to glib/gunicode.h . */
#define BREAK_TYPE_SAFE(btype)            \
	 ((btype) <= G_UNICODE_BREAK_ZERO_WIDTH_JOINER ? (btype) : G_UNICODE_BREAK_UNKNOWN)


/*
 * Hangul Conjoining Jamo handling.
 *
 * The way we implement it is just a bit different from TR14,
 * but produces the same results.
 * The same algorithm is also used in TR29 for cluster boundaries.
 *
 */


/* An enum that works as the states of the Hangul syllables system.
 **/
typedef enum
{
  JAMO_L,	/* G_UNICODE_BREAK_HANGUL_L_JAMO */
  JAMO_V,	/* G_UNICODE_BREAK_HANGUL_V_JAMO */
  JAMO_T,	/* G_UNICODE_BREAK_HANGUL_T_JAMO */
  JAMO_LV,	/* G_UNICODE_BREAK_HANGUL_LV_SYLLABLE */
  JAMO_LVT,	/* G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE */
  NO_JAMO	/* Other */
} JamoType;

/* There are Hangul syllables encoded as characters, that act like a
 * sequence of Jamos. For each character we define a JamoType
 * that the character starts with, and one that it ends with.  This
 * decomposes JAMO_LV and JAMO_LVT to simple other JAMOs.  So for
 * example, a character with LineBreak type
 * G_UNICODE_BREAK_HANGUL_LV_SYLLABLE has start=JAMO_L and end=JAMO_V.
 */
typedef struct _CharJamoProps
{
  JamoType start, end;
} CharJamoProps;

/* Map from JamoType to CharJamoProps that hold only simple
 * JamoTypes (no LV or LVT) or none.
 */
static const CharJamoProps HangulJamoProps[] = {
  {JAMO_L, JAMO_L},	/* JAMO_L */
  {JAMO_V, JAMO_V},	/* JAMO_V */
  {JAMO_T, JAMO_T},	/* JAMO_T */
  {JAMO_L, JAMO_V},	/* JAMO_LV */
  {JAMO_L, JAMO_T},	/* JAMO_LVT */
  {NO_JAMO, NO_JAMO}	/* NO_JAMO */
};

/* A character forms a syllable with the previous character if and only if:
 * JamoType(this) is not NO_JAMO and:
 *
 * HangulJamoProps[JamoType(prev)].end and
 * HangulJamoProps[JamoType(this)].start are equal,
 * or the former is one less than the latter.
 */

#define IS_JAMO(btype)              \
	((btype >= G_UNICODE_BREAK_HANGUL_L_JAMO) && \
	 (btype <= G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
#define JAMO_TYPE(btype)      \
	(IS_JAMO(btype) ? (btype - G_UNICODE_BREAK_HANGUL_L_JAMO) : NO_JAMO)

/* Types of Japanese characters */
#define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)
#define KANJI(wc)    ((wc) >= 0x2F00 && (wc) <= 0x2FDF)
#define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)
#define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)

#define LATIN(wc) (((wc) >= 0x0020 && (wc) <= 0x02AF) || ((wc) >= 0x1E00 && (wc) <= 0x1EFF))
#define CYRILLIC(wc) (((wc) >= 0x0400 && (wc) <= 0x052F))
#define GREEK(wc) (((wc) >= 0x0370 && (wc) <= 0x3FF) || ((wc) >= 0x1F00 && (wc) <= 0x1FFF))
#define KANA(wc) ((wc) >= 0x3040 && (wc) <= 0x30FF)
#define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)
#define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) && !HANGUL(wc))

/* Previously "123foo" was two words. But in UAX 29 of Unicode, 
 * we know don't break words between consecutive letters and numbers
 */
typedef enum
{
  WordNone,
  WordLetters,
  WordNumbers
} WordType;


/**
 * pango_default_break:
 * @text: text to break
 * @length: length of text in bytes (may be -1 if @text is nul-terminated)
 * @analysis: (nullable): a #PangoAnalysis for the @text
 * @attrs: logical attributes to fill in
 * @attrs_len: size of the array passed as @attrs
 *
 * This is the default break algorithm, used if no language
 * engine overrides it. Normally you should use pango_break()
 * instead. Unlike pango_break(),
 * @analysis can be %NULL, but only do that if you know what
 * you're doing. If you need an analysis to pass to pango_break(),
 * you need to pango_itemize().  In most cases however you should
 * simply use pango_get_log_attrs().
 **/
void
pango_default_break (const gchar   *text,
		     gint           length,
		     PangoAnalysis *analysis G_GNUC_UNUSED,
		     PangoLogAttr  *attrs,
		     int            attrs_len G_GNUC_UNUSED)
{
  /* The rationale for all this is in section 5.15 of the Unicode 3.0 book,
   * the line breaking stuff is also in TR14 on unicode.org
   */

  /* This is a default break implementation that should work for nearly all
   * languages. Language engines can override it optionally.
   */

  /* FIXME one cheesy optimization here would be to memset attrs to 0
   * before we start, and then never assign %FALSE to anything
   */

  const gchar *next;
  gint i;

  gunichar prev_wc;
  gunichar next_wc;

  JamoType prev_jamo;

  GUnicodeBreakType next_break_type;
  GUnicodeBreakType prev_break_type;
  GUnicodeBreakType prev_prev_break_type;

  /* See Grapheme_Cluster_Break Property Values table of UAX#29 */
  typedef enum
  {
    GB_Other,
    GB_ControlCRLF,
    GB_Extend,
    GB_ZWJ,
    GB_Prepend,
    GB_SpacingMark,
    GB_InHangulSyllable, /* Handles all of L, V, T, LV, LVT rules */
    /* Use state machine to handle emoji sequence */
    /* Rule GB10 and GB11 */
    GB_E_Base,
    GB_E_Modifier,
    GB_Glue_After_Zwj,
    GB_E_Base_GAZ,
    /* Rule GB12 and GB13 */
    GB_RI_Odd, /* Meets odd number of RI */
    GB_RI_Even, /* Meets even number of RI */
  } GraphemeBreakType;
  GraphemeBreakType prev_GB_type = GB_Other;

  /* See Word_Break Property Values table of UAX#29 */
  typedef enum
  {
    WB_Other,
    WB_NewlineCRLF,
    WB_ExtendFormat,
    WB_Katakana,
    WB_Hebrew_Letter,
    WB_ALetter,
    WB_MidNumLet,
    WB_MidLetter,
    WB_MidNum,
    WB_Numeric,
    WB_ExtendNumLet,
    WB_RI_Odd,
    WB_RI_Even,
  } WordBreakType;
  WordBreakType prev_prev_WB_type = WB_Other, prev_WB_type = WB_Other;
  gint prev_WB_i = -1;

  /* See Sentence_Break Property Values table of UAX#29 */
  typedef enum
  {
    SB_Other,
    SB_ExtendFormat,
    SB_ParaSep,
    SB_Sp,
    SB_Lower,
    SB_Upper,
    SB_OLetter,
    SB_Numeric,
    SB_ATerm,
    SB_SContinue,
    SB_STerm,
    SB_Close,
    /* Rules SB8 and SB8a */
    SB_ATerm_Close_Sp,
    SB_STerm_Close_Sp,
  } SentenceBreakType;
  SentenceBreakType prev_prev_SB_type = SB_Other, prev_SB_type = SB_Other;
  gint prev_SB_i = -1;

  /* Rule LB25 with Example 7 of Customization */
  typedef enum
  {
    LB_Other,
    LB_Numeric,
    LB_Numeric_Close,
    LB_RI_Odd,
    LB_RI_Even,
  } LineBreakType;
  LineBreakType prev_LB_type = LB_Other;

  WordType current_word_type = WordNone;
  gunichar last_word_letter = 0;
  gunichar base_character = 0;

  gint last_sentence_start = -1;
  gint last_non_space = -1;

  gboolean almost_done = FALSE;
  gboolean done = FALSE;

  g_return_if_fail (length == 0 || text != NULL);
  g_return_if_fail (attrs != NULL);

  next = text;

  prev_break_type = G_UNICODE_BREAK_UNKNOWN;
  prev_prev_break_type = G_UNICODE_BREAK_UNKNOWN;
  prev_wc = 0;
  prev_jamo = NO_JAMO;

  if (length == 0 || *text == '\0')
    {
      next_wc = PARAGRAPH_SEPARATOR;
      almost_done = TRUE;
    }
  else
    next_wc = g_utf8_get_char (next);

  next_break_type = g_unichar_break_type (next_wc);
  next_break_type = BREAK_TYPE_SAFE (next_break_type);

  for (i = 0; !done ; i++)
    {
      GUnicodeType type;
      gunichar wc;
      GUnicodeBreakType break_type;
      GUnicodeBreakType row_break_type;
      BreakOpportunity break_op;
      JamoType jamo;
      gboolean makes_hangul_syllable;

      /* UAX#29 boundaries */
      gboolean is_grapheme_boundary;
      gboolean is_word_boundary;
      gboolean is_sentence_boundary;


      wc = next_wc;
      break_type = next_break_type;

      if (almost_done)
	{
	  /*
	   * If we have already reached the end of @text g_utf8_next_char()
	   * may not increment next
	   */
	  next_wc = 0;
	  next_break_type = G_UNICODE_BREAK_UNKNOWN;
	  done = TRUE;
	}
      else
	{
	  next = g_utf8_next_char (next);

	  if ((length >= 0 && next >= text + length) || *next == '\0')
	    {
	      /* This is how we fill in the last element (end position) of the
	       * attr array - assume there's a paragraph separators off the end
	       * of @text.
	       */
	      next_wc = PARAGRAPH_SEPARATOR;
	      almost_done = TRUE;
	    }
	  else
	    next_wc = g_utf8_get_char (next);

	  next_break_type = g_unichar_break_type (next_wc);
	  next_break_type = BREAK_TYPE_SAFE (next_break_type);
	}

      type = g_unichar_type (wc);
      jamo = JAMO_TYPE (break_type);

      /* Determine wheter this forms a Hangul syllable with prev. */
      if (jamo == NO_JAMO)
	makes_hangul_syllable = FALSE;
      else
	{
	  JamoType prev_end   = HangulJamoProps[prev_jamo].end  ;
	  JamoType this_start = HangulJamoProps[     jamo].start;

	  /* See comments before IS_JAMO */
	  makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start);
	}

      switch (type)
        {
        case G_UNICODE_SPACE_SEPARATOR:
        case G_UNICODE_LINE_SEPARATOR:
        case G_UNICODE_PARAGRAPH_SEPARATOR:
          attrs[i].is_white = TRUE;
          break;
        default:
          if (wc == '\t' || wc == '\n' || wc == '\r' || wc == '\f')
            attrs[i].is_white = TRUE;
          else
            attrs[i].is_white = FALSE;
          break;
        }

      /* Just few spaces have variable width. So explicitly mark them.
       */
      attrs[i].is_expandable_space = (0x0020 == wc || 0x00A0 == wc);

      /* ---- UAX#29 Grapheme Boundaries ---- */
      {
	GraphemeBreakType GB_type;
        /* Find the GraphemeBreakType of wc */
	GB_type = GB_Other;
	switch ((int) type)
	  {
	  case G_UNICODE_FORMAT:
	    if (G_UNLIKELY (wc == 0x200C))
	      {
		GB_type = GB_Extend;
		break;
	      }
	    if (G_UNLIKELY (wc == 0x200D))
	      {
		GB_type = GB_ZWJ;
		break;
	      }
            if (G_UNLIKELY((wc >= 0x600 && wc <= 0x605) ||
                            wc == 0x6DD ||
                            wc == 0x70F ||
                            wc == 0x8E2 ||
                            wc == 0xD4E ||
                            wc == 0x110BD ||
                            (wc >= 0x111C2 && wc <= 0x111C3)))
              {
                GB_type = GB_Prepend;
                break;
              }
	    /* fall through */
	  case G_UNICODE_CONTROL:
	  case G_UNICODE_LINE_SEPARATOR:
	  case G_UNICODE_PARAGRAPH_SEPARATOR:
	  case G_UNICODE_SURROGATE:
	    GB_type = GB_ControlCRLF;
	    break;

	  case G_UNICODE_UNASSIGNED:
	    /* Unassigned default ignorables */
	    if ((wc >= 0xFFF0 && wc <= 0xFFF8) ||
		(wc >= 0xE0000 && wc <= 0xE0FFF))
	      {
		GB_type = GB_ControlCRLF;
		break;
	      }

	  case G_UNICODE_OTHER_LETTER:
	    if (makes_hangul_syllable)
	      GB_type = GB_InHangulSyllable;
	    break;

	  case G_UNICODE_MODIFIER_LETTER:
	    if (wc >= 0xFF9E && wc <= 0xFF9F)
	      GB_type = GB_Extend; /* Other_Grapheme_Extend */
	    break;

	  case G_UNICODE_SPACING_MARK:
	    GB_type = GB_SpacingMark; /* SpacingMark */
	    if (wc >= 0x0900)
	      {
		if (wc == 0x09BE || wc == 0x09D7 ||
		    wc == 0x0B3E || wc == 0x0B57 || wc == 0x0BBE || wc == 0x0BD7 ||
		    wc == 0x0CC2 || wc == 0x0CD5 || wc == 0x0CD6 ||
		    wc == 0x0D3E || wc == 0x0D57 || wc == 0x0DCF || wc == 0x0DDF ||
		    wc == 0x1D165 || (wc >= 0x1D16E && wc <= 0x1D172))
		  GB_type = GB_Extend; /* Other_Grapheme_Extend */
	      }
	    break;

	  case G_UNICODE_ENCLOSING_MARK:
	  case G_UNICODE_NON_SPACING_MARK:
	    GB_type = GB_Extend; /* Grapheme_Extend */
	    break;

          case G_UNICODE_OTHER_SYMBOL:
            if (G_UNLIKELY(wc == 0x261D ||
                           wc == 0x26F9 ||
                           (wc >= 0x270A && wc <= 0x270D) ||
                           wc == 0x1F385 ||
                           (wc >= 0x1F3C2 && wc <= 0x1F3C4) ||
                           wc == 0x1F3C7 ||
                           (wc >= 0x1F3CA && wc <= 0x1F3CC) ||
                           (wc >= 0x1F442 && wc <= 0x1F443) ||
                           (wc >= 0x1F446 && wc <= 0x1F450) ||
                           wc == 0x1F46E ||
                           (wc >= 0x1F470 && wc <= 0x1F478) ||
                           wc == 0x1F47C ||
                           (wc >= 0x1F481 && wc <= 0x1F483) ||
                           (wc >= 0x1F485 && wc <= 0x1F487) ||
                           wc == 0x1F4AA ||
                           (wc >= 0x1F574 && wc <= 0x1F575) ||
                           wc == 0x1F57A ||
                           wc == 0x1F590 ||
                           (wc >= 0x1F595 && wc <= 0x1F596) ||
                           (wc >= 0x1F645 && wc <= 0x1F647) ||
                           (wc >= 0x1F64B && wc <= 0x1F64F) ||
                           wc == 0x1F6A3 ||
                           (wc >= 0x1F6B4 && wc <= 0x1F6B6) ||
                           wc == 0x1F6C0 ||
                           wc == 0x1F6CC ||
                           (wc >= 0x1F918 && wc <= 0x1F91C) ||
                           (wc >= 0x1F91E && wc <= 0x1F91F) ||
                           wc == 0x1F926 ||
                           (wc >= 0x1F930 && wc <= 0x1F939) ||
                           (wc >= 0x1F93D && wc <= 0x1F93E) ||
                           (wc >= 0x1F9D1 && wc <= 0x1F9DD)))
              {
                GB_type = GB_E_Base;
                break;
              }
            if (G_UNLIKELY(wc == 0x2640 ||
                           wc == 0x2642 ||
                           (wc >= 0x2695 && wc <= 0x2696) ||
                           wc == 0x2708 ||
                           wc == 0x2764 ||
                           wc == 0x1F308 ||
                           wc == 0x1F33E ||
                           wc == 0x1F373 ||
                           wc == 0x1F393 ||
                           wc == 0x1F3A4 ||
                           wc == 0x1F3A8 ||
                           wc == 0x1F3EB ||
                           wc == 0x1F3ED ||
                           wc == 0x1F48B ||
                           (wc >= 0x1F4BB && wc <= 0x1F4BC) ||
                           wc == 0x1F527 ||
                           wc == 0x1F52C ||
                           wc == 0x1F5E8 ||
                           wc == 0x1F680 ||
                           wc == 0x1F692))
              {
                GB_type = GB_Glue_After_Zwj;
                break;
              }
            if (G_UNLIKELY(wc >= 0x1F466 && wc <= 0x1F469))
              {
                GB_type = GB_E_Base_GAZ;
                break;
              }
            if (G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF))
              {
                if (prev_GB_type == GB_RI_Odd)
                  GB_type = GB_RI_Even;
                else if (prev_GB_type == GB_RI_Even)
                  GB_type = GB_RI_Odd;
                else
                  GB_type = GB_RI_Odd;
                break;
              }
            break;

          case G_UNICODE_MODIFIER_SYMBOL:
            if (wc >= 0x1F3FB && wc <= 0x1F3FF)
              GB_type = GB_E_Modifier;
            break;
	  }

	/* Grapheme Cluster Boundary Rules */

	/* We apply Rules GB1 and GB2 at the end of the function */
	if (wc == '\n' && prev_wc == '\r')
          is_grapheme_boundary = FALSE; /* Rule GB3 */
	else if (prev_GB_type == GB_ControlCRLF || GB_type == GB_ControlCRLF)
	  is_grapheme_boundary = TRUE; /* Rules GB4 and GB5 */
	else if (GB_type == GB_InHangulSyllable)
	  is_grapheme_boundary = FALSE; /* Rules GB6, GB7, GB8 */
	else if (GB_type == GB_Extend)
          {
            /* Rule GB10 */
            if (prev_GB_type == GB_E_Base || prev_GB_type == GB_E_Base_GAZ)
	      GB_type = prev_GB_type;
	    is_grapheme_boundary = FALSE; /* Rule GB9 */
          }
        else if (GB_type == GB_ZWJ)
	  is_grapheme_boundary = FALSE; /* Rule GB9 */
	else if (GB_type == GB_SpacingMark)
	  is_grapheme_boundary = FALSE; /* Rule GB9a */
	else if (prev_GB_type == GB_Prepend)
	  is_grapheme_boundary = FALSE; /* Rule GB9b */
	/* Rule GB10 */
	else if (prev_GB_type == GB_E_Base || prev_GB_type == GB_E_Base_GAZ)
	  {
            if (GB_type == GB_E_Modifier)
              is_grapheme_boundary = FALSE;
            else
              is_grapheme_boundary = TRUE;
          }
	else if (prev_GB_type == GB_ZWJ &&
                 (GB_type == GB_Glue_After_Zwj || GB_type == GB_E_Base_GAZ))
	  is_grapheme_boundary = FALSE; /* Rule GB11 */
	else if (prev_GB_type == GB_RI_Odd && GB_type == GB_RI_Even)
	  is_grapheme_boundary = FALSE; /* Rule GB12 and GB13 */
	else
	  is_grapheme_boundary = TRUE; /* Rule GB999 */

	attrs[i].is_cursor_position = is_grapheme_boundary;
	/* If this is a grapheme boundary, we have to decide if backspace
	 * deletes a character or the whole grapheme cluster */
	if (is_grapheme_boundary)
          {
            if (prev_GB_type == GB_E_Base ||
                prev_GB_type == GB_E_Base_GAZ ||
                prev_GB_type == GB_Glue_After_Zwj ||
                prev_GB_type == GB_Extend ||
                prev_GB_type == GB_E_Modifier ||
                prev_GB_type == GB_RI_Odd ||
                prev_GB_type == GB_RI_Even)
	      attrs[i].backspace_deletes_character = FALSE;
            else
	      attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);
          }
	else
	  attrs[i].backspace_deletes_character = FALSE;

	prev_GB_type = GB_type;
      }

      /* ---- UAX#29 Word Boundaries ---- */
      {
	is_word_boundary = FALSE;
	if (is_grapheme_boundary ||
	    G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF)) /* Rules WB3 and WB4 */
	  {
	    PangoScript script;
	    WordBreakType WB_type;

	    script = g_unichar_get_script (wc);

	    /* Find the WordBreakType of wc */
	    WB_type = WB_Other;

	    if (script == PANGO_SCRIPT_KATAKANA)
	      WB_type = WB_Katakana;

	    if (script == PANGO_SCRIPT_HEBREW && type == G_UNICODE_OTHER_LETTER)
	      WB_type = WB_Hebrew_Letter;

	    if (WB_type == WB_Other)
	      switch (wc >> 8)
	        {
		case 0x30:
		  if (wc == 0x3031 || wc == 0x3032 || wc == 0x3033 || wc == 0x3034 || wc == 0x3035 ||
		      wc == 0x309b || wc == 0x309c || wc == 0x30a0 || wc == 0x30fc)
		    WB_type = WB_Katakana; /* Katakana exceptions */
		  break;
		case 0xFF:
		  if (wc == 0xFF70)
		    WB_type = WB_Katakana; /* Katakana exceptions */
		  else if (wc >= 0xFF9E && wc <= 0xFF9F)
		    WB_type = WB_ExtendFormat; /* Other_Grapheme_Extend */
		  break;
		case 0x05:
		  if (wc == 0x05F3)
		    WB_type = WB_ALetter; /* ALetter exceptions */
		  break;
		}

	    if (WB_type == WB_Other)
	      switch ((int) break_type)
	        {
		case G_UNICODE_BREAK_NUMERIC:
		  if (wc != 0x066C)
		    WB_type = WB_Numeric; /* Numeric */
		  break;
		case G_UNICODE_BREAK_INFIX_SEPARATOR:
		  if (wc != 0x003A && wc != 0xFE13 && wc != 0x002E)
		    WB_type = WB_MidNum; /* MidNum */
		  break;
		}

	    if (WB_type == WB_Other)
	      switch ((int) type)
		{
		case G_UNICODE_CONTROL:
		  if (wc != 0x000D && wc != 0x000A && wc != 0x000B && wc != 0x000C && wc != 0x0085)
		    break;
		  /* fall through */
		case G_UNICODE_LINE_SEPARATOR:
		case G_UNICODE_PARAGRAPH_SEPARATOR:
		  WB_type = WB_NewlineCRLF; /* CR, LF, Newline */
		  break;

		case G_UNICODE_FORMAT:
		case G_UNICODE_SPACING_MARK:
		case G_UNICODE_ENCLOSING_MARK:
		case G_UNICODE_NON_SPACING_MARK:
		  WB_type = WB_ExtendFormat; /* Extend, Format */
		  break;

		case G_UNICODE_CONNECT_PUNCTUATION:
		  WB_type = WB_ExtendNumLet; /* ExtendNumLet */
		  break;

		case G_UNICODE_INITIAL_PUNCTUATION:
		case G_UNICODE_FINAL_PUNCTUATION:
		  if (wc == 0x2018 || wc == 0x2019)
		    WB_type = WB_MidNumLet; /* MidNumLet */
		  break;
		case G_UNICODE_OTHER_PUNCTUATION:
		  if (wc == 0x0027 || wc == 0x002e || wc == 0x2024 ||
		      wc == 0xfe52 || wc == 0xff07 || wc == 0xff0e)
		    WB_type = WB_MidNumLet; /* MidNumLet */
		  else if (wc == 0x00b7 || wc == 0x05f4 || wc == 0x2027 || wc == 0x003a || wc == 0x0387 ||
			   wc == 0xfe13 || wc == 0xfe55 || wc == 0xff1a)
		    WB_type = WB_MidLetter; /* WB_MidLetter */
		  else if (wc == 0x066c ||
			   wc == 0xfe50 || wc == 0xfe54 || wc == 0xff0c || wc == 0xff1b)
		    WB_type = WB_MidNum; /* MidNum */
		  break;

		case G_UNICODE_OTHER_SYMBOL:
		  if (wc >= 0x24B6 && wc <= 0x24E9) /* Other_Alphabetic */
		    goto Alphabetic;

		  if (G_UNLIKELY(wc >=0x1F1E6 && wc <=0x1F1FF))
		    {
			  if (prev_WB_type == WB_RI_Odd)
			   WB_type = WB_RI_Even;
			  else if (prev_WB_type == WB_RI_Even)
			   WB_type = WB_RI_Odd;
			  else
			   WB_type = WB_RI_Odd;
		    }

		  break;

		case G_UNICODE_OTHER_LETTER:
		case G_UNICODE_LETTER_NUMBER:
		  if (wc == 0x3006 || wc == 0x3007 ||
		      (wc >= 0x3021 && wc <= 0x3029) ||
		      (wc >= 0x3038 && wc <= 0x303A) ||
		      (wc >= 0x3400 && wc <= 0x4DB5) ||
		      (wc >= 0x4E00 && wc <= 0x9FC3) ||
		      (wc >= 0xF900 && wc <= 0xFA2D) ||
		      (wc >= 0xFA30 && wc <= 0xFA6A) ||
		      (wc >= 0xFA70 && wc <= 0xFAD9) ||
		      (wc >= 0x20000 && wc <= 0x2A6D6) ||
		      (wc >= 0x2F800 && wc <= 0x2FA1D))
		    break; /* ALetter exceptions: Ideographic */
		  goto Alphabetic;

		case G_UNICODE_LOWERCASE_LETTER:
		case G_UNICODE_MODIFIER_LETTER:
		case G_UNICODE_TITLECASE_LETTER:
		case G_UNICODE_UPPERCASE_LETTER:
		Alphabetic:
		  if (break_type != G_UNICODE_BREAK_COMPLEX_CONTEXT && script != PANGO_SCRIPT_HIRAGANA)
		    WB_type = WB_ALetter; /* ALetter */
		  break;
		}

	    /* Grapheme Cluster Boundary Rules */

	    /* We apply Rules WB1 and WB2 at the end of the function */

	    if (prev_wc == 0x3031 && wc == 0x41)
	      g_debug ("Y %d %d", prev_WB_type, WB_type);
	    if (prev_WB_type == WB_NewlineCRLF && prev_WB_i + 1 == i)
	      {
	        /* The extra check for prev_WB_i is to correctly handle sequences like
		 * Newline ÷ Extend × Extend
		 * since we have not skipped ExtendFormat yet.
		 */
	        is_word_boundary = TRUE; /* Rule WB3a */
	      }
	    else if (WB_type == WB_NewlineCRLF)
	      is_word_boundary = TRUE; /* Rule WB3b */
	    else if (WB_type == WB_ExtendFormat)
	      is_word_boundary = FALSE; /* Rules WB4? */
	    else if ((prev_WB_type == WB_ALetter  ||
                  prev_WB_type == WB_Hebrew_Letter ||
                  prev_WB_type == WB_Numeric) &&
                 (WB_type == WB_ALetter  ||
                  WB_type == WB_Hebrew_Letter ||
                  WB_type == WB_Numeric))
	      is_word_boundary = FALSE; /* Rules WB5, WB8, WB9, WB10 */
	    else if (prev_WB_type == WB_Katakana && WB_type == WB_Katakana)
	      is_word_boundary = FALSE; /* Rule WB13 */
	    else if ((prev_WB_type == WB_ALetter ||
                  prev_WB_type == WB_Hebrew_Letter ||
                  prev_WB_type == WB_Numeric ||
                  prev_WB_type == WB_Katakana ||
                  prev_WB_type == WB_ExtendNumLet) &&
                 WB_type == WB_ExtendNumLet)
	      is_word_boundary = FALSE; /* Rule WB13a */
	    else if (prev_WB_type == WB_ExtendNumLet &&
                 (WB_type == WB_ALetter ||
                  WB_type == WB_Hebrew_Letter ||
                  WB_type == WB_Numeric ||
                  WB_type == WB_Katakana))
	      is_word_boundary = FALSE; /* Rule WB13b */
	    else if (((prev_prev_WB_type == WB_ALetter ||
                   prev_prev_WB_type == WB_Hebrew_Letter) &&
                  (WB_type == WB_ALetter ||
                   WB_type == WB_Hebrew_Letter)) &&
		     (prev_WB_type == WB_MidLetter ||
              prev_WB_type == WB_MidNumLet ||
              prev_wc == 0x0027))
	      {
		attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB6 */
		is_word_boundary = FALSE; /* Rule WB7 */
	      }
	    else if (prev_WB_type == WB_Hebrew_Letter && wc == 0x0027)
          is_word_boundary = FALSE; /* Rule WB7a */
	    else if (prev_prev_WB_type == WB_Hebrew_Letter && prev_wc == 0x0022 &&
                 WB_type == WB_Hebrew_Letter) {
          attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB7b */
          is_word_boundary = FALSE; /* Rule WB7c */
        }
	    else if ((prev_prev_WB_type == WB_Numeric && WB_type == WB_Numeric) &&
                 (prev_WB_type == WB_MidNum || prev_WB_type == WB_MidNumLet ||
                  prev_wc == 0x0027))
	      {
		is_word_boundary = FALSE; /* Rule WB11 */
		attrs[prev_WB_i].is_word_boundary = FALSE; /* Rule WB12 */
	      }
	    else if (prev_WB_type == WB_RI_Odd && WB_type == WB_RI_Even)
	      is_word_boundary = FALSE; /* Rule WB15 and WB16 */
	    else
	      is_word_boundary = TRUE; /* Rule WB999 */

	    if (WB_type != WB_ExtendFormat)
	      {
		prev_prev_WB_type = prev_WB_type;
		prev_WB_type = WB_type;
		prev_WB_i = i;
	      }
	  }

	attrs[i].is_word_boundary = is_word_boundary;
      }

      /* ---- UAX#29 Sentence Boundaries ---- */
      {
	is_sentence_boundary = FALSE;
	if (is_word_boundary ||
	    wc == '\r' || wc == '\n') /* Rules SB3 and SB5 */
	  {
	    SentenceBreakType SB_type;

	    /* Find the SentenceBreakType of wc */
	    SB_type = SB_Other;

	    if (break_type == G_UNICODE_BREAK_NUMERIC)
	      SB_type = SB_Numeric; /* Numeric */

	    if (SB_type == SB_Other)
	      switch ((int) type)
		{
		case G_UNICODE_CONTROL:
		  if (wc == '\r' || wc == '\n')
		    SB_type = SB_ParaSep;
		  else if (wc == 0x0009 || wc == 0x000B || wc == 0x000C)
		    SB_type = SB_Sp;
		  else if (wc == 0x0085)
		    SB_type = SB_ParaSep;
		  break;

		case G_UNICODE_SPACE_SEPARATOR:
		  if (wc == 0x0020 || wc == 0x00A0 || wc == 0x1680 ||
		      (wc >= 0x2000 && wc <= 0x200A) ||
		      wc == 0x202F || wc == 0x205F || wc == 0x3000)
		    SB_type = SB_Sp;
		  break;

		case G_UNICODE_LINE_SEPARATOR:
		case G_UNICODE_PARAGRAPH_SEPARATOR:
		  SB_type = SB_ParaSep;
		  break;

		case G_UNICODE_FORMAT:
		case G_UNICODE_SPACING_MARK:
		case G_UNICODE_ENCLOSING_MARK:
		case G_UNICODE_NON_SPACING_MARK:
		  SB_type = SB_ExtendFormat; /* Extend, Format */
		  break;

		case G_UNICODE_MODIFIER_LETTER:
		  if (wc >= 0xFF9E && wc <= 0xFF9F)
		    SB_type = SB_ExtendFormat; /* Other_Grapheme_Extend */
		  break;

		case G_UNICODE_TITLECASE_LETTER:
		  SB_type = SB_Upper;
		  break;

		case G_UNICODE_DASH_PUNCTUATION:
		  if (wc == 0x002D ||
		      (wc >= 0x2013 && wc <= 0x2014) ||
		      (wc >= 0xFE31 && wc <= 0xFE32) ||
		      wc == 0xFE58 ||
		      wc == 0xFE63 ||
		      wc == 0xFF0D)
		    SB_type = SB_SContinue;
		  break;

		case G_UNICODE_OTHER_PUNCTUATION:
		  if (wc == 0x05F3)
		    SB_type = SB_OLetter;
		  else if (wc == 0x002E || wc == 0x2024 ||
		      wc == 0xFE52 || wc == 0xFF0E)
		    SB_type = SB_ATerm;

		  if (wc == 0x002C ||
		      wc == 0x003A ||
		      wc == 0x055D ||
		      (wc >= 0x060C && wc <= 0x060D) ||
		      wc == 0x07F8 ||
		      wc == 0x1802 ||
		      wc == 0x1808 ||
		      wc == 0x3001 ||
		      (wc >= 0xFE10 && wc <= 0xFE11) ||
		      wc == 0xFE13 ||
		      (wc >= 0xFE50 && wc <= 0xFE51) ||
		      wc == 0xFE55 ||
		      wc == 0xFF0C ||
		      wc == 0xFF1A ||
		      wc == 0xFF64)
		    SB_type = SB_SContinue;

		  if (wc == 0x0021 ||
		      wc == 0x003F ||
		      wc == 0x0589 ||
		      wc == 0x061F ||
		      wc == 0x06D4 ||
		      (wc >= 0x0700 && wc <= 0x0702) ||
		      wc == 0x07F9 ||
		      (wc >= 0x0964 && wc <= 0x0965) ||
		      (wc >= 0x104A && wc <= 0x104B) ||
		      wc == 0x1362 ||
		      (wc >= 0x1367 && wc <= 0x1368) ||
		      wc == 0x166E ||
		      (wc >= 0x1735 && wc <= 0x1736) ||
		      wc == 0x1803 ||
		      wc == 0x1809 ||
		      (wc >= 0x1944 && wc <= 0x1945) ||
		      (wc >= 0x1AA8 && wc <= 0x1AAB) ||
		      (wc >= 0x1B5A && wc <= 0x1B5B) ||
		      (wc >= 0x1B5E && wc <= 0x1B5F) ||
		      (wc >= 0x1C3B && wc <= 0x1C3C) ||
		      (wc >= 0x1C7E && wc <= 0x1C7F) ||
		      (wc >= 0x203C && wc <= 0x203D) ||
		      (wc >= 0x2047 && wc <= 0x2049) ||
		      wc == 0x2E2E ||
		      wc == 0x2E3C ||
		      wc == 0x3002 ||
		      wc == 0xA4FF ||
		      (wc >= 0xA60E && wc <= 0xA60F) ||
		      wc == 0xA6F3 ||
		      wc == 0xA6F7 ||
		      (wc >= 0xA876 && wc <= 0xA877) ||
		      (wc >= 0xA8CE && wc <= 0xA8CF) ||
		      wc == 0xA92F ||
		      (wc >= 0xA9C8 && wc <= 0xA9C9) ||
		      (wc >= 0xAA5D && wc <= 0xAA5F) ||
		      (wc >= 0xAAF0 && wc <= 0xAAF1) ||
		      wc == 0xABEB ||
		      (wc >= 0xFE56 && wc <= 0xFE57) ||
		      wc == 0xFF01 ||
		      wc == 0xFF1F ||
		      wc == 0xFF61 ||
		      (wc >= 0x10A56 && wc <= 0x10A57) ||
		      (wc >= 0x11047 && wc <= 0x11048) ||
		      (wc >= 0x110BE && wc <= 0x110C1) ||
		      (wc >= 0x11141 && wc <= 0x11143) ||
		      (wc >= 0x111C5 && wc <= 0x111C6) ||
		      wc == 0x111CD ||
		      (wc >= 0x111DE && wc <= 0x111DF) ||
		      (wc >= 0x11238 && wc <= 0x11239) ||
		      (wc >= 0x1123B && wc <= 0x1123C) ||
		      wc == 0x112A9 ||
		      (wc >= 0x1144B && wc <= 0x1144C) ||
		      (wc >= 0x115C2 && wc <= 0x115C3) ||
		      (wc >= 0x115C9 && wc <= 0x115D7) ||
		      (wc >= 0x11641 && wc <= 0x11642) ||
		      (wc >= 0x1173C && wc <= 0x1173E) ||
		      (wc >= 0x11C41 && wc <= 0x11C42) ||
		      (wc >= 0x16A6E && wc <= 0x16A6F) ||
		      wc == 0x16AF5 ||
		      (wc >= 0x16B37 && wc <= 0x16B38) ||
		      wc == 0x16B44 ||
		      wc == 0x1BC9F ||
		      wc == 0x1DA88)
		    SB_type = SB_STerm;

		  break;
		}

	    if (SB_type == SB_Other)
	      {
		if (g_unichar_islower(wc))
		  SB_type = SB_Lower;
		else if (g_unichar_isupper(wc))
		  SB_type = SB_Upper;
		else if (g_unichar_isalpha(wc))
		  SB_type = SB_OLetter;

		if (type == G_UNICODE_OPEN_PUNCTUATION ||
		    type == G_UNICODE_CLOSE_PUNCTUATION ||
		    break_type == G_UNICODE_BREAK_QUOTATION)
		  SB_type = SB_Close;
	      }

	    /* Sentence Boundary Rules */

	    /* We apply Rules SB1 and SB2 at the end of the function */

#define IS_OTHER_TERM(SB_type)						\
	    /* not in (OLetter | Upper | Lower | ParaSep | SATerm) */	\
	      !(SB_type == SB_OLetter ||				\
		SB_type == SB_Upper || SB_type == SB_Lower ||		\
		SB_type == SB_ParaSep ||				\
		SB_type == SB_ATerm || SB_type == SB_STerm ||		\
		SB_type == SB_ATerm_Close_Sp ||				\
		SB_type == SB_STerm_Close_Sp)


	    if (wc == '\n' && prev_wc == '\r')
	      is_sentence_boundary = FALSE; /* Rule SB3 */
	    else if (prev_SB_type == SB_ParaSep && prev_SB_i + 1 == i)
	      {
		/* The extra check for prev_SB_i is to correctly handle sequences like
		 * ParaSep ÷ Extend × Extend
		 * since we have not skipped ExtendFormat yet.
		 */

		is_sentence_boundary = TRUE; /* Rule SB4 */
	      }
	    else if (SB_type == SB_ExtendFormat)
	      is_sentence_boundary = FALSE; /* Rule SB5? */
	    else if (prev_SB_type == SB_ATerm && SB_type == SB_Numeric)
	      is_sentence_boundary = FALSE; /* Rule SB6 */
	    else if ((prev_prev_SB_type == SB_Upper ||
		      prev_prev_SB_type == SB_Lower) &&
		     prev_SB_type == SB_ATerm &&
		     SB_type == SB_Upper)
	      is_sentence_boundary = FALSE; /* Rule SB7 */
	    else if (prev_SB_type == SB_ATerm && SB_type == SB_Close)
		SB_type = SB_ATerm;
	    else if (prev_SB_type == SB_STerm && SB_type == SB_Close)
	      SB_type = SB_STerm;
	    else if (prev_SB_type == SB_ATerm && SB_type == SB_Sp)
	      SB_type = SB_ATerm_Close_Sp;
	    else if (prev_SB_type == SB_STerm && SB_type == SB_Sp)
	      SB_type = SB_STerm_Close_Sp;
	    /* Rule SB8 */
	    else if ((prev_SB_type == SB_ATerm ||
		      prev_SB_type == SB_ATerm_Close_Sp) &&
		     SB_type == SB_Lower)
	      is_sentence_boundary = FALSE;
	    else if ((prev_prev_SB_type == SB_ATerm ||
		      prev_prev_SB_type == SB_ATerm_Close_Sp) &&
		     IS_OTHER_TERM(prev_SB_type) &&
		     SB_type == SB_Lower)
	      attrs[prev_SB_i].is_sentence_boundary = FALSE;
	    else if ((prev_SB_type == SB_ATerm ||
		      prev_SB_type == SB_ATerm_Close_Sp ||
		      prev_SB_type == SB_STerm ||
		      prev_SB_type == SB_STerm_Close_Sp) &&
		     (SB_type == SB_SContinue ||
		      SB_type == SB_ATerm || SB_type == SB_STerm))
	      is_sentence_boundary = FALSE; /* Rule SB8a */
	    else if ((prev_SB_type == SB_ATerm ||
		      prev_SB_type == SB_STerm) &&
		     (SB_type == SB_Close || SB_type == SB_Sp ||
		      SB_type == SB_ParaSep))
	      is_sentence_boundary = FALSE; /* Rule SB9 */
	    else if ((prev_SB_type == SB_ATerm ||
		      prev_SB_type == SB_ATerm_Close_Sp ||
		      prev_SB_type == SB_STerm ||
		      prev_SB_type == SB_STerm_Close_Sp) &&
		     (SB_type == SB_Sp || SB_type == SB_ParaSep))
	      is_sentence_boundary = FALSE; /* Rule SB10 */
	    else if ((prev_SB_type == SB_ATerm ||
		      prev_SB_type == SB_ATerm_Close_Sp ||
		      prev_SB_type == SB_STerm ||
		      prev_SB_type == SB_STerm_Close_Sp) &&
		     SB_type != SB_ParaSep)
	      is_sentence_boundary = TRUE; /* Rule SB11 */
	    else
	      is_sentence_boundary = FALSE; /* Rule SB998 */

	    if (SB_type != SB_ExtendFormat &&
		!((prev_prev_SB_type == SB_ATerm ||
		   prev_prev_SB_type == SB_ATerm_Close_Sp) &&
		  IS_OTHER_TERM(prev_SB_type) &&
		  IS_OTHER_TERM(SB_type)))
              {
                prev_prev_SB_type = prev_SB_type;
                prev_SB_type = SB_type;
                prev_SB_i = i;
              }

#undef IS_OTHER_TERM

	  }

	if (i == 0 || done)
	  is_sentence_boundary = TRUE; /* Rules SB1 and SB2 */

	attrs[i].is_sentence_boundary = is_sentence_boundary;
      }

      /* ---- Line breaking ---- */

      break_op = BREAK_ALREADY_HANDLED;

      row_break_type = prev_break_type == G_UNICODE_BREAK_SPACE ?
	prev_prev_break_type : prev_break_type;
      g_assert (row_break_type != G_UNICODE_BREAK_SPACE);

      attrs[i].is_char_break = FALSE;
      attrs[i].is_line_break = FALSE;
      attrs[i].is_mandatory_break = FALSE;

      /* Rule LB1:
	 assign a line breaking class to each code point of the input. */
      switch ((int) break_type)
	{
	case G_UNICODE_BREAK_AMBIGUOUS:
	case G_UNICODE_BREAK_SURROGATE:
	case G_UNICODE_BREAK_UNKNOWN:
	  break_type = G_UNICODE_BREAK_ALPHABETIC;
	  break;

	case G_UNICODE_BREAK_COMPLEX_CONTEXT:
	  if (type == G_UNICODE_NON_SPACING_MARK ||
	      type == G_UNICODE_SPACING_MARK)
	    break_type = G_UNICODE_BREAK_COMBINING_MARK;
	  else
	    break_type = G_UNICODE_BREAK_ALPHABETIC;
	  break;

	case G_UNICODE_BREAK_CONDITIONAL_JAPANESE_STARTER:
	  break_type = G_UNICODE_BREAK_NON_STARTER;
	  break;

	default:
	  ;
	}

      /* If it's not a grapheme boundary, it's not a line break either */
      if (attrs[i].is_cursor_position ||
	  break_type == G_UNICODE_BREAK_COMBINING_MARK ||
	  break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER ||
	  break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
	  break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
	  break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
	  break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
	  break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE ||
	  break_type == G_UNICODE_BREAK_REGIONAL_INDICATOR)
	{
	  LineBreakType LB_type;

	  /* Find the LineBreakType of wc */
	  LB_type = LB_Other;

	  if (break_type == G_UNICODE_BREAK_NUMERIC)
	    LB_type = LB_Numeric;

	  if (break_type == G_UNICODE_BREAK_SYMBOL ||
	      break_type == G_UNICODE_BREAK_INFIX_SEPARATOR)
	    {
	      if (!(prev_LB_type == LB_Numeric))
		LB_type = LB_Other;
	    }

	  if (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
	      break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS)
	    {
	      if (prev_LB_type == LB_Numeric)
		LB_type = LB_Numeric_Close;
	      else
		LB_type = LB_Other;
	    }

	  if (break_type == G_UNICODE_BREAK_REGIONAL_INDICATOR)
	    {
	      if (prev_LB_type == LB_RI_Odd)
		LB_type = LB_RI_Even;
	      else if (prev_LB_type == LB_RI_Even)
		LB_type = LB_RI_Odd;
	      else
		LB_type = LB_RI_Odd;
	    }

	  attrs[i].is_line_break = TRUE; /* Rule LB31 */
	  /* Unicode doesn't specify char wrap;
	     we wrap around all chars currently. */
	  if (attrs[i].is_cursor_position)
	    attrs[i].is_char_break = TRUE;

	  /* Make any necessary replacements first */
	  if (row_break_type == G_UNICODE_BREAK_UNKNOWN)
	    row_break_type = G_UNICODE_BREAK_ALPHABETIC;

	  /* add the line break rules in reverse order to override
	     the lower priority rules. */

	  /* Rule LB30 */
	  if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
	       prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER ||
	       prev_break_type == G_UNICODE_BREAK_NUMERIC) &&
	      break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION)
	    break_op = BREAK_PROHIBITED;

	  if (prev_break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS &&
	      (break_type == G_UNICODE_BREAK_ALPHABETIC ||
	       break_type == G_UNICODE_BREAK_HEBREW_LETTER ||
	       break_type == G_UNICODE_BREAK_NUMERIC))
	    break_op = BREAK_PROHIBITED;

	  /* Rule LB30a */
	  if (prev_LB_type == LB_RI_Odd && LB_type == LB_RI_Even)
	    break_op = BREAK_PROHIBITED;

	  /* Rule LB30b */
	  if (prev_break_type == G_UNICODE_BREAK_EMOJI_BASE &&
	      break_type == G_UNICODE_BREAK_EMOJI_MODIFIER)
	    break_op = BREAK_PROHIBITED;

	  /* Rule LB29 */
	  if (prev_break_type == G_UNICODE_BREAK_INFIX_SEPARATOR &&
	      (break_type == G_UNICODE_BREAK_ALPHABETIC ||
	       break_type == G_UNICODE_BREAK_HEBREW_LETTER))
	    break_op = BREAK_PROHIBITED;

	  /* Rule LB28 */
	  if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
	       prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
	      (break_type == G_UNICODE_BREAK_ALPHABETIC ||
	       break_type == G_UNICODE_BREAK_HEBREW_LETTER))
	    break_op = BREAK_PROHIBITED;

	  /* Rule LB27 */
	  if ((prev_break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
	       prev_break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
	       prev_break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
	       prev_break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
	       prev_break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE) &&
	      (break_type == G_UNICODE_BREAK_INSEPARABLE ||
	       break_type == G_UNICODE_BREAK_POSTFIX))
	    break_op = BREAK_PROHIBITED;

	  if (prev_break_type == G_UNICODE_BREAK_PREFIX &&
	      (break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
	       break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
	       break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
	       break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
	       break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
	    break_op = BREAK_PROHIBITED;

	  /* Rule LB26 */
	  if (prev_break_type == G_UNICODE_BREAK_HANGUL_L_JAMO &&
	      (break_type == G_UNICODE_BREAK_HANGUL_L_JAMO ||
	       break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
	       break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE ||
	       break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE))
	    break_op = BREAK_PROHIBITED;

	  if ((prev_break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
	       prev_break_type == G_UNICODE_BREAK_HANGUL_LV_SYLLABLE) &&
	      (break_type == G_UNICODE_BREAK_HANGUL_V_JAMO ||
	       break_type == G_UNICODE_BREAK_HANGUL_T_JAMO))
	    break_op = BREAK_PROHIBITED;

	  if ((prev_break_type == G_UNICODE_BREAK_HANGUL_T_JAMO ||
	       prev_break_type == G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE) &&
	      break_type == G_UNICODE_BREAK_HANGUL_T_JAMO)
	    break_op = BREAK_PROHIBITED;

	  /* Rule LB25 with Example 7 of Customization */
	  if ((prev_break_type == G_UNICODE_BREAK_PREFIX ||
	       prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
	      break_type == G_UNICODE_BREAK_NUMERIC)
	    break_op = BREAK_PROHIBITED;

	  if ((prev_break_type == G_UNICODE_BREAK_PREFIX ||
	       prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
	      (break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION ||
	       break_type == G_UNICODE_BREAK_HYPHEN) &&
	      next_break_type == G_UNICODE_BREAK_NUMERIC)
	    break_op = BREAK_PROHIBITED;

	  if ((prev_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION ||
	       prev_break_type == G_UNICODE_BREAK_HYPHEN) &&
	      break_type == G_UNICODE_BREAK_NUMERIC)
	    break_op = BREAK_PROHIBITED;

	  if (prev_break_type == G_UNICODE_BREAK_NUMERIC &&
	      (break_type == G_UNICODE_BREAK_NUMERIC ||
	       break_type == G_UNICODE_BREAK_SYMBOL ||
	       break_type == G_UNICODE_BREAK_INFIX_SEPARATOR))
	    break_op = BREAK_PROHIBITED;

	  if (prev_LB_type == LB_Numeric &&
	      (break_type == G_UNICODE_BREAK_NUMERIC ||
	       break_type == G_UNICODE_BREAK_SYMBOL ||
	       break_type == G_UNICODE_BREAK_INFIX_SEPARATOR ||
	       break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
	       break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS))
	    break_op = BREAK_PROHIBITED;

	  if ((prev_LB_type == LB_Numeric ||
	       prev_LB_type == LB_Numeric_Close) &&
	      (break_type == G_UNICODE_BREAK_POSTFIX ||
	       break_type == G_UNICODE_BREAK_PREFIX))
	    break_op = BREAK_PROHIBITED;

	  /* Rule LB24 */
	  if ((prev_break_type == G_UNICODE_BREAK_PREFIX ||
	       prev_break_type == G_UNICODE_BREAK_POSTFIX) &&
	      (break_type == G_UNICODE_BREAK_ALPHABETIC ||
	       break_type == G_UNICODE_BREAK_HEBREW_LETTER))
	    break_op = BREAK_PROHIBITED;

	  if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
	       prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
	      (break_type == G_UNICODE_BREAK_PREFIX ||
	       break_type == G_UNICODE_BREAK_POSTFIX))
	    break_op = BREAK_PROHIBITED;

	  /* Rule LB23 */
	  if ((prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
	       prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER) &&
	      break_type == G_UNICODE_BREAK_NUMERIC)
	    break_op = BREAK_PROHIBITED;

	  if (prev_break_type == G_UNICODE_BREAK_NUMERIC &&
	      (break_type == G_UNICODE_BREAK_ALPHABETIC ||
	       break_type == G_UNICODE_BREAK_HEBREW_LETTER))
	    break_op = BREAK_PROHIBITED;

	  /* Rule LB23a */
	  if (prev_break_type == G_UNICODE_BREAK_PREFIX &&
	      (break_type == G_UNICODE_BREAK_IDEOGRAPHIC ||
	       break_type == G_UNICODE_BREAK_EMOJI_BASE ||
	       break_type == G_UNICODE_BREAK_EMOJI_MODIFIER))
	    break_op = BREAK_PROHIBITED;

	  if ((prev_break_type == G_UNICODE_BREAK_IDEOGRAPHIC ||
	       prev_break_type == G_UNICODE_BREAK_EMOJI_BASE ||
	       prev_break_type == G_UNICODE_BREAK_EMOJI_MODIFIER) &&
	      break_type == G_UNICODE_BREAK_POSTFIX)
	    break_op = BREAK_PROHIBITED;

	  /* Rule LB22 */
	  if (break_type == G_UNICODE_BREAK_INSEPARABLE)
	    {
	      if (prev_break_type == G_UNICODE_BREAK_ALPHABETIC ||
		  prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER)
		break_op = BREAK_PROHIBITED;

	      if (prev_break_type == G_UNICODE_BREAK_EXCLAMATION)
		break_op = BREAK_PROHIBITED;

	      if (prev_break_type == G_UNICODE_BREAK_IDEOGRAPHIC ||
		  prev_break_type == G_UNICODE_BREAK_EMOJI_BASE ||
		  prev_break_type == G_UNICODE_BREAK_EMOJI_MODIFIER)
		break_op = BREAK_PROHIBITED;

	      if (prev_break_type == G_UNICODE_BREAK_INSEPARABLE)
		break_op = BREAK_PROHIBITED;

	      if (prev_break_type == G_UNICODE_BREAK_NUMERIC)
		break_op = BREAK_PROHIBITED;
	    }

	  if (break_type == G_UNICODE_BREAK_AFTER ||
	      break_type == G_UNICODE_BREAK_HYPHEN ||
	      break_type == G_UNICODE_BREAK_NON_STARTER ||
	      prev_break_type == G_UNICODE_BREAK_BEFORE)
	    break_op = BREAK_PROHIBITED; /* Rule LB21 */

	  if (prev_prev_break_type == G_UNICODE_BREAK_HEBREW_LETTER &&
	      (prev_break_type == G_UNICODE_BREAK_HYPHEN ||
	       prev_break_type == G_UNICODE_BREAK_AFTER))
	    break_op = BREAK_PROHIBITED; /* Rule LB21a */

	  if (prev_break_type == G_UNICODE_BREAK_SYMBOL &&
	      break_type == G_UNICODE_BREAK_HEBREW_LETTER)
	    break_op = BREAK_PROHIBITED; /* Rule LB21b */

	  if (prev_break_type == G_UNICODE_BREAK_CONTINGENT ||
	      break_type == G_UNICODE_BREAK_CONTINGENT)
	    break_op = BREAK_ALLOWED; /* Rule LB20 */

	  if (prev_break_type == G_UNICODE_BREAK_QUOTATION ||
	      break_type == G_UNICODE_BREAK_QUOTATION)
	    break_op = BREAK_PROHIBITED; /* Rule LB19 */

	  /* handle related rules for Space as state machine here,
	     and override the pair table result. */
	  if (prev_break_type == G_UNICODE_BREAK_SPACE) /* Rule LB18 */
	    break_op = BREAK_ALLOWED;

	  if (row_break_type == G_UNICODE_BREAK_BEFORE_AND_AFTER &&
	      break_type == G_UNICODE_BREAK_BEFORE_AND_AFTER)
	    break_op = BREAK_PROHIBITED; /* Rule LB17 */

	  if ((row_break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
	       row_break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS) &&
	      break_type == G_UNICODE_BREAK_NON_STARTER)
	    break_op = BREAK_PROHIBITED; /* Rule LB16 */

	  if (row_break_type == G_UNICODE_BREAK_QUOTATION &&
	      break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION)
	    break_op = BREAK_PROHIBITED; /* Rule LB15 */

	  if (row_break_type == G_UNICODE_BREAK_OPEN_PUNCTUATION)
	    break_op = BREAK_PROHIBITED; /* Rule LB14 */

	  /* Rule LB13 with Example 7 of Customization */
	  if (break_type == G_UNICODE_BREAK_EXCLAMATION)
	    break_op = BREAK_PROHIBITED;

	  if (prev_break_type != G_UNICODE_BREAK_NUMERIC &&
	      (break_type == G_UNICODE_BREAK_CLOSE_PUNCTUATION ||
	       break_type == G_UNICODE_BREAK_CLOSE_PARANTHESIS ||
	       break_type == G_UNICODE_BREAK_INFIX_SEPARATOR ||
	       break_type == G_UNICODE_BREAK_SYMBOL))
	    break_op = BREAK_PROHIBITED;

	  if (prev_break_type == G_UNICODE_BREAK_NON_BREAKING_GLUE)
	    break_op = BREAK_PROHIBITED; /* Rule LB12 */

	  if (break_type == G_UNICODE_BREAK_NON_BREAKING_GLUE &&
	      (prev_break_type != G_UNICODE_BREAK_SPACE &&
	       prev_break_type != G_UNICODE_BREAK_AFTER &&
	       prev_break_type != G_UNICODE_BREAK_HYPHEN))
	    break_op = BREAK_PROHIBITED; /* Rule LB12a */

	  if (prev_break_type == G_UNICODE_BREAK_WORD_JOINER ||
	      break_type == G_UNICODE_BREAK_WORD_JOINER)
	    break_op = BREAK_PROHIBITED; /* Rule LB11 */


	  /* Rule LB9 */
	  if (break_type == G_UNICODE_BREAK_COMBINING_MARK ||
              break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER)
	    {
	      if (!(prev_break_type == G_UNICODE_BREAK_MANDATORY ||
		    prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN ||
		    prev_break_type == G_UNICODE_BREAK_LINE_FEED ||
		    prev_break_type == G_UNICODE_BREAK_NEXT_LINE ||
		    prev_break_type == G_UNICODE_BREAK_SPACE ||
		    prev_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE))
		break_op = BREAK_PROHIBITED;
	    }

	  if (row_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
	    break_op = BREAK_ALLOWED; /* Rule LB8 */

	  if (prev_wc == 0x200D &&
	      (break_type == G_UNICODE_BREAK_IDEOGRAPHIC ||
	       break_type == G_UNICODE_BREAK_EMOJI_BASE ||
	       break_type == G_UNICODE_BREAK_EMOJI_MODIFIER))
	    break_op = BREAK_PROHIBITED; /* Rule LB8a */

	  if (break_type == G_UNICODE_BREAK_SPACE ||
	      break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
	    break_op = BREAK_PROHIBITED; /* Rule LB7 */

	  /* Rule LB6 */
	  if (break_type == G_UNICODE_BREAK_MANDATORY ||
	      break_type == G_UNICODE_BREAK_CARRIAGE_RETURN ||
	      break_type == G_UNICODE_BREAK_LINE_FEED ||
	      break_type == G_UNICODE_BREAK_NEXT_LINE)
	    break_op = BREAK_PROHIBITED;

	  /* Rules LB4 and LB5 */
	  if (prev_break_type == G_UNICODE_BREAK_MANDATORY ||
	      (prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN &&
	       wc != '\n') ||
	      prev_break_type == G_UNICODE_BREAK_LINE_FEED ||
	      prev_break_type == G_UNICODE_BREAK_NEXT_LINE)
	    {
	      attrs[i].is_mandatory_break = TRUE;
	      break_op = BREAK_ALLOWED;
	    }

	  switch (break_op)
	    {
	    case BREAK_PROHIBITED:
	      /* can't break here */
	      attrs[i].is_line_break = FALSE;
	      break;

	    case BREAK_IF_SPACES:
	      /* break if prev char was space */
	      if (prev_break_type != G_UNICODE_BREAK_SPACE)
		attrs[i].is_line_break = FALSE;
	      break;

	    case BREAK_ALLOWED:
	      attrs[i].is_line_break = TRUE;
	      break;

	    case BREAK_ALREADY_HANDLED:
	      break;

	    default:
	      g_assert_not_reached ();
	      break;
	    }

	  /* Rule LB9 */
	  if (!(break_type == G_UNICODE_BREAK_COMBINING_MARK ||
		break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER))
	    {
	      /* Rule LB25 with Example 7 of Customization */
	      if (break_type == G_UNICODE_BREAK_NUMERIC ||
		  break_type == G_UNICODE_BREAK_SYMBOL ||
		  break_type == G_UNICODE_BREAK_INFIX_SEPARATOR)
		{
		  if (prev_LB_type != LB_Numeric)
		    prev_LB_type = LB_type;
		  /* else don't change the prev_LB_type */
		}
	      else
		{
		  prev_LB_type = LB_type;
		}
	    }
	  /* else don't change the prev_LB_type for Rule LB9 */
	}

      if (break_type != G_UNICODE_BREAK_SPACE)
	{
	  /* Rule LB9 */
	  if (break_type == G_UNICODE_BREAK_COMBINING_MARK ||
	      break_type == G_UNICODE_BREAK_ZERO_WIDTH_JOINER)
	    {
	      if (i == 0 /* start of text */ ||
		  prev_break_type == G_UNICODE_BREAK_MANDATORY ||
		  prev_break_type == G_UNICODE_BREAK_CARRIAGE_RETURN ||
		  prev_break_type == G_UNICODE_BREAK_LINE_FEED ||
		  prev_break_type == G_UNICODE_BREAK_NEXT_LINE ||
		  prev_break_type == G_UNICODE_BREAK_SPACE ||
		  prev_break_type == G_UNICODE_BREAK_ZERO_WIDTH_SPACE)
		prev_break_type = G_UNICODE_BREAK_ALPHABETIC; /* Rule LB10 */
	      /* else don't change the prev_break_type for Rule LB9 */
	    }
	  else
	    {
	      prev_prev_break_type = prev_break_type;
	      prev_break_type = break_type;
	    }

	  prev_jamo = jamo;
	}
      else
	{
	  if (prev_break_type != G_UNICODE_BREAK_SPACE)
	    {
	      prev_prev_break_type = prev_break_type;
	      prev_break_type = break_type;
	    }
	  /* else don't change the prev_break_type */
	}

      /* ---- Word breaks ---- */

      /* default to not a word start/end */
      attrs[i].is_word_start = FALSE;
      attrs[i].is_word_end = FALSE;

      if (current_word_type != WordNone)
	{
	  /* Check for a word end */
	  switch ((int) type)
	    {
	    case G_UNICODE_SPACING_MARK:
	    case G_UNICODE_ENCLOSING_MARK:
	    case G_UNICODE_NON_SPACING_MARK:
	    case G_UNICODE_FORMAT:
	      /* nothing, we just eat these up as part of the word */
	      break;

	    case G_UNICODE_LOWERCASE_LETTER:
	    case G_UNICODE_MODIFIER_LETTER:
	    case G_UNICODE_OTHER_LETTER:
	    case G_UNICODE_TITLECASE_LETTER:
	    case G_UNICODE_UPPERCASE_LETTER:
	      if (current_word_type == WordLetters)
		{
		  /* Japanese special cases for ending the word */
		  if (JAPANESE (last_word_letter) ||
		      JAPANESE (wc))
		    {
		      if ((HIRAGANA (last_word_letter) &&
			   !HIRAGANA (wc)) ||
			  (KATAKANA (last_word_letter) &&
			   !(KATAKANA (wc) || HIRAGANA (wc))) ||
			  (KANJI (last_word_letter) &&
			   !(HIRAGANA (wc) || KANJI (wc))) ||
			  (JAPANESE (last_word_letter) &&
			   !JAPANESE (wc)) ||
			  (!JAPANESE (last_word_letter) &&
			   JAPANESE (wc)))
			attrs[i].is_word_end = TRUE;
		    }
		}
	      last_word_letter = wc;
	      break;

	    case G_UNICODE_DECIMAL_NUMBER:
	    case G_UNICODE_LETTER_NUMBER:
	    case G_UNICODE_OTHER_NUMBER:
	      last_word_letter = wc;
	      break;

	    default:
	      /* Punctuation, control/format chars, etc. all end a word. */
	      attrs[i].is_word_end = TRUE;
	      current_word_type = WordNone;
	      break;
	    }
	}
      else
	{
	  /* Check for a word start */
	  switch ((int) type)
	    {
	    case G_UNICODE_LOWERCASE_LETTER:
	    case G_UNICODE_MODIFIER_LETTER:
	    case G_UNICODE_OTHER_LETTER:
	    case G_UNICODE_TITLECASE_LETTER:
	    case G_UNICODE_UPPERCASE_LETTER:
	      current_word_type = WordLetters;
	      last_word_letter = wc;
	      attrs[i].is_word_start = TRUE;
	      break;

	    case G_UNICODE_DECIMAL_NUMBER:
	    case G_UNICODE_LETTER_NUMBER:
	    case G_UNICODE_OTHER_NUMBER:
	      current_word_type = WordNumbers;
	      last_word_letter = wc;
	      attrs[i].is_word_start = TRUE;
	      break;

	    default:
	      /* No word here */
	      break;
	    }
	}

      /* ---- Sentence breaks ---- */
      {

	/* default to not a sentence start/end */
	attrs[i].is_sentence_start = FALSE;
	attrs[i].is_sentence_end = FALSE;

	/* maybe start sentence */
	if (last_sentence_start == -1 && !is_sentence_boundary)
	  last_sentence_start = i - 1;

	/* remember last non space character position */
	if (i > 0 && !attrs[i - 1].is_white)
	  last_non_space = i;

	/* meets sentence end, mark both sentence start and end */
	if (last_sentence_start != -1 && is_sentence_boundary) {
	  if (last_non_space != -1) {
	    attrs[last_sentence_start].is_sentence_start = TRUE;
	    attrs[last_non_space].is_sentence_end = TRUE;
	  }

	  last_sentence_start = -1;
	  last_non_space = -1;
	}

	/* meets space character, move sentence start */
	if (last_sentence_start != -1 &&
	    last_sentence_start == i - 1 &&
	    attrs[i - 1].is_white)
	    last_sentence_start++;

      }

      prev_wc = wc;

      /* wc might not be a valid Unicode base character, but really all we
       * need to know is the last non-combining character */
      if (type != G_UNICODE_SPACING_MARK &&
	  type != G_UNICODE_ENCLOSING_MARK &&
	  type != G_UNICODE_NON_SPACING_MARK)
	base_character = wc;
    }

  i--;

  attrs[i].is_cursor_position = TRUE;  /* Rule GB2 */
  attrs[0].is_cursor_position = TRUE;  /* Rule GB1 */

  attrs[i].is_word_boundary = TRUE;  /* Rule WB2 */
  attrs[0].is_word_boundary = TRUE;  /* Rule WB1 */

  attrs[i].is_line_break = TRUE;  /* Rule LB3 */
  attrs[0].is_line_break = FALSE; /* Rule LB2 */

}

static gboolean
tailor_break (const gchar   *text,
	     gint           length,
	     PangoAnalysis *analysis,
	     PangoLogAttr  *attrs,
	     int            attrs_len)
{
  if (analysis->lang_engine && PANGO_ENGINE_LANG_GET_CLASS (analysis->lang_engine)->script_break)
    {
      if (length < 0)
	length = strlen (text);
      else if (text == NULL)
	text = "";

      PANGO_ENGINE_LANG_GET_CLASS (analysis->lang_engine)->script_break (analysis->lang_engine, text, length, analysis, attrs, attrs_len);
      return TRUE;
    }
  return FALSE;
}

/**
 * pango_break:
 * @text:      the text to process
 * @length:    length of @text in bytes (may be -1 if @text is nul-terminated)
 * @analysis:  #PangoAnalysis structure from pango_itemize()
 * @attrs:     (array length=attrs_len): an array to store character
 *             information in
 * @attrs_len: size of the array passed as @attrs
 *
 * Determines possible line, word, and character breaks
 * for a string of Unicode text with a single analysis.  For most
 * purposes you may want to use pango_get_log_attrs().
 */
void
pango_break (const gchar   *text,
	     gint           length,
	     PangoAnalysis *analysis,
	     PangoLogAttr  *attrs,
	     int            attrs_len)
{
  g_return_if_fail (analysis != NULL);
  g_return_if_fail (attrs != NULL);

  pango_default_break (text, length, analysis, attrs, attrs_len);
  tailor_break        (text, length, analysis, attrs, attrs_len);
}

/**
 * pango_find_paragraph_boundary:
 * @text: UTF-8 text
 * @length: length of @text in bytes, or -1 if nul-terminated
 * @paragraph_delimiter_index: (out): return location for index of
 *   delimiter
 * @next_paragraph_start: (out): return location for start of next
 *   paragraph
 *
 * Locates a paragraph boundary in @text. A boundary is caused by
 * delimiter characters, such as a newline, carriage return, carriage
 * return-newline pair, or Unicode paragraph separator character.  The
 * index of the run of delimiters is returned in
 * @paragraph_delimiter_index. The index of the start of the paragraph
 * (index after all delimiters) is stored in @next_paragraph_start.
 *
 * If no delimiters are found, both @paragraph_delimiter_index and
 * @next_paragraph_start are filled with the length of @text (an index one
 * off the end).
 **/
void
pango_find_paragraph_boundary (const gchar *text,
			       gint         length,
			       gint        *paragraph_delimiter_index,
			       gint        *next_paragraph_start)
{
  const gchar *p = text;
  const gchar *end;
  const gchar *start = NULL;
  const gchar *delimiter = NULL;

  /* Only one character has type G_UNICODE_PARAGRAPH_SEPARATOR in
   * Unicode 5.0; update the following code if that changes.
   */

  /* prev_sep is the first byte of the previous separator.  Since
   * the valid separators are \r, \n, and PARAGRAPH_SEPARATOR, the
   * first byte is enough to identify it.
   */
  gchar prev_sep;


  if (length < 0)
    length = strlen (text);

  end = text + length;

  if (paragraph_delimiter_index)
    *paragraph_delimiter_index = length;

  if (next_paragraph_start)
    *next_paragraph_start = length;

  if (length == 0)
    return;

  prev_sep = 0;

  while (p < end)
    {
      if (prev_sep == '\n' ||
	  prev_sep == PARAGRAPH_SEPARATOR_STRING[0])
	{
	  g_assert (delimiter);
	  start = p;
	  break;
	}
      else if (prev_sep == '\r')
	{
	  /* don't break between \r and \n */
	  if (*p != '\n')
	    {
	      g_assert (delimiter);
	      start = p;
	      break;
	    }
	}

      if (*p == '\n' ||
	   *p == '\r' ||
	   !strncmp(p, PARAGRAPH_SEPARATOR_STRING,
		    strlen(PARAGRAPH_SEPARATOR_STRING)))
	{
	  if (delimiter == NULL)
	    delimiter = p;
	  prev_sep = *p;
	}
      else
	prev_sep = 0;

      p = g_utf8_next_char (p);
    }

  if (delimiter && paragraph_delimiter_index)
    *paragraph_delimiter_index = delimiter - text;

  if (start && next_paragraph_start)
    *next_paragraph_start = start - text;
}

static int
tailor_segment (const char      *range_start,
		const char      *range_end,
		int              chars_broken,
		PangoAnalysis   *analysis,
		PangoLogAttr    *log_attrs)
{
  int chars_in_range;
  PangoLogAttr *start = log_attrs + chars_broken;
  PangoLogAttr attr_before = *start;

  chars_in_range = pango_utf8_strlen (range_start, range_end - range_start);

  if (tailor_break (range_start,
		    range_end - range_start,
		    analysis,
		    start,
		    chars_in_range + 1))
    {
      /* if tailored, we enforce some of the attrs from before tailoring at
       * the boundary
       */

     start->backspace_deletes_character  = attr_before.backspace_deletes_character;

     start->is_line_break      |= attr_before.is_line_break;
     start->is_mandatory_break |= attr_before.is_mandatory_break;
     start->is_cursor_position |= attr_before.is_cursor_position;
    }

  return chars_in_range;
}

/**
 * pango_get_log_attrs:
 * @text: text to process
 * @length: length in bytes of @text
 * @level: embedding level, or -1 if unknown
 * @language: language tag
 * @log_attrs: (array length=attrs_len): array with one #PangoLogAttr
 *   per character in @text, plus one extra, to be filled in
 * @attrs_len: length of @log_attrs array
 *
 * Computes a #PangoLogAttr for each character in @text. The @log_attrs
 * array must have one #PangoLogAttr for each position in @text; if
 * @text contains N characters, it has N+1 positions, including the
 * last position at the end of the text. @text should be an entire
 * paragraph; logical attributes can't be computed without context
 * (for example you need to see spaces on either side of a word to know
 * the word is a word).
 */
void
pango_get_log_attrs (const char    *text,
		     int            length,
		     int            level,
		     PangoLanguage *language,
		     PangoLogAttr  *log_attrs,
		     int            attrs_len)
{
  int chars_broken;
  PangoAnalysis analysis = { NULL };
  PangoScriptIter iter;

  g_return_if_fail (length == 0 || text != NULL);
  g_return_if_fail (log_attrs != NULL);

  analysis.level = level;
  analysis.lang_engine = _pango_get_language_engine ();

  pango_default_break (text, length, &analysis, log_attrs, attrs_len);

  chars_broken = 0;

  _pango_script_iter_init (&iter, text, length);
  do
    {
      const char *run_start, *run_end;
      PangoScript script;

      pango_script_iter_get_range (&iter, &run_start, &run_end, &script);
      analysis.script = script;

      chars_broken += tailor_segment (run_start, run_end, chars_broken, &analysis, log_attrs);
    }
  while (pango_script_iter_next (&iter));
  _pango_script_iter_fini (&iter);

  if (chars_broken + 1 > attrs_len)
    g_warning ("pango_get_log_attrs: attrs_len should have been at least %d, but was %d.  Expect corrupted memory.",
	       chars_broken + 1,
	       attrs_len);
}

#include "break-arabic.c"
#include "break-indic.c"
#include "break-thai.c"

static void
break_script (const char          *item_text,
	      unsigned int         item_length,
	      const PangoAnalysis *analysis,
	      PangoLogAttr        *attrs,
	      int                  attrs_len)
{
  switch (analysis->script)
    {
    case PANGO_SCRIPT_ARABIC:
      break_arabic (item_text, item_length, analysis, attrs, attrs_len);
      break;

    case PANGO_SCRIPT_DEVANAGARI:
    case PANGO_SCRIPT_BENGALI:
    case PANGO_SCRIPT_GURMUKHI:
    case PANGO_SCRIPT_GUJARATI:
    case PANGO_SCRIPT_ORIYA:
    case PANGO_SCRIPT_TAMIL:
    case PANGO_SCRIPT_TELUGU:
    case PANGO_SCRIPT_KANNADA:
    case PANGO_SCRIPT_MALAYALAM:
    case PANGO_SCRIPT_SINHALA:
      break_indic (item_text, item_length, analysis, attrs, attrs_len);
      break;

    case PANGO_SCRIPT_THAI:
      break_thai (item_text, item_length, analysis, attrs, attrs_len);
      break;
    }
}


/* Wrap language breaker in PangoEngineLang to pass it through old API,
 * from times when there were modules and engines. */
typedef PangoEngineLang      PangoLanguageEngine;
typedef PangoEngineLangClass PangoLanguageEngineClass;
static GType pango_language_engine_get_type (void) G_GNUC_CONST;
G_GNUC_BEGIN_IGNORE_DEPRECATIONS
G_DEFINE_TYPE (PangoLanguageEngine, pango_language_engine, PANGO_TYPE_ENGINE_LANG);
G_GNUC_END_IGNORE_DEPRECATIONS
static void
_pango_language_engine_break (PangoEngineLang *engine G_GNUC_UNUSED,
			      const char      *item_text,
			      int              item_length,
			      PangoAnalysis   *analysis,
			      PangoLogAttr    *attrs,
			      int              attrs_len)
{
  break_script (item_text, item_length, analysis, attrs, attrs_len);
}
static void
pango_language_engine_class_init (PangoEngineLangClass *class)
{
  class->script_break = _pango_language_engine_break;
}
static void
pango_language_engine_init (PangoEngineLang *object)
{
}

PangoEngineLang *
_pango_get_language_engine (void)
{
  static PangoEngineLang *engine;
  if (g_once_init_enter (&engine))
    g_once_init_leave (&engine, g_object_new (pango_language_engine_get_type(), NULL));
  return engine;
}