Blame lib/unistring/uninorm/u-normalize-internal.h

Packit aea12f
/* Decomposition and composition of Unicode strings.
Packit Service 991b93
   Copyright (C) 2009-2020 Free Software Foundation, Inc.
Packit aea12f
   Written by Bruno Haible <bruno@clisp.org>, 2009.
Packit aea12f
Packit aea12f
   This program is free software: you can redistribute it and/or
Packit aea12f
   modify it under the terms of either:
Packit aea12f
Packit aea12f
     * the GNU Lesser General Public License as published by the Free
Packit aea12f
       Software Foundation; either version 3 of the License, or (at your
Packit aea12f
       option) any later version.
Packit aea12f
Packit aea12f
   or
Packit aea12f
Packit aea12f
     * the GNU General Public License as published by the Free
Packit aea12f
       Software Foundation; either version 2 of the License, or (at your
Packit aea12f
       option) any later version.
Packit aea12f
Packit aea12f
   or both in parallel, as here.
Packit aea12f
   This program is distributed in the hope that it will be useful,
Packit aea12f
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit aea12f
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit aea12f
   Lesser General Public License for more details.
Packit aea12f
Packit aea12f
   You should have received a copy of the GNU Lesser General Public License
Packit aea12f
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
Packit aea12f
Packit aea12f
UNIT *
Packit aea12f
FUNC (uninorm_t nf, const UNIT *s, size_t n,
Packit aea12f
      UNIT *resultbuf, size_t *lengthp)
Packit aea12f
{
Packit aea12f
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
Packit aea12f
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
Packit aea12f
Packit aea12f
  /* The result being accumulated.  */
Packit aea12f
  UNIT *result;
Packit aea12f
  size_t length;
Packit aea12f
  size_t allocated;
Packit aea12f
  /* The buffer for sorting.  */
Packit aea12f
  #define SORTBUF_PREALLOCATED 64
Packit aea12f
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
Packit aea12f
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
Packit aea12f
  size_t sortbuf_allocated;
Packit aea12f
  size_t sortbuf_count;
Packit aea12f
Packit aea12f
  /* Initialize the accumulator.  */
Packit aea12f
  if (resultbuf == NULL)
Packit aea12f
    {
Packit aea12f
      result = NULL;
Packit aea12f
      allocated = 0;
Packit aea12f
    }
Packit aea12f
  else
Packit aea12f
    {
Packit aea12f
      result = resultbuf;
Packit aea12f
      allocated = *lengthp;
Packit aea12f
    }
Packit aea12f
  length = 0;
Packit aea12f
Packit aea12f
  /* Initialize the buffer for sorting.  */
Packit aea12f
  sortbuf = sortbuf_preallocated;
Packit aea12f
  sortbuf_allocated = SORTBUF_PREALLOCATED;
Packit aea12f
  sortbuf_count = 0;
Packit aea12f
Packit aea12f
  {
Packit aea12f
    const UNIT *s_end = s + n;
Packit aea12f
Packit aea12f
    for (;;)
Packit aea12f
      {
Packit aea12f
        int count;
Packit aea12f
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
Packit aea12f
        int decomposed_count;
Packit aea12f
        int i;
Packit aea12f
Packit aea12f
        if (s < s_end)
Packit aea12f
          {
Packit aea12f
            /* Fetch the next character.  */
Packit aea12f
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
Packit aea12f
            decomposed_count = 1;
Packit aea12f
Packit aea12f
            /* Decompose it, recursively.
Packit aea12f
               It would be possible to precompute the recursive decomposition
Packit aea12f
               and store it in a table.  But this would significantly increase
Packit aea12f
               the size of the decomposition tables, because for example for
Packit aea12f
               U+1FC1 the recursive canonical decomposition and the recursive
Packit aea12f
               compatibility decomposition are different.  */
Packit aea12f
            {
Packit aea12f
              int curr;
Packit aea12f
Packit aea12f
              for (curr = 0; curr < decomposed_count; )
Packit aea12f
                {
Packit aea12f
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
Packit aea12f
                     all elements are atomic.  */
Packit aea12f
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
Packit aea12f
                  int curr_decomposed_count;
Packit aea12f
Packit aea12f
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
Packit aea12f
                  if (curr_decomposed_count >= 0)
Packit aea12f
                    {
Packit aea12f
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
Packit aea12f
                         decomposed[curr], making room.  It's not worth using
Packit aea12f
                         memcpy() here, since the counts are so small.  */
Packit aea12f
                      int shift = curr_decomposed_count - 1;
Packit aea12f
Packit aea12f
                      if (shift < 0)
Packit aea12f
                        abort ();
Packit aea12f
                      if (shift > 0)
Packit aea12f
                        {
Packit aea12f
                          int j;
Packit aea12f
Packit aea12f
                          decomposed_count += shift;
Packit aea12f
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
Packit aea12f
                            abort ();
Packit aea12f
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
Packit aea12f
                            decomposed[j + shift] = decomposed[j];
Packit aea12f
                        }
Packit aea12f
                      for (; shift >= 0; shift--)
Packit aea12f
                        decomposed[curr + shift] = curr_decomposed[shift];
Packit aea12f
                    }
Packit aea12f
                  else
Packit aea12f
                    {
Packit aea12f
                      /* decomposed[curr] is atomic.  */
Packit aea12f
                      curr++;
Packit aea12f
                    }
Packit aea12f
                }
Packit aea12f
            }
Packit aea12f
          }
Packit aea12f
        else
Packit aea12f
          {
Packit aea12f
            count = 0;
Packit aea12f
            decomposed_count = 0;
Packit aea12f
          }
Packit aea12f
Packit aea12f
        i = 0;
Packit aea12f
        for (;;)
Packit aea12f
          {
Packit aea12f
            ucs4_t uc;
Packit aea12f
            int ccc;
Packit aea12f
Packit aea12f
            if (s < s_end)
Packit aea12f
              {
Packit aea12f
                /* Fetch the next character from the decomposition.  */
Packit aea12f
                if (i == decomposed_count)
Packit aea12f
                  break;
Packit aea12f
                uc = decomposed[i];
Packit aea12f
                ccc = uc_combining_class (uc);
Packit aea12f
              }
Packit aea12f
            else
Packit aea12f
              {
Packit aea12f
                /* End of string reached.  */
Packit aea12f
                uc = 0;
Packit aea12f
                ccc = 0;
Packit aea12f
              }
Packit aea12f
Packit aea12f
            if (ccc == 0)
Packit aea12f
              {
Packit aea12f
                size_t j;
Packit aea12f
Packit aea12f
                /* Apply the canonical ordering algorithm to the accumulated
Packit aea12f
                   sequence of characters.  */
Packit aea12f
                if (sortbuf_count > 1)
Packit aea12f
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
Packit aea12f
                                                           sortbuf + sortbuf_count);
Packit aea12f
Packit aea12f
                if (composer != NULL)
Packit aea12f
                  {
Packit aea12f
                    /* Attempt to combine decomposed characters, as specified
Packit aea12f
                       in the Unicode Standard Annex #15 "Unicode Normalization
Packit aea12f
                       Forms".  We need to check
Packit aea12f
                         1. whether the first accumulated character is a
Packit aea12f
                            "starter" (i.e. has ccc = 0).  This is usually the
Packit aea12f
                            case.  But when the string starts with a
Packit aea12f
                            non-starter, the sortbuf also starts with a
Packit aea12f
                            non-starter.  Btw, this check could also be
Packit aea12f
                            omitted, because the composition table has only
Packit aea12f
                            entries (code1, code2) for which code1 is a
Packit aea12f
                            starter; if the first accumulated character is not
Packit aea12f
                            a starter, no lookup will succeed.
Packit aea12f
                         2. If the sortbuf has more than one character, check
Packit aea12f
                            for each of these characters that are not "blocked"
Packit aea12f
                            from the starter (i.e. have a ccc that is higher
Packit aea12f
                            than the ccc of the previous character) whether it
Packit aea12f
                            can be combined with the first character.
Packit aea12f
                         3. If only one character is left in sortbuf, check
Packit aea12f
                            whether it can be combined with the next character
Packit aea12f
                            (also a starter).  */
Packit aea12f
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
Packit aea12f
                      {
Packit aea12f
                        for (j = 1; j < sortbuf_count; )
Packit aea12f
                          {
Packit aea12f
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
Packit aea12f
                              {
Packit aea12f
                                ucs4_t combined =
Packit aea12f
                                  composer (sortbuf[0].code, sortbuf[j].code);
Packit aea12f
                                if (combined)
Packit aea12f
                                  {
Packit aea12f
                                    size_t k;
Packit aea12f
Packit aea12f
                                    sortbuf[0].code = combined;
Packit aea12f
                                    /* sortbuf[0].ccc = 0, still valid.  */
Packit aea12f
                                    for (k = j + 1; k < sortbuf_count; k++)
Packit aea12f
                                      sortbuf[k - 1] = sortbuf[k];
Packit aea12f
                                    sortbuf_count--;
Packit aea12f
                                    continue;
Packit aea12f
                                  }
Packit aea12f
                              }
Packit aea12f
                            j++;
Packit aea12f
                          }
Packit aea12f
                        if (s < s_end && sortbuf_count == 1)
Packit aea12f
                          {
Packit aea12f
                            ucs4_t combined =
Packit aea12f
                              composer (sortbuf[0].code, uc);
Packit aea12f
                            if (combined)
Packit aea12f
                              {
Packit aea12f
                                uc = combined;
Packit aea12f
                                ccc = 0;
Packit aea12f
                                /* uc could be further combined with subsequent
Packit aea12f
                                   characters.  So don't put it into sortbuf[0] in
Packit aea12f
                                   this round, only in the next round.  */
Packit aea12f
                                sortbuf_count = 0;
Packit aea12f
                              }
Packit aea12f
                          }
Packit aea12f
                      }
Packit aea12f
                  }
Packit aea12f
Packit aea12f
                for (j = 0; j < sortbuf_count; j++)
Packit aea12f
                  {
Packit aea12f
                    ucs4_t muc = sortbuf[j].code;
Packit aea12f
Packit aea12f
                    /* Append muc to the result accumulator.  */
Packit aea12f
                    if (length < allocated)
Packit aea12f
                      {
Packit aea12f
                        int ret =
Packit aea12f
                          U_UCTOMB (result + length, muc, allocated - length);
Packit aea12f
                        if (ret == -1)
Packit aea12f
                          {
Packit aea12f
                            errno = EINVAL;
Packit aea12f
                            goto fail;
Packit aea12f
                          }
Packit aea12f
                        if (ret >= 0)
Packit aea12f
                          {
Packit aea12f
                            length += ret;
Packit aea12f
                            goto done_appending;
Packit aea12f
                          }
Packit aea12f
                      }
Packit aea12f
                    {
Packit aea12f
                      size_t old_allocated = allocated;
Packit aea12f
                      size_t new_allocated = 2 * old_allocated;
Packit aea12f
                      if (new_allocated < 64)
Packit aea12f
                        new_allocated = 64;
Packit aea12f
                      if (new_allocated < old_allocated) /* integer overflow? */
Packit aea12f
                        abort ();
Packit aea12f
                      {
Packit aea12f
                        UNIT *larger_result;
Packit aea12f
                        if (result == NULL)
Packit aea12f
                          {
Packit aea12f
                            larger_result =
Packit aea12f
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
Packit aea12f
                            if (larger_result == NULL)
Packit aea12f
                              {
Packit aea12f
                                errno = ENOMEM;
Packit aea12f
                                goto fail;
Packit aea12f
                              }
Packit aea12f
                          }
Packit aea12f
                        else if (result == resultbuf)
Packit aea12f
                          {
Packit aea12f
                            larger_result =
Packit aea12f
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
Packit aea12f
                            if (larger_result == NULL)
Packit aea12f
                              {
Packit aea12f
                                errno = ENOMEM;
Packit aea12f
                                goto fail;
Packit aea12f
                              }
Packit aea12f
                            U_CPY (larger_result, resultbuf, length);
Packit aea12f
                          }
Packit aea12f
                        else
Packit aea12f
                          {
Packit aea12f
                            larger_result =
Packit aea12f
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
Packit aea12f
                            if (larger_result == NULL)
Packit aea12f
                              {
Packit aea12f
                                errno = ENOMEM;
Packit aea12f
                                goto fail;
Packit aea12f
                              }
Packit aea12f
                          }
Packit aea12f
                        result = larger_result;
Packit aea12f
                        allocated = new_allocated;
Packit aea12f
                        {
Packit aea12f
                          int ret =
Packit aea12f
                            U_UCTOMB (result + length, muc, allocated - length);
Packit aea12f
                          if (ret == -1)
Packit aea12f
                            {
Packit aea12f
                              errno = EINVAL;
Packit aea12f
                              goto fail;
Packit aea12f
                            }
Packit aea12f
                          if (ret < 0)
Packit aea12f
                            abort ();
Packit aea12f
                          length += ret;
Packit aea12f
                          goto done_appending;
Packit aea12f
                        }
Packit aea12f
                      }
Packit aea12f
                    }
Packit aea12f
                   done_appending: ;
Packit aea12f
                  }
Packit aea12f
Packit aea12f
                /* sortbuf is now empty.  */
Packit aea12f
                sortbuf_count = 0;
Packit aea12f
              }
Packit aea12f
Packit aea12f
            if (!(s < s_end))
Packit aea12f
              /* End of string reached.  */
Packit aea12f
              break;
Packit aea12f
Packit aea12f
            /* Append (uc, ccc) to sortbuf.  */
Packit aea12f
            if (sortbuf_count == sortbuf_allocated)
Packit aea12f
              {
Packit aea12f
                struct ucs4_with_ccc *new_sortbuf;
Packit aea12f
Packit aea12f
                sortbuf_allocated = 2 * sortbuf_allocated;
Packit aea12f
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
Packit aea12f
                  abort ();
Packit aea12f
                new_sortbuf =
Packit aea12f
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
Packit aea12f
                if (new_sortbuf == NULL)
Packit aea12f
                  {
Packit aea12f
                    errno = ENOMEM;
Packit aea12f
                    goto fail;
Packit aea12f
                  }
Packit aea12f
                memcpy (new_sortbuf, sortbuf,
Packit aea12f
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
Packit aea12f
                if (sortbuf != sortbuf_preallocated)
Packit aea12f
                  free (sortbuf);
Packit aea12f
                sortbuf = new_sortbuf;
Packit aea12f
              }
Packit aea12f
            sortbuf[sortbuf_count].code = uc;
Packit aea12f
            sortbuf[sortbuf_count].ccc = ccc;
Packit aea12f
            sortbuf_count++;
Packit aea12f
Packit aea12f
            i++;
Packit aea12f
          }
Packit aea12f
Packit aea12f
        if (!(s < s_end))
Packit aea12f
          /* End of string reached.  */
Packit aea12f
          break;
Packit aea12f
Packit aea12f
        s += count;
Packit aea12f
      }
Packit aea12f
  }
Packit aea12f
Packit aea12f
  if (length == 0)
Packit aea12f
    {
Packit aea12f
      if (result == NULL)
Packit aea12f
        {
Packit aea12f
          /* Return a non-NULL value.  NULL means error.  */
Packit aea12f
          result = (UNIT *) malloc (1);
Packit aea12f
          if (result == NULL)
Packit aea12f
            {
Packit aea12f
              errno = ENOMEM;
Packit aea12f
              goto fail;
Packit aea12f
            }
Packit aea12f
        }
Packit aea12f
    }
Packit aea12f
  else if (result != resultbuf && length < allocated)
Packit aea12f
    {
Packit aea12f
      /* Shrink the allocated memory if possible.  */
Packit aea12f
      UNIT *memory;
Packit aea12f
Packit aea12f
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
Packit aea12f
      if (memory != NULL)
Packit aea12f
        result = memory;
Packit aea12f
    }
Packit aea12f
Packit aea12f
  if (sortbuf_count > 0)
Packit aea12f
    abort ();
Packit aea12f
  if (sortbuf != sortbuf_preallocated)
Packit aea12f
    free (sortbuf);
Packit aea12f
Packit aea12f
  *lengthp = length;
Packit aea12f
  return result;
Packit aea12f
Packit aea12f
 fail:
Packit aea12f
  {
Packit aea12f
    int saved_errno = errno;
Packit aea12f
    if (sortbuf != sortbuf_preallocated)
Packit aea12f
      free (sortbuf);
Packit aea12f
    if (result != resultbuf)
Packit aea12f
      free (result);
Packit aea12f
    errno = saved_errno;
Packit aea12f
  }
Packit aea12f
  return NULL;
Packit aea12f
}