Blame lib/unistring/uninorm/u-normalize-internal.h

Packit Service 4684c1
/* Decomposition and composition of Unicode strings.
Packit Service 4684c1
   Copyright (C) 2009-2020 Free Software Foundation, Inc.
Packit Service 4684c1
   Written by Bruno Haible <bruno@clisp.org>, 2009.
Packit Service 4684c1
Packit Service 4684c1
   This program is free software: you can redistribute it and/or
Packit Service 4684c1
   modify it under the terms of either:
Packit Service 4684c1
Packit Service 4684c1
     * the GNU Lesser General Public License as published by the Free
Packit Service 4684c1
       Software Foundation; either version 3 of the License, or (at your
Packit Service 4684c1
       option) any later version.
Packit Service 4684c1
Packit Service 4684c1
   or
Packit Service 4684c1
Packit Service 4684c1
     * the GNU General Public License as published by the Free
Packit Service 4684c1
       Software Foundation; either version 2 of the License, or (at your
Packit Service 4684c1
       option) any later version.
Packit Service 4684c1
Packit Service 4684c1
   or both in parallel, as here.
Packit Service 4684c1
   This program is distributed in the hope that it will be useful,
Packit Service 4684c1
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit Service 4684c1
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit Service 4684c1
   Lesser General Public License for more details.
Packit Service 4684c1
Packit Service 4684c1
   You should have received a copy of the GNU Lesser General Public License
Packit Service 4684c1
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
Packit Service 4684c1
Packit Service 4684c1
UNIT *
Packit Service 4684c1
FUNC (uninorm_t nf, const UNIT *s, size_t n,
Packit Service 4684c1
      UNIT *resultbuf, size_t *lengthp)
Packit Service 4684c1
{
Packit Service 4684c1
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
Packit Service 4684c1
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;
Packit Service 4684c1
Packit Service 4684c1
  /* The result being accumulated.  */
Packit Service 4684c1
  UNIT *result;
Packit Service 4684c1
  size_t length;
Packit Service 4684c1
  size_t allocated;
Packit Service 4684c1
  /* The buffer for sorting.  */
Packit Service 4684c1
  #define SORTBUF_PREALLOCATED 64
Packit Service 4684c1
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
Packit Service 4684c1
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
Packit Service 4684c1
  size_t sortbuf_allocated;
Packit Service 4684c1
  size_t sortbuf_count;
Packit Service 4684c1
Packit Service 4684c1
  /* Initialize the accumulator.  */
Packit Service 4684c1
  if (resultbuf == NULL)
Packit Service 4684c1
    {
Packit Service 4684c1
      result = NULL;
Packit Service 4684c1
      allocated = 0;
Packit Service 4684c1
    }
Packit Service 4684c1
  else
Packit Service 4684c1
    {
Packit Service 4684c1
      result = resultbuf;
Packit Service 4684c1
      allocated = *lengthp;
Packit Service 4684c1
    }
Packit Service 4684c1
  length = 0;
Packit Service 4684c1
Packit Service 4684c1
  /* Initialize the buffer for sorting.  */
Packit Service 4684c1
  sortbuf = sortbuf_preallocated;
Packit Service 4684c1
  sortbuf_allocated = SORTBUF_PREALLOCATED;
Packit Service 4684c1
  sortbuf_count = 0;
Packit Service 4684c1
Packit Service 4684c1
  {
Packit Service 4684c1
    const UNIT *s_end = s + n;
Packit Service 4684c1
Packit Service 4684c1
    for (;;)
Packit Service 4684c1
      {
Packit Service 4684c1
        int count;
Packit Service 4684c1
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
Packit Service 4684c1
        int decomposed_count;
Packit Service 4684c1
        int i;
Packit Service 4684c1
Packit Service 4684c1
        if (s < s_end)
Packit Service 4684c1
          {
Packit Service 4684c1
            /* Fetch the next character.  */
Packit Service 4684c1
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
Packit Service 4684c1
            decomposed_count = 1;
Packit Service 4684c1
Packit Service 4684c1
            /* Decompose it, recursively.
Packit Service 4684c1
               It would be possible to precompute the recursive decomposition
Packit Service 4684c1
               and store it in a table.  But this would significantly increase
Packit Service 4684c1
               the size of the decomposition tables, because for example for
Packit Service 4684c1
               U+1FC1 the recursive canonical decomposition and the recursive
Packit Service 4684c1
               compatibility decomposition are different.  */
Packit Service 4684c1
            {
Packit Service 4684c1
              int curr;
Packit Service 4684c1
Packit Service 4684c1
              for (curr = 0; curr < decomposed_count; )
Packit Service 4684c1
                {
Packit Service 4684c1
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
Packit Service 4684c1
                     all elements are atomic.  */
Packit Service 4684c1
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
Packit Service 4684c1
                  int curr_decomposed_count;
Packit Service 4684c1
Packit Service 4684c1
                  curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);
Packit Service 4684c1
                  if (curr_decomposed_count >= 0)
Packit Service 4684c1
                    {
Packit Service 4684c1
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
Packit Service 4684c1
                         decomposed[curr], making room.  It's not worth using
Packit Service 4684c1
                         memcpy() here, since the counts are so small.  */
Packit Service 4684c1
                      int shift = curr_decomposed_count - 1;
Packit Service 4684c1
Packit Service 4684c1
                      if (shift < 0)
Packit Service 4684c1
                        abort ();
Packit Service 4684c1
                      if (shift > 0)
Packit Service 4684c1
                        {
Packit Service 4684c1
                          int j;
Packit Service 4684c1
Packit Service 4684c1
                          decomposed_count += shift;
Packit Service 4684c1
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
Packit Service 4684c1
                            abort ();
Packit Service 4684c1
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
Packit Service 4684c1
                            decomposed[j + shift] = decomposed[j];
Packit Service 4684c1
                        }
Packit Service 4684c1
                      for (; shift >= 0; shift--)
Packit Service 4684c1
                        decomposed[curr + shift] = curr_decomposed[shift];
Packit Service 4684c1
                    }
Packit Service 4684c1
                  else
Packit Service 4684c1
                    {
Packit Service 4684c1
                      /* decomposed[curr] is atomic.  */
Packit Service 4684c1
                      curr++;
Packit Service 4684c1
                    }
Packit Service 4684c1
                }
Packit Service 4684c1
            }
Packit Service 4684c1
          }
Packit Service 4684c1
        else
Packit Service 4684c1
          {
Packit Service 4684c1
            count = 0;
Packit Service 4684c1
            decomposed_count = 0;
Packit Service 4684c1
          }
Packit Service 4684c1
Packit Service 4684c1
        i = 0;
Packit Service 4684c1
        for (;;)
Packit Service 4684c1
          {
Packit Service 4684c1
            ucs4_t uc;
Packit Service 4684c1
            int ccc;
Packit Service 4684c1
Packit Service 4684c1
            if (s < s_end)
Packit Service 4684c1
              {
Packit Service 4684c1
                /* Fetch the next character from the decomposition.  */
Packit Service 4684c1
                if (i == decomposed_count)
Packit Service 4684c1
                  break;
Packit Service 4684c1
                uc = decomposed[i];
Packit Service 4684c1
                ccc = uc_combining_class (uc);
Packit Service 4684c1
              }
Packit Service 4684c1
            else
Packit Service 4684c1
              {
Packit Service 4684c1
                /* End of string reached.  */
Packit Service 4684c1
                uc = 0;
Packit Service 4684c1
                ccc = 0;
Packit Service 4684c1
              }
Packit Service 4684c1
Packit Service 4684c1
            if (ccc == 0)
Packit Service 4684c1
              {
Packit Service 4684c1
                size_t j;
Packit Service 4684c1
Packit Service 4684c1
                /* Apply the canonical ordering algorithm to the accumulated
Packit Service 4684c1
                   sequence of characters.  */
Packit Service 4684c1
                if (sortbuf_count > 1)
Packit Service 4684c1
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,
Packit Service 4684c1
                                                           sortbuf + sortbuf_count);
Packit Service 4684c1
Packit Service 4684c1
                if (composer != NULL)
Packit Service 4684c1
                  {
Packit Service 4684c1
                    /* Attempt to combine decomposed characters, as specified
Packit Service 4684c1
                       in the Unicode Standard Annex #15 "Unicode Normalization
Packit Service 4684c1
                       Forms".  We need to check
Packit Service 4684c1
                         1. whether the first accumulated character is a
Packit Service 4684c1
                            "starter" (i.e. has ccc = 0).  This is usually the
Packit Service 4684c1
                            case.  But when the string starts with a
Packit Service 4684c1
                            non-starter, the sortbuf also starts with a
Packit Service 4684c1
                            non-starter.  Btw, this check could also be
Packit Service 4684c1
                            omitted, because the composition table has only
Packit Service 4684c1
                            entries (code1, code2) for which code1 is a
Packit Service 4684c1
                            starter; if the first accumulated character is not
Packit Service 4684c1
                            a starter, no lookup will succeed.
Packit Service 4684c1
                         2. If the sortbuf has more than one character, check
Packit Service 4684c1
                            for each of these characters that are not "blocked"
Packit Service 4684c1
                            from the starter (i.e. have a ccc that is higher
Packit Service 4684c1
                            than the ccc of the previous character) whether it
Packit Service 4684c1
                            can be combined with the first character.
Packit Service 4684c1
                         3. If only one character is left in sortbuf, check
Packit Service 4684c1
                            whether it can be combined with the next character
Packit Service 4684c1
                            (also a starter).  */
Packit Service 4684c1
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
Packit Service 4684c1
                      {
Packit Service 4684c1
                        for (j = 1; j < sortbuf_count; )
Packit Service 4684c1
                          {
Packit Service 4684c1
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
Packit Service 4684c1
                              {
Packit Service 4684c1
                                ucs4_t combined =
Packit Service 4684c1
                                  composer (sortbuf[0].code, sortbuf[j].code);
Packit Service 4684c1
                                if (combined)
Packit Service 4684c1
                                  {
Packit Service 4684c1
                                    size_t k;
Packit Service 4684c1
Packit Service 4684c1
                                    sortbuf[0].code = combined;
Packit Service 4684c1
                                    /* sortbuf[0].ccc = 0, still valid.  */
Packit Service 4684c1
                                    for (k = j + 1; k < sortbuf_count; k++)
Packit Service 4684c1
                                      sortbuf[k - 1] = sortbuf[k];
Packit Service 4684c1
                                    sortbuf_count--;
Packit Service 4684c1
                                    continue;
Packit Service 4684c1
                                  }
Packit Service 4684c1
                              }
Packit Service 4684c1
                            j++;
Packit Service 4684c1
                          }
Packit Service 4684c1
                        if (s < s_end && sortbuf_count == 1)
Packit Service 4684c1
                          {
Packit Service 4684c1
                            ucs4_t combined =
Packit Service 4684c1
                              composer (sortbuf[0].code, uc);
Packit Service 4684c1
                            if (combined)
Packit Service 4684c1
                              {
Packit Service 4684c1
                                uc = combined;
Packit Service 4684c1
                                ccc = 0;
Packit Service 4684c1
                                /* uc could be further combined with subsequent
Packit Service 4684c1
                                   characters.  So don't put it into sortbuf[0] in
Packit Service 4684c1
                                   this round, only in the next round.  */
Packit Service 4684c1
                                sortbuf_count = 0;
Packit Service 4684c1
                              }
Packit Service 4684c1
                          }
Packit Service 4684c1
                      }
Packit Service 4684c1
                  }
Packit Service 4684c1
Packit Service 4684c1
                for (j = 0; j < sortbuf_count; j++)
Packit Service 4684c1
                  {
Packit Service 4684c1
                    ucs4_t muc = sortbuf[j].code;
Packit Service 4684c1
Packit Service 4684c1
                    /* Append muc to the result accumulator.  */
Packit Service 4684c1
                    if (length < allocated)
Packit Service 4684c1
                      {
Packit Service 4684c1
                        int ret =
Packit Service 4684c1
                          U_UCTOMB (result + length, muc, allocated - length);
Packit Service 4684c1
                        if (ret == -1)
Packit Service 4684c1
                          {
Packit Service 4684c1
                            errno = EINVAL;
Packit Service 4684c1
                            goto fail;
Packit Service 4684c1
                          }
Packit Service 4684c1
                        if (ret >= 0)
Packit Service 4684c1
                          {
Packit Service 4684c1
                            length += ret;
Packit Service 4684c1
                            goto done_appending;
Packit Service 4684c1
                          }
Packit Service 4684c1
                      }
Packit Service 4684c1
                    {
Packit Service 4684c1
                      size_t old_allocated = allocated;
Packit Service 4684c1
                      size_t new_allocated = 2 * old_allocated;
Packit Service 4684c1
                      if (new_allocated < 64)
Packit Service 4684c1
                        new_allocated = 64;
Packit Service 4684c1
                      if (new_allocated < old_allocated) /* integer overflow? */
Packit Service 4684c1
                        abort ();
Packit Service 4684c1
                      {
Packit Service 4684c1
                        UNIT *larger_result;
Packit Service 4684c1
                        if (result == NULL)
Packit Service 4684c1
                          {
Packit Service 4684c1
                            larger_result =
Packit Service 4684c1
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
Packit Service 4684c1
                            if (larger_result == NULL)
Packit Service 4684c1
                              {
Packit Service 4684c1
                                errno = ENOMEM;
Packit Service 4684c1
                                goto fail;
Packit Service 4684c1
                              }
Packit Service 4684c1
                          }
Packit Service 4684c1
                        else if (result == resultbuf)
Packit Service 4684c1
                          {
Packit Service 4684c1
                            larger_result =
Packit Service 4684c1
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
Packit Service 4684c1
                            if (larger_result == NULL)
Packit Service 4684c1
                              {
Packit Service 4684c1
                                errno = ENOMEM;
Packit Service 4684c1
                                goto fail;
Packit Service 4684c1
                              }
Packit Service 4684c1
                            U_CPY (larger_result, resultbuf, length);
Packit Service 4684c1
                          }
Packit Service 4684c1
                        else
Packit Service 4684c1
                          {
Packit Service 4684c1
                            larger_result =
Packit Service 4684c1
                              (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
Packit Service 4684c1
                            if (larger_result == NULL)
Packit Service 4684c1
                              {
Packit Service 4684c1
                                errno = ENOMEM;
Packit Service 4684c1
                                goto fail;
Packit Service 4684c1
                              }
Packit Service 4684c1
                          }
Packit Service 4684c1
                        result = larger_result;
Packit Service 4684c1
                        allocated = new_allocated;
Packit Service 4684c1
                        {
Packit Service 4684c1
                          int ret =
Packit Service 4684c1
                            U_UCTOMB (result + length, muc, allocated - length);
Packit Service 4684c1
                          if (ret == -1)
Packit Service 4684c1
                            {
Packit Service 4684c1
                              errno = EINVAL;
Packit Service 4684c1
                              goto fail;
Packit Service 4684c1
                            }
Packit Service 4684c1
                          if (ret < 0)
Packit Service 4684c1
                            abort ();
Packit Service 4684c1
                          length += ret;
Packit Service 4684c1
                          goto done_appending;
Packit Service 4684c1
                        }
Packit Service 4684c1
                      }
Packit Service 4684c1
                    }
Packit Service 4684c1
                   done_appending: ;
Packit Service 4684c1
                  }
Packit Service 4684c1
Packit Service 4684c1
                /* sortbuf is now empty.  */
Packit Service 4684c1
                sortbuf_count = 0;
Packit Service 4684c1
              }
Packit Service 4684c1
Packit Service 4684c1
            if (!(s < s_end))
Packit Service 4684c1
              /* End of string reached.  */
Packit Service 4684c1
              break;
Packit Service 4684c1
Packit Service 4684c1
            /* Append (uc, ccc) to sortbuf.  */
Packit Service 4684c1
            if (sortbuf_count == sortbuf_allocated)
Packit Service 4684c1
              {
Packit Service 4684c1
                struct ucs4_with_ccc *new_sortbuf;
Packit Service 4684c1
Packit Service 4684c1
                sortbuf_allocated = 2 * sortbuf_allocated;
Packit Service 4684c1
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
Packit Service 4684c1
                  abort ();
Packit Service 4684c1
                new_sortbuf =
Packit Service 4684c1
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));
Packit Service 4684c1
                if (new_sortbuf == NULL)
Packit Service 4684c1
                  {
Packit Service 4684c1
                    errno = ENOMEM;
Packit Service 4684c1
                    goto fail;
Packit Service 4684c1
                  }
Packit Service 4684c1
                memcpy (new_sortbuf, sortbuf,
Packit Service 4684c1
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
Packit Service 4684c1
                if (sortbuf != sortbuf_preallocated)
Packit Service 4684c1
                  free (sortbuf);
Packit Service 4684c1
                sortbuf = new_sortbuf;
Packit Service 4684c1
              }
Packit Service 4684c1
            sortbuf[sortbuf_count].code = uc;
Packit Service 4684c1
            sortbuf[sortbuf_count].ccc = ccc;
Packit Service 4684c1
            sortbuf_count++;
Packit Service 4684c1
Packit Service 4684c1
            i++;
Packit Service 4684c1
          }
Packit Service 4684c1
Packit Service 4684c1
        if (!(s < s_end))
Packit Service 4684c1
          /* End of string reached.  */
Packit Service 4684c1
          break;
Packit Service 4684c1
Packit Service 4684c1
        s += count;
Packit Service 4684c1
      }
Packit Service 4684c1
  }
Packit Service 4684c1
Packit Service 4684c1
  if (length == 0)
Packit Service 4684c1
    {
Packit Service 4684c1
      if (result == NULL)
Packit Service 4684c1
        {
Packit Service 4684c1
          /* Return a non-NULL value.  NULL means error.  */
Packit Service 4684c1
          result = (UNIT *) malloc (1);
Packit Service 4684c1
          if (result == NULL)
Packit Service 4684c1
            {
Packit Service 4684c1
              errno = ENOMEM;
Packit Service 4684c1
              goto fail;
Packit Service 4684c1
            }
Packit Service 4684c1
        }
Packit Service 4684c1
    }
Packit Service 4684c1
  else if (result != resultbuf && length < allocated)
Packit Service 4684c1
    {
Packit Service 4684c1
      /* Shrink the allocated memory if possible.  */
Packit Service 4684c1
      UNIT *memory;
Packit Service 4684c1
Packit Service 4684c1
      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
Packit Service 4684c1
      if (memory != NULL)
Packit Service 4684c1
        result = memory;
Packit Service 4684c1
    }
Packit Service 4684c1
Packit Service 4684c1
  if (sortbuf_count > 0)
Packit Service 4684c1
    abort ();
Packit Service 4684c1
  if (sortbuf != sortbuf_preallocated)
Packit Service 4684c1
    free (sortbuf);
Packit Service 4684c1
Packit Service 4684c1
  *lengthp = length;
Packit Service 4684c1
  return result;
Packit Service 4684c1
Packit Service 4684c1
 fail:
Packit Service 4684c1
  {
Packit Service 4684c1
    int saved_errno = errno;
Packit Service 4684c1
    if (sortbuf != sortbuf_preallocated)
Packit Service 4684c1
      free (sortbuf);
Packit Service 4684c1
    if (result != resultbuf)
Packit Service 4684c1
      free (result);
Packit Service 4684c1
    errno = saved_errno;
Packit Service 4684c1
  }
Packit Service 4684c1
  return NULL;
Packit Service 4684c1
}