Blob Blame History Raw
/* Association between Unicode characters and their names.
   Copyright (C) 2000-2002, 2005-2007, 2009-2017 Free Software Foundation, Inc.

   This program is free software: you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published
   by the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

#include <config.h>

/* Specification.  */
#include "uniname.h"

#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>

#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))


/* Table of Unicode character names, derived from UnicodeData.txt.
   This table is generated in a way to minimize the memory footprint:
     1. its compiled size is small (less than 350 KB),
     2. it resides entirely in the text or read-only data segment of the
        executable or shared library: the table contains only immediate
        integers, no pointers, and the functions don't do heap allocation.
 */
#include "uninames.h"
/* It contains:
  static const char unicode_name_words[36303] = ...;
  #define UNICODE_CHARNAME_NUM_WORDS 6260
  static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
  #define UNICODE_CHARNAME_WORD_HANGUL 3902
  #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
  #define UNICODE_CHARNAME_WORD_CJK 417
  #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
  static const uint16_t unicode_names[68940] = ...;
  static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
  static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
  #define UNICODE_CHARNAME_MAX_LENGTH 83
  #define UNICODE_CHARNAME_MAX_WORDS 13
  static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
*/

/* Returns the word with a given index.  */
static const char *
unicode_name_word (unsigned int index, unsigned int *lengthp)
{
  unsigned int i1;
  unsigned int i2;
  unsigned int i;

  assert (index < UNICODE_CHARNAME_NUM_WORDS);

  /* Binary search for i with
       unicode_name_by_length[i].ind_offset <= index
     and
       index < unicode_name_by_length[i+1].ind_offset
   */

  i1 = 0;
  i2 = SIZEOF (unicode_name_by_length) - 1;
  while (i2 - i1 > 1)
    {
      unsigned int i = (i1 + i2) >> 1;
      if (unicode_name_by_length[i].ind_offset <= index)
        i1 = i;
      else
        i2 = i;
    }
  i = i1;
  assert (unicode_name_by_length[i].ind_offset <= index
          && index < unicode_name_by_length[i+1].ind_offset);
  *lengthp = i;
  return &unicode_name_words[unicode_name_by_length[i].extra_offset
                             + (index-unicode_name_by_length[i].ind_offset)*i];
}

/* Looks up the index of a word.  */
static int
unicode_name_word_lookup (const char *word, unsigned int length)
{
  if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
    {
      /* Binary search among the words of given length.  */
      unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
      unsigned int i0 = unicode_name_by_length[length].ind_offset;
      unsigned int i1 = i0;
      unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
      while (i2 - i1 > 0)
        {
          unsigned int i = (i1 + i2) >> 1;
          const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
          const char *w = word;
          unsigned int n = length;
          for (;;)
            {
              if (*p < *w)
                {
                  if (i1 == i)
                    return -1;
                  /* Note here: i1 < i < i2.  */
                  i1 = i;
                  break;
                }
              if (*p > *w)
                {
                  /* Note here: i1 <= i < i2.  */
                  i2 = i;
                  break;
                }
              p++; w++; n--;
              if (n == 0)
                return i;
            }
        }
    }
  return -1;
}

#define UNINAME_INVALID_INDEX UINT16_MAX

/* Looks up the internal index of a Unicode character.  */
static uint16_t
unicode_code_to_index (ucs4_t c)
{
  /* Binary search in unicode_ranges.  */
  unsigned int i1 = 0;
  unsigned int i2 = SIZEOF (unicode_ranges);

  for (;;)
    {
      unsigned int i = (i1 + i2) >> 1;
      ucs4_t start_code =
        unicode_ranges[i].index + unicode_ranges[i].gap;
      ucs4_t end_code =
        start_code + unicode_ranges[i].length - 1;

      if (start_code <= c && c <= end_code)
        return c - unicode_ranges[i].gap;

      if (end_code < c)
        {
          if (i1 == i)
            break;
          /* Note here: i1 < i < i2.  */
          i1 = i;
        }
      else if (c < start_code)
        {
          if (i2 == i)
            break;
          /* Note here: i1 <= i < i2.  */
          i2 = i;
        }
    }
  return UNINAME_INVALID_INDEX;
}

/* Looks up the codepoint of a Unicode character, from the given
   internal index.  */
static ucs4_t
unicode_index_to_code (uint16_t index)
{
  /* Binary search in unicode_ranges.  */
  unsigned int i1 = 0;
  unsigned int i2 = SIZEOF (unicode_ranges);

  for (;;)
    {
      unsigned int i = (i1 + i2) >> 1;
      uint16_t start_index = unicode_ranges[i].index;
      uint16_t end_index = start_index + unicode_ranges[i].length - 1;

      if (start_index <= index && index <= end_index)
        return index + unicode_ranges[i].gap;

      if (end_index < index)
        {
          if (i1 == i)
            break;
          /* Note here: i1 < i < i2.  */
          i1 = i;
        }
      else if (index < start_index)
        {
          if (i2 == i)
            break;
          /* Note here: i1 <= i < i2.  */
          i2 = i;
        }
    }
  return UNINAME_INVALID;
}


/* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
   sections 3.11 and 4.4.  */
static const char jamo_initial_short_name[19][3] =
{
  "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
  "C", "K", "T", "P", "H"
};
static const char jamo_medial_short_name[21][4] =
{
  "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
  "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
};
static const char jamo_final_short_name[28][3] =
{
  "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
  "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
};

/* Looks up the name of a Unicode character, in uppercase ASCII.
   Returns the filled buf, or NULL if the character does not have a name.  */
char *
unicode_character_name (ucs4_t c, char *buf)
{
  if (c >= 0xAC00 && c <= 0xD7A3)
    {
      /* Special case for Hangul syllables. Keeps the tables small.  */
      char *ptr;
      unsigned int tmp;
      unsigned int index1;
      unsigned int index2;
      unsigned int index3;
      const char *q;

      /* buf needs to have at least 16 + 7 bytes here.  */
      memcpy (buf, "HANGUL SYLLABLE ", 16);
      ptr = buf + 16;

      tmp = c - 0xAC00;
      index3 = tmp % 28; tmp = tmp / 28;
      index2 = tmp % 21; tmp = tmp / 21;
      index1 = tmp;

      q = jamo_initial_short_name[index1];
      while (*q != '\0')
        *ptr++ = *q++;
      q = jamo_medial_short_name[index2];
      while (*q != '\0')
        *ptr++ = *q++;
      q = jamo_final_short_name[index3];
      while (*q != '\0')
        *ptr++ = *q++;
      *ptr = '\0';
      return buf;
    }
  else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
           || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
    {
      /* Special case for CJK compatibility ideographs. Keeps the tables
         small.  */
      char *ptr;
      int i;

      /* buf needs to have at least 28 + 5 bytes here.  */
      memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
      ptr = buf + 28;

      for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
        {
          unsigned int x = (c >> i) & 0xf;
          *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
        }
      *ptr = '\0';
      return buf;
    }
  else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF))
    {
      /* Special case for variation selectors. Keeps the tables
         small.  */

      /* buf needs to have at least 19 + 3 bytes here.  */
      sprintf (buf, "VARIATION SELECTOR-%d",
               c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
      return buf;
    }
  else
    {
      uint16_t index = unicode_code_to_index (c);
      const uint16_t *words = NULL;

      if (index != UNINAME_INVALID_INDEX)
        {
          /* Binary search in unicode_code_to_name.  */
          unsigned int i1 = 0;
          unsigned int i2 = SIZEOF (unicode_index_to_name);
          for (;;)
            {
              unsigned int i = (i1 + i2) >> 1;
              if (unicode_index_to_name[i].index == index)
                {
                  words = &unicode_names[unicode_index_to_name[i].name];
                  break;
                }
              else if (unicode_index_to_name[i].index < index)
                {
                  if (i1 == i)
                    {
                      words = NULL;
                      break;
                    }
                  /* Note here: i1 < i < i2.  */
                  i1 = i;
                }
              else if (unicode_index_to_name[i].index > index)
                {
                  if (i2 == i)
                    {
                      words = NULL;
                      break;
                    }
                  /* Note here: i1 <= i < i2.  */
                  i2 = i;
                }
            }
        }
      if (words != NULL)
        {
          /* Found it in unicode_index_to_name. Now concatenate the words.  */
          /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
          char *ptr = buf;
          for (;;)
            {
              unsigned int wordlen;
              const char *word = unicode_name_word (*words>>1, &wordlen);
              do
                *ptr++ = *word++;
              while (--wordlen > 0);
              if ((*words & 1) == 0)
                break;
              *ptr++ = ' ';
              words++;
            }
          *ptr = '\0';
          return buf;
        }
      return NULL;
    }
}

/* Looks up the Unicode character with a given name, in upper- or lowercase
   ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
ucs4_t
unicode_name_character (const char *name)
{
  unsigned int len = strlen (name);
  if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
    {
      /* Test for "word1 word2 ..." syntax.  */
      char buf[UNICODE_CHARNAME_MAX_LENGTH];
      char *ptr = buf;
      for (;;)
        {
          char c = *name++;
          if (!(c >= ' ' && c <= '~'))
            break;
          *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
          if (--len == 0)
            goto filled_buf;
        }
      if (false)
      filled_buf:
        {
          {
            /* Special case for variation selector aliases. Keeps the
               tables small.  */
            const char *p1 = buf;
            if (ptr >= buf + 3 && *p1++ == 'V')
              {
                if (*p1++ == 'S')
                  {
                    if (*p1 != '0')
                      {
                        unsigned int c = 0;
                        for (;;)
                          {
                            if (*p1 >= '0' && *p1 <= '9')
                              c += (*p1 - '0');
                            p1++;
                            if (p1 == ptr)
                              {
                                if (c >= 1 && c <= 16)
                                  return c - 1 + 0xFE00;
                                else if (c >= 17 && c <= 256)
                                  return c - 17 + 0xE0100;
                                else
                                  break;
                              }
                            c = c * 10;
                          }
                      }
                  }
              }
          }
          {
            /* Convert the constituents to uint16_t words.  */
            uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
            uint16_t *wordptr = words;
            {
              const char *p1 = buf;
              for (;;)
                {
                  {
                    int word;
                    const char *p2 = p1;
                    while (p2 < ptr && *p2 != ' ')
                      p2++;
                    word = unicode_name_word_lookup (p1, p2 - p1);
                    if (word < 0)
                      break;
                    if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
                      break;
                    *wordptr++ = word;
                    if (p2 == ptr)
                      goto filled_words;
                    p1 = p2 + 1;
                  }
                  /* Special case for Hangul syllables. Keeps the tables small. */
                  if (wordptr == &words[2]
                      && words[0] == UNICODE_CHARNAME_WORD_HANGUL
                      && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
                    {
                      /* Split the last word [p1..ptr) into three parts:
                           1) [BCDGHJKMNPRST]
                           2) [AEIOUWY]
                           3) [BCDGHIJKLMNPST]
                       */
                      const char *p2;
                      const char *p3;
                      const char *p4;

                      p2 = p1;
                      while (p2 < ptr
                             && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
                                 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
                                 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
                                 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
                                 || *p2 == 'T'))
                        p2++;
                      p3 = p2;
                      while (p3 < ptr
                             && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
                                 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
                                 || *p3 == 'Y'))
                        p3++;
                      p4 = p3;
                      while (p4 < ptr
                             && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
                                 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
                                 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
                                 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
                                 || *p4 == 'S' || *p4 == 'T'))
                        p4++;
                      if (p4 == ptr)
                        {
                          unsigned int n1 = p2 - p1;
                          unsigned int n2 = p3 - p2;
                          unsigned int n3 = p4 - p3;

                          if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
                            {
                              unsigned int index1;

                              for (index1 = 0; index1 < 19; index1++)
                                if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
                                    && jamo_initial_short_name[index1][n1] == '\0')
                                  {
                                    unsigned int index2;

                                    for (index2 = 0; index2 < 21; index2++)
                                      if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
                                          && jamo_medial_short_name[index2][n2] == '\0')
                                        {
                                          unsigned int index3;

                                          for (index3 = 0; index3 < 28; index3++)
                                            if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
                                                && jamo_final_short_name[index3][n3] == '\0')
                                              {
                                                return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
                                              }
                                          break;
                                        }
                                    break;
                                  }
                            }
                        }
                    }
                  /* Special case for CJK compatibility ideographs. Keeps the
                     tables small.  */
                  if (wordptr == &words[2]
                      && words[0] == UNICODE_CHARNAME_WORD_CJK
                      && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
                      && p1 + 14 <= ptr
                      && p1 + 15 >= ptr
                      && memcmp (p1, "IDEOGRAPH-", 10) == 0)
                    {
                      const char *p2 = p1 + 10;

                      if (*p2 != '0')
                        {
                          unsigned int c = 0;

                          for (;;)
                            {
                              if (*p2 >= '0' && *p2 <= '9')
                                c += (*p2 - '0');
                              else if (*p2 >= 'A' && *p2 <= 'F')
                                c += (*p2 - 'A' + 10);
                              else
                                break;
                              p2++;
                              if (p2 == ptr)
                                {
                                  if ((c >= 0xF900 && c <= 0xFA2D)
                                      || (c >= 0xFA30 && c <= 0xFA6A)
                                      || (c >= 0xFA70 && c <= 0xFAD9)
                                      || (c >= 0x2F800 && c <= 0x2FA1D))
                                    return c;
                                  else
                                    break;
                                }
                              c = c << 4;
                            }
                        }
                    }
                  /* Special case for variation selectors. Keeps the
                     tables small.  */
                  if (wordptr == &words[1]
                      && words[0] == UNICODE_CHARNAME_WORD_VARIATION
                      && p1 + 10 <= ptr
                      && p1 + 12 >= ptr
                      && memcmp (p1, "SELECTOR-", 9) == 0)
                    {
                      const char *p2 = p1 + 9;

                      if (*p2 != '0')
                        {
                          unsigned int c = 0;

                          for (;;)
                            {
                              if (*p2 >= '0' && *p2 <= '9')
                                c += (*p2 - '0');
                              p2++;
                              if (p2 == ptr)
                                {
                                  if (c >= 1 && c <= 16)
                                    return c - 1 + 0xFE00;
                                  else if (c >= 17 && c <= 256)
                                    return c - 17 + 0xE0100;
                                  else
                                    break;
                                }
                              c = c * 10;
                            }
                        }
                    }
                }
            }
            if (false)
            filled_words:
              {
                /* Multiply by 2, to simplify later comparisons.  */
                unsigned int words_length = wordptr - words;
                {
                  int i = words_length - 1;
                  words[i] = 2 * words[i];
                  for (; --i >= 0; )
                    words[i] = 2 * words[i] + 1;
                }
                /* Binary search in unicode_name_to_index.  */
                {
                  unsigned int i1 = 0;
                  unsigned int i2 = SIZEOF (unicode_name_to_index);
                  for (;;)
                    {
                      unsigned int i = (i1 + i2) >> 1;
                      const uint16_t *w = words;
                      const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
                      unsigned int n = words_length;
                      for (;;)
                        {
                          if (*p < *w)
                            {
                              if (i1 == i)
                                goto name_not_found;
                              /* Note here: i1 < i < i2.  */
                              i1 = i;
                              break;
                            }
                          else if (*p > *w)
                            {
                              if (i2 == i)
                                goto name_not_found;
                              /* Note here: i1 <= i < i2.  */
                              i2 = i;
                              break;
                            }
                          p++; w++; n--;
                          if (n == 0)
                            return unicode_index_to_code (unicode_name_to_index[i].index);
                        }
                    }
                }
              name_not_found: ;
              }
          }
        }
    }
  return UNINAME_INVALID;
}