Blame lib/propername.c

Packit 709fb3
/* Localization of proper names.
Packit 709fb3
   Copyright (C) 2006-2017 Free Software Foundation, Inc.
Packit 709fb3
   Written by Bruno Haible <bruno@clisp.org>, 2006.
Packit 709fb3
Packit 709fb3
   This program is free software: you can redistribute it and/or modify
Packit 709fb3
   it under the terms of the GNU General Public License as published by
Packit 709fb3
   the Free Software Foundation; either version 3 of the License, or
Packit 709fb3
   (at your option) any later version.
Packit 709fb3
Packit 709fb3
   This program is distributed in the hope that it will be useful,
Packit 709fb3
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 709fb3
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 709fb3
   GNU General Public License for more details.
Packit 709fb3
Packit 709fb3
   You should have received a copy of the GNU General Public License
Packit 709fb3
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
Packit 709fb3
Packit 709fb3
/* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that
Packit 709fb3
   the proper_name function might be candidate for attribute 'const'  */
Packit 709fb3
#if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__
Packit 709fb3
# pragma GCC diagnostic ignored "-Wsuggest-attribute=const"
Packit 709fb3
#endif
Packit 709fb3
Packit 709fb3
#include <config.h>
Packit 709fb3
Packit 709fb3
/* Specification.  */
Packit 709fb3
#include "propername.h"
Packit 709fb3
Packit 709fb3
#include <ctype.h>
Packit 709fb3
#include <stdbool.h>
Packit 709fb3
#include <stdio.h>
Packit 709fb3
#include <stdlib.h>
Packit 709fb3
#include <string.h>
Packit 709fb3
#if HAVE_ICONV
Packit 709fb3
# include <iconv.h>
Packit 709fb3
#endif
Packit 709fb3
Packit 709fb3
#include "trim.h"
Packit 709fb3
#include "mbchar.h"
Packit 709fb3
#include "mbuiter.h"
Packit 709fb3
#include "localcharset.h"
Packit 709fb3
#include "c-strcase.h"
Packit 709fb3
#include "xstriconv.h"
Packit 709fb3
#include "xalloc.h"
Packit 709fb3
#include "gettext.h"
Packit 709fb3
Packit 709fb3
Packit 709fb3
/* Tests whether STRING contains trim (SUB), starting and ending at word
Packit 709fb3
   boundaries.
Packit 709fb3
   Here, instead of implementing Unicode Standard Annex #29 for determining
Packit 709fb3
   word boundaries, we assume that trim (SUB) starts and ends with words and
Packit 709fb3
   only test whether the part before it ends with a non-word and the part
Packit 709fb3
   after it starts with a non-word.  */
Packit 709fb3
static bool
Packit 709fb3
mbsstr_trimmed_wordbounded (const char *string, const char *sub)
Packit 709fb3
{
Packit 709fb3
  char *tsub = trim (sub);
Packit 709fb3
  bool found = false;
Packit 709fb3
Packit 709fb3
  for (; *string != '\0';)
Packit 709fb3
    {
Packit 709fb3
      const char *tsub_in_string = mbsstr (string, tsub);
Packit 709fb3
      if (tsub_in_string == NULL)
Packit 709fb3
        break;
Packit 709fb3
      else
Packit 709fb3
        {
Packit 709fb3
          if (MB_CUR_MAX > 1)
Packit 709fb3
            {
Packit 709fb3
              mbui_iterator_t string_iter;
Packit 709fb3
              bool word_boundary_before;
Packit 709fb3
              bool word_boundary_after;
Packit 709fb3
Packit 709fb3
              mbui_init (string_iter, string);
Packit 709fb3
              word_boundary_before = true;
Packit 709fb3
              if (mbui_cur_ptr (string_iter) < tsub_in_string)
Packit 709fb3
                {
Packit 709fb3
                  mbchar_t last_char_before_tsub;
Packit 709fb3
                  do
Packit 709fb3
                    {
Packit 709fb3
                      if (!mbui_avail (string_iter))
Packit 709fb3
                        abort ();
Packit 709fb3
                      last_char_before_tsub = mbui_cur (string_iter);
Packit 709fb3
                      mbui_advance (string_iter);
Packit 709fb3
                    }
Packit 709fb3
                  while (mbui_cur_ptr (string_iter) < tsub_in_string);
Packit 709fb3
                  if (mb_isalnum (last_char_before_tsub))
Packit 709fb3
                    word_boundary_before = false;
Packit 709fb3
                }
Packit 709fb3
Packit 709fb3
              mbui_init (string_iter, tsub_in_string);
Packit 709fb3
              {
Packit 709fb3
                mbui_iterator_t tsub_iter;
Packit 709fb3
Packit 709fb3
                for (mbui_init (tsub_iter, tsub);
Packit 709fb3
                     mbui_avail (tsub_iter);
Packit 709fb3
                     mbui_advance (tsub_iter))
Packit 709fb3
                  {
Packit 709fb3
                    if (!mbui_avail (string_iter))
Packit 709fb3
                      abort ();
Packit 709fb3
                    mbui_advance (string_iter);
Packit 709fb3
                  }
Packit 709fb3
              }
Packit 709fb3
              word_boundary_after = true;
Packit 709fb3
              if (mbui_avail (string_iter))
Packit 709fb3
                {
Packit 709fb3
                  mbchar_t first_char_after_tsub = mbui_cur (string_iter);
Packit 709fb3
                  if (mb_isalnum (first_char_after_tsub))
Packit 709fb3
                    word_boundary_after = false;
Packit 709fb3
                }
Packit 709fb3
Packit 709fb3
              if (word_boundary_before && word_boundary_after)
Packit 709fb3
                {
Packit 709fb3
                  found = true;
Packit 709fb3
                  break;
Packit 709fb3
                }
Packit 709fb3
Packit 709fb3
              mbui_init (string_iter, tsub_in_string);
Packit 709fb3
              if (!mbui_avail (string_iter))
Packit 709fb3
                break;
Packit 709fb3
              string = tsub_in_string + mb_len (mbui_cur (string_iter));
Packit 709fb3
            }
Packit 709fb3
          else
Packit 709fb3
            {
Packit 709fb3
              bool word_boundary_before;
Packit 709fb3
              const char *p;
Packit 709fb3
              bool word_boundary_after;
Packit 709fb3
Packit 709fb3
              word_boundary_before = true;
Packit 709fb3
              if (string < tsub_in_string)
Packit 709fb3
                if (isalnum ((unsigned char) tsub_in_string[-1]))
Packit 709fb3
                  word_boundary_before = false;
Packit 709fb3
Packit 709fb3
              p = tsub_in_string + strlen (tsub);
Packit 709fb3
              word_boundary_after = true;
Packit 709fb3
              if (*p != '\0')
Packit 709fb3
                if (isalnum ((unsigned char) *p))
Packit 709fb3
                  word_boundary_after = false;
Packit 709fb3
Packit 709fb3
              if (word_boundary_before && word_boundary_after)
Packit 709fb3
                {
Packit 709fb3
                  found = true;
Packit 709fb3
                  break;
Packit 709fb3
                }
Packit 709fb3
Packit 709fb3
              if (*tsub_in_string == '\0')
Packit 709fb3
                break;
Packit 709fb3
              string = tsub_in_string + 1;
Packit 709fb3
            }
Packit 709fb3
        }
Packit 709fb3
    }
Packit 709fb3
  free (tsub);
Packit 709fb3
  return found;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Return the localization of NAME.  NAME is written in ASCII.  */
Packit 709fb3
Packit 709fb3
const char *
Packit 709fb3
proper_name (const char *name)
Packit 709fb3
{
Packit 709fb3
  /* See whether there is a translation.   */
Packit 709fb3
  const char *translation = gettext (name);
Packit 709fb3
Packit 709fb3
  if (translation != name)
Packit 709fb3
    {
Packit 709fb3
      /* See whether the translation contains the original name.  */
Packit 709fb3
      if (mbsstr_trimmed_wordbounded (translation, name))
Packit 709fb3
        return translation;
Packit 709fb3
      else
Packit 709fb3
        {
Packit 709fb3
          /* Return "TRANSLATION (NAME)".  */
Packit 709fb3
          char *result =
Packit 709fb3
            XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
Packit 709fb3
Packit 709fb3
          sprintf (result, "%s (%s)", translation, name);
Packit 709fb3
          return result;
Packit 709fb3
        }
Packit 709fb3
    }
Packit 709fb3
  else
Packit 709fb3
    return name;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
/* Return the localization of a name whose original writing is not ASCII.
Packit 709fb3
   NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal
Packit 709fb3
   escape sequences.  NAME_ASCII is a fallback written only with ASCII
Packit 709fb3
   characters.  */
Packit 709fb3
Packit 709fb3
const char *
Packit 709fb3
proper_name_utf8 (const char *name_ascii, const char *name_utf8)
Packit 709fb3
{
Packit 709fb3
  /* See whether there is a translation.   */
Packit 709fb3
  const char *translation = gettext (name_ascii);
Packit 709fb3
Packit 709fb3
  /* Try to convert NAME_UTF8 to the locale encoding.  */
Packit 709fb3
  const char *locale_code = locale_charset ();
Packit 709fb3
  char *alloc_name_converted = NULL;
Packit 709fb3
  char *alloc_name_converted_translit = NULL;
Packit 709fb3
  const char *name_converted = NULL;
Packit 709fb3
  const char *name_converted_translit = NULL;
Packit 709fb3
  const char *name;
Packit 709fb3
Packit 709fb3
  if (c_strcasecmp (locale_code, "UTF-8") != 0)
Packit 709fb3
    {
Packit 709fb3
#if HAVE_ICONV
Packit 709fb3
      name_converted = alloc_name_converted =
Packit 709fb3
        xstr_iconv (name_utf8, "UTF-8", locale_code);
Packit 709fb3
Packit 709fb3
# if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
Packit 709fb3
      && !defined __UCLIBC__) \
Packit 709fb3
     || _LIBICONV_VERSION >= 0x0105
Packit 709fb3
      {
Packit 709fb3
        char *converted_translit;
Packit 709fb3
Packit 709fb3
        size_t len = strlen (locale_code);
Packit 709fb3
        char *locale_code_translit = XNMALLOC (len + 10 + 1, char);
Packit 709fb3
        memcpy (locale_code_translit, locale_code, len);
Packit 709fb3
        memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);
Packit 709fb3
Packit 709fb3
        converted_translit =
Packit 709fb3
          xstr_iconv (name_utf8, "UTF-8", locale_code_translit);
Packit 709fb3
Packit 709fb3
        free (locale_code_translit);
Packit 709fb3
Packit 709fb3
        if (converted_translit != NULL)
Packit 709fb3
          {
Packit 709fb3
#  if !_LIBICONV_VERSION
Packit 709fb3
            /* Don't use the transliteration if it added question marks.
Packit 709fb3
               glibc's transliteration falls back to question marks; libiconv's
Packit 709fb3
               transliteration does not.
Packit 709fb3
               mbschr is equivalent to strchr in this case.  */
Packit 709fb3
            if (strchr (converted_translit, '?') != NULL)
Packit 709fb3
              free (converted_translit);
Packit 709fb3
            else
Packit 709fb3
#  endif
Packit 709fb3
              name_converted_translit = alloc_name_converted_translit =
Packit 709fb3
                converted_translit;
Packit 709fb3
          }
Packit 709fb3
      }
Packit 709fb3
# endif
Packit 709fb3
#endif
Packit 709fb3
    }
Packit 709fb3
  else
Packit 709fb3
    {
Packit 709fb3
      name_converted = name_utf8;
Packit 709fb3
      name_converted_translit = name_utf8;
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  /* The name in locale encoding.  */
Packit 709fb3
  name = (name_converted != NULL ? name_converted :
Packit 709fb3
          name_converted_translit != NULL ? name_converted_translit :
Packit 709fb3
          name_ascii);
Packit 709fb3
Packit 709fb3
  /* See whether we have a translation.  Some translators have not understood
Packit 709fb3
     that they should use the UTF-8 form of the name, if possible.  So if the
Packit 709fb3
     translator provided a no-op translation, we ignore it.  */
Packit 709fb3
  if (strcmp (translation, name_ascii) != 0)
Packit 709fb3
    {
Packit 709fb3
      /* See whether the translation contains the original name.  */
Packit 709fb3
      if (mbsstr_trimmed_wordbounded (translation, name_ascii)
Packit 709fb3
          || (name_converted != NULL
Packit 709fb3
              && mbsstr_trimmed_wordbounded (translation, name_converted))
Packit 709fb3
          || (name_converted_translit != NULL
Packit 709fb3
              && mbsstr_trimmed_wordbounded (translation, name_converted_translit)))
Packit 709fb3
        {
Packit 709fb3
          if (alloc_name_converted != NULL)
Packit 709fb3
            free (alloc_name_converted);
Packit 709fb3
          if (alloc_name_converted_translit != NULL)
Packit 709fb3
            free (alloc_name_converted_translit);
Packit 709fb3
          return translation;
Packit 709fb3
        }
Packit 709fb3
      else
Packit 709fb3
        {
Packit 709fb3
          /* Return "TRANSLATION (NAME)".  */
Packit 709fb3
          char *result =
Packit 709fb3
            XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
Packit 709fb3
Packit 709fb3
          sprintf (result, "%s (%s)", translation, name);
Packit 709fb3
Packit 709fb3
          if (alloc_name_converted != NULL)
Packit 709fb3
            free (alloc_name_converted);
Packit 709fb3
          if (alloc_name_converted_translit != NULL)
Packit 709fb3
            free (alloc_name_converted_translit);
Packit 709fb3
          return result;
Packit 709fb3
        }
Packit 709fb3
    }
Packit 709fb3
  else
Packit 709fb3
    {
Packit 709fb3
      if (alloc_name_converted != NULL && alloc_name_converted != name)
Packit 709fb3
        free (alloc_name_converted);
Packit 709fb3
      if (alloc_name_converted_translit != NULL
Packit 709fb3
          && alloc_name_converted_translit != name)
Packit 709fb3
        free (alloc_name_converted_translit);
Packit 709fb3
      return name;
Packit 709fb3
    }
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
#ifdef TEST1
Packit 709fb3
# include <locale.h>
Packit 709fb3
int
Packit 709fb3
main (int argc, char *argv[])
Packit 709fb3
{
Packit 709fb3
  setlocale (LC_ALL, "");
Packit 709fb3
  if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
Packit 709fb3
    printf("found\n");
Packit 709fb3
  return 0;
Packit 709fb3
}
Packit 709fb3
#endif
Packit 709fb3
Packit 709fb3
#ifdef TEST2
Packit 709fb3
# include <locale.h>
Packit 709fb3
# include <stdio.h>
Packit 709fb3
int
Packit 709fb3
main (int argc, char *argv[])
Packit 709fb3
{
Packit 709fb3
  setlocale (LC_ALL, "");
Packit 709fb3
  printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));
Packit 709fb3
  return 0;
Packit 709fb3
}
Packit 709fb3
#endif