Blame lib/propername.c

Packit 33f14e
/* Localization of proper names.
Packit 33f14e
   Copyright (C) 2006-2017 Free Software Foundation, Inc.
Packit 33f14e
   Written by Bruno Haible <bruno@clisp.org>, 2006.
Packit 33f14e
Packit 33f14e
   This program is free software: you can redistribute it and/or modify
Packit 33f14e
   it under the terms of the GNU General Public License as published by
Packit 33f14e
   the Free Software Foundation; either version 3 of the License, or
Packit 33f14e
   (at your option) any later version.
Packit 33f14e
Packit 33f14e
   This program is distributed in the hope that it will be useful,
Packit 33f14e
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 33f14e
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 33f14e
   GNU General Public License for more details.
Packit 33f14e
Packit 33f14e
   You should have received a copy of the GNU General Public License
Packit 33f14e
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
Packit 33f14e
Packit 33f14e
/* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that
Packit 33f14e
   the proper_name function might be candidate for attribute 'const'  */
Packit 33f14e
#if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__
Packit 33f14e
# pragma GCC diagnostic ignored "-Wsuggest-attribute=const"
Packit 33f14e
#endif
Packit 33f14e
Packit 33f14e
#include <config.h>
Packit 33f14e
Packit 33f14e
/* Specification.  */
Packit 33f14e
#include "propername.h"
Packit 33f14e
Packit 33f14e
#include <ctype.h>
Packit 33f14e
#include <stdbool.h>
Packit 33f14e
#include <stdio.h>
Packit 33f14e
#include <stdlib.h>
Packit 33f14e
#include <string.h>
Packit 33f14e
#if HAVE_ICONV
Packit 33f14e
# include <iconv.h>
Packit 33f14e
#endif
Packit 33f14e
Packit 33f14e
#include "trim.h"
Packit 33f14e
#include "mbchar.h"
Packit 33f14e
#include "mbuiter.h"
Packit 33f14e
#include "localcharset.h"
Packit 33f14e
#include "c-strcase.h"
Packit 33f14e
#include "xstriconv.h"
Packit 33f14e
#include "xalloc.h"
Packit 33f14e
#include "gettext.h"
Packit 33f14e
Packit 33f14e
Packit 33f14e
/* Tests whether STRING contains trim (SUB), starting and ending at word
Packit 33f14e
   boundaries.
Packit 33f14e
   Here, instead of implementing Unicode Standard Annex #29 for determining
Packit 33f14e
   word boundaries, we assume that trim (SUB) starts and ends with words and
Packit 33f14e
   only test whether the part before it ends with a non-word and the part
Packit 33f14e
   after it starts with a non-word.  */
Packit 33f14e
static bool
Packit 33f14e
mbsstr_trimmed_wordbounded (const char *string, const char *sub)
Packit 33f14e
{
Packit 33f14e
  char *tsub = trim (sub);
Packit 33f14e
  bool found = false;
Packit 33f14e
Packit 33f14e
  for (; *string != '\0';)
Packit 33f14e
    {
Packit 33f14e
      const char *tsub_in_string = mbsstr (string, tsub);
Packit 33f14e
      if (tsub_in_string == NULL)
Packit 33f14e
        break;
Packit 33f14e
      else
Packit 33f14e
        {
Packit 33f14e
          if (MB_CUR_MAX > 1)
Packit 33f14e
            {
Packit 33f14e
              mbui_iterator_t string_iter;
Packit 33f14e
              bool word_boundary_before;
Packit 33f14e
              bool word_boundary_after;
Packit 33f14e
Packit 33f14e
              mbui_init (string_iter, string);
Packit 33f14e
              word_boundary_before = true;
Packit 33f14e
              if (mbui_cur_ptr (string_iter) < tsub_in_string)
Packit 33f14e
                {
Packit 33f14e
                  mbchar_t last_char_before_tsub;
Packit 33f14e
                  do
Packit 33f14e
                    {
Packit 33f14e
                      if (!mbui_avail (string_iter))
Packit 33f14e
                        abort ();
Packit 33f14e
                      last_char_before_tsub = mbui_cur (string_iter);
Packit 33f14e
                      mbui_advance (string_iter);
Packit 33f14e
                    }
Packit 33f14e
                  while (mbui_cur_ptr (string_iter) < tsub_in_string);
Packit 33f14e
                  if (mb_isalnum (last_char_before_tsub))
Packit 33f14e
                    word_boundary_before = false;
Packit 33f14e
                }
Packit 33f14e
Packit 33f14e
              mbui_init (string_iter, tsub_in_string);
Packit 33f14e
              {
Packit 33f14e
                mbui_iterator_t tsub_iter;
Packit 33f14e
Packit 33f14e
                for (mbui_init (tsub_iter, tsub);
Packit 33f14e
                     mbui_avail (tsub_iter);
Packit 33f14e
                     mbui_advance (tsub_iter))
Packit 33f14e
                  {
Packit 33f14e
                    if (!mbui_avail (string_iter))
Packit 33f14e
                      abort ();
Packit 33f14e
                    mbui_advance (string_iter);
Packit 33f14e
                  }
Packit 33f14e
              }
Packit 33f14e
              word_boundary_after = true;
Packit 33f14e
              if (mbui_avail (string_iter))
Packit 33f14e
                {
Packit 33f14e
                  mbchar_t first_char_after_tsub = mbui_cur (string_iter);
Packit 33f14e
                  if (mb_isalnum (first_char_after_tsub))
Packit 33f14e
                    word_boundary_after = false;
Packit 33f14e
                }
Packit 33f14e
Packit 33f14e
              if (word_boundary_before && word_boundary_after)
Packit 33f14e
                {
Packit 33f14e
                  found = true;
Packit 33f14e
                  break;
Packit 33f14e
                }
Packit 33f14e
Packit 33f14e
              mbui_init (string_iter, tsub_in_string);
Packit 33f14e
              if (!mbui_avail (string_iter))
Packit 33f14e
                break;
Packit 33f14e
              string = tsub_in_string + mb_len (mbui_cur (string_iter));
Packit 33f14e
            }
Packit 33f14e
          else
Packit 33f14e
            {
Packit 33f14e
              bool word_boundary_before;
Packit 33f14e
              const char *p;
Packit 33f14e
              bool word_boundary_after;
Packit 33f14e
Packit 33f14e
              word_boundary_before = true;
Packit 33f14e
              if (string < tsub_in_string)
Packit 33f14e
                if (isalnum ((unsigned char) tsub_in_string[-1]))
Packit 33f14e
                  word_boundary_before = false;
Packit 33f14e
Packit 33f14e
              p = tsub_in_string + strlen (tsub);
Packit 33f14e
              word_boundary_after = true;
Packit 33f14e
              if (*p != '\0')
Packit 33f14e
                if (isalnum ((unsigned char) *p))
Packit 33f14e
                  word_boundary_after = false;
Packit 33f14e
Packit 33f14e
              if (word_boundary_before && word_boundary_after)
Packit 33f14e
                {
Packit 33f14e
                  found = true;
Packit 33f14e
                  break;
Packit 33f14e
                }
Packit 33f14e
Packit 33f14e
              if (*tsub_in_string == '\0')
Packit 33f14e
                break;
Packit 33f14e
              string = tsub_in_string + 1;
Packit 33f14e
            }
Packit 33f14e
        }
Packit 33f14e
    }
Packit 33f14e
  free (tsub);
Packit 33f14e
  return found;
Packit 33f14e
}
Packit 33f14e
Packit 33f14e
/* Return the localization of NAME.  NAME is written in ASCII.  */
Packit 33f14e
Packit 33f14e
const char *
Packit 33f14e
proper_name (const char *name)
Packit 33f14e
{
Packit 33f14e
  /* See whether there is a translation.   */
Packit 33f14e
  const char *translation = gettext (name);
Packit 33f14e
Packit 33f14e
  if (translation != name)
Packit 33f14e
    {
Packit 33f14e
      /* See whether the translation contains the original name.  */
Packit 33f14e
      if (mbsstr_trimmed_wordbounded (translation, name))
Packit 33f14e
        return translation;
Packit 33f14e
      else
Packit 33f14e
        {
Packit 33f14e
          /* Return "TRANSLATION (NAME)".  */
Packit 33f14e
          char *result =
Packit 33f14e
            XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
Packit 33f14e
Packit 33f14e
          sprintf (result, "%s (%s)", translation, name);
Packit 33f14e
          return result;
Packit 33f14e
        }
Packit 33f14e
    }
Packit 33f14e
  else
Packit 33f14e
    return name;
Packit 33f14e
}
Packit 33f14e
Packit 33f14e
/* Return the localization of a name whose original writing is not ASCII.
Packit 33f14e
   NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal
Packit 33f14e
   escape sequences.  NAME_ASCII is a fallback written only with ASCII
Packit 33f14e
   characters.  */
Packit 33f14e
Packit 33f14e
const char *
Packit 33f14e
proper_name_utf8 (const char *name_ascii, const char *name_utf8)
Packit 33f14e
{
Packit 33f14e
  /* See whether there is a translation.   */
Packit 33f14e
  const char *translation = gettext (name_ascii);
Packit 33f14e
Packit 33f14e
  /* Try to convert NAME_UTF8 to the locale encoding.  */
Packit 33f14e
  const char *locale_code = locale_charset ();
Packit 33f14e
  char *alloc_name_converted = NULL;
Packit 33f14e
  char *alloc_name_converted_translit = NULL;
Packit 33f14e
  const char *name_converted = NULL;
Packit 33f14e
  const char *name_converted_translit = NULL;
Packit 33f14e
  const char *name;
Packit 33f14e
Packit 33f14e
  if (c_strcasecmp (locale_code, "UTF-8") != 0)
Packit 33f14e
    {
Packit 33f14e
#if HAVE_ICONV
Packit 33f14e
      name_converted = alloc_name_converted =
Packit 33f14e
        xstr_iconv (name_utf8, "UTF-8", locale_code);
Packit 33f14e
Packit 33f14e
# if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
Packit 33f14e
      && !defined __UCLIBC__) \
Packit 33f14e
     || _LIBICONV_VERSION >= 0x0105
Packit 33f14e
      {
Packit 33f14e
        char *converted_translit;
Packit 33f14e
Packit 33f14e
        size_t len = strlen (locale_code);
Packit 33f14e
        char *locale_code_translit = XNMALLOC (len + 10 + 1, char);
Packit 33f14e
        memcpy (locale_code_translit, locale_code, len);
Packit 33f14e
        memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);
Packit 33f14e
Packit 33f14e
        converted_translit =
Packit 33f14e
          xstr_iconv (name_utf8, "UTF-8", locale_code_translit);
Packit 33f14e
Packit 33f14e
        free (locale_code_translit);
Packit 33f14e
Packit 33f14e
        if (converted_translit != NULL)
Packit 33f14e
          {
Packit 33f14e
#  if !_LIBICONV_VERSION
Packit 33f14e
            /* Don't use the transliteration if it added question marks.
Packit 33f14e
               glibc's transliteration falls back to question marks; libiconv's
Packit 33f14e
               transliteration does not.
Packit 33f14e
               mbschr is equivalent to strchr in this case.  */
Packit 33f14e
            if (strchr (converted_translit, '?') != NULL)
Packit 33f14e
              free (converted_translit);
Packit 33f14e
            else
Packit 33f14e
#  endif
Packit 33f14e
              name_converted_translit = alloc_name_converted_translit =
Packit 33f14e
                converted_translit;
Packit 33f14e
          }
Packit 33f14e
      }
Packit 33f14e
# endif
Packit 33f14e
#endif
Packit 33f14e
    }
Packit 33f14e
  else
Packit 33f14e
    {
Packit 33f14e
      name_converted = name_utf8;
Packit 33f14e
      name_converted_translit = name_utf8;
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  /* The name in locale encoding.  */
Packit 33f14e
  name = (name_converted != NULL ? name_converted :
Packit 33f14e
          name_converted_translit != NULL ? name_converted_translit :
Packit 33f14e
          name_ascii);
Packit 33f14e
Packit 33f14e
  /* See whether we have a translation.  Some translators have not understood
Packit 33f14e
     that they should use the UTF-8 form of the name, if possible.  So if the
Packit 33f14e
     translator provided a no-op translation, we ignore it.  */
Packit 33f14e
  if (strcmp (translation, name_ascii) != 0)
Packit 33f14e
    {
Packit 33f14e
      /* See whether the translation contains the original name.  */
Packit 33f14e
      if (mbsstr_trimmed_wordbounded (translation, name_ascii)
Packit 33f14e
          || (name_converted != NULL
Packit 33f14e
              && mbsstr_trimmed_wordbounded (translation, name_converted))
Packit 33f14e
          || (name_converted_translit != NULL
Packit 33f14e
              && mbsstr_trimmed_wordbounded (translation, name_converted_translit)))
Packit 33f14e
        {
Packit 33f14e
          if (alloc_name_converted != NULL)
Packit 33f14e
            free (alloc_name_converted);
Packit 33f14e
          if (alloc_name_converted_translit != NULL)
Packit 33f14e
            free (alloc_name_converted_translit);
Packit 33f14e
          return translation;
Packit 33f14e
        }
Packit 33f14e
      else
Packit 33f14e
        {
Packit 33f14e
          /* Return "TRANSLATION (NAME)".  */
Packit 33f14e
          char *result =
Packit 33f14e
            XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
Packit 33f14e
Packit 33f14e
          sprintf (result, "%s (%s)", translation, name);
Packit 33f14e
Packit 33f14e
          if (alloc_name_converted != NULL)
Packit 33f14e
            free (alloc_name_converted);
Packit 33f14e
          if (alloc_name_converted_translit != NULL)
Packit 33f14e
            free (alloc_name_converted_translit);
Packit 33f14e
          return result;
Packit 33f14e
        }
Packit 33f14e
    }
Packit 33f14e
  else
Packit 33f14e
    {
Packit 33f14e
      if (alloc_name_converted != NULL && alloc_name_converted != name)
Packit 33f14e
        free (alloc_name_converted);
Packit 33f14e
      if (alloc_name_converted_translit != NULL
Packit 33f14e
          && alloc_name_converted_translit != name)
Packit 33f14e
        free (alloc_name_converted_translit);
Packit 33f14e
      return name;
Packit 33f14e
    }
Packit 33f14e
}
Packit 33f14e
Packit 33f14e
#ifdef TEST1
Packit 33f14e
# include <locale.h>
Packit 33f14e
int
Packit 33f14e
main (int argc, char *argv[])
Packit 33f14e
{
Packit 33f14e
  setlocale (LC_ALL, "");
Packit 33f14e
  if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
Packit 33f14e
    printf("found\n");
Packit 33f14e
  return 0;
Packit 33f14e
}
Packit 33f14e
#endif
Packit 33f14e
Packit 33f14e
#ifdef TEST2
Packit 33f14e
# include <locale.h>
Packit 33f14e
# include <stdio.h>
Packit 33f14e
int
Packit 33f14e
main (int argc, char *argv[])
Packit 33f14e
{
Packit 33f14e
  setlocale (LC_ALL, "");
Packit 33f14e
  printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));
Packit 33f14e
  return 0;
Packit 33f14e
}
Packit 33f14e
#endif