Blame glib/libcharset/localcharset.c

Packit ae235b
/* Determine a canonical name for the current locale's character encoding.
Packit ae235b
Packit ae235b
   Copyright (C) 2000-2006 Free Software Foundation, Inc.
Packit ae235b
Packit ae235b
   This program is free software; you can redistribute it and/or modify it
Packit ae235b
   under the terms of the GNU Library General Public License as published
Packit ae235b
   by the Free Software Foundation; either version 2, or (at your option)
Packit ae235b
   any later version.
Packit ae235b
Packit ae235b
   This program is distributed in the hope that it will be useful,
Packit ae235b
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit ae235b
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit ae235b
   Library General Public License for more details.
Packit ae235b
Packit ae235b
   You should have received a copy of the GNU Library General Public
Packit ae235b
   License along with this program; if not, write to the Free Software
Packit ae235b
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
Packit ae235b
   USA.  */
Packit ae235b
Packit ae235b
/* Written by Bruno Haible <bruno@clisp.org>.  */
Packit ae235b
Packit ae235b
#include "config.h"
Packit ae235b
Packit ae235b
/* Specification.  */
Packit ae235b
#include "localcharset.h"
Packit ae235b
Packit ae235b
#include <stddef.h>
Packit ae235b
#include <stdio.h>
Packit ae235b
#include <string.h>
Packit ae235b
#include <stdlib.h>
Packit ae235b
Packit ae235b
#if defined _WIN32 || defined __WIN32__
Packit ae235b
# define WIN32_NATIVE
Packit ae235b
#endif
Packit ae235b
Packit ae235b
#if defined __EMX__
Packit ae235b
/* Assume EMX program runs on OS/2, even if compiled under DOS.  */
Packit ae235b
# define OS2
Packit ae235b
#endif
Packit ae235b
Packit ae235b
#if !defined WIN32_NATIVE
Packit ae235b
# if HAVE_LANGINFO_CODESET
Packit ae235b
#  include <langinfo.h>
Packit ae235b
# else
Packit ae235b
#  if 0 /* see comment below */
Packit ae235b
#   include <locale.h>
Packit ae235b
#  endif
Packit ae235b
# endif
Packit ae235b
# ifdef __CYGWIN__
Packit ae235b
#  define WIN32_LEAN_AND_MEAN
Packit ae235b
#  include <windows.h>
Packit ae235b
# endif
Packit ae235b
#elif defined WIN32_NATIVE
Packit ae235b
# define WIN32_LEAN_AND_MEAN
Packit ae235b
# include <windows.h>
Packit ae235b
#endif
Packit ae235b
#if defined OS2
Packit ae235b
# define INCL_DOS
Packit ae235b
# include <os2.h>
Packit ae235b
#endif
Packit ae235b
Packit ae235b
#if ENABLE_RELOCATABLE
Packit ae235b
# include "relocatable.h"
Packit ae235b
#else
Packit ae235b
# define relocate(pathname) (pathname)
Packit ae235b
#endif
Packit ae235b
Packit ae235b
/* Get GLIB_CHARSETALIAS_DIR.  */
Packit ae235b
#ifndef GLIB_CHARSETALIAS_DIR
Packit ae235b
# define GLIB_CHARSETALIAS_DIR LIBDIR
Packit ae235b
#endif
Packit ae235b
Packit ae235b
#if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
Packit ae235b
  /* Win32, Cygwin, OS/2, DOS */
Packit ae235b
# define ISSLASH(C) ((C) == '/' || (C) == '\\')
Packit ae235b
#endif
Packit ae235b
Packit ae235b
#ifndef DIRECTORY_SEPARATOR
Packit ae235b
# define DIRECTORY_SEPARATOR '/'
Packit ae235b
#endif
Packit ae235b
Packit ae235b
#ifndef ISSLASH
Packit ae235b
# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
Packit ae235b
#endif
Packit ae235b
Packit ae235b
#if HAVE_DECL_GETC_UNLOCKED
Packit ae235b
# undef getc
Packit ae235b
# define getc getc_unlocked
Packit ae235b
#endif
Packit ae235b
Packit ae235b
/* The following static variable is declared 'volatile' to avoid a
Packit ae235b
   possible multithread problem in the function get_charset_aliases. If we
Packit ae235b
   are running in a threaded environment, and if two threads initialize
Packit ae235b
   'charset_aliases' simultaneously, both will produce the same value,
Packit ae235b
   and everything will be ok if the two assignments to 'charset_aliases'
Packit ae235b
   are atomic. But I don't know what will happen if the two assignments mix.  */
Packit ae235b
#if __STDC__ != 1
Packit ae235b
# define volatile /* empty */
Packit ae235b
#endif
Packit ae235b
/* Pointer to the contents of the charset.alias file, if it has already been
Packit ae235b
   read, else NULL.  Its format is:
Packit ae235b
   ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
Packit ae235b
static const char * volatile charset_aliases;
Packit ae235b
Packit ae235b
/* Return a pointer to the contents of the charset.alias file.  */
Packit ae235b
const char *
Packit ae235b
_g_locale_get_charset_aliases (void)
Packit ae235b
{
Packit ae235b
  const char *cp;
Packit ae235b
Packit ae235b
  cp = charset_aliases;
Packit ae235b
  if (cp == NULL)
Packit ae235b
    {
Packit ae235b
#if !(defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
Packit ae235b
      FILE *fp;
Packit ae235b
      const char *dir;
Packit ae235b
      const char *base = "charset.alias";
Packit ae235b
      char *file_name;
Packit ae235b
Packit ae235b
      /* Make it possible to override the charset.alias location.  This is
Packit ae235b
	 necessary for running the testsuite before "make install".  */
Packit ae235b
      dir = getenv ("CHARSETALIASDIR");
Packit ae235b
      if (dir == NULL || dir[0] == '\0')
Packit ae235b
	dir = relocate (GLIB_CHARSETALIAS_DIR);
Packit ae235b
Packit ae235b
      /* Concatenate dir and base into freshly allocated file_name.  */
Packit ae235b
      {
Packit ae235b
	size_t dir_len = strlen (dir);
Packit ae235b
	size_t base_len = strlen (base);
Packit ae235b
	int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
Packit ae235b
	file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
Packit ae235b
	if (file_name != NULL)
Packit ae235b
	  {
Packit ae235b
	    memcpy (file_name, dir, dir_len);
Packit ae235b
	    if (add_slash)
Packit ae235b
	      file_name[dir_len] = DIRECTORY_SEPARATOR;
Packit ae235b
	    memcpy (file_name + dir_len + add_slash, base, base_len + 1);
Packit ae235b
	  }
Packit ae235b
      }
Packit ae235b
Packit ae235b
      if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
Packit ae235b
	/* Out of memory or file not found, treat it as empty.  */
Packit ae235b
	cp = "";
Packit ae235b
      else
Packit ae235b
	{
Packit ae235b
	  /* Parse the file's contents.  */
Packit ae235b
	  char *res_ptr = NULL;
Packit ae235b
	  size_t res_size = 0;
Packit ae235b
Packit ae235b
	  for (;;)
Packit ae235b
	    {
Packit ae235b
	      int c;
Packit ae235b
	      char buf1[50+1];
Packit ae235b
	      char buf2[50+1];
Packit ae235b
	      size_t l1, l2;
Packit ae235b
	      char *old_res_ptr;
Packit ae235b
Packit ae235b
	      c = getc (fp);
Packit ae235b
	      if (c == EOF)
Packit ae235b
		break;
Packit ae235b
	      if (c == '\n' || c == ' ' || c == '\t')
Packit ae235b
		continue;
Packit ae235b
	      if (c == '#')
Packit ae235b
		{
Packit ae235b
		  /* Skip comment, to end of line.  */
Packit ae235b
		  do
Packit ae235b
		    c = getc (fp);
Packit ae235b
		  while (!(c == EOF || c == '\n'));
Packit ae235b
		  if (c == EOF)
Packit ae235b
		    break;
Packit ae235b
		  continue;
Packit ae235b
		}
Packit ae235b
	      ungetc (c, fp);
Packit ae235b
	      if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
Packit ae235b
		break;
Packit ae235b
	      l1 = strlen (buf1);
Packit ae235b
	      l2 = strlen (buf2);
Packit ae235b
	      old_res_ptr = res_ptr;
Packit ae235b
	      if (res_size == 0)
Packit ae235b
		{
Packit ae235b
		  res_size = l1 + 1 + l2 + 1;
Packit ae235b
		  res_ptr = (char *) malloc (res_size + 1);
Packit ae235b
		}
Packit ae235b
	      else
Packit ae235b
		{
Packit ae235b
		  res_size += l1 + 1 + l2 + 1;
Packit ae235b
		  res_ptr = (char *) realloc (res_ptr, res_size + 1);
Packit ae235b
		}
Packit ae235b
	      if (res_ptr == NULL)
Packit ae235b
		{
Packit ae235b
		  /* Out of memory. */
Packit ae235b
		  res_size = 0;
Packit ae235b
		  if (old_res_ptr != NULL)
Packit ae235b
		    free (old_res_ptr);
Packit ae235b
		  break;
Packit ae235b
		}
Packit ae235b
	      strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
Packit ae235b
	      strcpy (res_ptr + res_size - (l2 + 1), buf2);
Packit ae235b
	    }
Packit ae235b
	  fclose (fp);
Packit ae235b
	  if (res_size == 0)
Packit ae235b
	    cp = "";
Packit ae235b
	  else
Packit ae235b
	    {
Packit ae235b
	      *(res_ptr + res_size) = '\0';
Packit ae235b
	      cp = res_ptr;
Packit ae235b
	    }
Packit ae235b
	}
Packit ae235b
Packit ae235b
      if (file_name != NULL)
Packit ae235b
	free (file_name);
Packit ae235b
Packit ae235b
#else
Packit ae235b
Packit ae235b
# if defined VMS
Packit ae235b
      /* To avoid the troubles of an extra file charset.alias_vms in the
Packit ae235b
	 sources of many GNU packages, simply inline the aliases here.  */
Packit ae235b
      /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
Packit ae235b
	 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
Packit ae235b
	 section 10.7 "Handling Different Character Sets".  */
Packit ae235b
      cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
Packit ae235b
	   "ISO8859-2" "\0" "ISO-8859-2" "\0"
Packit ae235b
	   "ISO8859-5" "\0" "ISO-8859-5" "\0"
Packit ae235b
	   "ISO8859-7" "\0" "ISO-8859-7" "\0"
Packit ae235b
	   "ISO8859-8" "\0" "ISO-8859-8" "\0"
Packit ae235b
	   "ISO8859-9" "\0" "ISO-8859-9" "\0"
Packit ae235b
	   /* Japanese */
Packit ae235b
	   "eucJP" "\0" "EUC-JP" "\0"
Packit ae235b
	   "SJIS" "\0" "SHIFT_JIS" "\0"
Packit ae235b
	   "DECKANJI" "\0" "DEC-KANJI" "\0"
Packit ae235b
	   "SDECKANJI" "\0" "EUC-JP" "\0"
Packit ae235b
	   /* Chinese */
Packit ae235b
	   "eucTW" "\0" "EUC-TW" "\0"
Packit ae235b
	   "DECHANYU" "\0" "DEC-HANYU" "\0"
Packit ae235b
	   "DECHANZI" "\0" "GB2312" "\0"
Packit ae235b
	   /* Korean */
Packit ae235b
	   "DECKOREAN" "\0" "EUC-KR" "\0";
Packit ae235b
# endif
Packit ae235b
Packit ae235b
# if defined WIN32_NATIVE || defined __CYGWIN__
Packit ae235b
      /* To avoid the troubles of installing a separate file in the same
Packit ae235b
	 directory as the DLL and of retrieving the DLL's directory at
Packit ae235b
	 runtime, simply inline the aliases here.  */
Packit ae235b
Packit ae235b
      cp = "CP936" "\0" "GBK" "\0"
Packit ae235b
	   "CP1361" "\0" "JOHAB" "\0"
Packit ae235b
	   "CP20127" "\0" "ASCII" "\0"
Packit ae235b
	   "CP20866" "\0" "KOI8-R" "\0"
Packit ae235b
	   "CP20936" "\0" "GB2312" "\0"
Packit ae235b
	   "CP21866" "\0" "KOI8-RU" "\0"
Packit ae235b
	   "CP28591" "\0" "ISO-8859-1" "\0"
Packit ae235b
	   "CP28592" "\0" "ISO-8859-2" "\0"
Packit ae235b
	   "CP28593" "\0" "ISO-8859-3" "\0"
Packit ae235b
	   "CP28594" "\0" "ISO-8859-4" "\0"
Packit ae235b
	   "CP28595" "\0" "ISO-8859-5" "\0"
Packit ae235b
	   "CP28596" "\0" "ISO-8859-6" "\0"
Packit ae235b
	   "CP28597" "\0" "ISO-8859-7" "\0"
Packit ae235b
	   "CP28598" "\0" "ISO-8859-8" "\0"
Packit ae235b
	   "CP28599" "\0" "ISO-8859-9" "\0"
Packit ae235b
	   "CP28605" "\0" "ISO-8859-15" "\0"
Packit ae235b
	   "CP38598" "\0" "ISO-8859-8" "\0"
Packit ae235b
	   "CP51932" "\0" "EUC-JP" "\0"
Packit ae235b
	   "CP51936" "\0" "GB2312" "\0"
Packit ae235b
	   "CP51949" "\0" "EUC-KR" "\0"
Packit ae235b
	   "CP51950" "\0" "EUC-TW" "\0"
Packit ae235b
	   "CP54936" "\0" "GB18030" "\0"
Packit ae235b
	   "CP65001" "\0" "UTF-8" "\0";
Packit ae235b
# endif
Packit ae235b
#endif
Packit ae235b
Packit ae235b
      charset_aliases = cp;
Packit ae235b
    }
Packit ae235b
Packit ae235b
  return cp;
Packit ae235b
}
Packit ae235b
Packit ae235b
/* Determine the current locale's character encoding, and canonicalize it
Packit ae235b
   into one of the canonical names listed in config.charset.
Packit ae235b
   The result must not be freed; it is statically allocated.
Packit ae235b
   If the canonical name cannot be determined, the result is a non-canonical
Packit ae235b
   name.  */
Packit ae235b
Packit ae235b
const char *
Packit ae235b
_g_locale_charset_raw (void)
Packit ae235b
{
Packit ae235b
  const char *codeset;
Packit ae235b
Packit ae235b
#if !(defined WIN32_NATIVE || defined OS2)
Packit ae235b
Packit ae235b
# if HAVE_LANGINFO_CODESET
Packit ae235b
Packit ae235b
  /* Most systems support nl_langinfo (CODESET) nowadays.  */
Packit ae235b
  codeset = nl_langinfo (CODESET);
Packit ae235b
Packit ae235b
#  ifdef __CYGWIN__
Packit ae235b
  /* Cygwin 2006 does not have locales.  nl_langinfo (CODESET) always
Packit ae235b
     returns "US-ASCII".  As long as this is not fixed, return the suffix
Packit ae235b
     of the locale name from the environment variables (if present) or
Packit ae235b
     the codepage as a number.  */
Packit ae235b
  if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
Packit ae235b
    {
Packit ae235b
      const char *locale;
Packit ae235b
      static char buf[2 + 10 + 1];
Packit ae235b
Packit ae235b
      locale = getenv ("LC_ALL");
Packit ae235b
      if (locale == NULL || locale[0] == '\0')
Packit ae235b
	{
Packit ae235b
	  locale = getenv ("LC_CTYPE");
Packit ae235b
	  if (locale == NULL || locale[0] == '\0')
Packit ae235b
	    locale = getenv ("LANG");
Packit ae235b
	}
Packit ae235b
      if (locale != NULL && locale[0] != '\0')
Packit ae235b
	{
Packit ae235b
	  /* If the locale name contains an encoding after the dot, return
Packit ae235b
	     it.  */
Packit ae235b
	  const char *dot = strchr (locale, '.');
Packit ae235b
Packit ae235b
	  if (dot != NULL)
Packit ae235b
	    {
Packit ae235b
	      const char *modifier;
Packit ae235b
Packit ae235b
	      dot++;
Packit ae235b
	      /* Look for the possible @... trailer and remove it, if any.  */
Packit ae235b
	      modifier = strchr (dot, '@');
Packit ae235b
	      if (modifier == NULL)
Packit ae235b
		return dot;
Packit ae235b
	      if (modifier - dot < sizeof (buf))
Packit ae235b
		{
Packit ae235b
		  memcpy (buf, dot, modifier - dot);
Packit ae235b
		  buf [modifier - dot] = '\0';
Packit ae235b
		  return buf;
Packit ae235b
		}
Packit ae235b
	    }
Packit ae235b
	}
Packit ae235b
Packit ae235b
      /* Woe32 has a function returning the locale's codepage as a number.  */
Packit ae235b
      sprintf (buf, "CP%u", GetACP ());
Packit ae235b
      codeset = buf;
Packit ae235b
    }
Packit ae235b
#  endif
Packit ae235b
Packit ae235b
# else
Packit ae235b
Packit ae235b
  /* On old systems which lack it, use setlocale or getenv.  */
Packit ae235b
  const char *locale = NULL;
Packit ae235b
Packit ae235b
  /* But most old systems don't have a complete set of locales.  Some
Packit ae235b
     (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
Packit ae235b
     use setlocale here; it would return "C" when it doesn't support the
Packit ae235b
     locale name the user has set.  */
Packit ae235b
#  if 0
Packit ae235b
  locale = setlocale (LC_CTYPE, NULL);
Packit ae235b
#  endif
Packit ae235b
  if (locale == NULL || locale[0] == '\0')
Packit ae235b
    {
Packit ae235b
      locale = getenv ("LC_ALL");
Packit ae235b
      if (locale == NULL || locale[0] == '\0')
Packit ae235b
	{
Packit ae235b
	  locale = getenv ("LC_CTYPE");
Packit ae235b
	  if (locale == NULL || locale[0] == '\0')
Packit ae235b
	    locale = getenv ("LANG");
Packit ae235b
	}
Packit ae235b
    }
Packit ae235b
Packit ae235b
  /* On some old systems, one used to set locale = "iso8859_1". On others,
Packit ae235b
     you set it to "language_COUNTRY.charset". In any case, we resolve it
Packit ae235b
     through the charset.alias file.  */
Packit ae235b
  codeset = locale;
Packit ae235b
Packit ae235b
# endif
Packit ae235b
Packit ae235b
#elif defined WIN32_NATIVE
Packit ae235b
Packit ae235b
  static char buf[2 + 10 + 1];
Packit ae235b
Packit ae235b
  /* Woe32 has a function returning the locale's codepage as a number.  */
Packit ae235b
  sprintf (buf, "CP%u", GetACP ());
Packit ae235b
  codeset = buf;
Packit ae235b
Packit ae235b
#elif defined OS2
Packit ae235b
Packit ae235b
  const char *locale;
Packit ae235b
  static char buf[2 + 10 + 1];
Packit ae235b
  ULONG cp[3];
Packit ae235b
  ULONG cplen;
Packit ae235b
Packit ae235b
  /* Allow user to override the codeset, as set in the operating system,
Packit ae235b
     with standard language environment variables.  */
Packit ae235b
  locale = getenv ("LC_ALL");
Packit ae235b
  if (locale == NULL || locale[0] == '\0')
Packit ae235b
    {
Packit ae235b
      locale = getenv ("LC_CTYPE");
Packit ae235b
      if (locale == NULL || locale[0] == '\0')
Packit ae235b
	locale = getenv ("LANG");
Packit ae235b
    }
Packit ae235b
  if (locale != NULL && locale[0] != '\0')
Packit ae235b
    {
Packit ae235b
      /* If the locale name contains an encoding after the dot, return it.  */
Packit ae235b
      const char *dot = strchr (locale, '.');
Packit ae235b
Packit ae235b
      if (dot != NULL)
Packit ae235b
	{
Packit ae235b
	  const char *modifier;
Packit ae235b
Packit ae235b
	  dot++;
Packit ae235b
	  /* Look for the possible @... trailer and remove it, if any.  */
Packit ae235b
	  modifier = strchr (dot, '@');
Packit ae235b
	  if (modifier == NULL)
Packit ae235b
	    return dot;
Packit ae235b
	  if (modifier - dot < sizeof (buf))
Packit ae235b
	    {
Packit ae235b
	      memcpy (buf, dot, modifier - dot);
Packit ae235b
	      buf [modifier - dot] = '\0';
Packit ae235b
	      return buf;
Packit ae235b
	    }
Packit ae235b
	}
Packit ae235b
Packit ae235b
      /* Resolve through the charset.alias file.  */
Packit ae235b
      codeset = locale;
Packit ae235b
    }
Packit ae235b
  else
Packit ae235b
    {
Packit ae235b
      /* OS/2 has a function returning the locale's codepage as a number.  */
Packit ae235b
      if (DosQueryCp (sizeof (cp), cp, &cplen))
Packit ae235b
	codeset = "";
Packit ae235b
      else
Packit ae235b
	{
Packit ae235b
	  sprintf (buf, "CP%u", cp[0]);
Packit ae235b
	  codeset = buf;
Packit ae235b
	}
Packit ae235b
    }
Packit ae235b
Packit ae235b
#endif
Packit ae235b
Packit ae235b
  return codeset;
Packit ae235b
}
Packit ae235b
Packit ae235b
const char *
Packit ae235b
_g_locale_charset_unalias (const char *codeset)
Packit ae235b
{
Packit ae235b
  const char *aliases;
Packit ae235b
Packit ae235b
  if (codeset == NULL)
Packit ae235b
    /* The canonical name cannot be determined.  */
Packit ae235b
    codeset = "";
Packit ae235b
Packit ae235b
  /* Resolve alias. */
Packit ae235b
  for (aliases = _g_locale_get_charset_aliases ();
Packit ae235b
       *aliases != '\0';
Packit ae235b
       aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
Packit ae235b
    if (strcmp (codeset, aliases) == 0
Packit ae235b
	|| (aliases[0] == '*' && aliases[1] == '\0'))
Packit ae235b
      {
Packit ae235b
	codeset = aliases + strlen (aliases) + 1;
Packit ae235b
	break;
Packit ae235b
      }
Packit ae235b
Packit ae235b
  /* Don't return an empty string.  GNU libc and GNU libiconv interpret
Packit ae235b
     the empty string as denoting "the locale's character encoding",
Packit ae235b
     thus GNU libiconv would call this function a second time.  */
Packit ae235b
  if (codeset[0] == '\0')
Packit ae235b
    codeset = "ASCII";
Packit ae235b
Packit ae235b
  return codeset;
Packit ae235b
}