Blob Blame History Raw
/*
  convert charset and surface names to internal representation and back

  Copyright (C) 2000-2003 David Necas (Yeti) <yeti@physics.muni.cz>

  This program is free software; you can redistribute it and/or modify it
  under the terms of version 2 of the GNU General Public License as published
  by the Free Software Foundation.

  This program is distributed in the hope that it will be useful, but WITHOUT
  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  more details.

  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif /* HAVE_CONFIG_H */

#include "enca.h"
#include "internal.h"
#include "tools/encodings.h"

#define NCHARSETS (ELEMENTS(CHARSET_INFO))
#define NALIASES (ELEMENTS(ALIAS_LIST))
#define NSURFACES (ELEMENTS(SURFACE_INFO))

#define ENCODING_UNKNOWN { ENCA_CS_UNKNOWN, 0 }

/* tolower() and toupper() which never fail. */
#define enca_tolower(c) (enca_isupper(c) ? (c) + ('a' - 'A') : (c))
#define enca_toupper(c) (enca_islower(c) ? (c) - ('a' - 'A') : (c))

static const char *UNKNOWN_CHARSET_NAME = "unknown";
static const char *UNKNOWN_CHARSET_HUMAN = "Unrecognized encoding";
static const char *UNKNOWN_CHARSET_SYM = "???";

/* Surface separator (sometimes we need a character, sometimes a string). */
#define SURF_SEPARATOR '/'
#define SURF_SEPARATOR_STR "/"

/**
 * EncaSurfaceInfo:
 * @enca: Canonical identifier (#NULL when not applicable).
 * @human: Human readable name.
 * @bit: Appropriate ENCA_SURFACE_<something>.
 *
 * Surface information.
 **/
struct _EncaSurfaceInfo {
  const char *enca;
  const char *human;
  EncaSurface bit;
};

typedef struct _EncaSurfaceInfo EncaSurfaceInfo;

/* Local prototypes. */
static int squeeze_compare(const char *x,
                           const char *y);
static int alias_search(const char *const *alist,
                        int n,
                        const char *s);
static int check_surface_consistency(EncaSurface surface);
static int count_bits(unsigned long int x);
static int check_encoding_name(const char *name);

/* Surface information. */
static const EncaSurfaceInfo SURFACE_INFO[] = {
  {
    "CR",
    "CR line terminators",
    ENCA_SURFACE_EOL_CR
  },
  {
    "LF",
    "LF line terminators",
    ENCA_SURFACE_EOL_LF
  },
  {
    "CRLF",
    "CRLF line terminators",
    ENCA_SURFACE_EOL_CRLF
  },
  {
    NULL,
    "Mixed line terminators",
    ENCA_SURFACE_EOL_MIX
  },
  {
    NULL,
    "Surrounded by/intermixed with non-text data",
    ENCA_SURFACE_EOL_BIN
  },
  {
    "21",
    "Byte order reversed in pairs (1,2 -> 2,1)",
    ENCA_SURFACE_PERM_21
  },
  {
    "4321",
    "Byte order reversed in quadruples (1,2,3,4 -> 4,3,2,1)",
    ENCA_SURFACE_PERM_4321
  },
  {
    NULL,
    "Both little and big endian chunks, concatenated",
    ENCA_SURFACE_PERM_MIX
  },
  {
    "qp",
    "Quoted-printable encoded",
    ENCA_SURFACE_QP
  },
  {
    "",
    "",
    ENCA_SURFACE_REMOVE
  },
};

/**
 * enca_charset_name:
 * @charset: A charset id.
 * @whatname: Teh type of name you request.
 * 
 * Translates numeric charset id @charset to some kind of name.
 * 
 * Returns: The requested charset name; #NULL for invalid @whatname or
 * @charset, or when @whatname name doesn't exist for charset @charset
 * (#ENCA_CS_UNKNOWN is OK).
 **/
const char*
enca_charset_name(int charset,
                  EncaNameStyle whatname)
{
  const EncaCharsetInfo *cs;

  if (charset == ENCA_CS_UNKNOWN) {
    switch (whatname) {
      case ENCA_NAME_STYLE_RFC1345:
      case ENCA_NAME_STYLE_ENCA:
      case ENCA_NAME_STYLE_MIME:
      return UNKNOWN_CHARSET_NAME;

      case ENCA_NAME_STYLE_HUMAN:
      return UNKNOWN_CHARSET_HUMAN;

      case ENCA_NAME_STYLE_CSTOCS:
      case ENCA_NAME_STYLE_ICONV:
      return UNKNOWN_CHARSET_SYM;

      default:
      return NULL;
    }
  }
  if ((size_t)charset >= NCHARSETS)
    return NULL;

  cs = CHARSET_INFO + charset;
  switch (whatname) {
    case ENCA_NAME_STYLE_RFC1345:
    return ALIAS_LIST[cs->rfc1345];

    case ENCA_NAME_STYLE_HUMAN:
    return cs->human;

    case ENCA_NAME_STYLE_ENCA:
    return ALIAS_LIST[cs->enca];

    case ENCA_NAME_STYLE_CSTOCS:
    return cs->cstocs < 0 ? NULL : ALIAS_LIST[cs->cstocs];

    case ENCA_NAME_STYLE_ICONV:
    return cs->iconv < 0 ? NULL : ALIAS_LIST[cs->iconv];

    case ENCA_NAME_STYLE_MIME:
    return cs->mime < 0 ? NULL : ALIAS_LIST[cs->mime];

    default:
    return NULL;
  }

  /* just to placate gcc */
  return NULL;
}

/**
 * enca_get_charset_aliases:
 * @charset: A charset id.
 * @n: The number of aliases will be stored here.
 *
 * Returns list of accepted aliases for charset @charset.
 *
 * The list of aliases has to be freed by caller; the strings themselves
 * must be considered constant and must NOT be freed.
 *
 * Returns: The list of aliases, storing their number into *@n; #NULL for
 * invalid @charset (*@n is zero then).
 **/
const char**
enca_get_charset_aliases(int charset,
                         size_t *n)
{
  const char **aliases;
  size_t i, j;

  /* Compute total length.
   * FIXME: The list is known at compile time. */
  for (i = *n = 0; i < NALIASES; i++)
    if (INDEX_LIST[i] == charset) (*n)++;

  /* Create the list. */
  aliases = NEW(const char*, *n);
  for (i = j = 0; i < NALIASES; i++)
    if (INDEX_LIST[i] == charset)
      aliases[j++] = ALIAS_LIST[i];

  return aliases;
}

/**
 * enca_get_surface_name:
 * @surface: A surface.
 * @whatname: The type of name you request.
 *
 * Constructs surface name from surface flags @surface.
 *
 * Returns: The requested surface name; #NULL for invalid @whatname; empty
 * string for naming style not supporting surfaces.  In all cases, the
 * returned string must be freed by caller when no longer used.
 **/
char*
enca_get_surface_name(EncaSurface surface,
                      EncaNameStyle whatname)
{
  char *s;
  size_t i;

  switch (whatname) {
    /* these don't know/define surfaces so forget it */
    case ENCA_NAME_STYLE_CSTOCS:
    case ENCA_NAME_STYLE_RFC1345:
    case ENCA_NAME_STYLE_ICONV:
    case ENCA_NAME_STYLE_MIME:
    s = enca_strdup("");
    break;

    /* human readable name (each on separate line) */
    case ENCA_NAME_STYLE_HUMAN:
    s = enca_strdup("");
    for (i = 0; i < NSURFACES; i++) {
      if (SURFACE_INFO[i].bit & surface) {
        s = enca_strappend(s, SURFACE_INFO[i].human, "\n", NULL);
      }
    }
    break;

    /* canonical name (/recode style) */
    case ENCA_NAME_STYLE_ENCA:
    s = enca_strdup("");
    for (i = 0; i < NSURFACES; i++) {
      if ((SURFACE_INFO[i].bit & surface) && SURFACE_INFO[i].enca != NULL) {
        s = enca_strappend(s, SURF_SEPARATOR_STR, SURFACE_INFO[i].enca, NULL);
      }
    }
    break;

    default:
    s = NULL;
    break;
  }

  return s;
}

/**
 * enca_charset_properties:
 * @charset: A charset.
 *
 * Returns charset properties.
 *
 * Returns: The requested charset properties; zero for invalid @charset.
 **/
EncaCharsetFlags
enca_charset_properties(int charset)
{
  if ((size_t)charset >= NCHARSETS)
    return 0;
  return CHARSET_INFO[charset].flags;
}

/**
 * enca_charset_natural_surface:
 * @charset: A charset.
 *
 * Returns natural surface of a charset.
 *
 * Returns: The requested charset natural surface (called `implied' in recode),
 *          zero for invalid @charset or for charsets with no natural surface.
 *
 *          Natrual surface is the surface one expects for a given charset --
 *          e.g. CRLF EOLs for IBM/Microsoft charsets, CR EOLs for Macintosh
 *          charsets and LF EOLs for ISO/Unix charsets.
 **/
EncaSurface
enca_charset_natural_surface(int charset)
{
  if ((size_t)charset >= NCHARSETS)
    return 0;
  else
    return CHARSET_INFO[charset].nsurface;
}

/**
 * enca_number_of_charsets:
 *
 * Returns number of known charsets.
 *
 * Charsets idetifiers are assigned successively starting from zero, so last
 * charset has identifier enca_number_of_charsets() - 1.
 *
 * Returns: The number of charsets.
 **/
size_t
enca_number_of_charsets(void)
{
  return NCHARSETS;
}

/**
 * enca_parse_encoding_name:
 * @name: An encoding specification.
 *
 * Transofrms encoding specification charset/surface into numeric #EncaEncoding.
 *
 * When the charset name is not recognized, surfaces are not parsed at all and
 * #ENCA_CS_UNKNOWN is returned as charset.  However, unrecognized surfaces are
 * considered only a minor problem causing %ENCA_SURFACE_UNKNOWN flag to be
 * set in the result, beside recognized surface flags.
 *
 * Returns: The charset/surface pair.
 **/
EncaEncoding
enca_parse_encoding_name(const char *name)
{
  EncaEncoding enc = ENCODING_UNKNOWN;
  char *p, *q;

  if (name == NULL)
    return enc;

  p = enca_strdup(name);
  /* separate pure charset name into p */
  q = strchr(p, SURF_SEPARATOR);
  if (q != NULL)
    *q++ = '\0';
  enc.charset = enca_name_to_charset(p);
  /* surfaces, ony by one */
  while (q != NULL && enc.charset != ENCA_CS_UNKNOWN) {
    unsigned int surface;
    char *r = strchr(p, SURF_SEPARATOR);

    if (r != NULL)
      *r++ = '\0';
    enc.surface |= surface = enca_name_to_surface(q);
    q = r;
  }
  if (!check_surface_consistency(enc.surface))
    enc.surface |= ENCA_SURFACE_UNKNOWN;
  free(p);

  return enc;
}

/**
 * squeeze_compare:
 * @x: A string.
 * @y: Another string.
 *
 * Compares two strings taking into account only alphanumeric characters.
 *
 * Returns: Less than zero, more than zero, or zero, when the first string is
 *          squeeze-alphabeticaly before, after, or equal to second string.
 **/
static int
squeeze_compare(const char *x,
                const char *y)
{
  if (x == NULL || y == NULL) {
    if (x == NULL && y == NULL)
      return 0;

    if (x == NULL)
      return -1;
    else
      return 1;
  }

  while (*x != '\0' || *y != '\0') {
    while (*x != '\0' && !enca_isalnum(*x))
      x++;
    while (*y != '\0' && !enca_isalnum(*y))
      y++;

    if (enca_tolower(*x) != enca_tolower(*y))
      return (int)enca_tolower(*x) - (int)enca_tolower(*y);

    if (*x != '\0')
      x++;
    if (*y != '\0')
      y++;
  }
  return 0;
}

#if 0
/**
 * stable_compare:
 * @x: A string.
 * @y: Another string.
 *
 * Compares two strings taking into account only alphanumeric characters first.
 *
 * When the strings are equal, compares them normally, too.  Zero is thus
 * returned for really identical strings only.
 *
 * Returns: Less than zero, more than zero, or zero, when the first string is
 *          squeeze-alphabeticaly before, after, or equal to second string.
 **/
static int
stable_compare(const char *x,
               const char *y)
{
  int i;

  i = squeeze_compare(x, y);
  /* to stabilize the sort */
  if (i == 0)
    return strcmp(x, y);

  return i;
}
#endif

/**
 * alias_search:
 * @alist: Sorted array of strings.
 * @n: Size of @alist.
 * @s: String to find.
 *
 * Finds string @s in stable-sorted array of strings.
 *
 * Returns: Index of @s in @alist; -1 if not found.
 **/
static int
alias_search(const char *const *alist,
             int n,
             const char *s)
{
  int i1 = 0;
  int i2 = n-1;
  int i;

  i = squeeze_compare(s, alist[i1]);
  if (i < 0)
    return -1;
  if (i == 0)
    return i1;

  i = squeeze_compare(s, alist[i2]);
  if (i > 0)
    return -1;
  if (i == 0)
    return i2;

  while (i1+1 < i2) {
    int im = (i1 + i2)/2;

    i = squeeze_compare(s, alist[im]);
    if (i == 0)
      return im;

    if (i > 0)
      i1 = im;
    else
      i2 = im;
  }
  if (squeeze_compare(s, alist[i1+1]) == 0)
    return i1+1;

  return -1;
}

/**
 * enca_name_to_charset:
 * @csname: The charset name.
 *
 * Transforms charset name to numeric charset id.
 *
 * Returns: The charset id; #ENCA_CS_UNKNOWN when the name is not recognized.
 **/
int
enca_name_to_charset(const char *csname)
{
  int i;

  if (check_encoding_name(csname) <= 0)
    return ENCA_CS_UNKNOWN;

  i = alias_search(ALIAS_LIST, NALIASES, csname);
  return i < 0 ? ENCA_CS_UNKNOWN : INDEX_LIST[i];
}

/**
 * enca_name_to_surface:
 * @sname: The surface name.
 *
 * Transforms surface name to numeric surface id.
 *
 * Returns: The surface id; %ENCA_SURFACE_UNKNOWN when the name is not
 * recognized.
 **/
EncaSurface
enca_name_to_surface(const char *sname)
{
  size_t i;

  if (sname == NULL)
    return ENCA_SURFACE_UNKNOWN;

  for (i = 0; i < NSURFACES; i++) {
    if (SURFACE_INFO[i].enca == NULL || *(SURFACE_INFO[i].enca) == '\0')
      continue;
    if (squeeze_compare(SURFACE_INFO[i].enca, sname))
      return SURFACE_INFO[i].bit;
  }
  return ENCA_SURFACE_UNKNOWN;
}

/**
 * check_surface_consistency:
 * @surface: The surface.
 *
 * Checks whether the specified surface makes sense.  Unlike recode we don't
 * consider /cr/cr/crlf/cr/lf/lf/crlf a reasonable surface.
 *
 * Returns: Nonzero when the surface is OK, zero othewise.
 **/
static int
check_surface_consistency(EncaSurface surface)
{
  return count_bits((unsigned long int)surface & ENCA_SURFACE_MASK_EOL) <= 1
         && count_bits((unsigned long int)surface & ENCA_SURFACE_MASK_PERM) <= 1
         && (surface & ENCA_SURFACE_REMOVE) == 0
         && (surface & ENCA_SURFACE_UNKNOWN) == 0;
}

/**
 * count_bits:
 * @x: A flag field.
 *
 * Returns: The number of bits set in @x.
 **/
static int
count_bits(unsigned long int x)
{
  int i = 0;

  while (x != 0) {
    if (x & 1UL)
      i++;

    x >>= 1;
  }

  return i;
}

/**
 * check_encoding_name:
 * @name: A charset/surface/encoding name.
 *
 * Checks whether @name contains only allowed characters and counts the
 * number of alphanumeric characters in @name.
 *
 * Returns: The number of alphanumeric characters in @name; -1 when @name
 * is invalid.
 **/
static int
check_encoding_name(const char *name)
{
  size_t i, n;

  if (name == NULL)
    return -1;

  for (i = n = 0; name[i] != '\0'; i++) {
    if (!enca_isname(name[i]))
      return -1;

    if (enca_isalnum(name[i]))
      n++;
  }

  return n;
}

/***** Documentation *********************************************************/

/**
 * EncaSurface:
 * @ENCA_SURFACE_EOL_CR: End-of-lines are represented with CR's.
 * @ENCA_SURFACE_EOL_LF: End-of-lines are represented with LF's.
 * @ENCA_SURFACE_EOL_CRLF: End-of-lines are represented with CRLF's.
 * @ENCA_SURFACE_EOL_MIX: Several end-of-line types, mixed.
 * @ENCA_SURFACE_EOL_BIN: End-of-line concept not applicable (binary data).
 * @ENCA_SURFACE_MASK_EOL: Mask for end-of-line surfaces.
 * @ENCA_SURFACE_PERM_21: Odd and even bytes swapped.
 * @ENCA_SURFACE_PERM_4321: Reversed byte sequence in 4byte words.
 * @ENCA_SURFACE_PERM_MIX: Chunks with both endianess, concatenated.
 * @ENCA_SURFACE_MASK_PERM: Mask for permutation surfaces.
 * @ENCA_SURFACE_QP: Quoted printables.
 * @ENCA_SURFACE_HZ: HZ encoded.
 * @ENCA_SURFACE_REMOVE: Recode `remove' surface.
 * @ENCA_SURFACE_UNKNOWN: Unknown surface.
 * @ENCA_SURFACE_MASK_ALL: Mask for all bits, withnout #ENCA_SURFACE_UNKNOWN.
 *
 * Surface flags.
 **/

/**
 * EncaNameStyle:
 * @ENCA_NAME_STYLE_ENCA: Default, implicit charset name in Enca.
 * @ENCA_NAME_STYLE_RFC1345: RFC 1345 or otherwise canonical charset name.
 * @ENCA_NAME_STYLE_CSTOCS: Cstocs charset name (may not exist).
 * @ENCA_NAME_STYLE_ICONV: Iconv charset name (may not exist).
 * @ENCA_NAME_STYLE_HUMAN: Human comprehensible description.
 * @ENCA_NAME_STYLE_MIME: Preferred MIME name (may not exist).
 *
 * Charset naming styles and conventions.
 **/

/**
 * EncaCharsetFlags:
 * @ENCA_CHARSET_7BIT: Characters are represented with 7bit characters.
 * @ENCA_CHARSET_8BIT: Characters are represented with bytes.
 * @ENCA_CHARSET_16BIT: Characters are represented with 2byte words.
 * @ENCA_CHARSET_32BIT: Characters are represented with 4byte words.
 * @ENCA_CHARSET_FIXED: One characters consists of one fundamental piece.
 * @ENCA_CHARSET_VARIABLE: One character consists of variable number of
 * fundamental pieces.
 * @ENCA_CHARSET_BINARY: Charset is binary from ASCII viewpoint.
 * @ENCA_CHARSET_REGULAR: Language dependent (8bit) charset.
 * @ENCA_CHARSET_MULTIBYTE: Multibyte charset.
 *
 * Charset properties.
 *
 * Flags %ENCA_CHARSET_7BIT, %ENCA_CHARSET_8BIT, %ENCA_CHARSET_16BIT,
 * %ENCA_CHARSET_32BIT tell how many bits a `fundamental piece' consists of.
 * This is different from bits per character; r.g. UTF-8 consists of 8bit
 * pieces (bytes), but character can be composed from 1 to 6 of them.
 **/

/**
 * ENCA_CS_UNKNOWN:
 *
 * Unknown character set id.
 *
 * Use enca_charset_is_known() to check for unknown charset instead of direct
 * comparsion.
 **/

/**
 * EncaEncoding:
 * @charset: Numeric charset identifier.
 * @surface: Surface flags.
 *
 * Encoding, i.e. charset and surface.
 *
 * This is what enca_analyse() and enca_analyse_const() return.
 *
 * The @charset field is an opaque numerical charset identifier, which has no
 * meaning outside Enca library.
 * You will probably want to use it only as enca_charset_name() argument.
 * It is only guaranteed not to change meaning
 * during program execution time; change of its interpretation (e.g. due to
 * addition of new charsets) is not considered API change.
 *
 * The @surface field is a combination of #EncaSurface flags.  You may want
 * to ignore it completely; you should use enca_set_interpreted_surfaces()
 * to disable weird surfaces then.
 **/

/**
 * enca_charset_is_known:
 * @cs: Charset id.
 *
 * Expands to nonzero when the charset is known (i.e. it's not
 * ENCA_CS_UNKNOWN).
 **/

/**
 * enca_charset_is_7bit:
 * @cs: Charset id.
 *
 * Expands to nonzero when characters are represented with 7bit characters.
 **/

/**
 * enca_charset_is_8bit:
 * @cs: Charset id.
 *
 * Expands to nonzero when characters are represented with bytes.
 **/

/**
 * enca_charset_is_16bit:
 * @cs: Charset id.
 *
 * Expands to nonzero when characters are represented with 2byte words.
 **/

/**
 * enca_charset_is_32bit:
 * @cs: Charset id.
 *
 * Expands to nonzero when characters are represented with 4byte words.
 **/

/**
 * enca_charset_is_fixed:
 * @cs: Charset id.
 *
 * Expands to nonzero when one characters consists of one fundamental piece.
 **/

/**
 * enca_charset_is_variable:
 * @cs: Charset id.
 *
 * Expands to nonzero when one character consists of variable number of
 * fundamental pieces.
 **/

/**
 * enca_charset_is_binary:
 * @cs: Charset id.
 *
 * Expands to nonzero when charset is binary from ASCII viewpoint.
 **/

/**
 * enca_charset_is_regular:
 * @cs: Charset id.
 *
 * Expands to nonzero when charset is language dependent (8bit) charset.
 **/

/**
 * enca_charset_is_multibyte:
 * @cs: Charset id.
 *
 * Expands to nonzero when charset is multibyte.
 **/

/* vim: ts=2 sw=2 et
 */