Blame lib/mbrtowc.c

Packit Service a2489d
/* Convert multibyte character to wide character.
Packit Service a2489d
   Copyright (C) 1999-2002, 2005-2018 Free Software Foundation, Inc.
Packit Service a2489d
   Written by Bruno Haible <bruno@clisp.org>, 2008.
Packit Service a2489d
Packit Service a2489d
   This program is free software: you can redistribute it and/or modify
Packit Service a2489d
   it under the terms of the GNU General Public License as published by
Packit Service a2489d
   the Free Software Foundation; either version 3 of the License, or
Packit Service a2489d
   (at your option) any later version.
Packit Service a2489d
Packit Service a2489d
   This program is distributed in the hope that it will be useful,
Packit Service a2489d
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit Service a2489d
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit Service a2489d
   GNU General Public License for more details.
Packit Service a2489d
Packit Service a2489d
   You should have received a copy of the GNU General Public License
Packit Service a2489d
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
Packit Service a2489d
Packit Service a2489d
#include <config.h>
Packit Service a2489d
Packit Service a2489d
/* Specification.  */
Packit Service a2489d
#include <wchar.h>
Packit Service a2489d
Packit Service a2489d
#if C_LOCALE_MAYBE_EILSEQ
Packit Service a2489d
# include "hard-locale.h"
Packit Service a2489d
# include <locale.h>
Packit Service a2489d
#endif
Packit Service a2489d
Packit Service a2489d
#if GNULIB_defined_mbstate_t
Packit Service a2489d
/* Implement mbrtowc() on top of mbtowc().  */
Packit Service a2489d
Packit Service a2489d
# include <errno.h>
Packit Service a2489d
# include <stdlib.h>
Packit Service a2489d
Packit Service a2489d
# include "localcharset.h"
Packit Service a2489d
# include "streq.h"
Packit Service a2489d
# include "verify.h"
Packit Service a2489d
Packit Service a2489d
# ifndef FALLTHROUGH
Packit Service a2489d
#  if __GNUC__ < 7
Packit Service a2489d
#   define FALLTHROUGH ((void) 0)
Packit Service a2489d
#  else
Packit Service a2489d
#   define FALLTHROUGH __attribute__ ((__fallthrough__))
Packit Service a2489d
#  endif
Packit Service a2489d
# endif
Packit Service a2489d
Packit Service a2489d
/* Returns a classification of special values of the encoding of the current
Packit Service a2489d
   locale.  */
Packit Service a2489d
typedef enum {
Packit Service a2489d
  enc_other,      /* other */
Packit Service a2489d
  enc_utf8,       /* UTF-8 */
Packit Service a2489d
  enc_eucjp,      /* EUC-JP */
Packit Service a2489d
  enc_94,         /* EUC-KR, GB2312, BIG5 */
Packit Service a2489d
  enc_euctw,      /* EUC-TW */
Packit Service a2489d
  enc_gb18030,    /* GB18030 */
Packit Service a2489d
  enc_sjis        /* SJIS */
Packit Service a2489d
} enc_t;
Packit Service a2489d
static inline enc_t
Packit Service a2489d
locale_enc (void)
Packit Service a2489d
{
Packit Service a2489d
  const char *encoding = locale_charset ();
Packit Service a2489d
  if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
Packit Service a2489d
    return enc_utf8;
Packit Service a2489d
  if (STREQ_OPT (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
Packit Service a2489d
    return enc_eucjp;
Packit Service a2489d
  if (STREQ_OPT (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
Packit Service a2489d
      || STREQ_OPT (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
Packit Service a2489d
      || STREQ_OPT (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
Packit Service a2489d
    return enc_94;
Packit Service a2489d
  if (STREQ_OPT (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
Packit Service a2489d
    return enc_euctw;
Packit Service a2489d
  if (STREQ_OPT (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
Packit Service a2489d
    return enc_gb18030;
Packit Service a2489d
  if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
Packit Service a2489d
    return enc_sjis;
Packit Service a2489d
  return enc_other;
Packit Service a2489d
}
Packit Service a2489d
Packit Service a2489d
#if GNULIB_WCHAR_SINGLE
Packit Service a2489d
/* When we know that the locale does not change, provide a speedup by
Packit Service a2489d
   caching the value of locale_enc.  */
Packit Service a2489d
static int cached_locale_enc = -1;
Packit Service a2489d
static inline enc_t
Packit Service a2489d
locale_enc_cached (void)
Packit Service a2489d
{
Packit Service a2489d
  if (cached_locale_enc < 0)
Packit Service a2489d
    cached_locale_enc = locale_enc ();
Packit Service a2489d
  return cached_locale_enc;
Packit Service a2489d
}
Packit Service a2489d
#else
Packit Service a2489d
/* By default, don't make assumptions, hence no caching.  */
Packit Service a2489d
# define locale_enc_cached locale_enc
Packit Service a2489d
#endif
Packit Service a2489d
Packit Service a2489d
verify (sizeof (mbstate_t) >= 4);
Packit Service a2489d
Packit Service a2489d
static char internal_state[4];
Packit Service a2489d
Packit Service a2489d
size_t
Packit Service a2489d
mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
Packit Service a2489d
{
Packit Service a2489d
  char *pstate = (char *)ps;
Packit Service a2489d
Packit Service a2489d
  if (s == NULL)
Packit Service a2489d
    {
Packit Service a2489d
      pwc = NULL;
Packit Service a2489d
      s = "";
Packit Service a2489d
      n = 1;
Packit Service a2489d
    }
Packit Service a2489d
Packit Service a2489d
  if (n == 0)
Packit Service a2489d
    return (size_t)(-2);
Packit Service a2489d
Packit Service a2489d
  /* Here n > 0.  */
Packit Service a2489d
Packit Service a2489d
  if (pstate == NULL)
Packit Service a2489d
    pstate = internal_state;
Packit Service a2489d
Packit Service a2489d
  {
Packit Service a2489d
    size_t nstate = pstate[0];
Packit Service a2489d
    char buf[4];
Packit Service a2489d
    const char *p;
Packit Service a2489d
    size_t m;
Packit Service a2489d
Packit Service a2489d
    switch (nstate)
Packit Service a2489d
      {
Packit Service a2489d
      case 0:
Packit Service a2489d
        p = s;
Packit Service a2489d
        m = n;
Packit Service a2489d
        break;
Packit Service a2489d
      case 3:
Packit Service a2489d
        buf[2] = pstate[3];
Packit Service a2489d
        FALLTHROUGH;
Packit Service a2489d
      case 2:
Packit Service a2489d
        buf[1] = pstate[2];
Packit Service a2489d
        FALLTHROUGH;
Packit Service a2489d
      case 1:
Packit Service a2489d
        buf[0] = pstate[1];
Packit Service a2489d
        p = buf;
Packit Service a2489d
        m = nstate;
Packit Service a2489d
        buf[m++] = s[0];
Packit Service a2489d
        if (n >= 2 && m < 4)
Packit Service a2489d
          {
Packit Service a2489d
            buf[m++] = s[1];
Packit Service a2489d
            if (n >= 3 && m < 4)
Packit Service a2489d
              buf[m++] = s[2];
Packit Service a2489d
          }
Packit Service a2489d
        break;
Packit Service a2489d
      default:
Packit Service a2489d
        errno = EINVAL;
Packit Service a2489d
        return (size_t)(-1);
Packit Service a2489d
      }
Packit Service a2489d
Packit Service a2489d
    /* Here m > 0.  */
Packit Service a2489d
Packit Service a2489d
# if __GLIBC__ || defined __UCLIBC__
Packit Service a2489d
    /* Work around bug <https://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
Packit Service a2489d
    mbtowc (NULL, NULL, 0);
Packit Service a2489d
# endif
Packit Service a2489d
    {
Packit Service a2489d
      int res = mbtowc (pwc, p, m);
Packit Service a2489d
Packit Service a2489d
      if (res >= 0)
Packit Service a2489d
        {
Packit Service a2489d
          if (pwc != NULL && ((*pwc == 0) != (res == 0)))
Packit Service a2489d
            abort ();
Packit Service a2489d
          if (nstate >= (res > 0 ? res : 1))
Packit Service a2489d
            abort ();
Packit Service a2489d
          res -= nstate;
Packit Service a2489d
          pstate[0] = 0;
Packit Service a2489d
          return res;
Packit Service a2489d
        }
Packit Service a2489d
Packit Service a2489d
      /* mbtowc does not distinguish between invalid and incomplete multibyte
Packit Service a2489d
         sequences.  But mbrtowc needs to make this distinction.
Packit Service a2489d
         There are two possible approaches:
Packit Service a2489d
           - Use iconv() and its return value.
Packit Service a2489d
           - Use built-in knowledge about the possible encodings.
Packit Service a2489d
         Given the low quality of implementation of iconv() on the systems that
Packit Service a2489d
         lack mbrtowc(), we use the second approach.
Packit Service a2489d
         The possible encodings are:
Packit Service a2489d
           - 8-bit encodings,
Packit Service a2489d
           - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
Packit Service a2489d
           - UTF-8.
Packit Service a2489d
         Use specialized code for each.  */
Packit Service a2489d
      if (m >= 4 || m >= MB_CUR_MAX)
Packit Service a2489d
        goto invalid;
Packit Service a2489d
      /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
Packit Service a2489d
      switch (locale_enc_cached ())
Packit Service a2489d
        {
Packit Service a2489d
        case enc_utf8: /* UTF-8 */
Packit Service a2489d
          {
Packit Service a2489d
            /* Cf. unistr/u8-mblen.c.  */
Packit Service a2489d
            unsigned char c = (unsigned char) p[0];
Packit Service a2489d
Packit Service a2489d
            if (c >= 0xc2)
Packit Service a2489d
              {
Packit Service a2489d
                if (c < 0xe0)
Packit Service a2489d
                  {
Packit Service a2489d
                    if (m == 1)
Packit Service a2489d
                      goto incomplete;
Packit Service a2489d
                  }
Packit Service a2489d
                else if (c < 0xf0)
Packit Service a2489d
                  {
Packit Service a2489d
                    if (m == 1)
Packit Service a2489d
                      goto incomplete;
Packit Service a2489d
                    if (m == 2)
Packit Service a2489d
                      {
Packit Service a2489d
                        unsigned char c2 = (unsigned char) p[1];
Packit Service a2489d
Packit Service a2489d
                        if ((c2 ^ 0x80) < 0x40
Packit Service a2489d
                            && (c >= 0xe1 || c2 >= 0xa0)
Packit Service a2489d
                            && (c != 0xed || c2 < 0xa0))
Packit Service a2489d
                          goto incomplete;
Packit Service a2489d
                      }
Packit Service a2489d
                  }
Packit Service a2489d
                else if (c <= 0xf4)
Packit Service a2489d
                  {
Packit Service a2489d
                    if (m == 1)
Packit Service a2489d
                      goto incomplete;
Packit Service a2489d
                    else /* m == 2 || m == 3 */
Packit Service a2489d
                      {
Packit Service a2489d
                        unsigned char c2 = (unsigned char) p[1];
Packit Service a2489d
Packit Service a2489d
                        if ((c2 ^ 0x80) < 0x40
Packit Service a2489d
                            && (c >= 0xf1 || c2 >= 0x90)
Packit Service a2489d
                            && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
Packit Service a2489d
                          {
Packit Service a2489d
                            if (m == 2)
Packit Service a2489d
                              goto incomplete;
Packit Service a2489d
                            else /* m == 3 */
Packit Service a2489d
                              {
Packit Service a2489d
                                unsigned char c3 = (unsigned char) p[2];
Packit Service a2489d
Packit Service a2489d
                                if ((c3 ^ 0x80) < 0x40)
Packit Service a2489d
                                  goto incomplete;
Packit Service a2489d
                              }
Packit Service a2489d
                          }
Packit Service a2489d
                      }
Packit Service a2489d
                  }
Packit Service a2489d
              }
Packit Service a2489d
            goto invalid;
Packit Service a2489d
          }
Packit Service a2489d
Packit Service a2489d
        /* As a reference for this code, you can use the GNU libiconv
Packit Service a2489d
           implementation.  Look for uses of the RET_TOOFEW macro.  */
Packit Service a2489d
Packit Service a2489d
        case enc_eucjp: /* EUC-JP */
Packit Service a2489d
          {
Packit Service a2489d
            if (m == 1)
Packit Service a2489d
              {
Packit Service a2489d
                unsigned char c = (unsigned char) p[0];
Packit Service a2489d
Packit Service a2489d
                if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
Packit Service a2489d
                  goto incomplete;
Packit Service a2489d
              }
Packit Service a2489d
            if (m == 2)
Packit Service a2489d
              {
Packit Service a2489d
                unsigned char c = (unsigned char) p[0];
Packit Service a2489d
Packit Service a2489d
                if (c == 0x8f)
Packit Service a2489d
                  {
Packit Service a2489d
                    unsigned char c2 = (unsigned char) p[1];
Packit Service a2489d
Packit Service a2489d
                    if (c2 >= 0xa1 && c2 < 0xff)
Packit Service a2489d
                      goto incomplete;
Packit Service a2489d
                  }
Packit Service a2489d
              }
Packit Service a2489d
            goto invalid;
Packit Service a2489d
          }
Packit Service a2489d
Packit Service a2489d
        case enc_94: /* EUC-KR, GB2312, BIG5 */
Packit Service a2489d
          {
Packit Service a2489d
            if (m == 1)
Packit Service a2489d
              {
Packit Service a2489d
                unsigned char c = (unsigned char) p[0];
Packit Service a2489d
Packit Service a2489d
                if (c >= 0xa1 && c < 0xff)
Packit Service a2489d
                  goto incomplete;
Packit Service a2489d
              }
Packit Service a2489d
            goto invalid;
Packit Service a2489d
          }
Packit Service a2489d
Packit Service a2489d
        case enc_euctw: /* EUC-TW */
Packit Service a2489d
          {
Packit Service a2489d
            if (m == 1)
Packit Service a2489d
              {
Packit Service a2489d
                unsigned char c = (unsigned char) p[0];
Packit Service a2489d
Packit Service a2489d
                if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
Packit Service a2489d
                  goto incomplete;
Packit Service a2489d
              }
Packit Service a2489d
            else /* m == 2 || m == 3 */
Packit Service a2489d
              {
Packit Service a2489d
                unsigned char c = (unsigned char) p[0];
Packit Service a2489d
Packit Service a2489d
                if (c == 0x8e)
Packit Service a2489d
                  goto incomplete;
Packit Service a2489d
              }
Packit Service a2489d
            goto invalid;
Packit Service a2489d
          }
Packit Service a2489d
Packit Service a2489d
        case enc_gb18030: /* GB18030 */
Packit Service a2489d
          {
Packit Service a2489d
            if (m == 1)
Packit Service a2489d
              {
Packit Service a2489d
                unsigned char c = (unsigned char) p[0];
Packit Service a2489d
Packit Service a2489d
                if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
Packit Service a2489d
                  goto incomplete;
Packit Service a2489d
              }
Packit Service a2489d
            else /* m == 2 || m == 3 */
Packit Service a2489d
              {
Packit Service a2489d
                unsigned char c = (unsigned char) p[0];
Packit Service a2489d
Packit Service a2489d
                if (c >= 0x90 && c <= 0xe3)
Packit Service a2489d
                  {
Packit Service a2489d
                    unsigned char c2 = (unsigned char) p[1];
Packit Service a2489d
Packit Service a2489d
                    if (c2 >= 0x30 && c2 <= 0x39)
Packit Service a2489d
                      {
Packit Service a2489d
                        if (m == 2)
Packit Service a2489d
                          goto incomplete;
Packit Service a2489d
                        else /* m == 3 */
Packit Service a2489d
                          {
Packit Service a2489d
                            unsigned char c3 = (unsigned char) p[2];
Packit Service a2489d
Packit Service a2489d
                            if (c3 >= 0x81 && c3 <= 0xfe)
Packit Service a2489d
                              goto incomplete;
Packit Service a2489d
                          }
Packit Service a2489d
                      }
Packit Service a2489d
                  }
Packit Service a2489d
              }
Packit Service a2489d
            goto invalid;
Packit Service a2489d
          }
Packit Service a2489d
Packit Service a2489d
        case enc_sjis: /* SJIS */
Packit Service a2489d
          {
Packit Service a2489d
            if (m == 1)
Packit Service a2489d
              {
Packit Service a2489d
                unsigned char c = (unsigned char) p[0];
Packit Service a2489d
Packit Service a2489d
                if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
Packit Service a2489d
                    || (c >= 0xf0 && c <= 0xf9))
Packit Service a2489d
                  goto incomplete;
Packit Service a2489d
              }
Packit Service a2489d
            goto invalid;
Packit Service a2489d
          }
Packit Service a2489d
Packit Service a2489d
        default:
Packit Service a2489d
          /* An unknown multibyte encoding.  */
Packit Service a2489d
          goto incomplete;
Packit Service a2489d
        }
Packit Service a2489d
Packit Service a2489d
     incomplete:
Packit Service a2489d
      {
Packit Service a2489d
        size_t k = nstate;
Packit Service a2489d
        /* Here 0 <= k < m < 4.  */
Packit Service a2489d
        pstate[++k] = s[0];
Packit Service a2489d
        if (k < m)
Packit Service a2489d
          {
Packit Service a2489d
            pstate[++k] = s[1];
Packit Service a2489d
            if (k < m)
Packit Service a2489d
              pstate[++k] = s[2];
Packit Service a2489d
          }
Packit Service a2489d
        if (k != m)
Packit Service a2489d
          abort ();
Packit Service a2489d
      }
Packit Service a2489d
      pstate[0] = m;
Packit Service a2489d
      return (size_t)(-2);
Packit Service a2489d
Packit Service a2489d
     invalid:
Packit Service a2489d
      errno = EILSEQ;
Packit Service a2489d
      /* The conversion state is undefined, says POSIX.  */
Packit Service a2489d
      return (size_t)(-1);
Packit Service a2489d
    }
Packit Service a2489d
  }
Packit Service a2489d
}
Packit Service a2489d
Packit Service a2489d
#else
Packit Service a2489d
/* Override the system's mbrtowc() function.  */
Packit Service a2489d
Packit Service a2489d
# undef mbrtowc
Packit Service a2489d
Packit Service a2489d
size_t
Packit Service a2489d
rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
Packit Service a2489d
{
Packit Service a2489d
  size_t ret;
Packit Service a2489d
  wchar_t wc;
Packit Service a2489d
Packit Service a2489d
# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
Packit Service a2489d
  if (s == NULL)
Packit Service a2489d
    {
Packit Service a2489d
      pwc = NULL;
Packit Service a2489d
      s = "";
Packit Service a2489d
      n = 1;
Packit Service a2489d
    }
Packit Service a2489d
# endif
Packit Service a2489d
Packit Service a2489d
# if MBRTOWC_EMPTY_INPUT_BUG
Packit Service a2489d
  if (n == 0)
Packit Service a2489d
    return (size_t) -2;
Packit Service a2489d
# endif
Packit Service a2489d
Packit Service a2489d
  if (! pwc)
Packit Service a2489d
    pwc = &wc;
Packit Service a2489d
Packit Service a2489d
# if MBRTOWC_RETVAL_BUG
Packit Service a2489d
  {
Packit Service a2489d
    static mbstate_t internal_state;
Packit Service a2489d
Packit Service a2489d
    /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
Packit Service a2489d
       hidden internal state, but we can call it on our variable.  */
Packit Service a2489d
    if (ps == NULL)
Packit Service a2489d
      ps = &internal_state;
Packit Service a2489d
Packit Service a2489d
    if (!mbsinit (ps))
Packit Service a2489d
      {
Packit Service a2489d
        /* Parse the rest of the multibyte character byte for byte.  */
Packit Service a2489d
        size_t count = 0;
Packit Service a2489d
        for (; n > 0; s++, n--)
Packit Service a2489d
          {
Packit Service a2489d
            ret = mbrtowc (&wc, s, 1, ps);
Packit Service a2489d
Packit Service a2489d
            if (ret == (size_t)(-1))
Packit Service a2489d
              return (size_t)(-1);
Packit Service a2489d
            count++;
Packit Service a2489d
            if (ret != (size_t)(-2))
Packit Service a2489d
              {
Packit Service a2489d
                /* The multibyte character has been completed.  */
Packit Service a2489d
                *pwc = wc;
Packit Service a2489d
                return (wc == 0 ? 0 : count);
Packit Service a2489d
              }
Packit Service a2489d
          }
Packit Service a2489d
        return (size_t)(-2);
Packit Service a2489d
      }
Packit Service a2489d
  }
Packit Service a2489d
# endif
Packit Service a2489d
Packit Service a2489d
  ret = mbrtowc (pwc, s, n, ps);
Packit Service a2489d
Packit Service a2489d
# if MBRTOWC_NUL_RETVAL_BUG
Packit Service a2489d
  if (ret < (size_t) -2 && !*pwc)
Packit Service a2489d
    return 0;
Packit Service a2489d
# endif
Packit Service a2489d
Packit Service a2489d
# if C_LOCALE_MAYBE_EILSEQ
Packit Service a2489d
  if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
Packit Service a2489d
    {
Packit Service a2489d
      unsigned char uc = *s;
Packit Service a2489d
      *pwc = uc;
Packit Service a2489d
      return 1;
Packit Service a2489d
    }
Packit Service a2489d
# endif
Packit Service a2489d
Packit Service a2489d
  return ret;
Packit Service a2489d
}
Packit Service a2489d
Packit Service a2489d
#endif