Blame gnulib/mbrtowc.c

Packit Service 392537
/* Convert multibyte character to wide character.
Packit Service 392537
   Copyright (C) 1999-2002, 2005-2016 Free Software Foundation, Inc.
Packit Service 392537
   Written by Bruno Haible <bruno@clisp.org>, 2008.
Packit Service 392537
Packit Service 392537
   This program is free software: you can redistribute it and/or modify
Packit Service 392537
   it under the terms of the GNU General Public License as published by
Packit Service 392537
   the Free Software Foundation; either version 3 of the License, or
Packit Service 392537
   (at your option) any later version.
Packit Service 392537
Packit Service 392537
   This program is distributed in the hope that it will be useful,
Packit Service 392537
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit Service 392537
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit Service 392537
   GNU General Public License for more details.
Packit Service 392537
Packit Service 392537
   You should have received a copy of the GNU General Public License
Packit Service 392537
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
Packit Service 392537
Packit Service 392537
#include <config.h>
Packit Service 392537
Packit Service 392537
/* Specification.  */
Packit Service 392537
#include <wchar.h>
Packit Service 392537
Packit Service 392537
#if GNULIB_defined_mbstate_t
Packit Service 392537
/* Implement mbrtowc() on top of mbtowc().  */
Packit Service 392537
Packit Service 392537
# include <errno.h>
Packit Service 392537
# include <stdlib.h>
Packit Service 392537
Packit Service 392537
# include "localcharset.h"
Packit Service 392537
# include "streq.h"
Packit Service 392537
# include "verify.h"
Packit Service 392537
Packit Service 392537
Packit Service 392537
verify (sizeof (mbstate_t) >= 4);
Packit Service 392537
Packit Service 392537
static char internal_state[4];
Packit Service 392537
Packit Service 392537
size_t
Packit Service 392537
mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
Packit Service 392537
{
Packit Service 392537
  char *pstate = (char *)ps;
Packit Service 392537
Packit Service 392537
  if (s == NULL)
Packit Service 392537
    {
Packit Service 392537
      pwc = NULL;
Packit Service 392537
      s = "";
Packit Service 392537
      n = 1;
Packit Service 392537
    }
Packit Service 392537
Packit Service 392537
  if (n == 0)
Packit Service 392537
    return (size_t)(-2);
Packit Service 392537
Packit Service 392537
  /* Here n > 0.  */
Packit Service 392537
Packit Service 392537
  if (pstate == NULL)
Packit Service 392537
    pstate = internal_state;
Packit Service 392537
Packit Service 392537
  {
Packit Service 392537
    size_t nstate = pstate[0];
Packit Service 392537
    char buf[4];
Packit Service 392537
    const char *p;
Packit Service 392537
    size_t m;
Packit Service 392537
Packit Service 392537
    switch (nstate)
Packit Service 392537
      {
Packit Service 392537
      case 0:
Packit Service 392537
        p = s;
Packit Service 392537
        m = n;
Packit Service 392537
        break;
Packit Service 392537
      case 3:
Packit Service 392537
        buf[2] = pstate[3];
Packit Service 392537
        /*FALLTHROUGH*/
Packit Service 392537
      case 2:
Packit Service 392537
        buf[1] = pstate[2];
Packit Service 392537
        /*FALLTHROUGH*/
Packit Service 392537
      case 1:
Packit Service 392537
        buf[0] = pstate[1];
Packit Service 392537
        p = buf;
Packit Service 392537
        m = nstate;
Packit Service 392537
        buf[m++] = s[0];
Packit Service 392537
        if (n >= 2 && m < 4)
Packit Service 392537
          {
Packit Service 392537
            buf[m++] = s[1];
Packit Service 392537
            if (n >= 3 && m < 4)
Packit Service 392537
              buf[m++] = s[2];
Packit Service 392537
          }
Packit Service 392537
        break;
Packit Service 392537
      default:
Packit Service 392537
        errno = EINVAL;
Packit Service 392537
        return (size_t)(-1);
Packit Service 392537
      }
Packit Service 392537
Packit Service 392537
    /* Here m > 0.  */
Packit Service 392537
Packit Service 392537
# if __GLIBC__ || defined __UCLIBC__
Packit Service 392537
    /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
Packit Service 392537
    mbtowc (NULL, NULL, 0);
Packit Service 392537
# endif
Packit Service 392537
    {
Packit Service 392537
      int res = mbtowc (pwc, p, m);
Packit Service 392537
Packit Service 392537
      if (res >= 0)
Packit Service 392537
        {
Packit Service 392537
          if (pwc != NULL && ((*pwc == 0) != (res == 0)))
Packit Service 392537
            abort ();
Packit Service 392537
          if (nstate >= (res > 0 ? res : 1))
Packit Service 392537
            abort ();
Packit Service 392537
          res -= nstate;
Packit Service 392537
          pstate[0] = 0;
Packit Service 392537
          return res;
Packit Service 392537
        }
Packit Service 392537
Packit Service 392537
      /* mbtowc does not distinguish between invalid and incomplete multibyte
Packit Service 392537
         sequences.  But mbrtowc needs to make this distinction.
Packit Service 392537
         There are two possible approaches:
Packit Service 392537
           - Use iconv() and its return value.
Packit Service 392537
           - Use built-in knowledge about the possible encodings.
Packit Service 392537
         Given the low quality of implementation of iconv() on the systems that
Packit Service 392537
         lack mbrtowc(), we use the second approach.
Packit Service 392537
         The possible encodings are:
Packit Service 392537
           - 8-bit encodings,
Packit Service 392537
           - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
Packit Service 392537
           - UTF-8.
Packit Service 392537
         Use specialized code for each.  */
Packit Service 392537
      if (m >= 4 || m >= MB_CUR_MAX)
Packit Service 392537
        goto invalid;
Packit Service 392537
      /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
Packit Service 392537
      {
Packit Service 392537
        const char *encoding = locale_charset ();
Packit Service 392537
Packit Service 392537
        if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
Packit Service 392537
          {
Packit Service 392537
            /* Cf. unistr/u8-mblen.c.  */
Packit Service 392537
            unsigned char c = (unsigned char) p[0];
Packit Service 392537
Packit Service 392537
            if (c >= 0xc2)
Packit Service 392537
              {
Packit Service 392537
                if (c < 0xe0)
Packit Service 392537
                  {
Packit Service 392537
                    if (m == 1)
Packit Service 392537
                      goto incomplete;
Packit Service 392537
                  }
Packit Service 392537
                else if (c < 0xf0)
Packit Service 392537
                  {
Packit Service 392537
                    if (m == 1)
Packit Service 392537
                      goto incomplete;
Packit Service 392537
                    if (m == 2)
Packit Service 392537
                      {
Packit Service 392537
                        unsigned char c2 = (unsigned char) p[1];
Packit Service 392537
Packit Service 392537
                        if ((c2 ^ 0x80) < 0x40
Packit Service 392537
                            && (c >= 0xe1 || c2 >= 0xa0)
Packit Service 392537
                            && (c != 0xed || c2 < 0xa0))
Packit Service 392537
                          goto incomplete;
Packit Service 392537
                      }
Packit Service 392537
                  }
Packit Service 392537
                else if (c <= 0xf4)
Packit Service 392537
                  {
Packit Service 392537
                    if (m == 1)
Packit Service 392537
                      goto incomplete;
Packit Service 392537
                    else /* m == 2 || m == 3 */
Packit Service 392537
                      {
Packit Service 392537
                        unsigned char c2 = (unsigned char) p[1];
Packit Service 392537
Packit Service 392537
                        if ((c2 ^ 0x80) < 0x40
Packit Service 392537
                            && (c >= 0xf1 || c2 >= 0x90)
Packit Service 392537
                            && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
Packit Service 392537
                          {
Packit Service 392537
                            if (m == 2)
Packit Service 392537
                              goto incomplete;
Packit Service 392537
                            else /* m == 3 */
Packit Service 392537
                              {
Packit Service 392537
                                unsigned char c3 = (unsigned char) p[2];
Packit Service 392537
Packit Service 392537
                                if ((c3 ^ 0x80) < 0x40)
Packit Service 392537
                                  goto incomplete;
Packit Service 392537
                              }
Packit Service 392537
                          }
Packit Service 392537
                      }
Packit Service 392537
                  }
Packit Service 392537
              }
Packit Service 392537
            goto invalid;
Packit Service 392537
          }
Packit Service 392537
Packit Service 392537
        /* As a reference for this code, you can use the GNU libiconv
Packit Service 392537
           implementation.  Look for uses of the RET_TOOFEW macro.  */
Packit Service 392537
Packit Service 392537
        if (STREQ_OPT (encoding,
Packit Service 392537
                       "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
Packit Service 392537
          {
Packit Service 392537
            if (m == 1)
Packit Service 392537
              {
Packit Service 392537
                unsigned char c = (unsigned char) p[0];
Packit Service 392537
Packit Service 392537
                if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
Packit Service 392537
                  goto incomplete;
Packit Service 392537
              }
Packit Service 392537
            if (m == 2)
Packit Service 392537
              {
Packit Service 392537
                unsigned char c = (unsigned char) p[0];
Packit Service 392537
Packit Service 392537
                if (c == 0x8f)
Packit Service 392537
                  {
Packit Service 392537
                    unsigned char c2 = (unsigned char) p[1];
Packit Service 392537
Packit Service 392537
                    if (c2 >= 0xa1 && c2 < 0xff)
Packit Service 392537
                      goto incomplete;
Packit Service 392537
                  }
Packit Service 392537
              }
Packit Service 392537
            goto invalid;
Packit Service 392537
          }
Packit Service 392537
        if (STREQ_OPT (encoding,
Packit Service 392537
                       "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
Packit Service 392537
            || STREQ_OPT (encoding,
Packit Service 392537
                          "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
Packit Service 392537
            || STREQ_OPT (encoding,
Packit Service 392537
                          "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
Packit Service 392537
          {
Packit Service 392537
            if (m == 1)
Packit Service 392537
              {
Packit Service 392537
                unsigned char c = (unsigned char) p[0];
Packit Service 392537
Packit Service 392537
                if (c >= 0xa1 && c < 0xff)
Packit Service 392537
                  goto incomplete;
Packit Service 392537
              }
Packit Service 392537
            goto invalid;
Packit Service 392537
          }
Packit Service 392537
        if (STREQ_OPT (encoding,
Packit Service 392537
                       "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
Packit Service 392537
          {
Packit Service 392537
            if (m == 1)
Packit Service 392537
              {
Packit Service 392537
                unsigned char c = (unsigned char) p[0];
Packit Service 392537
Packit Service 392537
                if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
Packit Service 392537
                  goto incomplete;
Packit Service 392537
              }
Packit Service 392537
            else /* m == 2 || m == 3 */
Packit Service 392537
              {
Packit Service 392537
                unsigned char c = (unsigned char) p[0];
Packit Service 392537
Packit Service 392537
                if (c == 0x8e)
Packit Service 392537
                  goto incomplete;
Packit Service 392537
              }
Packit Service 392537
            goto invalid;
Packit Service 392537
          }
Packit Service 392537
        if (STREQ_OPT (encoding,
Packit Service 392537
                       "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
Packit Service 392537
          {
Packit Service 392537
            if (m == 1)
Packit Service 392537
              {
Packit Service 392537
                unsigned char c = (unsigned char) p[0];
Packit Service 392537
Packit Service 392537
                if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
Packit Service 392537
                  goto incomplete;
Packit Service 392537
              }
Packit Service 392537
            else /* m == 2 || m == 3 */
Packit Service 392537
              {
Packit Service 392537
                unsigned char c = (unsigned char) p[0];
Packit Service 392537
Packit Service 392537
                if (c >= 0x90 && c <= 0xe3)
Packit Service 392537
                  {
Packit Service 392537
                    unsigned char c2 = (unsigned char) p[1];
Packit Service 392537
Packit Service 392537
                    if (c2 >= 0x30 && c2 <= 0x39)
Packit Service 392537
                      {
Packit Service 392537
                        if (m == 2)
Packit Service 392537
                          goto incomplete;
Packit Service 392537
                        else /* m == 3 */
Packit Service 392537
                          {
Packit Service 392537
                            unsigned char c3 = (unsigned char) p[2];
Packit Service 392537
Packit Service 392537
                            if (c3 >= 0x81 && c3 <= 0xfe)
Packit Service 392537
                              goto incomplete;
Packit Service 392537
                          }
Packit Service 392537
                      }
Packit Service 392537
                  }
Packit Service 392537
              }
Packit Service 392537
            goto invalid;
Packit Service 392537
          }
Packit Service 392537
        if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
Packit Service 392537
          {
Packit Service 392537
            if (m == 1)
Packit Service 392537
              {
Packit Service 392537
                unsigned char c = (unsigned char) p[0];
Packit Service 392537
Packit Service 392537
                if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
Packit Service 392537
                    || (c >= 0xf0 && c <= 0xf9))
Packit Service 392537
                  goto incomplete;
Packit Service 392537
              }
Packit Service 392537
            goto invalid;
Packit Service 392537
          }
Packit Service 392537
Packit Service 392537
        /* An unknown multibyte encoding.  */
Packit Service 392537
        goto incomplete;
Packit Service 392537
      }
Packit Service 392537
Packit Service 392537
     incomplete:
Packit Service 392537
      {
Packit Service 392537
        size_t k = nstate;
Packit Service 392537
        /* Here 0 <= k < m < 4.  */
Packit Service 392537
        pstate[++k] = s[0];
Packit Service 392537
        if (k < m)
Packit Service 392537
          {
Packit Service 392537
            pstate[++k] = s[1];
Packit Service 392537
            if (k < m)
Packit Service 392537
              pstate[++k] = s[2];
Packit Service 392537
          }
Packit Service 392537
        if (k != m)
Packit Service 392537
          abort ();
Packit Service 392537
      }
Packit Service 392537
      pstate[0] = m;
Packit Service 392537
      return (size_t)(-2);
Packit Service 392537
Packit Service 392537
     invalid:
Packit Service 392537
      errno = EILSEQ;
Packit Service 392537
      /* The conversion state is undefined, says POSIX.  */
Packit Service 392537
      return (size_t)(-1);
Packit Service 392537
    }
Packit Service 392537
  }
Packit Service 392537
}
Packit Service 392537
Packit Service 392537
#else
Packit Service 392537
/* Override the system's mbrtowc() function.  */
Packit Service 392537
Packit Service 392537
# undef mbrtowc
Packit Service 392537
Packit Service 392537
size_t
Packit Service 392537
rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
Packit Service 392537
{
Packit Service 392537
# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
Packit Service 392537
  if (s == NULL)
Packit Service 392537
    {
Packit Service 392537
      pwc = NULL;
Packit Service 392537
      s = "";
Packit Service 392537
      n = 1;
Packit Service 392537
    }
Packit Service 392537
# endif
Packit Service 392537
Packit Service 392537
# if MBRTOWC_EMPTY_INPUT_BUG
Packit Service 392537
  if (n == 0)
Packit Service 392537
    return (size_t) -2;
Packit Service 392537
# endif
Packit Service 392537
Packit Service 392537
# if MBRTOWC_RETVAL_BUG
Packit Service 392537
  {
Packit Service 392537
    static mbstate_t internal_state;
Packit Service 392537
Packit Service 392537
    /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
Packit Service 392537
       hidden internal state, but we can call it on our variable.  */
Packit Service 392537
    if (ps == NULL)
Packit Service 392537
      ps = &internal_state;
Packit Service 392537
Packit Service 392537
    if (!mbsinit (ps))
Packit Service 392537
      {
Packit Service 392537
        /* Parse the rest of the multibyte character byte for byte.  */
Packit Service 392537
        size_t count = 0;
Packit Service 392537
        for (; n > 0; s++, n--)
Packit Service 392537
          {
Packit Service 392537
            wchar_t wc;
Packit Service 392537
            size_t ret = mbrtowc (&wc, s, 1, ps);
Packit Service 392537
Packit Service 392537
            if (ret == (size_t)(-1))
Packit Service 392537
              return (size_t)(-1);
Packit Service 392537
            count++;
Packit Service 392537
            if (ret != (size_t)(-2))
Packit Service 392537
              {
Packit Service 392537
                /* The multibyte character has been completed.  */
Packit Service 392537
                if (pwc != NULL)
Packit Service 392537
                  *pwc = wc;
Packit Service 392537
                return (wc == 0 ? 0 : count);
Packit Service 392537
              }
Packit Service 392537
          }
Packit Service 392537
        return (size_t)(-2);
Packit Service 392537
      }
Packit Service 392537
  }
Packit Service 392537
# endif
Packit Service 392537
Packit Service 392537
# if MBRTOWC_NUL_RETVAL_BUG
Packit Service 392537
  {
Packit Service 392537
    wchar_t wc;
Packit Service 392537
    size_t ret = mbrtowc (&wc, s, n, ps);
Packit Service 392537
Packit Service 392537
    if (ret != (size_t)(-1) && ret != (size_t)(-2))
Packit Service 392537
      {
Packit Service 392537
        if (pwc != NULL)
Packit Service 392537
          *pwc = wc;
Packit Service 392537
        if (wc == 0)
Packit Service 392537
          ret = 0;
Packit Service 392537
      }
Packit Service 392537
    return ret;
Packit Service 392537
  }
Packit Service 392537
# else
Packit Service 392537
  {
Packit Service 392537
#   if MBRTOWC_NULL_ARG1_BUG
Packit Service 392537
    wchar_t dummy;
Packit Service 392537
Packit Service 392537
    if (pwc == NULL)
Packit Service 392537
      pwc = &dummy;
Packit Service 392537
#   endif
Packit Service 392537
Packit Service 392537
    return mbrtowc (pwc, s, n, ps);
Packit Service 392537
  }
Packit Service 392537
# endif
Packit Service 392537
}
Packit Service 392537
Packit Service 392537
#endif