Blame gl/mbrtowc.c

Packit a4aae4
/* Convert multibyte character to wide character.
Packit a4aae4
   Copyright (C) 1999-2002, 2005-2017 Free Software Foundation, Inc.
Packit a4aae4
   Written by Bruno Haible <bruno@clisp.org>, 2008.
Packit a4aae4
Packit a4aae4
   This program is free software: you can redistribute it and/or modify
Packit a4aae4
   it under the terms of the GNU Lesser General Public License as published by
Packit a4aae4
   the Free Software Foundation; either version 3 of the License, or
Packit a4aae4
   (at your option) any later version.
Packit a4aae4
Packit a4aae4
   This program is distributed in the hope that it will be useful,
Packit a4aae4
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit a4aae4
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit a4aae4
   GNU Lesser General Public License for more details.
Packit a4aae4
Packit a4aae4
   You should have received a copy of the GNU Lesser General Public License
Packit a4aae4
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
Packit a4aae4
Packit a4aae4
#include <config.h>
Packit a4aae4
Packit a4aae4
/* Specification.  */
Packit a4aae4
#include <wchar.h>
Packit a4aae4
Packit a4aae4
#if C_LOCALE_MAYBE_EILSEQ
Packit a4aae4
# include "hard-locale.h"
Packit a4aae4
# include <locale.h>
Packit a4aae4
#endif
Packit a4aae4
Packit a4aae4
#if GNULIB_defined_mbstate_t
Packit a4aae4
/* Implement mbrtowc() on top of mbtowc().  */
Packit a4aae4
Packit a4aae4
# include <errno.h>
Packit a4aae4
# include <stdlib.h>
Packit a4aae4
Packit a4aae4
# include "localcharset.h"
Packit a4aae4
# include "streq.h"
Packit a4aae4
# include "verify.h"
Packit a4aae4
Packit a4aae4
#ifndef FALLTHROUGH
Packit a4aae4
# if __GNUC__ < 7
Packit a4aae4
#  define FALLTHROUGH ((void) 0)
Packit a4aae4
# else
Packit a4aae4
#  define FALLTHROUGH __attribute__ ((__fallthrough__))
Packit a4aae4
# endif
Packit a4aae4
#endif
Packit a4aae4
Packit a4aae4
verify (sizeof (mbstate_t) >= 4);
Packit a4aae4
Packit a4aae4
static char internal_state[4];
Packit a4aae4
Packit a4aae4
size_t
Packit a4aae4
mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
Packit a4aae4
{
Packit a4aae4
  char *pstate = (char *)ps;
Packit a4aae4
Packit a4aae4
  if (s == NULL)
Packit a4aae4
    {
Packit a4aae4
      pwc = NULL;
Packit a4aae4
      s = "";
Packit a4aae4
      n = 1;
Packit a4aae4
    }
Packit a4aae4
Packit a4aae4
  if (n == 0)
Packit a4aae4
    return (size_t)(-2);
Packit a4aae4
Packit a4aae4
  /* Here n > 0.  */
Packit a4aae4
Packit a4aae4
  if (pstate == NULL)
Packit a4aae4
    pstate = internal_state;
Packit a4aae4
Packit a4aae4
  {
Packit a4aae4
    size_t nstate = pstate[0];
Packit a4aae4
    char buf[4];
Packit a4aae4
    const char *p;
Packit a4aae4
    size_t m;
Packit a4aae4
Packit a4aae4
    switch (nstate)
Packit a4aae4
      {
Packit a4aae4
      case 0:
Packit a4aae4
        p = s;
Packit a4aae4
        m = n;
Packit a4aae4
        break;
Packit a4aae4
      case 3:
Packit a4aae4
        buf[2] = pstate[3];
Packit a4aae4
        FALLTHROUGH;
Packit a4aae4
      case 2:
Packit a4aae4
        buf[1] = pstate[2];
Packit a4aae4
        FALLTHROUGH;
Packit a4aae4
      case 1:
Packit a4aae4
        buf[0] = pstate[1];
Packit a4aae4
        p = buf;
Packit a4aae4
        m = nstate;
Packit a4aae4
        buf[m++] = s[0];
Packit a4aae4
        if (n >= 2 && m < 4)
Packit a4aae4
          {
Packit a4aae4
            buf[m++] = s[1];
Packit a4aae4
            if (n >= 3 && m < 4)
Packit a4aae4
              buf[m++] = s[2];
Packit a4aae4
          }
Packit a4aae4
        break;
Packit a4aae4
      default:
Packit a4aae4
        errno = EINVAL;
Packit a4aae4
        return (size_t)(-1);
Packit a4aae4
      }
Packit a4aae4
Packit a4aae4
    /* Here m > 0.  */
Packit a4aae4
Packit a4aae4
# if __GLIBC__ || defined __UCLIBC__
Packit a4aae4
    /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
Packit a4aae4
    mbtowc (NULL, NULL, 0);
Packit a4aae4
# endif
Packit a4aae4
    {
Packit a4aae4
      int res = mbtowc (pwc, p, m);
Packit a4aae4
Packit a4aae4
      if (res >= 0)
Packit a4aae4
        {
Packit a4aae4
          if (pwc != NULL && ((*pwc == 0) != (res == 0)))
Packit a4aae4
            abort ();
Packit a4aae4
          if (nstate >= (res > 0 ? res : 1))
Packit a4aae4
            abort ();
Packit a4aae4
          res -= nstate;
Packit a4aae4
          pstate[0] = 0;
Packit a4aae4
          return res;
Packit a4aae4
        }
Packit a4aae4
Packit a4aae4
      /* mbtowc does not distinguish between invalid and incomplete multibyte
Packit a4aae4
         sequences.  But mbrtowc needs to make this distinction.
Packit a4aae4
         There are two possible approaches:
Packit a4aae4
           - Use iconv() and its return value.
Packit a4aae4
           - Use built-in knowledge about the possible encodings.
Packit a4aae4
         Given the low quality of implementation of iconv() on the systems that
Packit a4aae4
         lack mbrtowc(), we use the second approach.
Packit a4aae4
         The possible encodings are:
Packit a4aae4
           - 8-bit encodings,
Packit a4aae4
           - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
Packit a4aae4
           - UTF-8.
Packit a4aae4
         Use specialized code for each.  */
Packit a4aae4
      if (m >= 4 || m >= MB_CUR_MAX)
Packit a4aae4
        goto invalid;
Packit a4aae4
      /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
Packit a4aae4
      {
Packit a4aae4
        const char *encoding = locale_charset ();
Packit a4aae4
Packit a4aae4
        if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
Packit a4aae4
          {
Packit a4aae4
            /* Cf. unistr/u8-mblen.c.  */
Packit a4aae4
            unsigned char c = (unsigned char) p[0];
Packit a4aae4
Packit a4aae4
            if (c >= 0xc2)
Packit a4aae4
              {
Packit a4aae4
                if (c < 0xe0)
Packit a4aae4
                  {
Packit a4aae4
                    if (m == 1)
Packit a4aae4
                      goto incomplete;
Packit a4aae4
                  }
Packit a4aae4
                else if (c < 0xf0)
Packit a4aae4
                  {
Packit a4aae4
                    if (m == 1)
Packit a4aae4
                      goto incomplete;
Packit a4aae4
                    if (m == 2)
Packit a4aae4
                      {
Packit a4aae4
                        unsigned char c2 = (unsigned char) p[1];
Packit a4aae4
Packit a4aae4
                        if ((c2 ^ 0x80) < 0x40
Packit a4aae4
                            && (c >= 0xe1 || c2 >= 0xa0)
Packit a4aae4
                            && (c != 0xed || c2 < 0xa0))
Packit a4aae4
                          goto incomplete;
Packit a4aae4
                      }
Packit a4aae4
                  }
Packit a4aae4
                else if (c <= 0xf4)
Packit a4aae4
                  {
Packit a4aae4
                    if (m == 1)
Packit a4aae4
                      goto incomplete;
Packit a4aae4
                    else /* m == 2 || m == 3 */
Packit a4aae4
                      {
Packit a4aae4
                        unsigned char c2 = (unsigned char) p[1];
Packit a4aae4
Packit a4aae4
                        if ((c2 ^ 0x80) < 0x40
Packit a4aae4
                            && (c >= 0xf1 || c2 >= 0x90)
Packit a4aae4
                            && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
Packit a4aae4
                          {
Packit a4aae4
                            if (m == 2)
Packit a4aae4
                              goto incomplete;
Packit a4aae4
                            else /* m == 3 */
Packit a4aae4
                              {
Packit a4aae4
                                unsigned char c3 = (unsigned char) p[2];
Packit a4aae4
Packit a4aae4
                                if ((c3 ^ 0x80) < 0x40)
Packit a4aae4
                                  goto incomplete;
Packit a4aae4
                              }
Packit a4aae4
                          }
Packit a4aae4
                      }
Packit a4aae4
                  }
Packit a4aae4
              }
Packit a4aae4
            goto invalid;
Packit a4aae4
          }
Packit a4aae4
Packit a4aae4
        /* As a reference for this code, you can use the GNU libiconv
Packit a4aae4
           implementation.  Look for uses of the RET_TOOFEW macro.  */
Packit a4aae4
Packit a4aae4
        if (STREQ_OPT (encoding,
Packit a4aae4
                       "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
Packit a4aae4
          {
Packit a4aae4
            if (m == 1)
Packit a4aae4
              {
Packit a4aae4
                unsigned char c = (unsigned char) p[0];
Packit a4aae4
Packit a4aae4
                if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
Packit a4aae4
                  goto incomplete;
Packit a4aae4
              }
Packit a4aae4
            if (m == 2)
Packit a4aae4
              {
Packit a4aae4
                unsigned char c = (unsigned char) p[0];
Packit a4aae4
Packit a4aae4
                if (c == 0x8f)
Packit a4aae4
                  {
Packit a4aae4
                    unsigned char c2 = (unsigned char) p[1];
Packit a4aae4
Packit a4aae4
                    if (c2 >= 0xa1 && c2 < 0xff)
Packit a4aae4
                      goto incomplete;
Packit a4aae4
                  }
Packit a4aae4
              }
Packit a4aae4
            goto invalid;
Packit a4aae4
          }
Packit a4aae4
        if (STREQ_OPT (encoding,
Packit a4aae4
                       "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
Packit a4aae4
            || STREQ_OPT (encoding,
Packit a4aae4
                          "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
Packit a4aae4
            || STREQ_OPT (encoding,
Packit a4aae4
                          "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
Packit a4aae4
          {
Packit a4aae4
            if (m == 1)
Packit a4aae4
              {
Packit a4aae4
                unsigned char c = (unsigned char) p[0];
Packit a4aae4
Packit a4aae4
                if (c >= 0xa1 && c < 0xff)
Packit a4aae4
                  goto incomplete;
Packit a4aae4
              }
Packit a4aae4
            goto invalid;
Packit a4aae4
          }
Packit a4aae4
        if (STREQ_OPT (encoding,
Packit a4aae4
                       "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
Packit a4aae4
          {
Packit a4aae4
            if (m == 1)
Packit a4aae4
              {
Packit a4aae4
                unsigned char c = (unsigned char) p[0];
Packit a4aae4
Packit a4aae4
                if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
Packit a4aae4
                  goto incomplete;
Packit a4aae4
              }
Packit a4aae4
            else /* m == 2 || m == 3 */
Packit a4aae4
              {
Packit a4aae4
                unsigned char c = (unsigned char) p[0];
Packit a4aae4
Packit a4aae4
                if (c == 0x8e)
Packit a4aae4
                  goto incomplete;
Packit a4aae4
              }
Packit a4aae4
            goto invalid;
Packit a4aae4
          }
Packit a4aae4
        if (STREQ_OPT (encoding,
Packit a4aae4
                       "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
Packit a4aae4
          {
Packit a4aae4
            if (m == 1)
Packit a4aae4
              {
Packit a4aae4
                unsigned char c = (unsigned char) p[0];
Packit a4aae4
Packit a4aae4
                if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
Packit a4aae4
                  goto incomplete;
Packit a4aae4
              }
Packit a4aae4
            else /* m == 2 || m == 3 */
Packit a4aae4
              {
Packit a4aae4
                unsigned char c = (unsigned char) p[0];
Packit a4aae4
Packit a4aae4
                if (c >= 0x90 && c <= 0xe3)
Packit a4aae4
                  {
Packit a4aae4
                    unsigned char c2 = (unsigned char) p[1];
Packit a4aae4
Packit a4aae4
                    if (c2 >= 0x30 && c2 <= 0x39)
Packit a4aae4
                      {
Packit a4aae4
                        if (m == 2)
Packit a4aae4
                          goto incomplete;
Packit a4aae4
                        else /* m == 3 */
Packit a4aae4
                          {
Packit a4aae4
                            unsigned char c3 = (unsigned char) p[2];
Packit a4aae4
Packit a4aae4
                            if (c3 >= 0x81 && c3 <= 0xfe)
Packit a4aae4
                              goto incomplete;
Packit a4aae4
                          }
Packit a4aae4
                      }
Packit a4aae4
                  }
Packit a4aae4
              }
Packit a4aae4
            goto invalid;
Packit a4aae4
          }
Packit a4aae4
        if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
Packit a4aae4
          {
Packit a4aae4
            if (m == 1)
Packit a4aae4
              {
Packit a4aae4
                unsigned char c = (unsigned char) p[0];
Packit a4aae4
Packit a4aae4
                if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
Packit a4aae4
                    || (c >= 0xf0 && c <= 0xf9))
Packit a4aae4
                  goto incomplete;
Packit a4aae4
              }
Packit a4aae4
            goto invalid;
Packit a4aae4
          }
Packit a4aae4
Packit a4aae4
        /* An unknown multibyte encoding.  */
Packit a4aae4
        goto incomplete;
Packit a4aae4
      }
Packit a4aae4
Packit a4aae4
     incomplete:
Packit a4aae4
      {
Packit a4aae4
        size_t k = nstate;
Packit a4aae4
        /* Here 0 <= k < m < 4.  */
Packit a4aae4
        pstate[++k] = s[0];
Packit a4aae4
        if (k < m)
Packit a4aae4
          {
Packit a4aae4
            pstate[++k] = s[1];
Packit a4aae4
            if (k < m)
Packit a4aae4
              pstate[++k] = s[2];
Packit a4aae4
          }
Packit a4aae4
        if (k != m)
Packit a4aae4
          abort ();
Packit a4aae4
      }
Packit a4aae4
      pstate[0] = m;
Packit a4aae4
      return (size_t)(-2);
Packit a4aae4
Packit a4aae4
     invalid:
Packit a4aae4
      errno = EILSEQ;
Packit a4aae4
      /* The conversion state is undefined, says POSIX.  */
Packit a4aae4
      return (size_t)(-1);
Packit a4aae4
    }
Packit a4aae4
  }
Packit a4aae4
}
Packit a4aae4
Packit a4aae4
#else
Packit a4aae4
/* Override the system's mbrtowc() function.  */
Packit a4aae4
Packit a4aae4
# undef mbrtowc
Packit a4aae4
Packit a4aae4
size_t
Packit a4aae4
rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
Packit a4aae4
{
Packit a4aae4
  size_t ret;
Packit a4aae4
  wchar_t wc;
Packit a4aae4
Packit a4aae4
# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
Packit a4aae4
  if (s == NULL)
Packit a4aae4
    {
Packit a4aae4
      pwc = NULL;
Packit a4aae4
      s = "";
Packit a4aae4
      n = 1;
Packit a4aae4
    }
Packit a4aae4
# endif
Packit a4aae4
Packit a4aae4
# if MBRTOWC_EMPTY_INPUT_BUG
Packit a4aae4
  if (n == 0)
Packit a4aae4
    return (size_t) -2;
Packit a4aae4
# endif
Packit a4aae4
Packit a4aae4
  if (! pwc)
Packit a4aae4
    pwc = &wc;
Packit a4aae4
Packit a4aae4
# if MBRTOWC_RETVAL_BUG
Packit a4aae4
  {
Packit a4aae4
    static mbstate_t internal_state;
Packit a4aae4
Packit a4aae4
    /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
Packit a4aae4
       hidden internal state, but we can call it on our variable.  */
Packit a4aae4
    if (ps == NULL)
Packit a4aae4
      ps = &internal_state;
Packit a4aae4
Packit a4aae4
    if (!mbsinit (ps))
Packit a4aae4
      {
Packit a4aae4
        /* Parse the rest of the multibyte character byte for byte.  */
Packit a4aae4
        size_t count = 0;
Packit a4aae4
        for (; n > 0; s++, n--)
Packit a4aae4
          {
Packit a4aae4
            ret = mbrtowc (&wc, s, 1, ps);
Packit a4aae4
Packit a4aae4
            if (ret == (size_t)(-1))
Packit a4aae4
              return (size_t)(-1);
Packit a4aae4
            count++;
Packit a4aae4
            if (ret != (size_t)(-2))
Packit a4aae4
              {
Packit a4aae4
                /* The multibyte character has been completed.  */
Packit a4aae4
                *pwc = wc;
Packit a4aae4
                return (wc == 0 ? 0 : count);
Packit a4aae4
              }
Packit a4aae4
          }
Packit a4aae4
        return (size_t)(-2);
Packit a4aae4
      }
Packit a4aae4
  }
Packit a4aae4
# endif
Packit a4aae4
Packit a4aae4
  ret = mbrtowc (pwc, s, n, ps);
Packit a4aae4
Packit a4aae4
# if MBRTOWC_NUL_RETVAL_BUG
Packit a4aae4
  if (ret < (size_t) -2 && !*pwc)
Packit a4aae4
    return 0;
Packit a4aae4
# endif
Packit a4aae4
Packit a4aae4
# if C_LOCALE_MAYBE_EILSEQ
Packit a4aae4
  if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
Packit a4aae4
    {
Packit a4aae4
      unsigned char uc = *s;
Packit a4aae4
      *pwc = uc;
Packit a4aae4
      return 1;
Packit a4aae4
    }
Packit a4aae4
# endif
Packit a4aae4
Packit a4aae4
  return ret;
Packit a4aae4
}
Packit a4aae4
Packit a4aae4
#endif