Blame lib/mbrtowc.c

Packit 33f14e
/* Convert multibyte character to wide character.
Packit 33f14e
   Copyright (C) 1999-2002, 2005-2017 Free Software Foundation, Inc.
Packit 33f14e
   Written by Bruno Haible <bruno@clisp.org>, 2008.
Packit 33f14e
Packit 33f14e
   This program is free software: you can redistribute it and/or modify
Packit 33f14e
   it under the terms of the GNU General Public License as published by
Packit 33f14e
   the Free Software Foundation; either version 3 of the License, or
Packit 33f14e
   (at your option) any later version.
Packit 33f14e
Packit 33f14e
   This program is distributed in the hope that it will be useful,
Packit 33f14e
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 33f14e
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 33f14e
   GNU General Public License for more details.
Packit 33f14e
Packit 33f14e
   You should have received a copy of the GNU General Public License
Packit 33f14e
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
Packit 33f14e
Packit 33f14e
#include <config.h>
Packit 33f14e
Packit 33f14e
/* Specification.  */
Packit 33f14e
#include <wchar.h>
Packit 33f14e
Packit 33f14e
#if C_LOCALE_MAYBE_EILSEQ
Packit 33f14e
# include "hard-locale.h"
Packit 33f14e
# include <locale.h>
Packit 33f14e
#endif
Packit 33f14e
Packit 33f14e
#if GNULIB_defined_mbstate_t
Packit 33f14e
/* Implement mbrtowc() on top of mbtowc().  */
Packit 33f14e
Packit 33f14e
# include <errno.h>
Packit 33f14e
# include <stdlib.h>
Packit 33f14e
Packit 33f14e
# include "localcharset.h"
Packit 33f14e
# include "streq.h"
Packit 33f14e
# include "verify.h"
Packit 33f14e
Packit 33f14e
#ifndef FALLTHROUGH
Packit 33f14e
# if __GNUC__ < 7
Packit 33f14e
#  define FALLTHROUGH ((void) 0)
Packit 33f14e
# else
Packit 33f14e
#  define FALLTHROUGH __attribute__ ((__fallthrough__))
Packit 33f14e
# endif
Packit 33f14e
#endif
Packit 33f14e
Packit 33f14e
verify (sizeof (mbstate_t) >= 4);
Packit 33f14e
Packit 33f14e
static char internal_state[4];
Packit 33f14e
Packit 33f14e
size_t
Packit 33f14e
mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
Packit 33f14e
{
Packit 33f14e
  char *pstate = (char *)ps;
Packit 33f14e
Packit 33f14e
  if (s == NULL)
Packit 33f14e
    {
Packit 33f14e
      pwc = NULL;
Packit 33f14e
      s = "";
Packit 33f14e
      n = 1;
Packit 33f14e
    }
Packit 33f14e
Packit 33f14e
  if (n == 0)
Packit 33f14e
    return (size_t)(-2);
Packit 33f14e
Packit 33f14e
  /* Here n > 0.  */
Packit 33f14e
Packit 33f14e
  if (pstate == NULL)
Packit 33f14e
    pstate = internal_state;
Packit 33f14e
Packit 33f14e
  {
Packit 33f14e
    size_t nstate = pstate[0];
Packit 33f14e
    char buf[4];
Packit 33f14e
    const char *p;
Packit 33f14e
    size_t m;
Packit 33f14e
Packit 33f14e
    switch (nstate)
Packit 33f14e
      {
Packit 33f14e
      case 0:
Packit 33f14e
        p = s;
Packit 33f14e
        m = n;
Packit 33f14e
        break;
Packit 33f14e
      case 3:
Packit 33f14e
        buf[2] = pstate[3];
Packit 33f14e
        FALLTHROUGH;
Packit 33f14e
      case 2:
Packit 33f14e
        buf[1] = pstate[2];
Packit 33f14e
        FALLTHROUGH;
Packit 33f14e
      case 1:
Packit 33f14e
        buf[0] = pstate[1];
Packit 33f14e
        p = buf;
Packit 33f14e
        m = nstate;
Packit 33f14e
        buf[m++] = s[0];
Packit 33f14e
        if (n >= 2 && m < 4)
Packit 33f14e
          {
Packit 33f14e
            buf[m++] = s[1];
Packit 33f14e
            if (n >= 3 && m < 4)
Packit 33f14e
              buf[m++] = s[2];
Packit 33f14e
          }
Packit 33f14e
        break;
Packit 33f14e
      default:
Packit 33f14e
        errno = EINVAL;
Packit 33f14e
        return (size_t)(-1);
Packit 33f14e
      }
Packit 33f14e
Packit 33f14e
    /* Here m > 0.  */
Packit 33f14e
Packit 33f14e
# if __GLIBC__ || defined __UCLIBC__
Packit 33f14e
    /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
Packit 33f14e
    mbtowc (NULL, NULL, 0);
Packit 33f14e
# endif
Packit 33f14e
    {
Packit 33f14e
      int res = mbtowc (pwc, p, m);
Packit 33f14e
Packit 33f14e
      if (res >= 0)
Packit 33f14e
        {
Packit 33f14e
          if (pwc != NULL && ((*pwc == 0) != (res == 0)))
Packit 33f14e
            abort ();
Packit 33f14e
          if (nstate >= (res > 0 ? res : 1))
Packit 33f14e
            abort ();
Packit 33f14e
          res -= nstate;
Packit 33f14e
          pstate[0] = 0;
Packit 33f14e
          return res;
Packit 33f14e
        }
Packit 33f14e
Packit 33f14e
      /* mbtowc does not distinguish between invalid and incomplete multibyte
Packit 33f14e
         sequences.  But mbrtowc needs to make this distinction.
Packit 33f14e
         There are two possible approaches:
Packit 33f14e
           - Use iconv() and its return value.
Packit 33f14e
           - Use built-in knowledge about the possible encodings.
Packit 33f14e
         Given the low quality of implementation of iconv() on the systems that
Packit 33f14e
         lack mbrtowc(), we use the second approach.
Packit 33f14e
         The possible encodings are:
Packit 33f14e
           - 8-bit encodings,
Packit 33f14e
           - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
Packit 33f14e
           - UTF-8.
Packit 33f14e
         Use specialized code for each.  */
Packit 33f14e
      if (m >= 4 || m >= MB_CUR_MAX)
Packit 33f14e
        goto invalid;
Packit 33f14e
      /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
Packit 33f14e
      {
Packit 33f14e
        const char *encoding = locale_charset ();
Packit 33f14e
Packit 33f14e
        if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
Packit 33f14e
          {
Packit 33f14e
            /* Cf. unistr/u8-mblen.c.  */
Packit 33f14e
            unsigned char c = (unsigned char) p[0];
Packit 33f14e
Packit 33f14e
            if (c >= 0xc2)
Packit 33f14e
              {
Packit 33f14e
                if (c < 0xe0)
Packit 33f14e
                  {
Packit 33f14e
                    if (m == 1)
Packit 33f14e
                      goto incomplete;
Packit 33f14e
                  }
Packit 33f14e
                else if (c < 0xf0)
Packit 33f14e
                  {
Packit 33f14e
                    if (m == 1)
Packit 33f14e
                      goto incomplete;
Packit 33f14e
                    if (m == 2)
Packit 33f14e
                      {
Packit 33f14e
                        unsigned char c2 = (unsigned char) p[1];
Packit 33f14e
Packit 33f14e
                        if ((c2 ^ 0x80) < 0x40
Packit 33f14e
                            && (c >= 0xe1 || c2 >= 0xa0)
Packit 33f14e
                            && (c != 0xed || c2 < 0xa0))
Packit 33f14e
                          goto incomplete;
Packit 33f14e
                      }
Packit 33f14e
                  }
Packit 33f14e
                else if (c <= 0xf4)
Packit 33f14e
                  {
Packit 33f14e
                    if (m == 1)
Packit 33f14e
                      goto incomplete;
Packit 33f14e
                    else /* m == 2 || m == 3 */
Packit 33f14e
                      {
Packit 33f14e
                        unsigned char c2 = (unsigned char) p[1];
Packit 33f14e
Packit 33f14e
                        if ((c2 ^ 0x80) < 0x40
Packit 33f14e
                            && (c >= 0xf1 || c2 >= 0x90)
Packit 33f14e
                            && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
Packit 33f14e
                          {
Packit 33f14e
                            if (m == 2)
Packit 33f14e
                              goto incomplete;
Packit 33f14e
                            else /* m == 3 */
Packit 33f14e
                              {
Packit 33f14e
                                unsigned char c3 = (unsigned char) p[2];
Packit 33f14e
Packit 33f14e
                                if ((c3 ^ 0x80) < 0x40)
Packit 33f14e
                                  goto incomplete;
Packit 33f14e
                              }
Packit 33f14e
                          }
Packit 33f14e
                      }
Packit 33f14e
                  }
Packit 33f14e
              }
Packit 33f14e
            goto invalid;
Packit 33f14e
          }
Packit 33f14e
Packit 33f14e
        /* As a reference for this code, you can use the GNU libiconv
Packit 33f14e
           implementation.  Look for uses of the RET_TOOFEW macro.  */
Packit 33f14e
Packit 33f14e
        if (STREQ_OPT (encoding,
Packit 33f14e
                       "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
Packit 33f14e
          {
Packit 33f14e
            if (m == 1)
Packit 33f14e
              {
Packit 33f14e
                unsigned char c = (unsigned char) p[0];
Packit 33f14e
Packit 33f14e
                if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
Packit 33f14e
                  goto incomplete;
Packit 33f14e
              }
Packit 33f14e
            if (m == 2)
Packit 33f14e
              {
Packit 33f14e
                unsigned char c = (unsigned char) p[0];
Packit 33f14e
Packit 33f14e
                if (c == 0x8f)
Packit 33f14e
                  {
Packit 33f14e
                    unsigned char c2 = (unsigned char) p[1];
Packit 33f14e
Packit 33f14e
                    if (c2 >= 0xa1 && c2 < 0xff)
Packit 33f14e
                      goto incomplete;
Packit 33f14e
                  }
Packit 33f14e
              }
Packit 33f14e
            goto invalid;
Packit 33f14e
          }
Packit 33f14e
        if (STREQ_OPT (encoding,
Packit 33f14e
                       "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
Packit 33f14e
            || STREQ_OPT (encoding,
Packit 33f14e
                          "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
Packit 33f14e
            || STREQ_OPT (encoding,
Packit 33f14e
                          "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
Packit 33f14e
          {
Packit 33f14e
            if (m == 1)
Packit 33f14e
              {
Packit 33f14e
                unsigned char c = (unsigned char) p[0];
Packit 33f14e
Packit 33f14e
                if (c >= 0xa1 && c < 0xff)
Packit 33f14e
                  goto incomplete;
Packit 33f14e
              }
Packit 33f14e
            goto invalid;
Packit 33f14e
          }
Packit 33f14e
        if (STREQ_OPT (encoding,
Packit 33f14e
                       "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
Packit 33f14e
          {
Packit 33f14e
            if (m == 1)
Packit 33f14e
              {
Packit 33f14e
                unsigned char c = (unsigned char) p[0];
Packit 33f14e
Packit 33f14e
                if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
Packit 33f14e
                  goto incomplete;
Packit 33f14e
              }
Packit 33f14e
            else /* m == 2 || m == 3 */
Packit 33f14e
              {
Packit 33f14e
                unsigned char c = (unsigned char) p[0];
Packit 33f14e
Packit 33f14e
                if (c == 0x8e)
Packit 33f14e
                  goto incomplete;
Packit 33f14e
              }
Packit 33f14e
            goto invalid;
Packit 33f14e
          }
Packit 33f14e
        if (STREQ_OPT (encoding,
Packit 33f14e
                       "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
Packit 33f14e
          {
Packit 33f14e
            if (m == 1)
Packit 33f14e
              {
Packit 33f14e
                unsigned char c = (unsigned char) p[0];
Packit 33f14e
Packit 33f14e
                if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
Packit 33f14e
                  goto incomplete;
Packit 33f14e
              }
Packit 33f14e
            else /* m == 2 || m == 3 */
Packit 33f14e
              {
Packit 33f14e
                unsigned char c = (unsigned char) p[0];
Packit 33f14e
Packit 33f14e
                if (c >= 0x90 && c <= 0xe3)
Packit 33f14e
                  {
Packit 33f14e
                    unsigned char c2 = (unsigned char) p[1];
Packit 33f14e
Packit 33f14e
                    if (c2 >= 0x30 && c2 <= 0x39)
Packit 33f14e
                      {
Packit 33f14e
                        if (m == 2)
Packit 33f14e
                          goto incomplete;
Packit 33f14e
                        else /* m == 3 */
Packit 33f14e
                          {
Packit 33f14e
                            unsigned char c3 = (unsigned char) p[2];
Packit 33f14e
Packit 33f14e
                            if (c3 >= 0x81 && c3 <= 0xfe)
Packit 33f14e
                              goto incomplete;
Packit 33f14e
                          }
Packit 33f14e
                      }
Packit 33f14e
                  }
Packit 33f14e
              }
Packit 33f14e
            goto invalid;
Packit 33f14e
          }
Packit 33f14e
        if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
Packit 33f14e
          {
Packit 33f14e
            if (m == 1)
Packit 33f14e
              {
Packit 33f14e
                unsigned char c = (unsigned char) p[0];
Packit 33f14e
Packit 33f14e
                if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
Packit 33f14e
                    || (c >= 0xf0 && c <= 0xf9))
Packit 33f14e
                  goto incomplete;
Packit 33f14e
              }
Packit 33f14e
            goto invalid;
Packit 33f14e
          }
Packit 33f14e
Packit 33f14e
        /* An unknown multibyte encoding.  */
Packit 33f14e
        goto incomplete;
Packit 33f14e
      }
Packit 33f14e
Packit 33f14e
     incomplete:
Packit 33f14e
      {
Packit 33f14e
        size_t k = nstate;
Packit 33f14e
        /* Here 0 <= k < m < 4.  */
Packit 33f14e
        pstate[++k] = s[0];
Packit 33f14e
        if (k < m)
Packit 33f14e
          {
Packit 33f14e
            pstate[++k] = s[1];
Packit 33f14e
            if (k < m)
Packit 33f14e
              pstate[++k] = s[2];
Packit 33f14e
          }
Packit 33f14e
        if (k != m)
Packit 33f14e
          abort ();
Packit 33f14e
      }
Packit 33f14e
      pstate[0] = m;
Packit 33f14e
      return (size_t)(-2);
Packit 33f14e
Packit 33f14e
     invalid:
Packit 33f14e
      errno = EILSEQ;
Packit 33f14e
      /* The conversion state is undefined, says POSIX.  */
Packit 33f14e
      return (size_t)(-1);
Packit 33f14e
    }
Packit 33f14e
  }
Packit 33f14e
}
Packit 33f14e
Packit 33f14e
#else
Packit 33f14e
/* Override the system's mbrtowc() function.  */
Packit 33f14e
Packit 33f14e
# undef mbrtowc
Packit 33f14e
Packit 33f14e
size_t
Packit 33f14e
rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
Packit 33f14e
{
Packit 33f14e
  size_t ret;
Packit 33f14e
  wchar_t wc;
Packit 33f14e
Packit 33f14e
# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
Packit 33f14e
  if (s == NULL)
Packit 33f14e
    {
Packit 33f14e
      pwc = NULL;
Packit 33f14e
      s = "";
Packit 33f14e
      n = 1;
Packit 33f14e
    }
Packit 33f14e
# endif
Packit 33f14e
Packit 33f14e
# if MBRTOWC_EMPTY_INPUT_BUG
Packit 33f14e
  if (n == 0)
Packit 33f14e
    return (size_t) -2;
Packit 33f14e
# endif
Packit 33f14e
Packit 33f14e
  if (! pwc)
Packit 33f14e
    pwc = &wc;
Packit 33f14e
Packit 33f14e
# if MBRTOWC_RETVAL_BUG
Packit 33f14e
  {
Packit 33f14e
    static mbstate_t internal_state;
Packit 33f14e
Packit 33f14e
    /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
Packit 33f14e
       hidden internal state, but we can call it on our variable.  */
Packit 33f14e
    if (ps == NULL)
Packit 33f14e
      ps = &internal_state;
Packit 33f14e
Packit 33f14e
    if (!mbsinit (ps))
Packit 33f14e
      {
Packit 33f14e
        /* Parse the rest of the multibyte character byte for byte.  */
Packit 33f14e
        size_t count = 0;
Packit 33f14e
        for (; n > 0; s++, n--)
Packit 33f14e
          {
Packit 33f14e
            ret = mbrtowc (&wc, s, 1, ps);
Packit 33f14e
Packit 33f14e
            if (ret == (size_t)(-1))
Packit 33f14e
              return (size_t)(-1);
Packit 33f14e
            count++;
Packit 33f14e
            if (ret != (size_t)(-2))
Packit 33f14e
              {
Packit 33f14e
                /* The multibyte character has been completed.  */
Packit 33f14e
                *pwc = wc;
Packit 33f14e
                return (wc == 0 ? 0 : count);
Packit 33f14e
              }
Packit 33f14e
          }
Packit 33f14e
        return (size_t)(-2);
Packit 33f14e
      }
Packit 33f14e
  }
Packit 33f14e
# endif
Packit 33f14e
Packit 33f14e
  ret = mbrtowc (pwc, s, n, ps);
Packit 33f14e
Packit 33f14e
# if MBRTOWC_NUL_RETVAL_BUG
Packit 33f14e
  if (ret < (size_t) -2 && !*pwc)
Packit 33f14e
    return 0;
Packit 33f14e
# endif
Packit 33f14e
Packit 33f14e
# if C_LOCALE_MAYBE_EILSEQ
Packit 33f14e
  if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
Packit 33f14e
    {
Packit 33f14e
      unsigned char uc = *s;
Packit 33f14e
      *pwc = uc;
Packit 33f14e
      return 1;
Packit 33f14e
    }
Packit 33f14e
# endif
Packit 33f14e
Packit 33f14e
  return ret;
Packit 33f14e
}
Packit 33f14e
Packit 33f14e
#endif