Blame lib/mbrtowc.c

Packit 709fb3
/* Convert multibyte character to wide character.
Packit 709fb3
   Copyright (C) 1999-2002, 2005-2017 Free Software Foundation, Inc.
Packit 709fb3
   Written by Bruno Haible <bruno@clisp.org>, 2008.
Packit 709fb3
Packit 709fb3
   This program is free software: you can redistribute it and/or modify
Packit 709fb3
   it under the terms of the GNU General Public License as published by
Packit 709fb3
   the Free Software Foundation; either version 3 of the License, or
Packit 709fb3
   (at your option) any later version.
Packit 709fb3
Packit 709fb3
   This program is distributed in the hope that it will be useful,
Packit 709fb3
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 709fb3
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 709fb3
   GNU General Public License for more details.
Packit 709fb3
Packit 709fb3
   You should have received a copy of the GNU General Public License
Packit 709fb3
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
Packit 709fb3
Packit 709fb3
#include <config.h>
Packit 709fb3
Packit 709fb3
/* Specification.  */
Packit 709fb3
#include <wchar.h>
Packit 709fb3
Packit 709fb3
#if C_LOCALE_MAYBE_EILSEQ
Packit 709fb3
# include "hard-locale.h"
Packit 709fb3
# include <locale.h>
Packit 709fb3
#endif
Packit 709fb3
Packit 709fb3
#if GNULIB_defined_mbstate_t
Packit 709fb3
/* Implement mbrtowc() on top of mbtowc().  */
Packit 709fb3
Packit 709fb3
# include <errno.h>
Packit 709fb3
# include <stdlib.h>
Packit 709fb3
Packit 709fb3
# include "localcharset.h"
Packit 709fb3
# include "streq.h"
Packit 709fb3
# include "verify.h"
Packit 709fb3
Packit 709fb3
#ifndef FALLTHROUGH
Packit 709fb3
# if __GNUC__ < 7
Packit 709fb3
#  define FALLTHROUGH ((void) 0)
Packit 709fb3
# else
Packit 709fb3
#  define FALLTHROUGH __attribute__ ((__fallthrough__))
Packit 709fb3
# endif
Packit 709fb3
#endif
Packit 709fb3
Packit 709fb3
verify (sizeof (mbstate_t) >= 4);
Packit 709fb3
Packit 709fb3
static char internal_state[4];
Packit 709fb3
Packit 709fb3
size_t
Packit 709fb3
mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
Packit 709fb3
{
Packit 709fb3
  char *pstate = (char *)ps;
Packit 709fb3
Packit 709fb3
  if (s == NULL)
Packit 709fb3
    {
Packit 709fb3
      pwc = NULL;
Packit 709fb3
      s = "";
Packit 709fb3
      n = 1;
Packit 709fb3
    }
Packit 709fb3
Packit 709fb3
  if (n == 0)
Packit 709fb3
    return (size_t)(-2);
Packit 709fb3
Packit 709fb3
  /* Here n > 0.  */
Packit 709fb3
Packit 709fb3
  if (pstate == NULL)
Packit 709fb3
    pstate = internal_state;
Packit 709fb3
Packit 709fb3
  {
Packit 709fb3
    size_t nstate = pstate[0];
Packit 709fb3
    char buf[4];
Packit 709fb3
    const char *p;
Packit 709fb3
    size_t m;
Packit 709fb3
Packit 709fb3
    switch (nstate)
Packit 709fb3
      {
Packit 709fb3
      case 0:
Packit 709fb3
        p = s;
Packit 709fb3
        m = n;
Packit 709fb3
        break;
Packit 709fb3
      case 3:
Packit 709fb3
        buf[2] = pstate[3];
Packit 709fb3
        FALLTHROUGH;
Packit 709fb3
      case 2:
Packit 709fb3
        buf[1] = pstate[2];
Packit 709fb3
        FALLTHROUGH;
Packit 709fb3
      case 1:
Packit 709fb3
        buf[0] = pstate[1];
Packit 709fb3
        p = buf;
Packit 709fb3
        m = nstate;
Packit 709fb3
        buf[m++] = s[0];
Packit 709fb3
        if (n >= 2 && m < 4)
Packit 709fb3
          {
Packit 709fb3
            buf[m++] = s[1];
Packit 709fb3
            if (n >= 3 && m < 4)
Packit 709fb3
              buf[m++] = s[2];
Packit 709fb3
          }
Packit 709fb3
        break;
Packit 709fb3
      default:
Packit 709fb3
        errno = EINVAL;
Packit 709fb3
        return (size_t)(-1);
Packit 709fb3
      }
Packit 709fb3
Packit 709fb3
    /* Here m > 0.  */
Packit 709fb3
Packit 709fb3
# if __GLIBC__ || defined __UCLIBC__
Packit 709fb3
    /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
Packit 709fb3
    mbtowc (NULL, NULL, 0);
Packit 709fb3
# endif
Packit 709fb3
    {
Packit 709fb3
      int res = mbtowc (pwc, p, m);
Packit 709fb3
Packit 709fb3
      if (res >= 0)
Packit 709fb3
        {
Packit 709fb3
          if (pwc != NULL && ((*pwc == 0) != (res == 0)))
Packit 709fb3
            abort ();
Packit 709fb3
          if (nstate >= (res > 0 ? res : 1))
Packit 709fb3
            abort ();
Packit 709fb3
          res -= nstate;
Packit 709fb3
          pstate[0] = 0;
Packit 709fb3
          return res;
Packit 709fb3
        }
Packit 709fb3
Packit 709fb3
      /* mbtowc does not distinguish between invalid and incomplete multibyte
Packit 709fb3
         sequences.  But mbrtowc needs to make this distinction.
Packit 709fb3
         There are two possible approaches:
Packit 709fb3
           - Use iconv() and its return value.
Packit 709fb3
           - Use built-in knowledge about the possible encodings.
Packit 709fb3
         Given the low quality of implementation of iconv() on the systems that
Packit 709fb3
         lack mbrtowc(), we use the second approach.
Packit 709fb3
         The possible encodings are:
Packit 709fb3
           - 8-bit encodings,
Packit 709fb3
           - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
Packit 709fb3
           - UTF-8.
Packit 709fb3
         Use specialized code for each.  */
Packit 709fb3
      if (m >= 4 || m >= MB_CUR_MAX)
Packit 709fb3
        goto invalid;
Packit 709fb3
      /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
Packit 709fb3
      {
Packit 709fb3
        const char *encoding = locale_charset ();
Packit 709fb3
Packit 709fb3
        if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
Packit 709fb3
          {
Packit 709fb3
            /* Cf. unistr/u8-mblen.c.  */
Packit 709fb3
            unsigned char c = (unsigned char) p[0];
Packit 709fb3
Packit 709fb3
            if (c >= 0xc2)
Packit 709fb3
              {
Packit 709fb3
                if (c < 0xe0)
Packit 709fb3
                  {
Packit 709fb3
                    if (m == 1)
Packit 709fb3
                      goto incomplete;
Packit 709fb3
                  }
Packit 709fb3
                else if (c < 0xf0)
Packit 709fb3
                  {
Packit 709fb3
                    if (m == 1)
Packit 709fb3
                      goto incomplete;
Packit 709fb3
                    if (m == 2)
Packit 709fb3
                      {
Packit 709fb3
                        unsigned char c2 = (unsigned char) p[1];
Packit 709fb3
Packit 709fb3
                        if ((c2 ^ 0x80) < 0x40
Packit 709fb3
                            && (c >= 0xe1 || c2 >= 0xa0)
Packit 709fb3
                            && (c != 0xed || c2 < 0xa0))
Packit 709fb3
                          goto incomplete;
Packit 709fb3
                      }
Packit 709fb3
                  }
Packit 709fb3
                else if (c <= 0xf4)
Packit 709fb3
                  {
Packit 709fb3
                    if (m == 1)
Packit 709fb3
                      goto incomplete;
Packit 709fb3
                    else /* m == 2 || m == 3 */
Packit 709fb3
                      {
Packit 709fb3
                        unsigned char c2 = (unsigned char) p[1];
Packit 709fb3
Packit 709fb3
                        if ((c2 ^ 0x80) < 0x40
Packit 709fb3
                            && (c >= 0xf1 || c2 >= 0x90)
Packit 709fb3
                            && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
Packit 709fb3
                          {
Packit 709fb3
                            if (m == 2)
Packit 709fb3
                              goto incomplete;
Packit 709fb3
                            else /* m == 3 */
Packit 709fb3
                              {
Packit 709fb3
                                unsigned char c3 = (unsigned char) p[2];
Packit 709fb3
Packit 709fb3
                                if ((c3 ^ 0x80) < 0x40)
Packit 709fb3
                                  goto incomplete;
Packit 709fb3
                              }
Packit 709fb3
                          }
Packit 709fb3
                      }
Packit 709fb3
                  }
Packit 709fb3
              }
Packit 709fb3
            goto invalid;
Packit 709fb3
          }
Packit 709fb3
Packit 709fb3
        /* As a reference for this code, you can use the GNU libiconv
Packit 709fb3
           implementation.  Look for uses of the RET_TOOFEW macro.  */
Packit 709fb3
Packit 709fb3
        if (STREQ_OPT (encoding,
Packit 709fb3
                       "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
Packit 709fb3
          {
Packit 709fb3
            if (m == 1)
Packit 709fb3
              {
Packit 709fb3
                unsigned char c = (unsigned char) p[0];
Packit 709fb3
Packit 709fb3
                if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
Packit 709fb3
                  goto incomplete;
Packit 709fb3
              }
Packit 709fb3
            if (m == 2)
Packit 709fb3
              {
Packit 709fb3
                unsigned char c = (unsigned char) p[0];
Packit 709fb3
Packit 709fb3
                if (c == 0x8f)
Packit 709fb3
                  {
Packit 709fb3
                    unsigned char c2 = (unsigned char) p[1];
Packit 709fb3
Packit 709fb3
                    if (c2 >= 0xa1 && c2 < 0xff)
Packit 709fb3
                      goto incomplete;
Packit 709fb3
                  }
Packit 709fb3
              }
Packit 709fb3
            goto invalid;
Packit 709fb3
          }
Packit 709fb3
        if (STREQ_OPT (encoding,
Packit 709fb3
                       "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
Packit 709fb3
            || STREQ_OPT (encoding,
Packit 709fb3
                          "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
Packit 709fb3
            || STREQ_OPT (encoding,
Packit 709fb3
                          "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
Packit 709fb3
          {
Packit 709fb3
            if (m == 1)
Packit 709fb3
              {
Packit 709fb3
                unsigned char c = (unsigned char) p[0];
Packit 709fb3
Packit 709fb3
                if (c >= 0xa1 && c < 0xff)
Packit 709fb3
                  goto incomplete;
Packit 709fb3
              }
Packit 709fb3
            goto invalid;
Packit 709fb3
          }
Packit 709fb3
        if (STREQ_OPT (encoding,
Packit 709fb3
                       "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
Packit 709fb3
          {
Packit 709fb3
            if (m == 1)
Packit 709fb3
              {
Packit 709fb3
                unsigned char c = (unsigned char) p[0];
Packit 709fb3
Packit 709fb3
                if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
Packit 709fb3
                  goto incomplete;
Packit 709fb3
              }
Packit 709fb3
            else /* m == 2 || m == 3 */
Packit 709fb3
              {
Packit 709fb3
                unsigned char c = (unsigned char) p[0];
Packit 709fb3
Packit 709fb3
                if (c == 0x8e)
Packit 709fb3
                  goto incomplete;
Packit 709fb3
              }
Packit 709fb3
            goto invalid;
Packit 709fb3
          }
Packit 709fb3
        if (STREQ_OPT (encoding,
Packit 709fb3
                       "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
Packit 709fb3
          {
Packit 709fb3
            if (m == 1)
Packit 709fb3
              {
Packit 709fb3
                unsigned char c = (unsigned char) p[0];
Packit 709fb3
Packit 709fb3
                if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
Packit 709fb3
                  goto incomplete;
Packit 709fb3
              }
Packit 709fb3
            else /* m == 2 || m == 3 */
Packit 709fb3
              {
Packit 709fb3
                unsigned char c = (unsigned char) p[0];
Packit 709fb3
Packit 709fb3
                if (c >= 0x90 && c <= 0xe3)
Packit 709fb3
                  {
Packit 709fb3
                    unsigned char c2 = (unsigned char) p[1];
Packit 709fb3
Packit 709fb3
                    if (c2 >= 0x30 && c2 <= 0x39)
Packit 709fb3
                      {
Packit 709fb3
                        if (m == 2)
Packit 709fb3
                          goto incomplete;
Packit 709fb3
                        else /* m == 3 */
Packit 709fb3
                          {
Packit 709fb3
                            unsigned char c3 = (unsigned char) p[2];
Packit 709fb3
Packit 709fb3
                            if (c3 >= 0x81 && c3 <= 0xfe)
Packit 709fb3
                              goto incomplete;
Packit 709fb3
                          }
Packit 709fb3
                      }
Packit 709fb3
                  }
Packit 709fb3
              }
Packit 709fb3
            goto invalid;
Packit 709fb3
          }
Packit 709fb3
        if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
Packit 709fb3
          {
Packit 709fb3
            if (m == 1)
Packit 709fb3
              {
Packit 709fb3
                unsigned char c = (unsigned char) p[0];
Packit 709fb3
Packit 709fb3
                if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
Packit 709fb3
                    || (c >= 0xf0 && c <= 0xf9))
Packit 709fb3
                  goto incomplete;
Packit 709fb3
              }
Packit 709fb3
            goto invalid;
Packit 709fb3
          }
Packit 709fb3
Packit 709fb3
        /* An unknown multibyte encoding.  */
Packit 709fb3
        goto incomplete;
Packit 709fb3
      }
Packit 709fb3
Packit 709fb3
     incomplete:
Packit 709fb3
      {
Packit 709fb3
        size_t k = nstate;
Packit 709fb3
        /* Here 0 <= k < m < 4.  */
Packit 709fb3
        pstate[++k] = s[0];
Packit 709fb3
        if (k < m)
Packit 709fb3
          {
Packit 709fb3
            pstate[++k] = s[1];
Packit 709fb3
            if (k < m)
Packit 709fb3
              pstate[++k] = s[2];
Packit 709fb3
          }
Packit 709fb3
        if (k != m)
Packit 709fb3
          abort ();
Packit 709fb3
      }
Packit 709fb3
      pstate[0] = m;
Packit 709fb3
      return (size_t)(-2);
Packit 709fb3
Packit 709fb3
     invalid:
Packit 709fb3
      errno = EILSEQ;
Packit 709fb3
      /* The conversion state is undefined, says POSIX.  */
Packit 709fb3
      return (size_t)(-1);
Packit 709fb3
    }
Packit 709fb3
  }
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
#else
Packit 709fb3
/* Override the system's mbrtowc() function.  */
Packit 709fb3
Packit 709fb3
# undef mbrtowc
Packit 709fb3
Packit 709fb3
size_t
Packit 709fb3
rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
Packit 709fb3
{
Packit 709fb3
  size_t ret;
Packit 709fb3
  wchar_t wc;
Packit 709fb3
Packit 709fb3
# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
Packit 709fb3
  if (s == NULL)
Packit 709fb3
    {
Packit 709fb3
      pwc = NULL;
Packit 709fb3
      s = "";
Packit 709fb3
      n = 1;
Packit 709fb3
    }
Packit 709fb3
# endif
Packit 709fb3
Packit 709fb3
# if MBRTOWC_EMPTY_INPUT_BUG
Packit 709fb3
  if (n == 0)
Packit 709fb3
    return (size_t) -2;
Packit 709fb3
# endif
Packit 709fb3
Packit 709fb3
  if (! pwc)
Packit 709fb3
    pwc = &wc;
Packit 709fb3
Packit 709fb3
# if MBRTOWC_RETVAL_BUG
Packit 709fb3
  {
Packit 709fb3
    static mbstate_t internal_state;
Packit 709fb3
Packit 709fb3
    /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
Packit 709fb3
       hidden internal state, but we can call it on our variable.  */
Packit 709fb3
    if (ps == NULL)
Packit 709fb3
      ps = &internal_state;
Packit 709fb3
Packit 709fb3
    if (!mbsinit (ps))
Packit 709fb3
      {
Packit 709fb3
        /* Parse the rest of the multibyte character byte for byte.  */
Packit 709fb3
        size_t count = 0;
Packit 709fb3
        for (; n > 0; s++, n--)
Packit 709fb3
          {
Packit 709fb3
            ret = mbrtowc (&wc, s, 1, ps);
Packit 709fb3
Packit 709fb3
            if (ret == (size_t)(-1))
Packit 709fb3
              return (size_t)(-1);
Packit 709fb3
            count++;
Packit 709fb3
            if (ret != (size_t)(-2))
Packit 709fb3
              {
Packit 709fb3
                /* The multibyte character has been completed.  */
Packit 709fb3
                *pwc = wc;
Packit 709fb3
                return (wc == 0 ? 0 : count);
Packit 709fb3
              }
Packit 709fb3
          }
Packit 709fb3
        return (size_t)(-2);
Packit 709fb3
      }
Packit 709fb3
  }
Packit 709fb3
# endif
Packit 709fb3
Packit 709fb3
  ret = mbrtowc (pwc, s, n, ps);
Packit 709fb3
Packit 709fb3
# if MBRTOWC_NUL_RETVAL_BUG
Packit 709fb3
  if (ret < (size_t) -2 && !*pwc)
Packit 709fb3
    return 0;
Packit 709fb3
# endif
Packit 709fb3
Packit 709fb3
# if C_LOCALE_MAYBE_EILSEQ
Packit 709fb3
  if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
Packit 709fb3
    {
Packit 709fb3
      unsigned char uc = *s;
Packit 709fb3
      *pwc = uc;
Packit 709fb3
      return 1;
Packit 709fb3
    }
Packit 709fb3
# endif
Packit 709fb3
Packit 709fb3
  return ret;
Packit 709fb3
}
Packit 709fb3
Packit 709fb3
#endif