Blame lib/mbrtowc.c

Packit Service fdd496
/* Convert multibyte character to wide character.
Packit Service fdd496
   Copyright (C) 1999-2002, 2005-2017 Free Software Foundation, Inc.
Packit Service fdd496
   Written by Bruno Haible <bruno@clisp.org>, 2008.
Packit Service fdd496
Packit Service fdd496
   This program is free software: you can redistribute it and/or modify
Packit Service fdd496
   it under the terms of the GNU General Public License as published by
Packit Service fdd496
   the Free Software Foundation; either version 3 of the License, or
Packit Service fdd496
   (at your option) any later version.
Packit Service fdd496
Packit Service fdd496
   This program is distributed in the hope that it will be useful,
Packit Service fdd496
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit Service fdd496
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit Service fdd496
   GNU General Public License for more details.
Packit Service fdd496
Packit Service fdd496
   You should have received a copy of the GNU General Public License
Packit Service fdd496
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
Packit Service fdd496
Packit Service fdd496
#include <config.h>
Packit Service fdd496
Packit Service fdd496
/* Specification.  */
Packit Service fdd496
#include <wchar.h>
Packit Service fdd496
Packit Service fdd496
#if C_LOCALE_MAYBE_EILSEQ
Packit Service fdd496
# include "hard-locale.h"
Packit Service fdd496
# include <locale.h>
Packit Service fdd496
#endif
Packit Service fdd496
Packit Service fdd496
#if GNULIB_defined_mbstate_t
Packit Service fdd496
/* Implement mbrtowc() on top of mbtowc().  */
Packit Service fdd496
Packit Service fdd496
# include <errno.h>
Packit Service fdd496
# include <stdlib.h>
Packit Service fdd496
Packit Service fdd496
# include "localcharset.h"
Packit Service fdd496
# include "streq.h"
Packit Service fdd496
# include "verify.h"
Packit Service fdd496
Packit Service fdd496
#ifndef FALLTHROUGH
Packit Service fdd496
# if __GNUC__ < 7
Packit Service fdd496
#  define FALLTHROUGH ((void) 0)
Packit Service fdd496
# else
Packit Service fdd496
#  define FALLTHROUGH __attribute__ ((__fallthrough__))
Packit Service fdd496
# endif
Packit Service fdd496
#endif
Packit Service fdd496
Packit Service fdd496
verify (sizeof (mbstate_t) >= 4);
Packit Service fdd496
Packit Service fdd496
static char internal_state[4];
Packit Service fdd496
Packit Service fdd496
size_t
Packit Service fdd496
mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
Packit Service fdd496
{
Packit Service fdd496
  char *pstate = (char *)ps;
Packit Service fdd496
Packit Service fdd496
  if (s == NULL)
Packit Service fdd496
    {
Packit Service fdd496
      pwc = NULL;
Packit Service fdd496
      s = "";
Packit Service fdd496
      n = 1;
Packit Service fdd496
    }
Packit Service fdd496
Packit Service fdd496
  if (n == 0)
Packit Service fdd496
    return (size_t)(-2);
Packit Service fdd496
Packit Service fdd496
  /* Here n > 0.  */
Packit Service fdd496
Packit Service fdd496
  if (pstate == NULL)
Packit Service fdd496
    pstate = internal_state;
Packit Service fdd496
Packit Service fdd496
  {
Packit Service fdd496
    size_t nstate = pstate[0];
Packit Service fdd496
    char buf[4];
Packit Service fdd496
    const char *p;
Packit Service fdd496
    size_t m;
Packit Service fdd496
Packit Service fdd496
    switch (nstate)
Packit Service fdd496
      {
Packit Service fdd496
      case 0:
Packit Service fdd496
        p = s;
Packit Service fdd496
        m = n;
Packit Service fdd496
        break;
Packit Service fdd496
      case 3:
Packit Service fdd496
        buf[2] = pstate[3];
Packit Service fdd496
        FALLTHROUGH;
Packit Service fdd496
      case 2:
Packit Service fdd496
        buf[1] = pstate[2];
Packit Service fdd496
        FALLTHROUGH;
Packit Service fdd496
      case 1:
Packit Service fdd496
        buf[0] = pstate[1];
Packit Service fdd496
        p = buf;
Packit Service fdd496
        m = nstate;
Packit Service fdd496
        buf[m++] = s[0];
Packit Service fdd496
        if (n >= 2 && m < 4)
Packit Service fdd496
          {
Packit Service fdd496
            buf[m++] = s[1];
Packit Service fdd496
            if (n >= 3 && m < 4)
Packit Service fdd496
              buf[m++] = s[2];
Packit Service fdd496
          }
Packit Service fdd496
        break;
Packit Service fdd496
      default:
Packit Service fdd496
        errno = EINVAL;
Packit Service fdd496
        return (size_t)(-1);
Packit Service fdd496
      }
Packit Service fdd496
Packit Service fdd496
    /* Here m > 0.  */
Packit Service fdd496
Packit Service fdd496
# if __GLIBC__ || defined __UCLIBC__
Packit Service fdd496
    /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
Packit Service fdd496
    mbtowc (NULL, NULL, 0);
Packit Service fdd496
# endif
Packit Service fdd496
    {
Packit Service fdd496
      int res = mbtowc (pwc, p, m);
Packit Service fdd496
Packit Service fdd496
      if (res >= 0)
Packit Service fdd496
        {
Packit Service fdd496
          if (pwc != NULL && ((*pwc == 0) != (res == 0)))
Packit Service fdd496
            abort ();
Packit Service fdd496
          if (nstate >= (res > 0 ? res : 1))
Packit Service fdd496
            abort ();
Packit Service fdd496
          res -= nstate;
Packit Service fdd496
          pstate[0] = 0;
Packit Service fdd496
          return res;
Packit Service fdd496
        }
Packit Service fdd496
Packit Service fdd496
      /* mbtowc does not distinguish between invalid and incomplete multibyte
Packit Service fdd496
         sequences.  But mbrtowc needs to make this distinction.
Packit Service fdd496
         There are two possible approaches:
Packit Service fdd496
           - Use iconv() and its return value.
Packit Service fdd496
           - Use built-in knowledge about the possible encodings.
Packit Service fdd496
         Given the low quality of implementation of iconv() on the systems that
Packit Service fdd496
         lack mbrtowc(), we use the second approach.
Packit Service fdd496
         The possible encodings are:
Packit Service fdd496
           - 8-bit encodings,
Packit Service fdd496
           - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
Packit Service fdd496
           - UTF-8.
Packit Service fdd496
         Use specialized code for each.  */
Packit Service fdd496
      if (m >= 4 || m >= MB_CUR_MAX)
Packit Service fdd496
        goto invalid;
Packit Service fdd496
      /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
Packit Service fdd496
      {
Packit Service fdd496
        const char *encoding = locale_charset ();
Packit Service fdd496
Packit Service fdd496
        if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
Packit Service fdd496
          {
Packit Service fdd496
            /* Cf. unistr/u8-mblen.c.  */
Packit Service fdd496
            unsigned char c = (unsigned char) p[0];
Packit Service fdd496
Packit Service fdd496
            if (c >= 0xc2)
Packit Service fdd496
              {
Packit Service fdd496
                if (c < 0xe0)
Packit Service fdd496
                  {
Packit Service fdd496
                    if (m == 1)
Packit Service fdd496
                      goto incomplete;
Packit Service fdd496
                  }
Packit Service fdd496
                else if (c < 0xf0)
Packit Service fdd496
                  {
Packit Service fdd496
                    if (m == 1)
Packit Service fdd496
                      goto incomplete;
Packit Service fdd496
                    if (m == 2)
Packit Service fdd496
                      {
Packit Service fdd496
                        unsigned char c2 = (unsigned char) p[1];
Packit Service fdd496
Packit Service fdd496
                        if ((c2 ^ 0x80) < 0x40
Packit Service fdd496
                            && (c >= 0xe1 || c2 >= 0xa0)
Packit Service fdd496
                            && (c != 0xed || c2 < 0xa0))
Packit Service fdd496
                          goto incomplete;
Packit Service fdd496
                      }
Packit Service fdd496
                  }
Packit Service fdd496
                else if (c <= 0xf4)
Packit Service fdd496
                  {
Packit Service fdd496
                    if (m == 1)
Packit Service fdd496
                      goto incomplete;
Packit Service fdd496
                    else /* m == 2 || m == 3 */
Packit Service fdd496
                      {
Packit Service fdd496
                        unsigned char c2 = (unsigned char) p[1];
Packit Service fdd496
Packit Service fdd496
                        if ((c2 ^ 0x80) < 0x40
Packit Service fdd496
                            && (c >= 0xf1 || c2 >= 0x90)
Packit Service fdd496
                            && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
Packit Service fdd496
                          {
Packit Service fdd496
                            if (m == 2)
Packit Service fdd496
                              goto incomplete;
Packit Service fdd496
                            else /* m == 3 */
Packit Service fdd496
                              {
Packit Service fdd496
                                unsigned char c3 = (unsigned char) p[2];
Packit Service fdd496
Packit Service fdd496
                                if ((c3 ^ 0x80) < 0x40)
Packit Service fdd496
                                  goto incomplete;
Packit Service fdd496
                              }
Packit Service fdd496
                          }
Packit Service fdd496
                      }
Packit Service fdd496
                  }
Packit Service fdd496
              }
Packit Service fdd496
            goto invalid;
Packit Service fdd496
          }
Packit Service fdd496
Packit Service fdd496
        /* As a reference for this code, you can use the GNU libiconv
Packit Service fdd496
           implementation.  Look for uses of the RET_TOOFEW macro.  */
Packit Service fdd496
Packit Service fdd496
        if (STREQ_OPT (encoding,
Packit Service fdd496
                       "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
Packit Service fdd496
          {
Packit Service fdd496
            if (m == 1)
Packit Service fdd496
              {
Packit Service fdd496
                unsigned char c = (unsigned char) p[0];
Packit Service fdd496
Packit Service fdd496
                if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
Packit Service fdd496
                  goto incomplete;
Packit Service fdd496
              }
Packit Service fdd496
            if (m == 2)
Packit Service fdd496
              {
Packit Service fdd496
                unsigned char c = (unsigned char) p[0];
Packit Service fdd496
Packit Service fdd496
                if (c == 0x8f)
Packit Service fdd496
                  {
Packit Service fdd496
                    unsigned char c2 = (unsigned char) p[1];
Packit Service fdd496
Packit Service fdd496
                    if (c2 >= 0xa1 && c2 < 0xff)
Packit Service fdd496
                      goto incomplete;
Packit Service fdd496
                  }
Packit Service fdd496
              }
Packit Service fdd496
            goto invalid;
Packit Service fdd496
          }
Packit Service fdd496
        if (STREQ_OPT (encoding,
Packit Service fdd496
                       "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
Packit Service fdd496
            || STREQ_OPT (encoding,
Packit Service fdd496
                          "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
Packit Service fdd496
            || STREQ_OPT (encoding,
Packit Service fdd496
                          "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
Packit Service fdd496
          {
Packit Service fdd496
            if (m == 1)
Packit Service fdd496
              {
Packit Service fdd496
                unsigned char c = (unsigned char) p[0];
Packit Service fdd496
Packit Service fdd496
                if (c >= 0xa1 && c < 0xff)
Packit Service fdd496
                  goto incomplete;
Packit Service fdd496
              }
Packit Service fdd496
            goto invalid;
Packit Service fdd496
          }
Packit Service fdd496
        if (STREQ_OPT (encoding,
Packit Service fdd496
                       "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
Packit Service fdd496
          {
Packit Service fdd496
            if (m == 1)
Packit Service fdd496
              {
Packit Service fdd496
                unsigned char c = (unsigned char) p[0];
Packit Service fdd496
Packit Service fdd496
                if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
Packit Service fdd496
                  goto incomplete;
Packit Service fdd496
              }
Packit Service fdd496
            else /* m == 2 || m == 3 */
Packit Service fdd496
              {
Packit Service fdd496
                unsigned char c = (unsigned char) p[0];
Packit Service fdd496
Packit Service fdd496
                if (c == 0x8e)
Packit Service fdd496
                  goto incomplete;
Packit Service fdd496
              }
Packit Service fdd496
            goto invalid;
Packit Service fdd496
          }
Packit Service fdd496
        if (STREQ_OPT (encoding,
Packit Service fdd496
                       "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
Packit Service fdd496
          {
Packit Service fdd496
            if (m == 1)
Packit Service fdd496
              {
Packit Service fdd496
                unsigned char c = (unsigned char) p[0];
Packit Service fdd496
Packit Service fdd496
                if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
Packit Service fdd496
                  goto incomplete;
Packit Service fdd496
              }
Packit Service fdd496
            else /* m == 2 || m == 3 */
Packit Service fdd496
              {
Packit Service fdd496
                unsigned char c = (unsigned char) p[0];
Packit Service fdd496
Packit Service fdd496
                if (c >= 0x90 && c <= 0xe3)
Packit Service fdd496
                  {
Packit Service fdd496
                    unsigned char c2 = (unsigned char) p[1];
Packit Service fdd496
Packit Service fdd496
                    if (c2 >= 0x30 && c2 <= 0x39)
Packit Service fdd496
                      {
Packit Service fdd496
                        if (m == 2)
Packit Service fdd496
                          goto incomplete;
Packit Service fdd496
                        else /* m == 3 */
Packit Service fdd496
                          {
Packit Service fdd496
                            unsigned char c3 = (unsigned char) p[2];
Packit Service fdd496
Packit Service fdd496
                            if (c3 >= 0x81 && c3 <= 0xfe)
Packit Service fdd496
                              goto incomplete;
Packit Service fdd496
                          }
Packit Service fdd496
                      }
Packit Service fdd496
                  }
Packit Service fdd496
              }
Packit Service fdd496
            goto invalid;
Packit Service fdd496
          }
Packit Service fdd496
        if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
Packit Service fdd496
          {
Packit Service fdd496
            if (m == 1)
Packit Service fdd496
              {
Packit Service fdd496
                unsigned char c = (unsigned char) p[0];
Packit Service fdd496
Packit Service fdd496
                if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
Packit Service fdd496
                    || (c >= 0xf0 && c <= 0xf9))
Packit Service fdd496
                  goto incomplete;
Packit Service fdd496
              }
Packit Service fdd496
            goto invalid;
Packit Service fdd496
          }
Packit Service fdd496
Packit Service fdd496
        /* An unknown multibyte encoding.  */
Packit Service fdd496
        goto incomplete;
Packit Service fdd496
      }
Packit Service fdd496
Packit Service fdd496
     incomplete:
Packit Service fdd496
      {
Packit Service fdd496
        size_t k = nstate;
Packit Service fdd496
        /* Here 0 <= k < m < 4.  */
Packit Service fdd496
        pstate[++k] = s[0];
Packit Service fdd496
        if (k < m)
Packit Service fdd496
          {
Packit Service fdd496
            pstate[++k] = s[1];
Packit Service fdd496
            if (k < m)
Packit Service fdd496
              pstate[++k] = s[2];
Packit Service fdd496
          }
Packit Service fdd496
        if (k != m)
Packit Service fdd496
          abort ();
Packit Service fdd496
      }
Packit Service fdd496
      pstate[0] = m;
Packit Service fdd496
      return (size_t)(-2);
Packit Service fdd496
Packit Service fdd496
     invalid:
Packit Service fdd496
      errno = EILSEQ;
Packit Service fdd496
      /* The conversion state is undefined, says POSIX.  */
Packit Service fdd496
      return (size_t)(-1);
Packit Service fdd496
    }
Packit Service fdd496
  }
Packit Service fdd496
}
Packit Service fdd496
Packit Service fdd496
#else
Packit Service fdd496
/* Override the system's mbrtowc() function.  */
Packit Service fdd496
Packit Service fdd496
# undef mbrtowc
Packit Service fdd496
Packit Service fdd496
size_t
Packit Service fdd496
rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
Packit Service fdd496
{
Packit Service fdd496
  size_t ret;
Packit Service fdd496
  wchar_t wc;
Packit Service fdd496
Packit Service fdd496
# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
Packit Service fdd496
  if (s == NULL)
Packit Service fdd496
    {
Packit Service fdd496
      pwc = NULL;
Packit Service fdd496
      s = "";
Packit Service fdd496
      n = 1;
Packit Service fdd496
    }
Packit Service fdd496
# endif
Packit Service fdd496
Packit Service fdd496
# if MBRTOWC_EMPTY_INPUT_BUG
Packit Service fdd496
  if (n == 0)
Packit Service fdd496
    return (size_t) -2;
Packit Service fdd496
# endif
Packit Service fdd496
Packit Service fdd496
  if (! pwc)
Packit Service fdd496
    pwc = &wc;
Packit Service fdd496
Packit Service fdd496
# if MBRTOWC_RETVAL_BUG
Packit Service fdd496
  {
Packit Service fdd496
    static mbstate_t internal_state;
Packit Service fdd496
Packit Service fdd496
    /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
Packit Service fdd496
       hidden internal state, but we can call it on our variable.  */
Packit Service fdd496
    if (ps == NULL)
Packit Service fdd496
      ps = &internal_state;
Packit Service fdd496
Packit Service fdd496
    if (!mbsinit (ps))
Packit Service fdd496
      {
Packit Service fdd496
        /* Parse the rest of the multibyte character byte for byte.  */
Packit Service fdd496
        size_t count = 0;
Packit Service fdd496
        for (; n > 0; s++, n--)
Packit Service fdd496
          {
Packit Service fdd496
            ret = mbrtowc (&wc, s, 1, ps);
Packit Service fdd496
Packit Service fdd496
            if (ret == (size_t)(-1))
Packit Service fdd496
              return (size_t)(-1);
Packit Service fdd496
            count++;
Packit Service fdd496
            if (ret != (size_t)(-2))
Packit Service fdd496
              {
Packit Service fdd496
                /* The multibyte character has been completed.  */
Packit Service fdd496
                *pwc = wc;
Packit Service fdd496
                return (wc == 0 ? 0 : count);
Packit Service fdd496
              }
Packit Service fdd496
          }
Packit Service fdd496
        return (size_t)(-2);
Packit Service fdd496
      }
Packit Service fdd496
  }
Packit Service fdd496
# endif
Packit Service fdd496
Packit Service fdd496
  ret = mbrtowc (pwc, s, n, ps);
Packit Service fdd496
Packit Service fdd496
# if MBRTOWC_NUL_RETVAL_BUG
Packit Service fdd496
  if (ret < (size_t) -2 && !*pwc)
Packit Service fdd496
    return 0;
Packit Service fdd496
# endif
Packit Service fdd496
Packit Service fdd496
# if C_LOCALE_MAYBE_EILSEQ
Packit Service fdd496
  if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
Packit Service fdd496
    {
Packit Service fdd496
      unsigned char uc = *s;
Packit Service fdd496
      *pwc = uc;
Packit Service fdd496
      return 1;
Packit Service fdd496
    }
Packit Service fdd496
# endif
Packit Service fdd496
Packit Service fdd496
  return ret;
Packit Service fdd496
}
Packit Service fdd496
Packit Service fdd496
#endif