Blame lib/mbchar.h

Packit 33f14e
/* Multibyte character data type.
Packit 33f14e
   Copyright (C) 2001, 2005-2007, 2009-2017 Free Software Foundation, Inc.
Packit 33f14e
Packit 33f14e
   This program is free software: you can redistribute it and/or modify
Packit 33f14e
   it under the terms of the GNU General Public License as published by
Packit 33f14e
   the Free Software Foundation; either version 3 of the License, or
Packit 33f14e
   (at your option) any later version.
Packit 33f14e
Packit 33f14e
   This program is distributed in the hope that it will be useful,
Packit 33f14e
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit 33f14e
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit 33f14e
   GNU General Public License for more details.
Packit 33f14e
Packit 33f14e
   You should have received a copy of the GNU General Public License
Packit 33f14e
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
Packit 33f14e
Packit 33f14e
/* Written by Bruno Haible <bruno@clisp.org>.  */
Packit 33f14e
Packit 33f14e
/* A multibyte character is a short subsequence of a char* string,
Packit 33f14e
   representing a single wide character.
Packit 33f14e
Packit 33f14e
   We use multibyte characters instead of wide characters because of
Packit 33f14e
   the following goals:
Packit 33f14e
   1) correct multibyte handling, i.e. operate according to the LC_CTYPE
Packit 33f14e
      locale,
Packit 33f14e
   2) ease of maintenance, i.e. the maintainer needs not know all details
Packit 33f14e
      of the ISO C 99 standard,
Packit 33f14e
   3) don't fail grossly if the input is not in the encoding set by the
Packit 33f14e
      locale, because often different encodings are in use in the same
Packit 33f14e
      countries (ISO-8859-1/UTF-8, EUC-JP/Shift_JIS, ...),
Packit 33f14e
   4) fast in the case of ASCII characters,
Packit 33f14e
   5) portability, i.e. don't make unportable assumptions about wchar_t.
Packit 33f14e
Packit 33f14e
   Multibyte characters are only accessed through the mb* macros.
Packit 33f14e
Packit 33f14e
   mb_ptr (mbc)
Packit 33f14e
     return a pointer to the beginning of the multibyte sequence.
Packit 33f14e
Packit 33f14e
   mb_len (mbc)
Packit 33f14e
     returns the number of bytes occupied by the multibyte sequence.
Packit 33f14e
     Always > 0.
Packit 33f14e
Packit 33f14e
   mb_iseq (mbc, sc)
Packit 33f14e
     returns true if mbc is the standard ASCII character sc.
Packit 33f14e
Packit 33f14e
   mb_isnul (mbc)
Packit 33f14e
     returns true if mbc is the nul character.
Packit 33f14e
Packit 33f14e
   mb_cmp (mbc1, mbc2)
Packit 33f14e
     returns a positive, zero, or negative value depending on whether mbc1
Packit 33f14e
     sorts after, same or before mbc2.
Packit 33f14e
Packit 33f14e
   mb_casecmp (mbc1, mbc2)
Packit 33f14e
     returns a positive, zero, or negative value depending on whether mbc1
Packit 33f14e
     sorts after, same or before mbc2, modulo upper/lowercase conversion.
Packit 33f14e
Packit 33f14e
   mb_equal (mbc1, mbc2)
Packit 33f14e
     returns true if mbc1 and mbc2 are equal.
Packit 33f14e
Packit 33f14e
   mb_caseequal (mbc1, mbc2)
Packit 33f14e
     returns true if mbc1 and mbc2 are equal modulo upper/lowercase conversion.
Packit 33f14e
Packit 33f14e
   mb_isalnum (mbc)
Packit 33f14e
     returns true if mbc is alphanumeric.
Packit 33f14e
Packit 33f14e
   mb_isalpha (mbc)
Packit 33f14e
     returns true if mbc is alphabetic.
Packit 33f14e
Packit 33f14e
   mb_isascii(mbc)
Packit 33f14e
     returns true if mbc is plain ASCII.
Packit 33f14e
Packit 33f14e
   mb_isblank (mbc)
Packit 33f14e
     returns true if mbc is a blank.
Packit 33f14e
Packit 33f14e
   mb_iscntrl (mbc)
Packit 33f14e
     returns true if mbc is a control character.
Packit 33f14e
Packit 33f14e
   mb_isdigit (mbc)
Packit 33f14e
     returns true if mbc is a decimal digit.
Packit 33f14e
Packit 33f14e
   mb_isgraph (mbc)
Packit 33f14e
     returns true if mbc is a graphic character.
Packit 33f14e
Packit 33f14e
   mb_islower (mbc)
Packit 33f14e
     returns true if mbc is lowercase.
Packit 33f14e
Packit 33f14e
   mb_isprint (mbc)
Packit 33f14e
     returns true if mbc is a printable character.
Packit 33f14e
Packit 33f14e
   mb_ispunct (mbc)
Packit 33f14e
     returns true if mbc is a punctuation character.
Packit 33f14e
Packit 33f14e
   mb_isspace (mbc)
Packit 33f14e
     returns true if mbc is a space character.
Packit 33f14e
Packit 33f14e
   mb_isupper (mbc)
Packit 33f14e
     returns true if mbc is uppercase.
Packit 33f14e
Packit 33f14e
   mb_isxdigit (mbc)
Packit 33f14e
     returns true if mbc is a hexadecimal digit.
Packit 33f14e
Packit 33f14e
   mb_width (mbc)
Packit 33f14e
     returns the number of columns on the output device occupied by mbc.
Packit 33f14e
     Always >= 0.
Packit 33f14e
Packit 33f14e
   mb_putc (mbc, stream)
Packit 33f14e
     outputs mbc on stream, a byte oriented FILE stream opened for output.
Packit 33f14e
Packit 33f14e
   mb_setascii (&mbc, sc)
Packit 33f14e
     assigns the standard ASCII character sc to mbc.
Packit 33f14e
Packit 33f14e
   mb_copy (&destmbc, &srcmbc)
Packit 33f14e
     copies srcmbc to destmbc.
Packit 33f14e
Packit 33f14e
   Here are the function prototypes of the macros.
Packit 33f14e
Packit 33f14e
   extern const char *  mb_ptr (const mbchar_t mbc);
Packit 33f14e
   extern size_t        mb_len (const mbchar_t mbc);
Packit 33f14e
   extern bool          mb_iseq (const mbchar_t mbc, char sc);
Packit 33f14e
   extern bool          mb_isnul (const mbchar_t mbc);
Packit 33f14e
   extern int           mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2);
Packit 33f14e
   extern int           mb_casecmp (const mbchar_t mbc1, const mbchar_t mbc2);
Packit 33f14e
   extern bool          mb_equal (const mbchar_t mbc1, const mbchar_t mbc2);
Packit 33f14e
   extern bool          mb_caseequal (const mbchar_t mbc1, const mbchar_t mbc2);
Packit 33f14e
   extern bool          mb_isalnum (const mbchar_t mbc);
Packit 33f14e
   extern bool          mb_isalpha (const mbchar_t mbc);
Packit 33f14e
   extern bool          mb_isascii (const mbchar_t mbc);
Packit 33f14e
   extern bool          mb_isblank (const mbchar_t mbc);
Packit 33f14e
   extern bool          mb_iscntrl (const mbchar_t mbc);
Packit 33f14e
   extern bool          mb_isdigit (const mbchar_t mbc);
Packit 33f14e
   extern bool          mb_isgraph (const mbchar_t mbc);
Packit 33f14e
   extern bool          mb_islower (const mbchar_t mbc);
Packit 33f14e
   extern bool          mb_isprint (const mbchar_t mbc);
Packit 33f14e
   extern bool          mb_ispunct (const mbchar_t mbc);
Packit 33f14e
   extern bool          mb_isspace (const mbchar_t mbc);
Packit 33f14e
   extern bool          mb_isupper (const mbchar_t mbc);
Packit 33f14e
   extern bool          mb_isxdigit (const mbchar_t mbc);
Packit 33f14e
   extern int           mb_width (const mbchar_t mbc);
Packit 33f14e
   extern void          mb_putc (const mbchar_t mbc, FILE *stream);
Packit 33f14e
   extern void          mb_setascii (mbchar_t *new, char sc);
Packit 33f14e
   extern void          mb_copy (mbchar_t *new, const mbchar_t *old);
Packit 33f14e
 */
Packit 33f14e
Packit 33f14e
#ifndef _MBCHAR_H
Packit 33f14e
#define _MBCHAR_H 1
Packit 33f14e
Packit 33f14e
#include <stdbool.h>
Packit 33f14e
#include <string.h>
Packit 33f14e
Packit 33f14e
/* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
Packit 33f14e
   <wchar.h>.
Packit 33f14e
   BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
Packit 33f14e
   <wchar.h>.  */
Packit 33f14e
#include <stdio.h>
Packit 33f14e
#include <time.h>
Packit 33f14e
#include <wchar.h>
Packit 33f14e
#include <wctype.h>
Packit 33f14e
Packit 33f14e
#ifndef _GL_INLINE_HEADER_BEGIN
Packit 33f14e
 #error "Please include config.h first."
Packit 33f14e
#endif
Packit 33f14e
_GL_INLINE_HEADER_BEGIN
Packit 33f14e
#ifndef MBCHAR_INLINE
Packit 33f14e
# define MBCHAR_INLINE _GL_INLINE
Packit 33f14e
#endif
Packit 33f14e
Packit 33f14e
#define MBCHAR_BUF_SIZE 24
Packit 33f14e
Packit 33f14e
struct mbchar
Packit 33f14e
{
Packit 33f14e
  const char *ptr;      /* pointer to current character */
Packit 33f14e
  size_t bytes;         /* number of bytes of current character, > 0 */
Packit 33f14e
  bool wc_valid;        /* true if wc is a valid wide character */
Packit 33f14e
  wchar_t wc;           /* if wc_valid: the current character */
Packit 33f14e
  char buf[MBCHAR_BUF_SIZE]; /* room for the bytes, used for file input only */
Packit 33f14e
};
Packit 33f14e
Packit 33f14e
/* EOF (not a real character) is represented with bytes = 0 and
Packit 33f14e
   wc_valid = false.  */
Packit 33f14e
Packit 33f14e
typedef struct mbchar mbchar_t;
Packit 33f14e
Packit 33f14e
/* Access the current character.  */
Packit 33f14e
#define mb_ptr(mbc) ((mbc).ptr)
Packit 33f14e
#define mb_len(mbc) ((mbc).bytes)
Packit 33f14e
Packit 33f14e
/* Comparison of characters.  */
Packit 33f14e
#define mb_iseq(mbc, sc) ((mbc).wc_valid && (mbc).wc == (sc))
Packit 33f14e
#define mb_isnul(mbc) ((mbc).wc_valid && (mbc).wc == 0)
Packit 33f14e
#define mb_cmp(mbc1, mbc2) \
Packit 33f14e
  ((mbc1).wc_valid                                                      \
Packit 33f14e
   ? ((mbc2).wc_valid                                                   \
Packit 33f14e
      ? (int) (mbc1).wc - (int) (mbc2).wc                               \
Packit 33f14e
      : -1)                                                             \
Packit 33f14e
   : ((mbc2).wc_valid                                                   \
Packit 33f14e
      ? 1                                                               \
Packit 33f14e
      : (mbc1).bytes == (mbc2).bytes                                    \
Packit 33f14e
        ? memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes)                 \
Packit 33f14e
        : (mbc1).bytes < (mbc2).bytes                                   \
Packit 33f14e
          ? (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) > 0 ? 1 : -1) \
Packit 33f14e
          : (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc2).bytes) >= 0 ? 1 : -1)))
Packit 33f14e
#define mb_casecmp(mbc1, mbc2) \
Packit 33f14e
  ((mbc1).wc_valid                                                      \
Packit 33f14e
   ? ((mbc2).wc_valid                                                   \
Packit 33f14e
      ? (int) towlower ((mbc1).wc) - (int) towlower ((mbc2).wc)         \
Packit 33f14e
      : -1)                                                             \
Packit 33f14e
   : ((mbc2).wc_valid                                                   \
Packit 33f14e
      ? 1                                                               \
Packit 33f14e
      : (mbc1).bytes == (mbc2).bytes                                    \
Packit 33f14e
        ? memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes)                 \
Packit 33f14e
        : (mbc1).bytes < (mbc2).bytes                                   \
Packit 33f14e
          ? (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) > 0 ? 1 : -1) \
Packit 33f14e
          : (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc2).bytes) >= 0 ? 1 : -1)))
Packit 33f14e
#define mb_equal(mbc1, mbc2) \
Packit 33f14e
  ((mbc1).wc_valid && (mbc2).wc_valid                                   \
Packit 33f14e
   ? (mbc1).wc == (mbc2).wc                                             \
Packit 33f14e
   : (mbc1).bytes == (mbc2).bytes                                       \
Packit 33f14e
     && memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0)
Packit 33f14e
#define mb_caseequal(mbc1, mbc2) \
Packit 33f14e
  ((mbc1).wc_valid && (mbc2).wc_valid                                   \
Packit 33f14e
   ? towlower ((mbc1).wc) == towlower ((mbc2).wc)                       \
Packit 33f14e
   : (mbc1).bytes == (mbc2).bytes                                       \
Packit 33f14e
     && memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0)
Packit 33f14e
Packit 33f14e
/* <ctype.h>, <wctype.h> classification.  */
Packit 33f14e
#define mb_isascii(mbc) \
Packit 33f14e
  ((mbc).wc_valid && (mbc).wc >= 0 && (mbc).wc <= 127)
Packit 33f14e
#define mb_isalnum(mbc) ((mbc).wc_valid && iswalnum ((mbc).wc))
Packit 33f14e
#define mb_isalpha(mbc) ((mbc).wc_valid && iswalpha ((mbc).wc))
Packit 33f14e
#define mb_isblank(mbc) ((mbc).wc_valid && iswblank ((mbc).wc))
Packit 33f14e
#define mb_iscntrl(mbc) ((mbc).wc_valid && iswcntrl ((mbc).wc))
Packit 33f14e
#define mb_isdigit(mbc) ((mbc).wc_valid && iswdigit ((mbc).wc))
Packit 33f14e
#define mb_isgraph(mbc) ((mbc).wc_valid && iswgraph ((mbc).wc))
Packit 33f14e
#define mb_islower(mbc) ((mbc).wc_valid && iswlower ((mbc).wc))
Packit 33f14e
#define mb_isprint(mbc) ((mbc).wc_valid && iswprint ((mbc).wc))
Packit 33f14e
#define mb_ispunct(mbc) ((mbc).wc_valid && iswpunct ((mbc).wc))
Packit 33f14e
#define mb_isspace(mbc) ((mbc).wc_valid && iswspace ((mbc).wc))
Packit 33f14e
#define mb_isupper(mbc) ((mbc).wc_valid && iswupper ((mbc).wc))
Packit 33f14e
#define mb_isxdigit(mbc) ((mbc).wc_valid && iswxdigit ((mbc).wc))
Packit 33f14e
Packit 33f14e
/* Extra <wchar.h> function.  */
Packit 33f14e
Packit 33f14e
/* Unprintable characters appear as a small box of width 1.  */
Packit 33f14e
#define MB_UNPRINTABLE_WIDTH 1
Packit 33f14e
Packit 33f14e
MBCHAR_INLINE int
Packit 33f14e
mb_width_aux (wint_t wc)
Packit 33f14e
{
Packit 33f14e
  int w = wcwidth (wc);
Packit 33f14e
  /* For unprintable characters, arbitrarily return 0 for control characters
Packit 33f14e
     and MB_UNPRINTABLE_WIDTH otherwise.  */
Packit 33f14e
  return (w >= 0 ? w : iswcntrl (wc) ? 0 : MB_UNPRINTABLE_WIDTH);
Packit 33f14e
}
Packit 33f14e
Packit 33f14e
#define mb_width(mbc) \
Packit 33f14e
  ((mbc).wc_valid ? mb_width_aux ((mbc).wc) : MB_UNPRINTABLE_WIDTH)
Packit 33f14e
Packit 33f14e
/* Output.  */
Packit 33f14e
#define mb_putc(mbc, stream)  fwrite ((mbc).ptr, 1, (mbc).bytes, (stream))
Packit 33f14e
Packit 33f14e
/* Assignment.  */
Packit 33f14e
#define mb_setascii(mbc, sc) \
Packit 33f14e
  ((mbc)->ptr = (mbc)->buf, (mbc)->bytes = 1, (mbc)->wc_valid = 1, \
Packit 33f14e
   (mbc)->wc = (mbc)->buf[0] = (sc))
Packit 33f14e
Packit 33f14e
/* Copying a character.  */
Packit 33f14e
MBCHAR_INLINE void
Packit 33f14e
mb_copy (mbchar_t *new_mbc, const mbchar_t *old_mbc)
Packit 33f14e
{
Packit 33f14e
  if (old_mbc->ptr == &old_mbc->buf[0])
Packit 33f14e
    {
Packit 33f14e
      memcpy (&new_mbc->buf[0], &old_mbc->buf[0], old_mbc->bytes);
Packit 33f14e
      new_mbc->ptr = &new_mbc->buf[0];
Packit 33f14e
    }
Packit 33f14e
  else
Packit 33f14e
    new_mbc->ptr = old_mbc->ptr;
Packit 33f14e
  new_mbc->bytes = old_mbc->bytes;
Packit 33f14e
  if ((new_mbc->wc_valid = old_mbc->wc_valid))
Packit 33f14e
    new_mbc->wc = old_mbc->wc;
Packit 33f14e
}
Packit 33f14e
Packit 33f14e
Packit 33f14e
/* is_basic(c) tests whether the single-byte character c is in the
Packit 33f14e
   ISO C "basic character set".
Packit 33f14e
   This is a convenience function, and is in this file only to share code
Packit 33f14e
   between mbiter_multi.h and mbfile_multi.h.  */
Packit 33f14e
#if (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
Packit 33f14e
    && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \
Packit 33f14e
    && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \
Packit 33f14e
    && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \
Packit 33f14e
    && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \
Packit 33f14e
    && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \
Packit 33f14e
    && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \
Packit 33f14e
    && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \
Packit 33f14e
    && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \
Packit 33f14e
    && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \
Packit 33f14e
    && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \
Packit 33f14e
    && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \
Packit 33f14e
    && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \
Packit 33f14e
    && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \
Packit 33f14e
    && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \
Packit 33f14e
    && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \
Packit 33f14e
    && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \
Packit 33f14e
    && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \
Packit 33f14e
    && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \
Packit 33f14e
    && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \
Packit 33f14e
    && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \
Packit 33f14e
    && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \
Packit 33f14e
    && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)
Packit 33f14e
/* The character set is ISO-646, not EBCDIC. */
Packit 33f14e
# define IS_BASIC_ASCII 1
Packit 33f14e
Packit 33f14e
extern const unsigned int is_basic_table[];
Packit 33f14e
Packit 33f14e
MBCHAR_INLINE bool
Packit 33f14e
is_basic (char c)
Packit 33f14e
{
Packit 33f14e
  return (is_basic_table [(unsigned char) c >> 5] >> ((unsigned char) c & 31))
Packit 33f14e
         & 1;
Packit 33f14e
}
Packit 33f14e
Packit 33f14e
#else
Packit 33f14e
Packit 33f14e
MBCHAR_INLINE bool
Packit 33f14e
is_basic (char c)
Packit 33f14e
{
Packit 33f14e
  switch (c)
Packit 33f14e
    {
Packit 33f14e
    case '\t': case '\v': case '\f':
Packit 33f14e
    case ' ': case '!': case '"': case '#': case '%':
Packit 33f14e
    case '&': case '\'': case '(': case ')': case '*':
Packit 33f14e
    case '+': case ',': case '-': case '.': case '/':
Packit 33f14e
    case '0': case '1': case '2': case '3': case '4':
Packit 33f14e
    case '5': case '6': case '7': case '8': case '9':
Packit 33f14e
    case ':': case ';': case '<': case '=': case '>':
Packit 33f14e
    case '?':
Packit 33f14e
    case 'A': case 'B': case 'C': case 'D': case 'E':
Packit 33f14e
    case 'F': case 'G': case 'H': case 'I': case 'J':
Packit 33f14e
    case 'K': case 'L': case 'M': case 'N': case 'O':
Packit 33f14e
    case 'P': case 'Q': case 'R': case 'S': case 'T':
Packit 33f14e
    case 'U': case 'V': case 'W': case 'X': case 'Y':
Packit 33f14e
    case 'Z':
Packit 33f14e
    case '[': case '\\': case ']': case '^': case '_':
Packit 33f14e
    case 'a': case 'b': case 'c': case 'd': case 'e':
Packit 33f14e
    case 'f': case 'g': case 'h': case 'i': case 'j':
Packit 33f14e
    case 'k': case 'l': case 'm': case 'n': case 'o':
Packit 33f14e
    case 'p': case 'q': case 'r': case 's': case 't':
Packit 33f14e
    case 'u': case 'v': case 'w': case 'x': case 'y':
Packit 33f14e
    case 'z': case '{': case '|': case '}': case '~':
Packit 33f14e
      return 1;
Packit 33f14e
    default:
Packit 33f14e
      return 0;
Packit 33f14e
    }
Packit 33f14e
}
Packit 33f14e
Packit 33f14e
#endif
Packit 33f14e
Packit 33f14e
_GL_INLINE_HEADER_END
Packit 33f14e
Packit 33f14e
#endif /* _MBCHAR_H */