Blame lib/UTF8CodingSystem.cxx

Packit 8a864e
// Copyright (c) 1994 James Clark
Packit 8a864e
// See the file COPYING for copying permission.
Packit 8a864e
Packit 8a864e
#include "splib.h"
Packit 8a864e
Packit 8a864e
#ifdef SP_MULTI_BYTE
Packit 8a864e
Packit 8a864e
#include "UTF8CodingSystem.h"
Packit 8a864e
#include "constant.h"
Packit 8a864e
Packit 8a864e
#ifdef SP_NAMESPACE
Packit 8a864e
namespace SP_NAMESPACE {
Packit 8a864e
#endif
Packit 8a864e
Packit 8a864e
enum {
Packit 8a864e
  // cmaskN is mask for first byte to test for N byte sequence
Packit 8a864e
  cmask1 = 0x80,
Packit 8a864e
  cmask2 = 0xe0,
Packit 8a864e
  cmask3 = 0xf0,
Packit 8a864e
  cmask4 = 0xf8,
Packit 8a864e
  cmask5 = 0xfc,
Packit 8a864e
  cmask6 = 0xfe,
Packit 8a864e
  // cvalN is value of masked first byte of N byte sequence
Packit 8a864e
  cval1 = 0x00,
Packit 8a864e
  cval2 = 0xc0,
Packit 8a864e
  cval3 = 0xe0,
Packit 8a864e
  cval4 = 0xf0,
Packit 8a864e
  cval5 = 0xf8,
Packit 8a864e
  cval6 = 0xfc,
Packit 8a864e
  // vmaskN is mask to get value from first byte in N byte sequence
Packit 8a864e
  vmask2 = 0x1f,
Packit 8a864e
  vmask3 = 0xf,
Packit 8a864e
  vmask4 = 0x7,
Packit 8a864e
  vmask5 = 0x3,
Packit 8a864e
  vmask6 = 0x1,
Packit 8a864e
  // minN is minimum legal resulting value for N byte sequence
Packit 8a864e
  min2 = 0x80,
Packit 8a864e
  min3 = 0x800,
Packit 8a864e
  min4 = 0x10000,
Packit 8a864e
  min5 = 0x200000,
Packit 8a864e
  min6 = 0x4000000,
Packit 8a864e
  max6 = 0x7fffffff
Packit 8a864e
};
Packit 8a864e
Packit 8a864e
class UTF8Decoder : public Decoder {
Packit 8a864e
public:
Packit 8a864e
  UTF8Decoder();
Packit 8a864e
  size_t decode(Char *, const char *, size_t, const char **);
Packit 8a864e
  Boolean convertOffset(unsigned long &offset) const;
Packit 8a864e
private:
Packit 8a864e
  // value for encoding error
Packit 8a864e
  enum { invalid = 0xfffd };
Packit 8a864e
  Boolean recovering_;
Packit 8a864e
  PackedBoolean hadFirstChar_;
Packit 8a864e
  PackedBoolean hadByteOrderMark_;
Packit 8a864e
};
Packit 8a864e
Packit 8a864e
class UTF8Encoder : public Encoder {
Packit 8a864e
public:
Packit 8a864e
  UTF8Encoder();
Packit 8a864e
  void output(const Char *, size_t, OutputByteStream *);
Packit 8a864e
};
Packit 8a864e
Packit 8a864e
Decoder *UTF8CodingSystem::makeDecoder() const
Packit 8a864e
{
Packit 8a864e
  return new UTF8Decoder;
Packit 8a864e
}
Packit 8a864e
Packit 8a864e
Encoder *UTF8CodingSystem::makeEncoder() const
Packit 8a864e
{
Packit 8a864e
  return new UTF8Encoder;
Packit 8a864e
}
Packit 8a864e
Packit 8a864e
Packit 8a864e
UTF8Decoder::UTF8Decoder()
Packit 8a864e
: recovering_(0), hadFirstChar_(0), hadByteOrderMark_(0)
Packit 8a864e
{
Packit 8a864e
}
Packit 8a864e
Packit 8a864e
size_t UTF8Decoder::decode(Char *to, const char *s,
Packit 8a864e
			  size_t slen, const char **result)
Packit 8a864e
{
Packit 8a864e
  // Check for byte-order mark
Packit 8a864e
  if (!hadFirstChar_ && slen >= 3) {
Packit 8a864e
    hadFirstChar_ = 1;
Packit 8a864e
Packit 8a864e
    if ((unsigned char)s[0] == 0xEF &&
Packit 8a864e
        (unsigned char)s[1] == 0xBB &&
Packit 8a864e
        (unsigned char)s[2] == 0xBF) {
Packit 8a864e
      s += 3;
Packit 8a864e
      slen -= 3;
Packit 8a864e
      hadByteOrderMark_ = 1;
Packit 8a864e
    }
Packit 8a864e
  }
Packit 8a864e
  Char *start = to;
Packit 8a864e
  const unsigned char *us = (const unsigned char *)s;
Packit 8a864e
  if (recovering_) {
Packit 8a864e
    recovering_ = 0;
Packit 8a864e
    goto recover;
Packit 8a864e
  }
Packit 8a864e
  while (slen > 0) {
Packit 8a864e
    unsigned c0;
Packit 8a864e
    c0 = us[0];
Packit 8a864e
    if ((c0 & cmask1) == cval1) {
Packit 8a864e
      *to++ = c0;
Packit 8a864e
      us++;
Packit 8a864e
      slen--;
Packit 8a864e
    }
Packit 8a864e
    else if ((c0 & cmask2) == cval2) {
Packit 8a864e
      if (slen < 2)
Packit 8a864e
	goto done;
Packit 8a864e
      unsigned c1 = us[1] ^ 0x80;
Packit 8a864e
      if (c1 & 0xc0)
Packit 8a864e
	goto error;
Packit 8a864e
      unsigned c = ((c0 & vmask2) << 6) | c1;
Packit 8a864e
      if (c < min2)
Packit 8a864e
	c = invalid;
Packit 8a864e
      *to++ = c;
Packit 8a864e
      slen -= 2;
Packit 8a864e
      us += 2;
Packit 8a864e
    }
Packit 8a864e
    else if ((c0 & cmask3) == cval3) {
Packit 8a864e
      if (slen < 3)
Packit 8a864e
	goto done;
Packit 8a864e
      unsigned c1 = us[1] ^ 0x80;
Packit 8a864e
      unsigned c2 = us[2] ^ 0x80;
Packit 8a864e
      if ((c1 | c2) & 0xc0)
Packit 8a864e
	goto error;
Packit 8a864e
      unsigned c = ((((c0 & vmask3) << 6) | c1) << 6) | c2;
Packit 8a864e
      if (c < min3)
Packit 8a864e
	c = invalid;
Packit 8a864e
      *to++ = c;
Packit 8a864e
      slen -= 3;
Packit 8a864e
      us += 3;
Packit 8a864e
    }
Packit 8a864e
    else if ((c0 & cmask4) == cval4) {
Packit 8a864e
      if (slen < 4)
Packit 8a864e
	goto done;
Packit 8a864e
      unsigned c1 = us[1] ^ 0x80;
Packit 8a864e
      unsigned c2 = us[2] ^ 0x80;
Packit 8a864e
      unsigned c3 = us[3] ^ 0x80;
Packit 8a864e
      if ((c1 | c2 | c3) & 0xc0)
Packit 8a864e
	goto error;
Packit 8a864e
      if (charMax < min5 - 1)
Packit 8a864e
	*to++ = invalid;
Packit 8a864e
      else {
Packit 8a864e
	unsigned long c = ((((c0 & vmask4) << 6) | c1) << 6) | c2;
Packit 8a864e
	c = (c << 6) | c3;
Packit 8a864e
	if (c < min4)
Packit 8a864e
	  c = invalid;
Packit 8a864e
	*to++ = c;
Packit 8a864e
      }
Packit 8a864e
      slen -= 4;
Packit 8a864e
      us += 4;
Packit 8a864e
    }
Packit 8a864e
    else if ((c0 & cmask5) == cval5) {
Packit 8a864e
      if (slen < 5)
Packit 8a864e
	goto done;
Packit 8a864e
      unsigned c1 = us[1] ^ 0x80;
Packit 8a864e
      unsigned c2 = us[2] ^ 0x80;
Packit 8a864e
      unsigned c3 = us[3] ^ 0x80;
Packit 8a864e
      unsigned c4 = us[4] ^ 0x80;
Packit 8a864e
      if ((c1 | c2 | c3 | c4) & 0xc0)
Packit 8a864e
	goto error;
Packit 8a864e
      if (charMax < min6 - 1)
Packit 8a864e
	*to++ = invalid;
Packit 8a864e
      else {
Packit 8a864e
	unsigned long c = ((((c0 & vmask5) << 6) | c1) << 6) | c2;
Packit 8a864e
	c = (((c << 6) | c3) << 6) | c4;
Packit 8a864e
	if (c < min5)
Packit 8a864e
	  c = invalid;
Packit 8a864e
	*to++ = c;
Packit 8a864e
      }
Packit 8a864e
      slen -= 5;
Packit 8a864e
      us += 5;
Packit 8a864e
    }
Packit 8a864e
    else if ((c0 & cmask6) == cval6) {
Packit 8a864e
      if (slen < 6)
Packit 8a864e
	goto done;
Packit 8a864e
      unsigned c1 = us[1] ^ 0x80;
Packit 8a864e
      unsigned c2 = us[2] ^ 0x80;
Packit 8a864e
      unsigned c3 = us[3] ^ 0x80;
Packit 8a864e
      unsigned c4 = us[4] ^ 0x80;
Packit 8a864e
      unsigned c5 = us[5] ^ 0x80;
Packit 8a864e
      if ((c1 | c2 | c3 | c4 | c5) & 0xc0)
Packit 8a864e
	goto error;
Packit 8a864e
      if (charMax < max6)
Packit 8a864e
	*to++ = invalid;
Packit 8a864e
      else {
Packit 8a864e
	unsigned long c = ((((c0 & vmask6) << 6) | c1) << 6) | c2;
Packit 8a864e
	c = (((((c << 6) | c3) << 6) | c4) << 6) | c5;
Packit 8a864e
	if (c < min6)
Packit 8a864e
	  c = invalid;
Packit 8a864e
	*to++ = c;
Packit 8a864e
      }
Packit 8a864e
      slen -= 6;
Packit 8a864e
      us += 6;
Packit 8a864e
    }
Packit 8a864e
    else {
Packit 8a864e
    error:
Packit 8a864e
      us++;
Packit 8a864e
      slen--;
Packit 8a864e
      *to++ = invalid;
Packit 8a864e
    recover:
Packit 8a864e
      for (;;) {
Packit 8a864e
	if (slen == 0) {
Packit 8a864e
	  recovering_ = 1;
Packit 8a864e
	  goto done;
Packit 8a864e
	}
Packit 8a864e
	if ((*us & 0xc0) != 0x80)
Packit 8a864e
	  break;
Packit 8a864e
	us++;
Packit 8a864e
	slen--;
Packit 8a864e
      }
Packit 8a864e
    }
Packit 8a864e
  }
Packit 8a864e
 done:
Packit 8a864e
  *result = (char *)us;
Packit 8a864e
  return to - start;
Packit 8a864e
}
Packit 8a864e
Packit 8a864e
Boolean UTF8Decoder::convertOffset(unsigned long &n) const
Packit 8a864e
{
Packit 8a864e
  if (hadByteOrderMark_)
Packit 8a864e
    n += 3;
Packit 8a864e
Packit 8a864e
  return true;
Packit 8a864e
}
Packit 8a864e
Packit 8a864e
UTF8Encoder::UTF8Encoder()
Packit 8a864e
{
Packit 8a864e
}
Packit 8a864e
Packit 8a864e
void UTF8Encoder::output(const Char *s, size_t n, OutputByteStream *sb)
Packit 8a864e
{
Packit 8a864e
  for (; n > 0; s++, n--) {
Packit 8a864e
    Char c = *s;
Packit 8a864e
    if (c < min2)
Packit 8a864e
      sb->sputc((unsigned char)c);
Packit 8a864e
    else if (c < min3) {
Packit 8a864e
      sb->sputc((c >> 6) | cval2);
Packit 8a864e
      sb->sputc((c & 0x3f) | 0x80);
Packit 8a864e
    }
Packit 8a864e
    else if (c < min4) {
Packit 8a864e
      sb->sputc((c >> 12) | cval3);
Packit 8a864e
      sb->sputc(((c >> 6) & 0x3f) | 0x80);
Packit 8a864e
      sb->sputc((c & 0x3f) | 0x80);
Packit 8a864e
    }
Packit 8a864e
    else if (c < min5) {
Packit 8a864e
      sb->sputc((c >> 18) | cval4);
Packit 8a864e
      sb->sputc(((c >> 12) & 0x3f) | 0x80);
Packit 8a864e
      sb->sputc(((c >> 6) & 0x3f) | 0x80);
Packit 8a864e
      sb->sputc((c & 0x3f) | 0x80);
Packit 8a864e
    }
Packit 8a864e
    else if (c < min6) {
Packit 8a864e
      sb->sputc((c >> 24) | cval5);
Packit 8a864e
      sb->sputc(((c >> 18) & 0x3f) | 0x80);
Packit 8a864e
      sb->sputc(((c >> 12) & 0x3f) | 0x80);
Packit 8a864e
      sb->sputc(((c >> 6) & 0x3f) | 0x80);
Packit 8a864e
      sb->sputc((c & 0x3f) | 0x80);
Packit 8a864e
    }
Packit 8a864e
    else if (c <= max6) {
Packit 8a864e
      sb->sputc((c >> 30) | cval6);
Packit 8a864e
      sb->sputc(((c >> 24) & 0x3f) | 0x80);
Packit 8a864e
      sb->sputc(((c >> 18) & 0x3f) | 0x80);
Packit 8a864e
      sb->sputc(((c >> 12) & 0x3f) | 0x80);
Packit 8a864e
      sb->sputc(((c >> 6) & 0x3f) | 0x80);
Packit 8a864e
      sb->sputc((c & 0x3f) | 0x80);
Packit 8a864e
    }
Packit 8a864e
  }
Packit 8a864e
}
Packit 8a864e
#ifdef SP_NAMESPACE
Packit 8a864e
}
Packit 8a864e
#endif
Packit 8a864e
Packit 8a864e
#else /* not SP_MULTI_BYTE */
Packit 8a864e
Packit 8a864e
#ifndef __GNUG__
Packit 8a864e
static char non_empty_translation_unit;	// sigh
Packit 8a864e
#endif
Packit 8a864e
Packit 8a864e
#endif /* not SP_MULTI_BYTE */