Blame cbits/text-helper.c

Packit de35d7
/*
Packit de35d7
 * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
Packit de35d7
 *
Packit de35d7
 * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
Packit de35d7
 *
Packit de35d7
 * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
Packit de35d7
 */
Packit de35d7
Packit de35d7
#include <string.h>
Packit de35d7
#include <stdint.h>
Packit de35d7
#include <stdio.h>
Packit de35d7
#include "text_cbits.h"
Packit de35d7
Packit de35d7
void _hs_streaming_commons_memcpy(void *dest, size_t doff, const void *src, size_t soff,
Packit de35d7
		     size_t n)
Packit de35d7
{
Packit de35d7
  memcpy(dest + (doff<<1), src + (soff<<1), n<<1);
Packit de35d7
}
Packit de35d7
Packit de35d7
int _hs_streaming_commons_memcmp(const void *a, size_t aoff, const void *b, size_t boff,
Packit de35d7
		    size_t n)
Packit de35d7
{
Packit de35d7
  return memcmp(a + (aoff<<1), b + (boff<<1), n<<1);
Packit de35d7
}
Packit de35d7
Packit de35d7
#define UTF8_ACCEPT 0
Packit de35d7
#define UTF8_REJECT 12
Packit de35d7
Packit de35d7
static const uint8_t utf8d[] = {
Packit de35d7
  /*
Packit de35d7
   * The first part of the table maps bytes to character classes that
Packit de35d7
   * to reduce the size of the transition table and create bitmasks.
Packit de35d7
   */
Packit de35d7
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
Packit de35d7
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
Packit de35d7
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
Packit de35d7
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
Packit de35d7
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
Packit de35d7
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
Packit de35d7
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
Packit de35d7
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
Packit de35d7
Packit de35d7
  /*
Packit de35d7
   * The second part is a transition table that maps a combination of
Packit de35d7
   * a state of the automaton and a character class to a state.
Packit de35d7
   */
Packit de35d7
   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
Packit de35d7
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
Packit de35d7
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
Packit de35d7
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
Packit de35d7
  12,36,12,12,12,12,12,12,12,12,12,12,
Packit de35d7
};
Packit de35d7
Packit de35d7
static inline uint32_t
Packit de35d7
decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
Packit de35d7
  uint32_t type = utf8d[byte];
Packit de35d7
Packit de35d7
  *codep = (*state != UTF8_ACCEPT) ?
Packit de35d7
    (byte & 0x3fu) | (*codep << 6) :
Packit de35d7
    (0xff >> type) & (byte);
Packit de35d7
Packit de35d7
  return *state = utf8d[256 + *state + type];
Packit de35d7
}
Packit de35d7
Packit de35d7
/*
Packit de35d7
 * The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode
Packit de35d7
 * code-points, therefore we can trivially convert from a latin-1 encoded bytestring to
Packit de35d7
 * an UTF16 array
Packit de35d7
 */
Packit de35d7
void
Packit de35d7
_hs_streaming_commons_decode_latin1(uint16_t *dest, const uint8_t const *src,
Packit de35d7
                       const uint8_t const *srcend)
Packit de35d7
{
Packit de35d7
  const uint8_t *p = src;
Packit de35d7
Packit de35d7
#if defined(__i386__) || defined(__x86_64__)
Packit de35d7
  /* This optimization works on a little-endian systems by using
Packit de35d7
     (aligned) 32-bit loads instead of 8-bit loads
Packit de35d7
   */
Packit de35d7
Packit de35d7
  /* consume unaligned prefix */
Packit de35d7
  while (p != srcend && (uintptr_t)p & 0x3)
Packit de35d7
    *dest++ = *p++;
Packit de35d7
Packit de35d7
  /* iterate over 32-bit aligned loads */
Packit de35d7
  while (p < srcend - 3) {
Packit de35d7
    const uint32_t w = *((const uint32_t *)p);
Packit de35d7
Packit de35d7
    *dest++ =  w        & 0xff;
Packit de35d7
    *dest++ = (w >> 8)  & 0xff;
Packit de35d7
    *dest++ = (w >> 16) & 0xff;
Packit de35d7
    *dest++ = (w >> 24) & 0xff;
Packit de35d7
Packit de35d7
    p += 4;
Packit de35d7
  }
Packit de35d7
#endif
Packit de35d7
Packit de35d7
  /* handle unaligned suffix */
Packit de35d7
  while (p != srcend)
Packit de35d7
    *dest++ = *p++;
Packit de35d7
}
Packit de35d7
Packit de35d7
/*
Packit de35d7
 * A best-effort decoder. Runs until it hits either end of input or
Packit de35d7
 * the start of an invalid byte sequence.
Packit de35d7
 *
Packit de35d7
 * At exit, we update *destoff with the next offset to write to, *src
Packit de35d7
 * with the next source location past the last one successfully
Packit de35d7
 * decoded, and return the next source location to read from.
Packit de35d7
 *
Packit de35d7
 * Moreover, we expose the internal decoder state (state0 and
Packit de35d7
 * codepoint0), allowing one to restart the decoder after it
Packit de35d7
 * terminates (say, due to a partial codepoint).
Packit de35d7
 *
Packit de35d7
 * In particular, there are a few possible outcomes,
Packit de35d7
 *
Packit de35d7
 *   1) We decoded the buffer entirely:
Packit de35d7
 *      In this case we return srcend
Packit de35d7
 *      state0 == UTF8_ACCEPT
Packit de35d7
 *
Packit de35d7
 *   2) We met an invalid encoding
Packit de35d7
 *      In this case we return the address of the first invalid byte
Packit de35d7
 *      state0 == UTF8_REJECT
Packit de35d7
 *
Packit de35d7
 *   3) We reached the end of the buffer while decoding a codepoint
Packit de35d7
 *      In this case we return a pointer to the first byte of the partial codepoint
Packit de35d7
 *      state0 != UTF8_ACCEPT, UTF8_REJECT
Packit de35d7
 *
Packit de35d7
 */
Packit de35d7
#if defined(__GNUC__) || defined(__clang__)
Packit de35d7
static inline uint8_t const *
Packit de35d7
_hs_streaming_commons_decode_utf8_int(uint16_t *const dest, size_t *destoff,
Packit de35d7
			 const uint8_t const **src, const uint8_t const *srcend,
Packit de35d7
			 uint32_t *codepoint0, uint32_t *state0)
Packit de35d7
  __attribute((always_inline));
Packit de35d7
#endif
Packit de35d7
Packit de35d7
static inline uint8_t const *
Packit de35d7
_hs_streaming_commons_decode_utf8_int(uint16_t *const dest, size_t *destoff,
Packit de35d7
			 const uint8_t const **src, const uint8_t const *srcend,
Packit de35d7
			 uint32_t *codepoint0, uint32_t *state0)
Packit de35d7
{
Packit de35d7
  uint16_t *d = dest + *destoff;
Packit de35d7
  const uint8_t *s = *src, *last = *src;
Packit de35d7
  uint32_t state = *state0;
Packit de35d7
  uint32_t codepoint = *codepoint0;
Packit de35d7
Packit de35d7
  while (s < srcend) {
Packit de35d7
#if defined(__i386__) || defined(__x86_64__)
Packit de35d7
    /*
Packit de35d7
     * This code will only work on a little-endian system that
Packit de35d7
     * supports unaligned loads.
Packit de35d7
     *
Packit de35d7
     * It gives a substantial speed win on data that is purely or
Packit de35d7
     * partly ASCII (e.g. HTML), at only a slight cost on purely
Packit de35d7
     * non-ASCII text.
Packit de35d7
     */
Packit de35d7
Packit de35d7
    if (state == UTF8_ACCEPT) {
Packit de35d7
      while (s < srcend - 4) {
Packit de35d7
	codepoint = *((uint32_t *) s);
Packit de35d7
	if ((codepoint & 0x80808080) != 0)
Packit de35d7
	  break;
Packit de35d7
	s += 4;
Packit de35d7
Packit de35d7
	/*
Packit de35d7
	 * Tried 32-bit stores here, but the extra bit-twiddling
Packit de35d7
	 * slowed the code down.
Packit de35d7
	 */
Packit de35d7
Packit de35d7
	*d++ = (uint16_t) (codepoint & 0xff);
Packit de35d7
	*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
Packit de35d7
	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
Packit de35d7
	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
Packit de35d7
      }
Packit de35d7
      last = s;
Packit de35d7
    }
Packit de35d7
#endif
Packit de35d7
Packit de35d7
    if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
Packit de35d7
      if (state != UTF8_REJECT)
Packit de35d7
	continue;
Packit de35d7
      break;
Packit de35d7
    }
Packit de35d7
Packit de35d7
    if (codepoint <= 0xffff)
Packit de35d7
      *d++ = (uint16_t) codepoint;
Packit de35d7
    else {
Packit de35d7
      *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
Packit de35d7
      *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
Packit de35d7
    }
Packit de35d7
    last = s;
Packit de35d7
  }
Packit de35d7
Packit de35d7
  *destoff = d - dest;
Packit de35d7
  *codepoint0 = codepoint;
Packit de35d7
  *state0 = state;
Packit de35d7
  *src = last;
Packit de35d7
Packit de35d7
  return s;
Packit de35d7
}
Packit de35d7
Packit de35d7
uint8_t const *
Packit de35d7
_hs_streaming_commons_decode_utf8_state(uint16_t *const dest, size_t *destoff,
Packit de35d7
                           const uint8_t const **src,
Packit de35d7
			   const uint8_t const *srcend,
Packit de35d7
                           uint32_t *codepoint0, uint32_t *state0)
Packit de35d7
{
Packit de35d7
  uint8_t const *ret = _hs_streaming_commons_decode_utf8_int(dest, destoff, src, srcend,
Packit de35d7
						codepoint0, state0);
Packit de35d7
  if (*state0 == UTF8_REJECT)
Packit de35d7
    ret -=1;
Packit de35d7
  return ret;
Packit de35d7
}
Packit de35d7
Packit de35d7
/*
Packit de35d7
 * Helper to decode buffer and discard final decoder state
Packit de35d7
 */
Packit de35d7
const uint8_t *
Packit de35d7
_hs_streaming_commons_decode_utf8(uint16_t *const dest, size_t *destoff,
Packit de35d7
                     const uint8_t *src, const uint8_t *const srcend)
Packit de35d7
{
Packit de35d7
  uint32_t codepoint;
Packit de35d7
  uint32_t state = UTF8_ACCEPT;
Packit de35d7
  uint8_t const *ret = _hs_streaming_commons_decode_utf8_int(dest, destoff, &src, srcend,
Packit de35d7
						&codepoint, &state);
Packit de35d7
  /* Back up if we have an incomplete or invalid encoding */
Packit de35d7
  if (state != UTF8_ACCEPT)
Packit de35d7
    ret -= 1;
Packit de35d7
  return ret;
Packit de35d7
}
Packit de35d7
Packit de35d7
void
Packit de35d7
_hs_streaming_commons_encode_utf8(uint8_t **destp, const uint16_t *src, size_t srcoff,
Packit de35d7
		     size_t srclen)
Packit de35d7
{
Packit de35d7
  const uint16_t *srcend;
Packit de35d7
  uint8_t *dest = *destp;
Packit de35d7
Packit de35d7
  src += srcoff;
Packit de35d7
  srcend = src + srclen;
Packit de35d7
Packit de35d7
 ascii:
Packit de35d7
#if defined(__x86_64__)
Packit de35d7
  while (srcend - src >= 4) {
Packit de35d7
    uint64_t w = *((uint64_t *) src);
Packit de35d7
Packit de35d7
    if (w & 0xFF80FF80FF80FF80ULL) {
Packit de35d7
      if (!(w & 0x000000000000FF80ULL)) {
Packit de35d7
	*dest++ = w & 0xFFFF;
Packit de35d7
	src++;
Packit de35d7
	if (!(w & 0x00000000FF800000ULL)) {
Packit de35d7
	  *dest++ = (w >> 16) & 0xFFFF;
Packit de35d7
	  src++;
Packit de35d7
	  if (!(w & 0x0000FF8000000000ULL)) {
Packit de35d7
	    *dest++ = (w >> 32) & 0xFFFF;
Packit de35d7
	    src++;
Packit de35d7
	  }
Packit de35d7
	}
Packit de35d7
      }
Packit de35d7
      break;
Packit de35d7
    }
Packit de35d7
    *dest++ = w & 0xFFFF;
Packit de35d7
    *dest++ = (w >> 16) & 0xFFFF;
Packit de35d7
    *dest++ = (w >> 32) & 0xFFFF;
Packit de35d7
    *dest++ = w >> 48;
Packit de35d7
    src += 4;
Packit de35d7
  }
Packit de35d7
#endif
Packit de35d7
Packit de35d7
#if defined(__i386__)
Packit de35d7
  while (srcend - src >= 2) {
Packit de35d7
    uint32_t w = *((uint32_t *) src);
Packit de35d7
Packit de35d7
    if (w & 0xFF80FF80)
Packit de35d7
      break;
Packit de35d7
    *dest++ = w & 0xFFFF;
Packit de35d7
    *dest++ = w >> 16;
Packit de35d7
    src += 2;
Packit de35d7
  }
Packit de35d7
#endif
Packit de35d7
Packit de35d7
  while (src < srcend) {
Packit de35d7
    uint16_t w = *src++;
Packit de35d7
Packit de35d7
    if (w <= 0x7F) {
Packit de35d7
      *dest++ = w;
Packit de35d7
      /* An ASCII byte is likely to begin a run of ASCII bytes.
Packit de35d7
	 Falling back into the fast path really helps performance. */
Packit de35d7
      goto ascii;
Packit de35d7
    }
Packit de35d7
    else if (w <= 0x7FF) {
Packit de35d7
      *dest++ = (w >> 6) | 0xC0;
Packit de35d7
      *dest++ = (w & 0x3f) | 0x80;
Packit de35d7
    }
Packit de35d7
    else if (w < 0xD800 || w > 0xDBFF) {
Packit de35d7
      *dest++ = (w >> 12) | 0xE0;
Packit de35d7
      *dest++ = ((w >> 6) & 0x3F) | 0x80;
Packit de35d7
      *dest++ = (w & 0x3F) | 0x80;
Packit de35d7
    } else {
Packit de35d7
      uint32_t c = ((((uint32_t) w) - 0xD800) << 10) +
Packit de35d7
	(((uint32_t) *src++) - 0xDC00) + 0x10000;
Packit de35d7
      *dest++ = (c >> 18) | 0xF0;
Packit de35d7
      *dest++ = ((c >> 12) & 0x3F) | 0x80;
Packit de35d7
      *dest++ = ((c >> 6) & 0x3F) | 0x80;
Packit de35d7
      *dest++ = (c & 0x3F) | 0x80;
Packit de35d7
    }
Packit de35d7
  }
Packit de35d7
Packit de35d7
  *destp = dest;
Packit de35d7
}