|
Packit |
de35d7 |
/*
|
|
Packit |
de35d7 |
* Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
|
|
Packit |
de35d7 |
*
|
|
Packit |
de35d7 |
* Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
|
|
Packit |
de35d7 |
*
|
|
Packit |
de35d7 |
* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
|
Packit |
de35d7 |
*/
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
#include <string.h>
|
|
Packit |
de35d7 |
#include <stdint.h>
|
|
Packit |
de35d7 |
#include <stdio.h>
|
|
Packit |
de35d7 |
#include "text_cbits.h"
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
void _hs_streaming_commons_memcpy(void *dest, size_t doff, const void *src, size_t soff,
|
|
Packit |
de35d7 |
size_t n)
|
|
Packit |
de35d7 |
{
|
|
Packit |
de35d7 |
memcpy(dest + (doff<<1), src + (soff<<1), n<<1);
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
int _hs_streaming_commons_memcmp(const void *a, size_t aoff, const void *b, size_t boff,
|
|
Packit |
de35d7 |
size_t n)
|
|
Packit |
de35d7 |
{
|
|
Packit |
de35d7 |
return memcmp(a + (aoff<<1), b + (boff<<1), n<<1);
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
#define UTF8_ACCEPT 0
|
|
Packit |
de35d7 |
#define UTF8_REJECT 12
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
static const uint8_t utf8d[] = {
|
|
Packit |
de35d7 |
/*
|
|
Packit |
de35d7 |
* The first part of the table maps bytes to character classes that
|
|
Packit |
de35d7 |
* to reduce the size of the transition table and create bitmasks.
|
|
Packit |
de35d7 |
*/
|
|
Packit |
de35d7 |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
Packit |
de35d7 |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
Packit |
de35d7 |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
Packit |
de35d7 |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
Packit |
de35d7 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
|
Packit |
de35d7 |
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
Packit |
de35d7 |
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
Packit |
de35d7 |
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
/*
|
|
Packit |
de35d7 |
* The second part is a transition table that maps a combination of
|
|
Packit |
de35d7 |
* a state of the automaton and a character class to a state.
|
|
Packit |
de35d7 |
*/
|
|
Packit |
de35d7 |
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
|
Packit |
de35d7 |
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
|
Packit |
de35d7 |
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
|
Packit |
de35d7 |
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
|
Packit |
de35d7 |
12,36,12,12,12,12,12,12,12,12,12,12,
|
|
Packit |
de35d7 |
};
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
static inline uint32_t
|
|
Packit |
de35d7 |
decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
|
|
Packit |
de35d7 |
uint32_t type = utf8d[byte];
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
*codep = (*state != UTF8_ACCEPT) ?
|
|
Packit |
de35d7 |
(byte & 0x3fu) | (*codep << 6) :
|
|
Packit |
de35d7 |
(0xff >> type) & (byte);
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
return *state = utf8d[256 + *state + type];
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
/*
|
|
Packit |
de35d7 |
* The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode
|
|
Packit |
de35d7 |
* code-points, therefore we can trivially convert from a latin-1 encoded bytestring to
|
|
Packit |
de35d7 |
* an UTF16 array
|
|
Packit |
de35d7 |
*/
|
|
Packit |
de35d7 |
void
|
|
Packit |
de35d7 |
_hs_streaming_commons_decode_latin1(uint16_t *dest, const uint8_t const *src,
|
|
Packit |
de35d7 |
const uint8_t const *srcend)
|
|
Packit |
de35d7 |
{
|
|
Packit |
de35d7 |
const uint8_t *p = src;
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
#if defined(__i386__) || defined(__x86_64__)
|
|
Packit |
de35d7 |
/* This optimization works on a little-endian systems by using
|
|
Packit |
de35d7 |
(aligned) 32-bit loads instead of 8-bit loads
|
|
Packit |
de35d7 |
*/
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
/* consume unaligned prefix */
|
|
Packit |
de35d7 |
while (p != srcend && (uintptr_t)p & 0x3)
|
|
Packit |
de35d7 |
*dest++ = *p++;
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
/* iterate over 32-bit aligned loads */
|
|
Packit |
de35d7 |
while (p < srcend - 3) {
|
|
Packit |
de35d7 |
const uint32_t w = *((const uint32_t *)p);
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
*dest++ = w & 0xff;
|
|
Packit |
de35d7 |
*dest++ = (w >> 8) & 0xff;
|
|
Packit |
de35d7 |
*dest++ = (w >> 16) & 0xff;
|
|
Packit |
de35d7 |
*dest++ = (w >> 24) & 0xff;
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
p += 4;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
#endif
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
/* handle unaligned suffix */
|
|
Packit |
de35d7 |
while (p != srcend)
|
|
Packit |
de35d7 |
*dest++ = *p++;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
/*
|
|
Packit |
de35d7 |
* A best-effort decoder. Runs until it hits either end of input or
|
|
Packit |
de35d7 |
* the start of an invalid byte sequence.
|
|
Packit |
de35d7 |
*
|
|
Packit |
de35d7 |
* At exit, we update *destoff with the next offset to write to, *src
|
|
Packit |
de35d7 |
* with the next source location past the last one successfully
|
|
Packit |
de35d7 |
* decoded, and return the next source location to read from.
|
|
Packit |
de35d7 |
*
|
|
Packit |
de35d7 |
* Moreover, we expose the internal decoder state (state0 and
|
|
Packit |
de35d7 |
* codepoint0), allowing one to restart the decoder after it
|
|
Packit |
de35d7 |
* terminates (say, due to a partial codepoint).
|
|
Packit |
de35d7 |
*
|
|
Packit |
de35d7 |
* In particular, there are a few possible outcomes,
|
|
Packit |
de35d7 |
*
|
|
Packit |
de35d7 |
* 1) We decoded the buffer entirely:
|
|
Packit |
de35d7 |
* In this case we return srcend
|
|
Packit |
de35d7 |
* state0 == UTF8_ACCEPT
|
|
Packit |
de35d7 |
*
|
|
Packit |
de35d7 |
* 2) We met an invalid encoding
|
|
Packit |
de35d7 |
* In this case we return the address of the first invalid byte
|
|
Packit |
de35d7 |
* state0 == UTF8_REJECT
|
|
Packit |
de35d7 |
*
|
|
Packit |
de35d7 |
* 3) We reached the end of the buffer while decoding a codepoint
|
|
Packit |
de35d7 |
* In this case we return a pointer to the first byte of the partial codepoint
|
|
Packit |
de35d7 |
* state0 != UTF8_ACCEPT, UTF8_REJECT
|
|
Packit |
de35d7 |
*
|
|
Packit |
de35d7 |
*/
|
|
Packit |
de35d7 |
#if defined(__GNUC__) || defined(__clang__)
|
|
Packit |
de35d7 |
static inline uint8_t const *
|
|
Packit |
de35d7 |
_hs_streaming_commons_decode_utf8_int(uint16_t *const dest, size_t *destoff,
|
|
Packit |
de35d7 |
const uint8_t const **src, const uint8_t const *srcend,
|
|
Packit |
de35d7 |
uint32_t *codepoint0, uint32_t *state0)
|
|
Packit |
de35d7 |
__attribute((always_inline));
|
|
Packit |
de35d7 |
#endif
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
static inline uint8_t const *
|
|
Packit |
de35d7 |
_hs_streaming_commons_decode_utf8_int(uint16_t *const dest, size_t *destoff,
|
|
Packit |
de35d7 |
const uint8_t const **src, const uint8_t const *srcend,
|
|
Packit |
de35d7 |
uint32_t *codepoint0, uint32_t *state0)
|
|
Packit |
de35d7 |
{
|
|
Packit |
de35d7 |
uint16_t *d = dest + *destoff;
|
|
Packit |
de35d7 |
const uint8_t *s = *src, *last = *src;
|
|
Packit |
de35d7 |
uint32_t state = *state0;
|
|
Packit |
de35d7 |
uint32_t codepoint = *codepoint0;
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
while (s < srcend) {
|
|
Packit |
de35d7 |
#if defined(__i386__) || defined(__x86_64__)
|
|
Packit |
de35d7 |
/*
|
|
Packit |
de35d7 |
* This code will only work on a little-endian system that
|
|
Packit |
de35d7 |
* supports unaligned loads.
|
|
Packit |
de35d7 |
*
|
|
Packit |
de35d7 |
* It gives a substantial speed win on data that is purely or
|
|
Packit |
de35d7 |
* partly ASCII (e.g. HTML), at only a slight cost on purely
|
|
Packit |
de35d7 |
* non-ASCII text.
|
|
Packit |
de35d7 |
*/
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
if (state == UTF8_ACCEPT) {
|
|
Packit |
de35d7 |
while (s < srcend - 4) {
|
|
Packit |
de35d7 |
codepoint = *((uint32_t *) s);
|
|
Packit |
de35d7 |
if ((codepoint & 0x80808080) != 0)
|
|
Packit |
de35d7 |
break;
|
|
Packit |
de35d7 |
s += 4;
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
/*
|
|
Packit |
de35d7 |
* Tried 32-bit stores here, but the extra bit-twiddling
|
|
Packit |
de35d7 |
* slowed the code down.
|
|
Packit |
de35d7 |
*/
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
*d++ = (uint16_t) (codepoint & 0xff);
|
|
Packit |
de35d7 |
*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
|
|
Packit |
de35d7 |
*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
|
|
Packit |
de35d7 |
*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
last = s;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
#endif
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
|
|
Packit |
de35d7 |
if (state != UTF8_REJECT)
|
|
Packit |
de35d7 |
continue;
|
|
Packit |
de35d7 |
break;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
if (codepoint <= 0xffff)
|
|
Packit |
de35d7 |
*d++ = (uint16_t) codepoint;
|
|
Packit |
de35d7 |
else {
|
|
Packit |
de35d7 |
*d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
|
|
Packit |
de35d7 |
*d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
last = s;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
*destoff = d - dest;
|
|
Packit |
de35d7 |
*codepoint0 = codepoint;
|
|
Packit |
de35d7 |
*state0 = state;
|
|
Packit |
de35d7 |
*src = last;
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
return s;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
uint8_t const *
|
|
Packit |
de35d7 |
_hs_streaming_commons_decode_utf8_state(uint16_t *const dest, size_t *destoff,
|
|
Packit |
de35d7 |
const uint8_t const **src,
|
|
Packit |
de35d7 |
const uint8_t const *srcend,
|
|
Packit |
de35d7 |
uint32_t *codepoint0, uint32_t *state0)
|
|
Packit |
de35d7 |
{
|
|
Packit |
de35d7 |
uint8_t const *ret = _hs_streaming_commons_decode_utf8_int(dest, destoff, src, srcend,
|
|
Packit |
de35d7 |
codepoint0, state0);
|
|
Packit |
de35d7 |
if (*state0 == UTF8_REJECT)
|
|
Packit |
de35d7 |
ret -=1;
|
|
Packit |
de35d7 |
return ret;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
/*
|
|
Packit |
de35d7 |
* Helper to decode buffer and discard final decoder state
|
|
Packit |
de35d7 |
*/
|
|
Packit |
de35d7 |
const uint8_t *
|
|
Packit |
de35d7 |
_hs_streaming_commons_decode_utf8(uint16_t *const dest, size_t *destoff,
|
|
Packit |
de35d7 |
const uint8_t *src, const uint8_t *const srcend)
|
|
Packit |
de35d7 |
{
|
|
Packit |
de35d7 |
uint32_t codepoint;
|
|
Packit |
de35d7 |
uint32_t state = UTF8_ACCEPT;
|
|
Packit |
de35d7 |
uint8_t const *ret = _hs_streaming_commons_decode_utf8_int(dest, destoff, &src, srcend,
|
|
Packit |
de35d7 |
&codepoint, &state);
|
|
Packit |
de35d7 |
/* Back up if we have an incomplete or invalid encoding */
|
|
Packit |
de35d7 |
if (state != UTF8_ACCEPT)
|
|
Packit |
de35d7 |
ret -= 1;
|
|
Packit |
de35d7 |
return ret;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
void
|
|
Packit |
de35d7 |
_hs_streaming_commons_encode_utf8(uint8_t **destp, const uint16_t *src, size_t srcoff,
|
|
Packit |
de35d7 |
size_t srclen)
|
|
Packit |
de35d7 |
{
|
|
Packit |
de35d7 |
const uint16_t *srcend;
|
|
Packit |
de35d7 |
uint8_t *dest = *destp;
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
src += srcoff;
|
|
Packit |
de35d7 |
srcend = src + srclen;
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
ascii:
|
|
Packit |
de35d7 |
#if defined(__x86_64__)
|
|
Packit |
de35d7 |
while (srcend - src >= 4) {
|
|
Packit |
de35d7 |
uint64_t w = *((uint64_t *) src);
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
if (w & 0xFF80FF80FF80FF80ULL) {
|
|
Packit |
de35d7 |
if (!(w & 0x000000000000FF80ULL)) {
|
|
Packit |
de35d7 |
*dest++ = w & 0xFFFF;
|
|
Packit |
de35d7 |
src++;
|
|
Packit |
de35d7 |
if (!(w & 0x00000000FF800000ULL)) {
|
|
Packit |
de35d7 |
*dest++ = (w >> 16) & 0xFFFF;
|
|
Packit |
de35d7 |
src++;
|
|
Packit |
de35d7 |
if (!(w & 0x0000FF8000000000ULL)) {
|
|
Packit |
de35d7 |
*dest++ = (w >> 32) & 0xFFFF;
|
|
Packit |
de35d7 |
src++;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
break;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
*dest++ = w & 0xFFFF;
|
|
Packit |
de35d7 |
*dest++ = (w >> 16) & 0xFFFF;
|
|
Packit |
de35d7 |
*dest++ = (w >> 32) & 0xFFFF;
|
|
Packit |
de35d7 |
*dest++ = w >> 48;
|
|
Packit |
de35d7 |
src += 4;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
#endif
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
#if defined(__i386__)
|
|
Packit |
de35d7 |
while (srcend - src >= 2) {
|
|
Packit |
de35d7 |
uint32_t w = *((uint32_t *) src);
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
if (w & 0xFF80FF80)
|
|
Packit |
de35d7 |
break;
|
|
Packit |
de35d7 |
*dest++ = w & 0xFFFF;
|
|
Packit |
de35d7 |
*dest++ = w >> 16;
|
|
Packit |
de35d7 |
src += 2;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
#endif
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
while (src < srcend) {
|
|
Packit |
de35d7 |
uint16_t w = *src++;
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
if (w <= 0x7F) {
|
|
Packit |
de35d7 |
*dest++ = w;
|
|
Packit |
de35d7 |
/* An ASCII byte is likely to begin a run of ASCII bytes.
|
|
Packit |
de35d7 |
Falling back into the fast path really helps performance. */
|
|
Packit |
de35d7 |
goto ascii;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
else if (w <= 0x7FF) {
|
|
Packit |
de35d7 |
*dest++ = (w >> 6) | 0xC0;
|
|
Packit |
de35d7 |
*dest++ = (w & 0x3f) | 0x80;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
else if (w < 0xD800 || w > 0xDBFF) {
|
|
Packit |
de35d7 |
*dest++ = (w >> 12) | 0xE0;
|
|
Packit |
de35d7 |
*dest++ = ((w >> 6) & 0x3F) | 0x80;
|
|
Packit |
de35d7 |
*dest++ = (w & 0x3F) | 0x80;
|
|
Packit |
de35d7 |
} else {
|
|
Packit |
de35d7 |
uint32_t c = ((((uint32_t) w) - 0xD800) << 10) +
|
|
Packit |
de35d7 |
(((uint32_t) *src++) - 0xDC00) + 0x10000;
|
|
Packit |
de35d7 |
*dest++ = (c >> 18) | 0xF0;
|
|
Packit |
de35d7 |
*dest++ = ((c >> 12) & 0x3F) | 0x80;
|
|
Packit |
de35d7 |
*dest++ = ((c >> 6) & 0x3F) | 0x80;
|
|
Packit |
de35d7 |
*dest++ = (c & 0x3F) | 0x80;
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
}
|
|
Packit |
de35d7 |
|
|
Packit |
de35d7 |
*destp = dest;
|
|
Packit |
de35d7 |
}
|