|
Packit |
9a2dfb |
// Copyright (c) 2008-2009 Bjoern Hoehrmann
|
|
Packit |
9a2dfb |
// Copyright (c) 2015, Ondrej Palkovsky
|
|
Packit |
9a2dfb |
// Copyright (c) 2016, Winterland
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
#include <string.h>
|
|
Packit |
9a2dfb |
#include <stdio.h>
|
|
Packit |
9a2dfb |
#include <stdint.h>
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
#define UTF8_ACCEPT 0
|
|
Packit |
9a2dfb |
#define UTF8_REJECT 12
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
static const uint8_t utf8d[] = {
|
|
Packit |
9a2dfb |
// The first part of the table maps bytes to character classes that
|
|
Packit |
9a2dfb |
// to reduce the size of the transition table and create bitmasks.
|
|
Packit |
9a2dfb |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
Packit |
9a2dfb |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
Packit |
9a2dfb |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
Packit |
9a2dfb |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
Packit |
9a2dfb |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
|
Packit |
9a2dfb |
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
Packit |
9a2dfb |
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
Packit |
9a2dfb |
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
// The second part is a transition table that maps a combination
|
|
Packit |
9a2dfb |
// of a state of the automaton and a character class to a state.
|
|
Packit |
9a2dfb |
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
|
Packit |
9a2dfb |
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
|
Packit |
9a2dfb |
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
|
Packit |
9a2dfb |
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
|
Packit |
9a2dfb |
12,36,12,12,12,12,12,12,12,12,12,12,
|
|
Packit |
9a2dfb |
};
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
|
Packit |
9a2dfb |
uint32_t type = utf8d[byte];
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
*codep = (*state != UTF8_ACCEPT) ?
|
|
Packit |
9a2dfb |
(byte & 0x3fu) | (*codep << 6) :
|
|
Packit |
9a2dfb |
(0xff >> type) & (byte);
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
*state = utf8d[256 + *state + type];
|
|
Packit |
9a2dfb |
return *state;
|
|
Packit |
9a2dfb |
}
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
static inline uint16_t decode_hex(uint32_t c)
|
|
Packit |
9a2dfb |
{
|
|
Packit |
9a2dfb |
if (c >= '0' && c <= '9') return c - '0';
|
|
Packit |
9a2dfb |
else if (c >= 'a' && c <= 'f') return c - 'a' + 10;
|
|
Packit |
9a2dfb |
else if (c >= 'A' && c <= 'F') return c - 'A' + 10;
|
|
Packit |
9a2dfb |
return 0xFFFF; // Should not happen
|
|
Packit |
9a2dfb |
}
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
// Decode, return non-zero value on error
|
|
Packit |
9a2dfb |
int _js_decode_string(uint16_t *const dest, size_t *destoff,
|
|
Packit |
9a2dfb |
const uint8_t *s, const uint8_t *const srcend)
|
|
Packit |
9a2dfb |
{
|
|
Packit |
9a2dfb |
uint16_t *d = dest + *destoff;
|
|
Packit |
9a2dfb |
uint32_t state = 0;
|
|
Packit |
9a2dfb |
uint32_t codepoint;
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
uint8_t surrogate = 0;
|
|
Packit |
9a2dfb |
uint16_t temp_hex = 0;
|
|
Packit |
9a2dfb |
uint16_t unidata;
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
// Optimized version of dispatch when just an ASCII char is expected
|
|
Packit |
9a2dfb |
#define DISPATCH_ASCII(label) {\
|
|
Packit |
9a2dfb |
if (s >= srcend) {\
|
|
Packit |
9a2dfb |
return -1;\
|
|
Packit |
9a2dfb |
}\
|
|
Packit |
9a2dfb |
codepoint = *s++;\
|
|
Packit |
9a2dfb |
goto label;\
|
|
Packit |
9a2dfb |
}
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
standard:
|
|
Packit |
9a2dfb |
// Test end of stream
|
|
Packit |
9a2dfb |
while (s < srcend) {
|
|
Packit |
9a2dfb |
if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
|
|
Packit |
9a2dfb |
if (state == UTF8_REJECT) { return -1; }
|
|
Packit |
9a2dfb |
continue;
|
|
Packit |
9a2dfb |
}
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
if (codepoint == '\\')
|
|
Packit |
9a2dfb |
DISPATCH_ASCII(backslash)
|
|
Packit |
9a2dfb |
else if (codepoint <= 0xffff)
|
|
Packit |
9a2dfb |
*d++ = (uint16_t) codepoint;
|
|
Packit |
9a2dfb |
else {
|
|
Packit |
9a2dfb |
*d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
|
|
Packit |
9a2dfb |
*d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
|
|
Packit |
9a2dfb |
}
|
|
Packit |
9a2dfb |
}
|
|
Packit |
9a2dfb |
*destoff = d - dest;
|
|
Packit |
9a2dfb |
// Exit point
|
|
Packit |
9a2dfb |
return (state != UTF8_ACCEPT);
|
|
Packit |
9a2dfb |
backslash:
|
|
Packit |
9a2dfb |
switch (codepoint) {
|
|
Packit |
9a2dfb |
case '"':
|
|
Packit |
9a2dfb |
case '\\':
|
|
Packit |
9a2dfb |
case '/':
|
|
Packit |
9a2dfb |
*d++ = (uint16_t) codepoint;
|
|
Packit |
9a2dfb |
goto standard;
|
|
Packit |
9a2dfb |
break;
|
|
Packit |
9a2dfb |
case 'b': *d++ = '\b';goto standard;
|
|
Packit |
9a2dfb |
case 'f': *d++ = '\f';goto standard;
|
|
Packit |
9a2dfb |
case 'n': *d++ = '\n';goto standard;
|
|
Packit |
9a2dfb |
case 'r': *d++ = '\r';goto standard;
|
|
Packit |
9a2dfb |
case 't': *d++ = '\t';goto standard;
|
|
Packit |
9a2dfb |
case 'u': DISPATCH_ASCII(unicode1);;break;
|
|
Packit |
9a2dfb |
default:
|
|
Packit |
9a2dfb |
return -1;
|
|
Packit |
9a2dfb |
}
|
|
Packit |
9a2dfb |
unicode1:
|
|
Packit |
9a2dfb |
temp_hex = decode_hex(codepoint);
|
|
Packit |
9a2dfb |
if (temp_hex == 0xFFFF) { return -1; }
|
|
Packit |
9a2dfb |
else unidata = temp_hex << 12;
|
|
Packit |
9a2dfb |
DISPATCH_ASCII(unicode2);
|
|
Packit |
9a2dfb |
unicode2:
|
|
Packit |
9a2dfb |
temp_hex = decode_hex(codepoint);
|
|
Packit |
9a2dfb |
if (temp_hex == 0xFFFF) { return -1; }
|
|
Packit |
9a2dfb |
else unidata |= temp_hex << 8;
|
|
Packit |
9a2dfb |
DISPATCH_ASCII(unicode3);
|
|
Packit |
9a2dfb |
unicode3:
|
|
Packit |
9a2dfb |
temp_hex = decode_hex(codepoint);
|
|
Packit |
9a2dfb |
if (temp_hex == 0xFFFF) { return -1; }
|
|
Packit |
9a2dfb |
else unidata |= temp_hex << 4;
|
|
Packit |
9a2dfb |
DISPATCH_ASCII(unicode4);
|
|
Packit |
9a2dfb |
unicode4:
|
|
Packit |
9a2dfb |
temp_hex = decode_hex(codepoint);
|
|
Packit |
9a2dfb |
if (temp_hex == 0xFFFF) { return -1; }
|
|
Packit |
9a2dfb |
else unidata |= temp_hex;
|
|
Packit |
9a2dfb |
*d++ = (uint16_t) unidata;
|
|
Packit |
9a2dfb |
|
|
Packit |
9a2dfb |
if (surrogate) {
|
|
Packit |
9a2dfb |
if (unidata < 0xDC00 || unidata > 0xDFFF) // is not low surrogate
|
|
Packit |
9a2dfb |
return -1;
|
|
Packit |
9a2dfb |
surrogate = 0;
|
|
Packit |
9a2dfb |
} else if (unidata >= 0xD800 && unidata <= 0xDBFF ) { // is high surrogate
|
|
Packit |
9a2dfb |
surrogate = 1;
|
|
Packit |
9a2dfb |
DISPATCH_ASCII(surrogate1);
|
|
Packit |
9a2dfb |
} else if (unidata >= 0xDC00 && unidata <= 0xDFFF) { // is low surrogate
|
|
Packit |
9a2dfb |
return -1;
|
|
Packit |
9a2dfb |
}
|
|
Packit |
9a2dfb |
goto standard;
|
|
Packit |
9a2dfb |
surrogate1:
|
|
Packit |
9a2dfb |
if (codepoint != '\\') { return -1; }
|
|
Packit |
9a2dfb |
DISPATCH_ASCII(surrogate2)
|
|
Packit |
9a2dfb |
surrogate2:
|
|
Packit |
9a2dfb |
if (codepoint != 'u') { return -1; }
|
|
Packit |
9a2dfb |
DISPATCH_ASCII(unicode1)
|
|
Packit |
9a2dfb |
}
|