dhodovsk / source-git / ghc-aeson

Forked from source-git/ghc-aeson 4 years ago
Clone

Blame cbits/unescape_string.c

Packit 9a2dfb
// Copyright (c) 2008-2009 Bjoern Hoehrmann
Packit 9a2dfb
// Copyright (c) 2015, Ondrej Palkovsky
Packit 9a2dfb
// Copyright (c) 2016, Winterland
Packit 9a2dfb
Packit 9a2dfb
#include <string.h>
Packit 9a2dfb
#include <stdio.h>
Packit 9a2dfb
#include <stdint.h>
Packit 9a2dfb
Packit 9a2dfb
Packit 9a2dfb
#define UTF8_ACCEPT 0
Packit 9a2dfb
#define UTF8_REJECT 12
Packit 9a2dfb
Packit 9a2dfb
static const uint8_t utf8d[] = {
Packit 9a2dfb
  // The first part of the table maps bytes to character classes that
Packit 9a2dfb
  // to reduce the size of the transition table and create bitmasks.
Packit 9a2dfb
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
Packit 9a2dfb
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
Packit 9a2dfb
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
Packit 9a2dfb
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
Packit 9a2dfb
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
Packit 9a2dfb
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
Packit 9a2dfb
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
Packit 9a2dfb
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
Packit 9a2dfb
Packit 9a2dfb
  // The second part is a transition table that maps a combination
Packit 9a2dfb
  // of a state of the automaton and a character class to a state.
Packit 9a2dfb
   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
Packit 9a2dfb
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
Packit 9a2dfb
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
Packit 9a2dfb
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
Packit 9a2dfb
  12,36,12,12,12,12,12,12,12,12,12,12,
Packit 9a2dfb
};
Packit 9a2dfb
Packit 9a2dfb
static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
Packit 9a2dfb
  uint32_t type = utf8d[byte];
Packit 9a2dfb
Packit 9a2dfb
  *codep = (*state != UTF8_ACCEPT) ?
Packit 9a2dfb
    (byte & 0x3fu) | (*codep << 6) :
Packit 9a2dfb
    (0xff >> type) & (byte);
Packit 9a2dfb
Packit 9a2dfb
  *state = utf8d[256 + *state + type];
Packit 9a2dfb
  return *state;
Packit 9a2dfb
}
Packit 9a2dfb
Packit 9a2dfb
static inline uint16_t decode_hex(uint32_t c)
Packit 9a2dfb
{
Packit 9a2dfb
  if (c >= '0' && c <= '9')      return c - '0';
Packit 9a2dfb
  else if (c >= 'a' && c <= 'f') return c - 'a' + 10;
Packit 9a2dfb
  else if (c >= 'A' && c <= 'F') return c - 'A' + 10;
Packit 9a2dfb
  return 0xFFFF; // Should not happen
Packit 9a2dfb
}
Packit 9a2dfb
Packit 9a2dfb
// Decode, return non-zero value on error
Packit 9a2dfb
int _js_decode_string(uint16_t *const dest, size_t *destoff,
Packit 9a2dfb
                  const uint8_t *s, const uint8_t *const srcend)
Packit 9a2dfb
{
Packit 9a2dfb
  uint16_t *d = dest + *destoff;
Packit 9a2dfb
  uint32_t state = 0;
Packit 9a2dfb
  uint32_t codepoint;
Packit 9a2dfb
Packit 9a2dfb
  uint8_t surrogate = 0;
Packit 9a2dfb
  uint16_t temp_hex = 0;
Packit 9a2dfb
  uint16_t unidata;
Packit 9a2dfb
Packit 9a2dfb
  // Optimized version of dispatch when just an ASCII char is expected
Packit 9a2dfb
  #define DISPATCH_ASCII(label) {\
Packit 9a2dfb
    if (s >= srcend) {\
Packit 9a2dfb
      return -1;\
Packit 9a2dfb
    }\
Packit 9a2dfb
    codepoint = *s++;\
Packit 9a2dfb
    goto label;\
Packit 9a2dfb
  }
Packit 9a2dfb
Packit 9a2dfb
  standard:
Packit 9a2dfb
    // Test end of stream
Packit 9a2dfb
    while (s < srcend) {
Packit 9a2dfb
        if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
Packit 9a2dfb
          if (state == UTF8_REJECT) { return -1; }
Packit 9a2dfb
          continue;
Packit 9a2dfb
        }
Packit 9a2dfb
Packit 9a2dfb
        if (codepoint == '\\')
Packit 9a2dfb
          DISPATCH_ASCII(backslash)
Packit 9a2dfb
        else if (codepoint <= 0xffff)
Packit 9a2dfb
          *d++ = (uint16_t) codepoint;
Packit 9a2dfb
        else {
Packit 9a2dfb
          *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
Packit 9a2dfb
          *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
Packit 9a2dfb
        }
Packit 9a2dfb
    }
Packit 9a2dfb
    *destoff = d - dest;
Packit 9a2dfb
    // Exit point
Packit 9a2dfb
    return (state != UTF8_ACCEPT);
Packit 9a2dfb
  backslash:
Packit 9a2dfb
    switch (codepoint) {
Packit 9a2dfb
      case '"':
Packit 9a2dfb
      case '\\':
Packit 9a2dfb
      case '/':
Packit 9a2dfb
        *d++ = (uint16_t) codepoint;
Packit 9a2dfb
        goto standard;
Packit 9a2dfb
        break;
Packit 9a2dfb
      case 'b': *d++ = '\b';goto standard;
Packit 9a2dfb
      case 'f': *d++ = '\f';goto standard;
Packit 9a2dfb
      case 'n': *d++ = '\n';goto standard;
Packit 9a2dfb
      case 'r': *d++ = '\r';goto standard;
Packit 9a2dfb
      case 't': *d++ = '\t';goto standard;
Packit 9a2dfb
      case 'u': DISPATCH_ASCII(unicode1);;break;
Packit 9a2dfb
      default:
Packit 9a2dfb
        return -1;
Packit 9a2dfb
    }
Packit 9a2dfb
  unicode1:
Packit 9a2dfb
    temp_hex = decode_hex(codepoint);
Packit 9a2dfb
    if (temp_hex == 0xFFFF) { return -1; }
Packit 9a2dfb
    else unidata = temp_hex << 12;
Packit 9a2dfb
    DISPATCH_ASCII(unicode2);
Packit 9a2dfb
  unicode2:
Packit 9a2dfb
    temp_hex = decode_hex(codepoint);
Packit 9a2dfb
    if (temp_hex == 0xFFFF) { return -1; }
Packit 9a2dfb
    else unidata |= temp_hex << 8;
Packit 9a2dfb
    DISPATCH_ASCII(unicode3);
Packit 9a2dfb
  unicode3:
Packit 9a2dfb
    temp_hex = decode_hex(codepoint);
Packit 9a2dfb
    if (temp_hex == 0xFFFF) { return -1; }
Packit 9a2dfb
    else unidata |= temp_hex << 4;
Packit 9a2dfb
    DISPATCH_ASCII(unicode4);
Packit 9a2dfb
  unicode4:
Packit 9a2dfb
    temp_hex = decode_hex(codepoint);
Packit 9a2dfb
    if (temp_hex == 0xFFFF) { return -1; }
Packit 9a2dfb
    else unidata |= temp_hex;
Packit 9a2dfb
    *d++ = (uint16_t) unidata;
Packit 9a2dfb
Packit 9a2dfb
    if (surrogate) {
Packit 9a2dfb
      if (unidata < 0xDC00 || unidata > 0xDFFF) // is not low surrogate
Packit 9a2dfb
        return -1;
Packit 9a2dfb
      surrogate = 0;
Packit 9a2dfb
    } else if (unidata >= 0xD800 && unidata <= 0xDBFF ) { // is high surrogate
Packit 9a2dfb
        surrogate = 1;
Packit 9a2dfb
        DISPATCH_ASCII(surrogate1);
Packit 9a2dfb
    } else if (unidata >= 0xDC00 && unidata <= 0xDFFF) { // is low surrogate
Packit 9a2dfb
        return -1;
Packit 9a2dfb
    }
Packit 9a2dfb
    goto standard;
Packit 9a2dfb
  surrogate1:
Packit 9a2dfb
    if (codepoint != '\\') { return -1; }
Packit 9a2dfb
    DISPATCH_ASCII(surrogate2)
Packit 9a2dfb
  surrogate2:
Packit 9a2dfb
    if (codepoint != 'u') { return -1; }
Packit 9a2dfb
    DISPATCH_ASCII(unicode1)
Packit 9a2dfb
}