Blame cbits/utf8.c

Packit 1f51f5
#include <stdlib.h>
Packit 1f51f5
#include <stdint.h>
Packit 1f51f5
#include <assert.h>
Packit 1f51f5
Packit 1f51f5
#include "cmark_ctype.h"
Packit 1f51f5
#include "utf8.h"
Packit 1f51f5
Packit 1f51f5
static const int8_t utf8proc_utf8class[256] = {
Packit 1f51f5
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Packit 1f51f5
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Packit 1f51f5
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Packit 1f51f5
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Packit 1f51f5
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Packit 1f51f5
    1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit 1f51f5
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit 1f51f5
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Packit 1f51f5
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
Packit 1f51f5
    2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
Packit 1f51f5
    4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0};
Packit 1f51f5
Packit 1f51f5
static void encode_unknown(cmark_strbuf *buf) {
Packit 1f51f5
  static const uint8_t repl[] = {239, 191, 189};
Packit 1f51f5
  cmark_strbuf_put(buf, repl, 3);
Packit 1f51f5
}
Packit 1f51f5
Packit 1f51f5
static int utf8proc_charlen(const uint8_t *str, bufsize_t str_len) {
Packit 1f51f5
  int length, i;
Packit 1f51f5
Packit 1f51f5
  if (!str_len)
Packit 1f51f5
    return 0;
Packit 1f51f5
Packit 1f51f5
  length = utf8proc_utf8class[str[0]];
Packit 1f51f5
Packit 1f51f5
  if (!length)
Packit 1f51f5
    return -1;
Packit 1f51f5
Packit 1f51f5
  if (str_len >= 0 && (bufsize_t)length > str_len)
Packit 1f51f5
    return -str_len;
Packit 1f51f5
Packit 1f51f5
  for (i = 1; i < length; i++) {
Packit 1f51f5
    if ((str[i] & 0xC0) != 0x80)
Packit 1f51f5
      return -i;
Packit 1f51f5
  }
Packit 1f51f5
Packit 1f51f5
  return length;
Packit 1f51f5
}
Packit 1f51f5
Packit 1f51f5
// Validate a single UTF-8 character according to RFC 3629.
Packit 1f51f5
static int utf8proc_valid(const uint8_t *str, bufsize_t str_len) {
Packit 1f51f5
  int length = utf8proc_utf8class[str[0]];
Packit 1f51f5
Packit 1f51f5
  if (!length)
Packit 1f51f5
    return -1;
Packit 1f51f5
Packit 1f51f5
  if ((bufsize_t)length > str_len)
Packit 1f51f5
    return -str_len;
Packit 1f51f5
Packit 1f51f5
  switch (length) {
Packit 1f51f5
  case 2:
Packit 1f51f5
    if ((str[1] & 0xC0) != 0x80)
Packit 1f51f5
      return -1;
Packit 1f51f5
    if (str[0] < 0xC2) {
Packit 1f51f5
      // Overlong
Packit 1f51f5
      return -length;
Packit 1f51f5
    }
Packit 1f51f5
    break;
Packit 1f51f5
Packit 1f51f5
  case 3:
Packit 1f51f5
    if ((str[1] & 0xC0) != 0x80)
Packit 1f51f5
      return -1;
Packit 1f51f5
    if ((str[2] & 0xC0) != 0x80)
Packit 1f51f5
      return -2;
Packit 1f51f5
    if (str[0] == 0xE0) {
Packit 1f51f5
      if (str[1] < 0xA0) {
Packit 1f51f5
        // Overlong
Packit 1f51f5
        return -length;
Packit 1f51f5
      }
Packit 1f51f5
    } else if (str[0] == 0xED) {
Packit 1f51f5
      if (str[1] >= 0xA0) {
Packit 1f51f5
        // Surrogate
Packit 1f51f5
        return -length;
Packit 1f51f5
      }
Packit 1f51f5
    }
Packit 1f51f5
    break;
Packit 1f51f5
Packit 1f51f5
  case 4:
Packit 1f51f5
    if ((str[1] & 0xC0) != 0x80)
Packit 1f51f5
      return -1;
Packit 1f51f5
    if ((str[2] & 0xC0) != 0x80)
Packit 1f51f5
      return -2;
Packit 1f51f5
    if ((str[3] & 0xC0) != 0x80)
Packit 1f51f5
      return -3;
Packit 1f51f5
    if (str[0] == 0xF0) {
Packit 1f51f5
      if (str[1] < 0x90) {
Packit 1f51f5
        // Overlong
Packit 1f51f5
        return -length;
Packit 1f51f5
      }
Packit 1f51f5
    } else if (str[0] >= 0xF4) {
Packit 1f51f5
      if (str[0] > 0xF4 || str[1] >= 0x90) {
Packit 1f51f5
        // Above 0x10FFFF
Packit 1f51f5
        return -length;
Packit 1f51f5
      }
Packit 1f51f5
    }
Packit 1f51f5
    break;
Packit 1f51f5
  }
Packit 1f51f5
Packit 1f51f5
  return length;
Packit 1f51f5
}
Packit 1f51f5
Packit 1f51f5
void cmark_utf8proc_check(cmark_strbuf *ob, const uint8_t *line,
Packit 1f51f5
                          bufsize_t size) {
Packit 1f51f5
  bufsize_t i = 0;
Packit 1f51f5
Packit 1f51f5
  while (i < size) {
Packit 1f51f5
    bufsize_t org = i;
Packit 1f51f5
    int charlen = 0;
Packit 1f51f5
Packit 1f51f5
    while (i < size) {
Packit 1f51f5
      if (line[i] < 0x80 && line[i] != 0) {
Packit 1f51f5
        i++;
Packit 1f51f5
      } else if (line[i] >= 0x80) {
Packit 1f51f5
        charlen = utf8proc_valid(line + i, size - i);
Packit 1f51f5
        if (charlen < 0) {
Packit 1f51f5
          charlen = -charlen;
Packit 1f51f5
          break;
Packit 1f51f5
        }
Packit 1f51f5
        i += charlen;
Packit 1f51f5
      } else if (line[i] == 0) {
Packit 1f51f5
        // ASCII NUL is technically valid but rejected
Packit 1f51f5
        // for security reasons.
Packit 1f51f5
        charlen = 1;
Packit 1f51f5
        break;
Packit 1f51f5
      }
Packit 1f51f5
    }
Packit 1f51f5
Packit 1f51f5
    if (i > org) {
Packit 1f51f5
      cmark_strbuf_put(ob, line + org, i - org);
Packit 1f51f5
    }
Packit 1f51f5
Packit 1f51f5
    if (i >= size) {
Packit 1f51f5
      break;
Packit 1f51f5
    } else {
Packit 1f51f5
      // Invalid UTF-8
Packit 1f51f5
      encode_unknown(ob);
Packit 1f51f5
      i += charlen;
Packit 1f51f5
    }
Packit 1f51f5
  }
Packit 1f51f5
}
Packit 1f51f5
Packit 1f51f5
int cmark_utf8proc_iterate(const uint8_t *str, bufsize_t str_len,
Packit 1f51f5
                           int32_t *dst) {
Packit 1f51f5
  int length;
Packit 1f51f5
  int32_t uc = -1;
Packit 1f51f5
Packit 1f51f5
  *dst = -1;
Packit 1f51f5
  length = utf8proc_charlen(str, str_len);
Packit 1f51f5
  if (length < 0)
Packit 1f51f5
    return -1;
Packit 1f51f5
Packit 1f51f5
  switch (length) {
Packit 1f51f5
  case 1:
Packit 1f51f5
    uc = str[0];
Packit 1f51f5
    break;
Packit 1f51f5
  case 2:
Packit 1f51f5
    uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
Packit 1f51f5
    if (uc < 0x80)
Packit 1f51f5
      uc = -1;
Packit 1f51f5
    break;
Packit 1f51f5
  case 3:
Packit 1f51f5
    uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) + (str[2] & 0x3F);
Packit 1f51f5
    if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000))
Packit 1f51f5
      uc = -1;
Packit 1f51f5
    break;
Packit 1f51f5
  case 4:
Packit 1f51f5
    uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) +
Packit 1f51f5
         ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
Packit 1f51f5
    if (uc < 0x10000 || uc >= 0x110000)
Packit 1f51f5
      uc = -1;
Packit 1f51f5
    break;
Packit 1f51f5
  }
Packit 1f51f5
Packit 1f51f5
  if (uc < 0)
Packit 1f51f5
    return -1;
Packit 1f51f5
Packit 1f51f5
  *dst = uc;
Packit 1f51f5
  return length;
Packit 1f51f5
}
Packit 1f51f5
Packit 1f51f5
void cmark_utf8proc_encode_char(int32_t uc, cmark_strbuf *buf) {
Packit 1f51f5
  uint8_t dst[4];
Packit 1f51f5
  bufsize_t len = 0;
Packit 1f51f5
Packit 1f51f5
  assert(uc >= 0);
Packit 1f51f5
Packit 1f51f5
  if (uc < 0x80) {
Packit 1f51f5
    dst[0] = (uint8_t)(uc);
Packit 1f51f5
    len = 1;
Packit 1f51f5
  } else if (uc < 0x800) {
Packit 1f51f5
    dst[0] = (uint8_t)(0xC0 + (uc >> 6));
Packit 1f51f5
    dst[1] = 0x80 + (uc & 0x3F);
Packit 1f51f5
    len = 2;
Packit 1f51f5
  } else if (uc == 0xFFFF) {
Packit 1f51f5
    dst[0] = 0xFF;
Packit 1f51f5
    len = 1;
Packit 1f51f5
  } else if (uc == 0xFFFE) {
Packit 1f51f5
    dst[0] = 0xFE;
Packit 1f51f5
    len = 1;
Packit 1f51f5
  } else if (uc < 0x10000) {
Packit 1f51f5
    dst[0] = (uint8_t)(0xE0 + (uc >> 12));
Packit 1f51f5
    dst[1] = 0x80 + ((uc >> 6) & 0x3F);
Packit 1f51f5
    dst[2] = 0x80 + (uc & 0x3F);
Packit 1f51f5
    len = 3;
Packit 1f51f5
  } else if (uc < 0x110000) {
Packit 1f51f5
    dst[0] = (uint8_t)(0xF0 + (uc >> 18));
Packit 1f51f5
    dst[1] = 0x80 + ((uc >> 12) & 0x3F);
Packit 1f51f5
    dst[2] = 0x80 + ((uc >> 6) & 0x3F);
Packit 1f51f5
    dst[3] = 0x80 + (uc & 0x3F);
Packit 1f51f5
    len = 4;
Packit 1f51f5
  } else {
Packit 1f51f5
    encode_unknown(buf);
Packit 1f51f5
    return;
Packit 1f51f5
  }
Packit 1f51f5
Packit 1f51f5
  cmark_strbuf_put(buf, dst, len);
Packit 1f51f5
}
Packit 1f51f5
Packit 1f51f5
void cmark_utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str,
Packit 1f51f5
                              bufsize_t len) {
Packit 1f51f5
  int32_t c;
Packit 1f51f5
Packit 1f51f5
#define bufpush(x) cmark_utf8proc_encode_char(x, dest)
Packit 1f51f5
Packit 1f51f5
  while (len > 0) {
Packit 1f51f5
    bufsize_t char_len = cmark_utf8proc_iterate(str, len, &c);
Packit 1f51f5
Packit 1f51f5
    if (char_len >= 0) {
Packit 1f51f5
#include "case_fold_switch.inc"
Packit 1f51f5
    } else {
Packit 1f51f5
      encode_unknown(dest);
Packit 1f51f5
      char_len = -char_len;
Packit 1f51f5
    }
Packit 1f51f5
Packit 1f51f5
    str += char_len;
Packit 1f51f5
    len -= char_len;
Packit 1f51f5
  }
Packit 1f51f5
}
Packit 1f51f5
Packit 1f51f5
// matches anything in the Zs class, plus LF, CR, TAB, FF.
Packit 1f51f5
int cmark_utf8proc_is_space(int32_t uc) {
Packit 1f51f5
  return (uc == 9 || uc == 10 || uc == 12 || uc == 13 || uc == 32 ||
Packit 1f51f5
          uc == 160 || uc == 5760 || (uc >= 8192 && uc <= 8202) || uc == 8239 ||
Packit 1f51f5
          uc == 8287 || uc == 12288);
Packit 1f51f5
}
Packit 1f51f5
Packit 1f51f5
// matches anything in the P[cdefios] classes.
Packit 1f51f5
int cmark_utf8proc_is_punctuation(int32_t uc) {
Packit 1f51f5
  return (
Packit 1f51f5
      (uc < 128 && cmark_ispunct((char)uc)) || uc == 161 || uc == 167 ||
Packit 1f51f5
      uc == 171 || uc == 182 || uc == 183 || uc == 187 || uc == 191 ||
Packit 1f51f5
      uc == 894 || uc == 903 || (uc >= 1370 && uc <= 1375) || uc == 1417 ||
Packit 1f51f5
      uc == 1418 || uc == 1470 || uc == 1472 || uc == 1475 || uc == 1478 ||
Packit 1f51f5
      uc == 1523 || uc == 1524 || uc == 1545 || uc == 1546 || uc == 1548 ||
Packit 1f51f5
      uc == 1549 || uc == 1563 || uc == 1566 || uc == 1567 ||
Packit 1f51f5
      (uc >= 1642 && uc <= 1645) || uc == 1748 || (uc >= 1792 && uc <= 1805) ||
Packit 1f51f5
      (uc >= 2039 && uc <= 2041) || (uc >= 2096 && uc <= 2110) || uc == 2142 ||
Packit 1f51f5
      uc == 2404 || uc == 2405 || uc == 2416 || uc == 2800 || uc == 3572 ||
Packit 1f51f5
      uc == 3663 || uc == 3674 || uc == 3675 || (uc >= 3844 && uc <= 3858) ||
Packit 1f51f5
      uc == 3860 || (uc >= 3898 && uc <= 3901) || uc == 3973 ||
Packit 1f51f5
      (uc >= 4048 && uc <= 4052) || uc == 4057 || uc == 4058 ||
Packit 1f51f5
      (uc >= 4170 && uc <= 4175) || uc == 4347 || (uc >= 4960 && uc <= 4968) ||
Packit 1f51f5
      uc == 5120 || uc == 5741 || uc == 5742 || uc == 5787 || uc == 5788 ||
Packit 1f51f5
      (uc >= 5867 && uc <= 5869) || uc == 5941 || uc == 5942 ||
Packit 1f51f5
      (uc >= 6100 && uc <= 6102) || (uc >= 6104 && uc <= 6106) ||
Packit 1f51f5
      (uc >= 6144 && uc <= 6154) || uc == 6468 || uc == 6469 || uc == 6686 ||
Packit 1f51f5
      uc == 6687 || (uc >= 6816 && uc <= 6822) || (uc >= 6824 && uc <= 6829) ||
Packit 1f51f5
      (uc >= 7002 && uc <= 7008) || (uc >= 7164 && uc <= 7167) ||
Packit 1f51f5
      (uc >= 7227 && uc <= 7231) || uc == 7294 || uc == 7295 ||
Packit 1f51f5
      (uc >= 7360 && uc <= 7367) || uc == 7379 || (uc >= 8208 && uc <= 8231) ||
Packit 1f51f5
      (uc >= 8240 && uc <= 8259) || (uc >= 8261 && uc <= 8273) ||
Packit 1f51f5
      (uc >= 8275 && uc <= 8286) || uc == 8317 || uc == 8318 || uc == 8333 ||
Packit 1f51f5
      uc == 8334 || (uc >= 8968 && uc <= 8971) || uc == 9001 || uc == 9002 ||
Packit 1f51f5
      (uc >= 10088 && uc <= 10101) || uc == 10181 || uc == 10182 ||
Packit 1f51f5
      (uc >= 10214 && uc <= 10223) || (uc >= 10627 && uc <= 10648) ||
Packit 1f51f5
      (uc >= 10712 && uc <= 10715) || uc == 10748 || uc == 10749 ||
Packit 1f51f5
      (uc >= 11513 && uc <= 11516) || uc == 11518 || uc == 11519 ||
Packit 1f51f5
      uc == 11632 || (uc >= 11776 && uc <= 11822) ||
Packit 1f51f5
      (uc >= 11824 && uc <= 11842) || (uc >= 12289 && uc <= 12291) ||
Packit 1f51f5
      (uc >= 12296 && uc <= 12305) || (uc >= 12308 && uc <= 12319) ||
Packit 1f51f5
      uc == 12336 || uc == 12349 || uc == 12448 || uc == 12539 || uc == 42238 ||
Packit 1f51f5
      uc == 42239 || (uc >= 42509 && uc <= 42511) || uc == 42611 ||
Packit 1f51f5
      uc == 42622 || (uc >= 42738 && uc <= 42743) ||
Packit 1f51f5
      (uc >= 43124 && uc <= 43127) || uc == 43214 || uc == 43215 ||
Packit 1f51f5
      (uc >= 43256 && uc <= 43258) || uc == 43310 || uc == 43311 ||
Packit 1f51f5
      uc == 43359 || (uc >= 43457 && uc <= 43469) || uc == 43486 ||
Packit 1f51f5
      uc == 43487 || (uc >= 43612 && uc <= 43615) || uc == 43742 ||
Packit 1f51f5
      uc == 43743 || uc == 43760 || uc == 43761 || uc == 44011 || uc == 64830 ||
Packit 1f51f5
      uc == 64831 || (uc >= 65040 && uc <= 65049) ||
Packit 1f51f5
      (uc >= 65072 && uc <= 65106) || (uc >= 65108 && uc <= 65121) ||
Packit 1f51f5
      uc == 65123 || uc == 65128 || uc == 65130 || uc == 65131 ||
Packit 1f51f5
      (uc >= 65281 && uc <= 65283) || (uc >= 65285 && uc <= 65290) ||
Packit 1f51f5
      (uc >= 65292 && uc <= 65295) || uc == 65306 || uc == 65307 ||
Packit 1f51f5
      uc == 65311 || uc == 65312 || (uc >= 65339 && uc <= 65341) ||
Packit 1f51f5
      uc == 65343 || uc == 65371 || uc == 65373 ||
Packit 1f51f5
      (uc >= 65375 && uc <= 65381) || (uc >= 65792 && uc <= 65794) ||
Packit 1f51f5
      uc == 66463 || uc == 66512 || uc == 66927 || uc == 67671 || uc == 67871 ||
Packit 1f51f5
      uc == 67903 || (uc >= 68176 && uc <= 68184) || uc == 68223 ||
Packit 1f51f5
      (uc >= 68336 && uc <= 68342) || (uc >= 68409 && uc <= 68415) ||
Packit 1f51f5
      (uc >= 68505 && uc <= 68508) || (uc >= 69703 && uc <= 69709) ||
Packit 1f51f5
      uc == 69819 || uc == 69820 || (uc >= 69822 && uc <= 69825) ||
Packit 1f51f5
      (uc >= 69952 && uc <= 69955) || uc == 70004 || uc == 70005 ||
Packit 1f51f5
      (uc >= 70085 && uc <= 70088) || uc == 70093 ||
Packit 1f51f5
      (uc >= 70200 && uc <= 70205) || uc == 70854 ||
Packit 1f51f5
      (uc >= 71105 && uc <= 71113) || (uc >= 71233 && uc <= 71235) ||
Packit 1f51f5
      (uc >= 74864 && uc <= 74868) || uc == 92782 || uc == 92783 ||
Packit 1f51f5
      uc == 92917 || (uc >= 92983 && uc <= 92987) || uc == 92996 ||
Packit 1f51f5
      uc == 113823);
Packit 1f51f5
}