/*
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, you can obtain one at https://mozilla.org/MPL/2.0/.
*
* See the COPYRIGHT file distributed with this work for additional
* information regarding copyright ownership.
*/
#include <config.h>
#include <string.h>
#include <isc/utf8.h>
#include <isc/util.h>
/*
* UTF-8 is defined in "The Unicode Standard -- Version 4.0"
* Also see RFC 3629.
*
* Char. number range | UTF-8 octet sequence
* (hexadecimal) | (binary)
* --------------------+---------------------------------------------
* 0000 0000-0000 007F | 0xxxxxxx
* 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
* 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
bool
isc_utf8_valid(const unsigned char *buf, size_t len) {
REQUIRE(buf != NULL);
for (size_t i = 0; i < len; i++) {
if (buf[i] <= 0x7f) {
continue;
}
if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 &&
(buf[i + 1] & 0xc0) == 0x80) {
unsigned int w;
w = (buf[i] & 0x1f) << 6;
w |= (buf[++i] & 0x3f);
if (w < 0x80) {
return (false);
}
continue;
}
if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 &&
(buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80)
{
unsigned int w;
w = (buf[i] & 0x0f) << 12;
w |= (buf[++i] & 0x3f) << 6;
w |= (buf[++i] & 0x3f);
if (w < 0x0800) {
return (false);
}
continue;
}
if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 &&
(buf[i + 1] & 0xc0) == 0x80 &&
(buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80)
{
unsigned int w;
w = (buf[i] & 0x07) << 18;
w |= (buf[++i] & 0x3f) << 12;
w |= (buf[++i] & 0x3f) << 6;
w |= (buf[++i] & 0x3f);
if (w < 0x10000 || w > 0x10FFFF) {
return (false);
}
continue;
}
return (false);
}
return (true);
}
bool
isc_utf8_bom(const unsigned char *buf, size_t len) {
REQUIRE(buf != NULL);
if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) {
return (true);
}
return (false);
}