|
Packit |
e9ba0d |
/* -*- mode: c; c-file-style: "openbsd" -*- */
|
|
Packit |
e9ba0d |
/*
|
|
Packit |
e9ba0d |
Copyright (c) 2011 Joseph A. Adams (joeyadams3.14159@gmail.com)
|
|
Packit |
e9ba0d |
|
|
Packit |
e9ba0d |
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
Packit |
e9ba0d |
of this software and associated documentation files (the "Software"), to deal
|
|
Packit |
e9ba0d |
in the Software without restriction, including without limitation the rights
|
|
Packit |
e9ba0d |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
Packit |
e9ba0d |
copies of the Software, and to permit persons to whom the Software is
|
|
Packit |
e9ba0d |
furnished to do so, subject to the following conditions:
|
|
Packit |
e9ba0d |
|
|
Packit |
e9ba0d |
The above copyright notice and this permission notice shall be included in
|
|
Packit |
e9ba0d |
all copies or substantial portions of the Software.
|
|
Packit |
e9ba0d |
|
|
Packit |
e9ba0d |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
Packit |
e9ba0d |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
Packit |
e9ba0d |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
Packit |
e9ba0d |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
Packit |
e9ba0d |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
Packit |
e9ba0d |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
Packit |
e9ba0d |
THE SOFTWARE.
|
|
Packit |
e9ba0d |
*/
|
|
Packit |
e9ba0d |
|
|
Packit |
e9ba0d |
#include <stddef.h>
|
|
Packit |
e9ba0d |
|
|
Packit |
e9ba0d |
/*
|
|
Packit |
e9ba0d |
* Validate a single UTF-8 character starting at @s.
|
|
Packit |
e9ba0d |
* The string must be null-terminated.
|
|
Packit |
e9ba0d |
*
|
|
Packit |
e9ba0d |
* If it's valid, return its length (1 thru 4).
|
|
Packit |
e9ba0d |
* If it's invalid or clipped, return 0.
|
|
Packit |
e9ba0d |
*
|
|
Packit |
e9ba0d |
* This function implements the syntax given in RFC3629, which is
|
|
Packit |
e9ba0d |
* the same as that given in The Unicode Standard, Version 6.0.
|
|
Packit |
e9ba0d |
*
|
|
Packit |
e9ba0d |
* It has the following properties:
|
|
Packit |
e9ba0d |
*
|
|
Packit |
e9ba0d |
* * All codepoints U+0000..U+10FFFF may be encoded,
|
|
Packit |
e9ba0d |
* except for U+D800..U+DFFF, which are reserved
|
|
Packit |
e9ba0d |
* for UTF-16 surrogate pair encoding.
|
|
Packit |
e9ba0d |
* * UTF-8 byte sequences longer than 4 bytes are not permitted,
|
|
Packit |
e9ba0d |
* as they exceed the range of Unicode.
|
|
Packit |
e9ba0d |
* * The sixty-six Unicode "non-characters" are permitted
|
|
Packit |
e9ba0d |
* (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
|
|
Packit |
e9ba0d |
*/
|
|
Packit |
e9ba0d |
size_t
|
|
Packit |
e9ba0d |
utf8_validate_cz(const char *s)
|
|
Packit |
e9ba0d |
{
|
|
Packit |
e9ba0d |
unsigned char c = *s++;
|
|
Packit |
e9ba0d |
|
|
Packit |
e9ba0d |
if (c <= 0x7F) { /* 00..7F */
|
|
Packit |
e9ba0d |
return 1;
|
|
Packit |
e9ba0d |
} else if (c <= 0xC1) { /* 80..C1 */
|
|
Packit |
e9ba0d |
/* Disallow overlong 2-byte sequence. */
|
|
Packit |
e9ba0d |
return 0;
|
|
Packit |
e9ba0d |
} else if (c <= 0xDF) { /* C2..DF */
|
|
Packit |
e9ba0d |
/* Make sure subsequent byte is in the range 0x80..0xBF. */
|
|
Packit |
e9ba0d |
if (((unsigned char)*s++ & 0xC0) != 0x80)
|
|
Packit |
e9ba0d |
return 0;
|
|
Packit |
e9ba0d |
|
|
Packit |
e9ba0d |
return 2;
|
|
Packit |
e9ba0d |
} else if (c <= 0xEF) { /* E0..EF */
|
|
Packit |
e9ba0d |
/* Disallow overlong 3-byte sequence. */
|
|
Packit |
e9ba0d |
if (c == 0xE0 && (unsigned char)*s < 0xA0)
|
|
Packit |
e9ba0d |
return 0;
|
|
Packit |
e9ba0d |
|
|
Packit |
e9ba0d |
/* Disallow U+D800..U+DFFF. */
|
|
Packit |
e9ba0d |
if (c == 0xED && (unsigned char)*s > 0x9F)
|
|
Packit |
e9ba0d |
return 0;
|
|
Packit |
e9ba0d |
|
|
Packit |
e9ba0d |
/* Make sure subsequent bytes are in the range 0x80..0xBF. */
|
|
Packit |
e9ba0d |
if (((unsigned char)*s++ & 0xC0) != 0x80)
|
|
Packit |
e9ba0d |
return 0;
|
|
Packit |
e9ba0d |
if (((unsigned char)*s++ & 0xC0) != 0x80)
|
|
Packit |
e9ba0d |
return 0;
|
|
Packit |
e9ba0d |
|
|
Packit |
e9ba0d |
return 3;
|
|
Packit |
e9ba0d |
} else if (c <= 0xF4) { /* F0..F4 */
|
|
Packit |
e9ba0d |
/* Disallow overlong 4-byte sequence. */
|
|
Packit |
e9ba0d |
if (c == 0xF0 && (unsigned char)*s < 0x90)
|
|
Packit |
e9ba0d |
return 0;
|
|
Packit |
e9ba0d |
|
|
Packit |
e9ba0d |
/* Disallow codepoints beyond U+10FFFF. */
|
|
Packit |
e9ba0d |
if (c == 0xF4 && (unsigned char)*s > 0x8F)
|
|
Packit |
e9ba0d |
return 0;
|
|
Packit |
e9ba0d |
|
|
Packit |
e9ba0d |
/* Make sure subsequent bytes are in the range 0x80..0xBF. */
|
|
Packit |
e9ba0d |
if (((unsigned char)*s++ & 0xC0) != 0x80)
|
|
Packit |
e9ba0d |
return 0;
|
|
Packit |
e9ba0d |
if (((unsigned char)*s++ & 0xC0) != 0x80)
|
|
Packit |
e9ba0d |
return 0;
|
|
Packit |
e9ba0d |
if (((unsigned char)*s++ & 0xC0) != 0x80)
|
|
Packit |
e9ba0d |
return 0;
|
|
Packit |
e9ba0d |
|
|
Packit |
e9ba0d |
return 4;
|
|
Packit |
e9ba0d |
} else { /* F5..FF */
|
|
Packit |
e9ba0d |
return 0;
|
|
Packit |
e9ba0d |
}
|
|
Packit |
e9ba0d |
}
|