Blame src/client/utf8.c

Packit e9ba0d
/* -*- mode: c; c-file-style: "openbsd" -*- */
Packit e9ba0d
/*
Packit e9ba0d
  Copyright (c) 2011 Joseph A. Adams (joeyadams3.14159@gmail.com)
Packit e9ba0d
Packit e9ba0d
  Permission is hereby granted, free of charge, to any person obtaining a copy
Packit e9ba0d
  of this software and associated documentation files (the "Software"), to deal
Packit e9ba0d
  in the Software without restriction, including without limitation the rights
Packit e9ba0d
  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
Packit e9ba0d
  copies of the Software, and to permit persons to whom the Software is
Packit e9ba0d
  furnished to do so, subject to the following conditions:
Packit e9ba0d
Packit e9ba0d
  The above copyright notice and this permission notice shall be included in
Packit e9ba0d
  all copies or substantial portions of the Software.
Packit e9ba0d
Packit e9ba0d
  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
Packit e9ba0d
  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
Packit e9ba0d
  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
Packit e9ba0d
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
Packit e9ba0d
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
Packit e9ba0d
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
Packit e9ba0d
  THE SOFTWARE.
Packit e9ba0d
*/
Packit e9ba0d
Packit e9ba0d
#include <stddef.h>
Packit e9ba0d
Packit e9ba0d
/*
Packit e9ba0d
 * Validate a single UTF-8 character starting at @s.
Packit e9ba0d
 * The string must be null-terminated.
Packit e9ba0d
 *
Packit e9ba0d
 * If it's valid, return its length (1 thru 4).
Packit e9ba0d
 * If it's invalid or clipped, return 0.
Packit e9ba0d
 *
Packit e9ba0d
 * This function implements the syntax given in RFC3629, which is
Packit e9ba0d
 * the same as that given in The Unicode Standard, Version 6.0.
Packit e9ba0d
 *
Packit e9ba0d
 * It has the following properties:
Packit e9ba0d
 *
Packit e9ba0d
 *  * All codepoints U+0000..U+10FFFF may be encoded,
Packit e9ba0d
 *    except for U+D800..U+DFFF, which are reserved
Packit e9ba0d
 *    for UTF-16 surrogate pair encoding.
Packit e9ba0d
 *  * UTF-8 byte sequences longer than 4 bytes are not permitted,
Packit e9ba0d
 *    as they exceed the range of Unicode.
Packit e9ba0d
 *  * The sixty-six Unicode "non-characters" are permitted
Packit e9ba0d
 *    (namely, U+FDD0..U+FDEF, U+xxFFFE, and U+xxFFFF).
Packit e9ba0d
 */
Packit e9ba0d
size_t
Packit e9ba0d
utf8_validate_cz(const char *s)
Packit e9ba0d
{
Packit e9ba0d
        unsigned char c = *s++;
Packit e9ba0d
Packit e9ba0d
        if (c <= 0x7F) {        /* 00..7F */
Packit e9ba0d
                return 1;
Packit e9ba0d
        } else if (c <= 0xC1) { /* 80..C1 */
Packit e9ba0d
                /* Disallow overlong 2-byte sequence. */
Packit e9ba0d
                return 0;
Packit e9ba0d
        } else if (c <= 0xDF) { /* C2..DF */
Packit e9ba0d
                /* Make sure subsequent byte is in the range 0x80..0xBF. */
Packit e9ba0d
                if (((unsigned char)*s++ & 0xC0) != 0x80)
Packit e9ba0d
                        return 0;
Packit e9ba0d
Packit e9ba0d
                return 2;
Packit e9ba0d
        } else if (c <= 0xEF) { /* E0..EF */
Packit e9ba0d
                /* Disallow overlong 3-byte sequence. */
Packit e9ba0d
                if (c == 0xE0 && (unsigned char)*s < 0xA0)
Packit e9ba0d
                        return 0;
Packit e9ba0d
Packit e9ba0d
                /* Disallow U+D800..U+DFFF. */
Packit e9ba0d
                if (c == 0xED && (unsigned char)*s > 0x9F)
Packit e9ba0d
                        return 0;
Packit e9ba0d
Packit e9ba0d
                /* Make sure subsequent bytes are in the range 0x80..0xBF. */
Packit e9ba0d
                if (((unsigned char)*s++ & 0xC0) != 0x80)
Packit e9ba0d
                        return 0;
Packit e9ba0d
                if (((unsigned char)*s++ & 0xC0) != 0x80)
Packit e9ba0d
                        return 0;
Packit e9ba0d
Packit e9ba0d
                return 3;
Packit e9ba0d
        } else if (c <= 0xF4) { /* F0..F4 */
Packit e9ba0d
                /* Disallow overlong 4-byte sequence. */
Packit e9ba0d
                if (c == 0xF0 && (unsigned char)*s < 0x90)
Packit e9ba0d
                        return 0;
Packit e9ba0d
Packit e9ba0d
                /* Disallow codepoints beyond U+10FFFF. */
Packit e9ba0d
                if (c == 0xF4 && (unsigned char)*s > 0x8F)
Packit e9ba0d
                        return 0;
Packit e9ba0d
Packit e9ba0d
                /* Make sure subsequent bytes are in the range 0x80..0xBF. */
Packit e9ba0d
                if (((unsigned char)*s++ & 0xC0) != 0x80)
Packit e9ba0d
                        return 0;
Packit e9ba0d
                if (((unsigned char)*s++ & 0xC0) != 0x80)
Packit e9ba0d
                        return 0;
Packit e9ba0d
                if (((unsigned char)*s++ & 0xC0) != 0x80)
Packit e9ba0d
                        return 0;
Packit e9ba0d
Packit e9ba0d
                return 4;
Packit e9ba0d
        } else {                /* F5..FF */
Packit e9ba0d
                return 0;
Packit e9ba0d
        }
Packit e9ba0d
}