Blame src/utf8/core.h

Packit bfcc33
// Copyright 2006 Nemanja Trifunovic
Packit bfcc33
Packit bfcc33
/*
Packit bfcc33
Permission is hereby granted, free of charge, to any person or organization
Packit bfcc33
obtaining a copy of the software and accompanying documentation covered by
Packit bfcc33
this license (the "Software") to use, reproduce, display, distribute,
Packit bfcc33
execute, and transmit the Software, and to prepare derivative works of the
Packit bfcc33
Software, and to permit third-parties to whom the Software is furnished to
Packit bfcc33
do so, all subject to the following:
Packit bfcc33
Packit bfcc33
The copyright notices in the Software and this entire statement, including
Packit bfcc33
the above license grant, this restriction and the following disclaimer,
Packit bfcc33
must be included in all copies of the Software, in whole or in part, and
Packit bfcc33
all derivative works of the Software, unless such copies or derivative
Packit bfcc33
works are solely in the form of machine-executable object code generated by
Packit bfcc33
a source language processor.
Packit bfcc33
Packit bfcc33
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
Packit bfcc33
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
Packit bfcc33
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
Packit bfcc33
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
Packit bfcc33
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
Packit bfcc33
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
Packit bfcc33
DEALINGS IN THE SOFTWARE.
Packit bfcc33
*/
Packit bfcc33
Packit bfcc33
Packit bfcc33
#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
Packit bfcc33
#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
Packit bfcc33
Packit bfcc33
#include <iterator>
Packit bfcc33
Packit bfcc33
namespace utf8
Packit bfcc33
{
Packit bfcc33
    // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
Packit bfcc33
    // You may need to change them to match your system.
Packit bfcc33
    // These typedefs have the same names as ones from cstdint, or boost/cstdint
Packit bfcc33
    typedef unsigned char   uint8_t;
Packit bfcc33
    typedef unsigned short  uint16_t;
Packit bfcc33
    typedef unsigned int    uint32_t;
Packit bfcc33
Packit bfcc33
// Helper code - not intended to be directly called by the library users. May be changed at any time
Packit bfcc33
namespace internal
Packit bfcc33
{
Packit bfcc33
    // Unicode constants
Packit bfcc33
    // Leading (high) surrogates: 0xd800 - 0xdbff
Packit bfcc33
    // Trailing (low) surrogates: 0xdc00 - 0xdfff
Packit bfcc33
    const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
Packit bfcc33
    const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
Packit bfcc33
    const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
Packit bfcc33
    const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
Packit bfcc33
    const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
Packit bfcc33
    const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
Packit bfcc33
Packit bfcc33
    // Maximum valid value for a Unicode code point
Packit bfcc33
    const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
Packit bfcc33
Packit bfcc33
    template<typename octet_type>
Packit bfcc33
    inline uint8_t mask8(octet_type oc)
Packit bfcc33
    {
Packit bfcc33
        return static_cast<uint8_t>(0xff & oc);
Packit bfcc33
    }
Packit bfcc33
    template<typename u16_type>
Packit bfcc33
    inline uint16_t mask16(u16_type oc)
Packit bfcc33
    {
Packit bfcc33
        return static_cast<uint16_t>(0xffff & oc);
Packit bfcc33
    }
Packit bfcc33
    template<typename octet_type>
Packit bfcc33
    inline bool is_trail(octet_type oc)
Packit bfcc33
    {
Packit bfcc33
        return ((utf8::internal::mask8(oc) >> 6) == 0x2);
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename u16>
Packit bfcc33
    inline bool is_lead_surrogate(u16 cp)
Packit bfcc33
    {
Packit bfcc33
        return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename u16>
Packit bfcc33
    inline bool is_trail_surrogate(u16 cp)
Packit bfcc33
    {
Packit bfcc33
        return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename u16>
Packit bfcc33
    inline bool is_surrogate(u16 cp)
Packit bfcc33
    {
Packit bfcc33
        return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename u32>
Packit bfcc33
    inline bool is_code_point_valid(u32 cp)
Packit bfcc33
    {
Packit bfcc33
        return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    inline typename std::iterator_traits<octet_iterator>::difference_type
Packit bfcc33
    sequence_length(octet_iterator lead_it)
Packit bfcc33
    {
Packit bfcc33
        uint8_t lead = utf8::internal::mask8(*lead_it);
Packit bfcc33
        if (lead < 0x80)
Packit bfcc33
            return 1;
Packit bfcc33
        else if ((lead >> 5) == 0x6)
Packit bfcc33
            return 2;
Packit bfcc33
        else if ((lead >> 4) == 0xe)
Packit bfcc33
            return 3;
Packit bfcc33
        else if ((lead >> 3) == 0x1e)
Packit bfcc33
            return 4;
Packit bfcc33
        else
Packit bfcc33
            return 0;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_difference_type>
Packit bfcc33
    inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
Packit bfcc33
    {
Packit bfcc33
        if (cp < 0x80) {
Packit bfcc33
            if (length != 1)
Packit bfcc33
                return true;
Packit bfcc33
        }
Packit bfcc33
        else if (cp < 0x800) {
Packit bfcc33
            if (length != 2)
Packit bfcc33
                return true;
Packit bfcc33
        }
Packit bfcc33
        else if (cp < 0x10000) {
Packit bfcc33
            if (length != 3)
Packit bfcc33
                return true;
Packit bfcc33
        }
Packit bfcc33
Packit bfcc33
        return false;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
Packit bfcc33
Packit bfcc33
    /// Helper for get_sequence_x
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    utf_error increase_safely(octet_iterator& it, octet_iterator end)
Packit bfcc33
    {
Packit bfcc33
        if (++it == end)
Packit bfcc33
            return NOT_ENOUGH_ROOM;
Packit bfcc33
Packit bfcc33
        if (!utf8::internal::is_trail(*it))
Packit bfcc33
            return INCOMPLETE_SEQUENCE;
Packit bfcc33
Packit bfcc33
        return UTF8_OK;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
Packit bfcc33
Packit bfcc33
    /// get_sequence_x functions decode utf-8 sequences of the length x
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
Packit bfcc33
    {
Packit bfcc33
        if (it == end)
Packit bfcc33
            return NOT_ENOUGH_ROOM;
Packit bfcc33
Packit bfcc33
        code_point = utf8::internal::mask8(*it);
Packit bfcc33
Packit bfcc33
        return UTF8_OK;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
Packit bfcc33
    {
Packit bfcc33
        if (it == end)
Packit bfcc33
            return NOT_ENOUGH_ROOM;
Packit bfcc33
Packit bfcc33
        code_point = utf8::internal::mask8(*it);
Packit bfcc33
Packit bfcc33
        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
Packit bfcc33
Packit bfcc33
        code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
Packit bfcc33
Packit bfcc33
        return UTF8_OK;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
Packit bfcc33
    {
Packit bfcc33
        if (it == end)
Packit bfcc33
            return NOT_ENOUGH_ROOM;
Packit bfcc33
Packit bfcc33
        code_point = utf8::internal::mask8(*it);
Packit bfcc33
Packit bfcc33
        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
Packit bfcc33
Packit bfcc33
        code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
Packit bfcc33
Packit bfcc33
        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
Packit bfcc33
Packit bfcc33
        code_point += (*it) & 0x3f;
Packit bfcc33
Packit bfcc33
        return UTF8_OK;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
Packit bfcc33
    {
Packit bfcc33
        if (it == end)
Packit bfcc33
           return NOT_ENOUGH_ROOM;
Packit bfcc33
Packit bfcc33
        code_point = utf8::internal::mask8(*it);
Packit bfcc33
Packit bfcc33
        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
Packit bfcc33
Packit bfcc33
        code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
Packit bfcc33
Packit bfcc33
        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
Packit bfcc33
Packit bfcc33
        code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
Packit bfcc33
Packit bfcc33
        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
Packit bfcc33
Packit bfcc33
        code_point += (*it) & 0x3f;
Packit bfcc33
Packit bfcc33
        return UTF8_OK;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
Packit bfcc33
    {
Packit bfcc33
        // Save the original value of it so we can go back in case of failure
Packit bfcc33
        // Of course, it does not make much sense with i.e. stream iterators
Packit bfcc33
        octet_iterator original_it = it;
Packit bfcc33
Packit bfcc33
        uint32_t cp = 0;
Packit bfcc33
        // Determine the sequence length based on the lead octet
Packit bfcc33
        typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
Packit bfcc33
        const octet_difference_type length = utf8::internal::sequence_length(it);
Packit bfcc33
Packit bfcc33
        // Get trail octets and calculate the code point
Packit bfcc33
        utf_error err = UTF8_OK;
Packit bfcc33
        switch (length) {
Packit bfcc33
            case 0:
Packit bfcc33
                return INVALID_LEAD;
Packit bfcc33
            case 1:
Packit bfcc33
                err = utf8::internal::get_sequence_1(it, end, cp);
Packit bfcc33
                break;
Packit bfcc33
            case 2:
Packit bfcc33
                err = utf8::internal::get_sequence_2(it, end, cp);
Packit bfcc33
            break;
Packit bfcc33
            case 3:
Packit bfcc33
                err = utf8::internal::get_sequence_3(it, end, cp);
Packit bfcc33
            break;
Packit bfcc33
            case 4:
Packit bfcc33
                err = utf8::internal::get_sequence_4(it, end, cp);
Packit bfcc33
            break;
Packit bfcc33
        }
Packit bfcc33
Packit bfcc33
        if (err == UTF8_OK) {
Packit bfcc33
            // Decoding succeeded. Now, security checks...
Packit bfcc33
            if (utf8::internal::is_code_point_valid(cp)) {
Packit bfcc33
                if (!utf8::internal::is_overlong_sequence(cp, length)){
Packit bfcc33
                    // Passed! Return here.
Packit bfcc33
                    code_point = cp;
Packit bfcc33
                    ++it;
Packit bfcc33
                    return UTF8_OK;
Packit bfcc33
                }
Packit bfcc33
                else
Packit bfcc33
                    err = OVERLONG_SEQUENCE;
Packit bfcc33
            }
Packit bfcc33
            else
Packit bfcc33
                err = INVALID_CODE_POINT;
Packit bfcc33
        }
Packit bfcc33
Packit bfcc33
        // Failure branch - restore the original value of the iterator
Packit bfcc33
        it = original_it;
Packit bfcc33
        return err;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
Packit bfcc33
        uint32_t ignored;
Packit bfcc33
        return utf8::internal::validate_next(it, end, ignored);
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
} // namespace internal
Packit bfcc33
Packit bfcc33
    /// The library API - functions intended to be called by the users
Packit bfcc33
Packit bfcc33
    // Byte order mark
Packit bfcc33
    const uint8_t bom[] = {0xef, 0xbb, 0xbf};
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    octet_iterator find_invalid(octet_iterator start, octet_iterator end)
Packit bfcc33
    {
Packit bfcc33
        octet_iterator result = start;
Packit bfcc33
        while (result != end) {
Packit bfcc33
            utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
Packit bfcc33
            if (err_code != internal::UTF8_OK)
Packit bfcc33
                return result;
Packit bfcc33
        }
Packit bfcc33
        return result;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    inline bool is_valid(octet_iterator start, octet_iterator end)
Packit bfcc33
    {
Packit bfcc33
        return (utf8::find_invalid(start, end) == end);
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    inline bool starts_with_bom (octet_iterator it, octet_iterator end)
Packit bfcc33
    {
Packit bfcc33
        return (
Packit bfcc33
            ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
Packit bfcc33
            ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
Packit bfcc33
            ((it != end) && (utf8::internal::mask8(*it))   == bom[2])
Packit bfcc33
           );
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    //Deprecated in release 2.3
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    inline bool is_bom (octet_iterator it)
Packit bfcc33
    {
Packit bfcc33
        return (
Packit bfcc33
            (utf8::internal::mask8(*it++)) == bom[0] &&
Packit bfcc33
            (utf8::internal::mask8(*it++)) == bom[1] &&
Packit bfcc33
            (utf8::internal::mask8(*it))   == bom[2]
Packit bfcc33
           );
Packit bfcc33
    }
Packit bfcc33
} // namespace utf8
Packit bfcc33
Packit bfcc33
#endif // header guard
Packit bfcc33
Packit bfcc33