Blame src/utf8/core.h

Packit Service 7770af
// Copyright 2006 Nemanja Trifunovic
Packit Service 7770af
Packit Service 7770af
/*
Packit Service 7770af
Permission is hereby granted, free of charge, to any person or organization
Packit Service 7770af
obtaining a copy of the software and accompanying documentation covered by
Packit Service 7770af
this license (the "Software") to use, reproduce, display, distribute,
Packit Service 7770af
execute, and transmit the Software, and to prepare derivative works of the
Packit Service 7770af
Software, and to permit third-parties to whom the Software is furnished to
Packit Service 7770af
do so, all subject to the following:
Packit Service 7770af
Packit Service 7770af
The copyright notices in the Software and this entire statement, including
Packit Service 7770af
the above license grant, this restriction and the following disclaimer,
Packit Service 7770af
must be included in all copies of the Software, in whole or in part, and
Packit Service 7770af
all derivative works of the Software, unless such copies or derivative
Packit Service 7770af
works are solely in the form of machine-executable object code generated by
Packit Service 7770af
a source language processor.
Packit Service 7770af
Packit Service 7770af
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
Packit Service 7770af
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
Packit Service 7770af
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
Packit Service 7770af
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
Packit Service 7770af
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
Packit Service 7770af
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
Packit Service 7770af
DEALINGS IN THE SOFTWARE.
Packit Service 7770af
*/
Packit Service 7770af
Packit Service 7770af
Packit Service 7770af
#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
Packit Service 7770af
#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
Packit Service 7770af
Packit Service 7770af
#include <iterator>
Packit Service 7770af
Packit Service 7770af
namespace utf8
Packit Service 7770af
{
Packit Service 7770af
    // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
Packit Service 7770af
    // You may need to change them to match your system.
Packit Service 7770af
    // These typedefs have the same names as ones from cstdint, or boost/cstdint
Packit Service 7770af
    typedef unsigned char   uint8_t;
Packit Service 7770af
    typedef unsigned short  uint16_t;
Packit Service 7770af
    typedef unsigned int    uint32_t;
Packit Service 7770af
Packit Service 7770af
// Helper code - not intended to be directly called by the library users. May be changed at any time
Packit Service 7770af
namespace internal
Packit Service 7770af
{
Packit Service 7770af
    // Unicode constants
Packit Service 7770af
    // Leading (high) surrogates: 0xd800 - 0xdbff
Packit Service 7770af
    // Trailing (low) surrogates: 0xdc00 - 0xdfff
Packit Service 7770af
    const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
Packit Service 7770af
    const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
Packit Service 7770af
    const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
Packit Service 7770af
    const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
Packit Service 7770af
    const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
Packit Service 7770af
    const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
Packit Service 7770af
Packit Service 7770af
    // Maximum valid value for a Unicode code point
Packit Service 7770af
    const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
Packit Service 7770af
Packit Service 7770af
    template<typename octet_type>
Packit Service 7770af
    inline uint8_t mask8(octet_type oc)
Packit Service 7770af
    {
Packit Service 7770af
        return static_cast<uint8_t>(0xff & oc);
Packit Service 7770af
    }
Packit Service 7770af
    template<typename u16_type>
Packit Service 7770af
    inline uint16_t mask16(u16_type oc)
Packit Service 7770af
    {
Packit Service 7770af
        return static_cast<uint16_t>(0xffff & oc);
Packit Service 7770af
    }
Packit Service 7770af
    template<typename octet_type>
Packit Service 7770af
    inline bool is_trail(octet_type oc)
Packit Service 7770af
    {
Packit Service 7770af
        return ((utf8::internal::mask8(oc) >> 6) == 0x2);
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename u16>
Packit Service 7770af
    inline bool is_lead_surrogate(u16 cp)
Packit Service 7770af
    {
Packit Service 7770af
        return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename u16>
Packit Service 7770af
    inline bool is_trail_surrogate(u16 cp)
Packit Service 7770af
    {
Packit Service 7770af
        return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename u16>
Packit Service 7770af
    inline bool is_surrogate(u16 cp)
Packit Service 7770af
    {
Packit Service 7770af
        return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename u32>
Packit Service 7770af
    inline bool is_code_point_valid(u32 cp)
Packit Service 7770af
    {
Packit Service 7770af
        return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    inline typename std::iterator_traits<octet_iterator>::difference_type
Packit Service 7770af
    sequence_length(octet_iterator lead_it)
Packit Service 7770af
    {
Packit Service 7770af
        uint8_t lead = utf8::internal::mask8(*lead_it);
Packit Service 7770af
        if (lead < 0x80)
Packit Service 7770af
            return 1;
Packit Service 7770af
        else if ((lead >> 5) == 0x6)
Packit Service 7770af
            return 2;
Packit Service 7770af
        else if ((lead >> 4) == 0xe)
Packit Service 7770af
            return 3;
Packit Service 7770af
        else if ((lead >> 3) == 0x1e)
Packit Service 7770af
            return 4;
Packit Service 7770af
        else
Packit Service 7770af
            return 0;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_difference_type>
Packit Service 7770af
    inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
Packit Service 7770af
    {
Packit Service 7770af
        if (cp < 0x80) {
Packit Service 7770af
            if (length != 1)
Packit Service 7770af
                return true;
Packit Service 7770af
        }
Packit Service 7770af
        else if (cp < 0x800) {
Packit Service 7770af
            if (length != 2)
Packit Service 7770af
                return true;
Packit Service 7770af
        }
Packit Service 7770af
        else if (cp < 0x10000) {
Packit Service 7770af
            if (length != 3)
Packit Service 7770af
                return true;
Packit Service 7770af
        }
Packit Service 7770af
Packit Service 7770af
        return false;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
Packit Service 7770af
Packit Service 7770af
    /// Helper for get_sequence_x
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    utf_error increase_safely(octet_iterator& it, octet_iterator end)
Packit Service 7770af
    {
Packit Service 7770af
        if (++it == end)
Packit Service 7770af
            return NOT_ENOUGH_ROOM;
Packit Service 7770af
Packit Service 7770af
        if (!utf8::internal::is_trail(*it))
Packit Service 7770af
            return INCOMPLETE_SEQUENCE;
Packit Service 7770af
Packit Service 7770af
        return UTF8_OK;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
Packit Service 7770af
Packit Service 7770af
    /// get_sequence_x functions decode utf-8 sequences of the length x
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
Packit Service 7770af
    {
Packit Service 7770af
        if (it == end)
Packit Service 7770af
            return NOT_ENOUGH_ROOM;
Packit Service 7770af
Packit Service 7770af
        code_point = utf8::internal::mask8(*it);
Packit Service 7770af
Packit Service 7770af
        return UTF8_OK;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
Packit Service 7770af
    {
Packit Service 7770af
        if (it == end)
Packit Service 7770af
            return NOT_ENOUGH_ROOM;
Packit Service 7770af
Packit Service 7770af
        code_point = utf8::internal::mask8(*it);
Packit Service 7770af
Packit Service 7770af
        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
Packit Service 7770af
Packit Service 7770af
        code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
Packit Service 7770af
Packit Service 7770af
        return UTF8_OK;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
Packit Service 7770af
    {
Packit Service 7770af
        if (it == end)
Packit Service 7770af
            return NOT_ENOUGH_ROOM;
Packit Service 7770af
Packit Service 7770af
        code_point = utf8::internal::mask8(*it);
Packit Service 7770af
Packit Service 7770af
        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
Packit Service 7770af
Packit Service 7770af
        code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
Packit Service 7770af
Packit Service 7770af
        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
Packit Service 7770af
Packit Service 7770af
        code_point += (*it) & 0x3f;
Packit Service 7770af
Packit Service 7770af
        return UTF8_OK;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
Packit Service 7770af
    {
Packit Service 7770af
        if (it == end)
Packit Service 7770af
           return NOT_ENOUGH_ROOM;
Packit Service 7770af
Packit Service 7770af
        code_point = utf8::internal::mask8(*it);
Packit Service 7770af
Packit Service 7770af
        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
Packit Service 7770af
Packit Service 7770af
        code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
Packit Service 7770af
Packit Service 7770af
        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
Packit Service 7770af
Packit Service 7770af
        code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
Packit Service 7770af
Packit Service 7770af
        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
Packit Service 7770af
Packit Service 7770af
        code_point += (*it) & 0x3f;
Packit Service 7770af
Packit Service 7770af
        return UTF8_OK;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
Packit Service 7770af
    {
Packit Service 7770af
        // Save the original value of it so we can go back in case of failure
Packit Service 7770af
        // Of course, it does not make much sense with i.e. stream iterators
Packit Service 7770af
        octet_iterator original_it = it;
Packit Service 7770af
Packit Service 7770af
        uint32_t cp = 0;
Packit Service 7770af
        // Determine the sequence length based on the lead octet
Packit Service 7770af
        typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
Packit Service 7770af
        const octet_difference_type length = utf8::internal::sequence_length(it);
Packit Service 7770af
Packit Service 7770af
        // Get trail octets and calculate the code point
Packit Service 7770af
        utf_error err = UTF8_OK;
Packit Service 7770af
        switch (length) {
Packit Service 7770af
            case 0:
Packit Service 7770af
                return INVALID_LEAD;
Packit Service 7770af
            case 1:
Packit Service 7770af
                err = utf8::internal::get_sequence_1(it, end, cp);
Packit Service 7770af
                break;
Packit Service 7770af
            case 2:
Packit Service 7770af
                err = utf8::internal::get_sequence_2(it, end, cp);
Packit Service 7770af
            break;
Packit Service 7770af
            case 3:
Packit Service 7770af
                err = utf8::internal::get_sequence_3(it, end, cp);
Packit Service 7770af
            break;
Packit Service 7770af
            case 4:
Packit Service 7770af
                err = utf8::internal::get_sequence_4(it, end, cp);
Packit Service 7770af
            break;
Packit Service 7770af
        }
Packit Service 7770af
Packit Service 7770af
        if (err == UTF8_OK) {
Packit Service 7770af
            // Decoding succeeded. Now, security checks...
Packit Service 7770af
            if (utf8::internal::is_code_point_valid(cp)) {
Packit Service 7770af
                if (!utf8::internal::is_overlong_sequence(cp, length)){
Packit Service 7770af
                    // Passed! Return here.
Packit Service 7770af
                    code_point = cp;
Packit Service 7770af
                    ++it;
Packit Service 7770af
                    return UTF8_OK;
Packit Service 7770af
                }
Packit Service 7770af
                else
Packit Service 7770af
                    err = OVERLONG_SEQUENCE;
Packit Service 7770af
            }
Packit Service 7770af
            else
Packit Service 7770af
                err = INVALID_CODE_POINT;
Packit Service 7770af
        }
Packit Service 7770af
Packit Service 7770af
        // Failure branch - restore the original value of the iterator
Packit Service 7770af
        it = original_it;
Packit Service 7770af
        return err;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
Packit Service 7770af
        uint32_t ignored;
Packit Service 7770af
        return utf8::internal::validate_next(it, end, ignored);
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
} // namespace internal
Packit Service 7770af
Packit Service 7770af
    /// The library API - functions intended to be called by the users
Packit Service 7770af
Packit Service 7770af
    // Byte order mark
Packit Service 7770af
    const uint8_t bom[] = {0xef, 0xbb, 0xbf};
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    octet_iterator find_invalid(octet_iterator start, octet_iterator end)
Packit Service 7770af
    {
Packit Service 7770af
        octet_iterator result = start;
Packit Service 7770af
        while (result != end) {
Packit Service 7770af
            utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
Packit Service 7770af
            if (err_code != internal::UTF8_OK)
Packit Service 7770af
                return result;
Packit Service 7770af
        }
Packit Service 7770af
        return result;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    inline bool is_valid(octet_iterator start, octet_iterator end)
Packit Service 7770af
    {
Packit Service 7770af
        return (utf8::find_invalid(start, end) == end);
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    inline bool starts_with_bom (octet_iterator it, octet_iterator end)
Packit Service 7770af
    {
Packit Service 7770af
        return (
Packit Service 7770af
            ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
Packit Service 7770af
            ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
Packit Service 7770af
            ((it != end) && (utf8::internal::mask8(*it))   == bom[2])
Packit Service 7770af
           );
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    //Deprecated in release 2.3
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    inline bool is_bom (octet_iterator it)
Packit Service 7770af
    {
Packit Service 7770af
        return (
Packit Service 7770af
            (utf8::internal::mask8(*it++)) == bom[0] &&
Packit Service 7770af
            (utf8::internal::mask8(*it++)) == bom[1] &&
Packit Service 7770af
            (utf8::internal::mask8(*it))   == bom[2]
Packit Service 7770af
           );
Packit Service 7770af
    }
Packit Service 7770af
} // namespace utf8
Packit Service 7770af
Packit Service 7770af
#endif // header guard
Packit Service 7770af
Packit Service 7770af