Blame src/utf8/checked.h

Packit Service 7770af
// Copyright 2006 Nemanja Trifunovic
Packit Service 7770af
Packit Service 7770af
/*
Packit Service 7770af
Permission is hereby granted, free of charge, to any person or organization
Packit Service 7770af
obtaining a copy of the software and accompanying documentation covered by
Packit Service 7770af
this license (the "Software") to use, reproduce, display, distribute,
Packit Service 7770af
execute, and transmit the Software, and to prepare derivative works of the
Packit Service 7770af
Software, and to permit third-parties to whom the Software is furnished to
Packit Service 7770af
do so, all subject to the following:
Packit Service 7770af
Packit Service 7770af
The copyright notices in the Software and this entire statement, including
Packit Service 7770af
the above license grant, this restriction and the following disclaimer,
Packit Service 7770af
must be included in all copies of the Software, in whole or in part, and
Packit Service 7770af
all derivative works of the Software, unless such copies or derivative
Packit Service 7770af
works are solely in the form of machine-executable object code generated by
Packit Service 7770af
a source language processor.
Packit Service 7770af
Packit Service 7770af
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
Packit Service 7770af
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
Packit Service 7770af
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
Packit Service 7770af
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
Packit Service 7770af
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
Packit Service 7770af
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
Packit Service 7770af
DEALINGS IN THE SOFTWARE.
Packit Service 7770af
*/
Packit Service 7770af
Packit Service 7770af
Packit Service 7770af
#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
Packit Service 7770af
#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
Packit Service 7770af
Packit Service 7770af
#include "core.h"
Packit Service 7770af
#include <stdexcept>
Packit Service 7770af
Packit Service 7770af
namespace utf8
Packit Service 7770af
{
Packit Service 7770af
    // Base for the exceptions that may be thrown from the library
Packit Service 7770af
    class exception : public ::std::exception {
Packit Service 7770af
    };
Packit Service 7770af
Packit Service 7770af
    // Exceptions that may be thrown from the library functions.
Packit Service 7770af
    class invalid_code_point : public exception {
Packit Service 7770af
        uint32_t cp;
Packit Service 7770af
    public:
Packit Service 7770af
        invalid_code_point(uint32_t cp) : cp(cp) {}
Packit Service 7770af
        virtual const char* what() const throw() { return "Invalid code point"; }
Packit Service 7770af
        uint32_t code_point() const {return cp;}
Packit Service 7770af
    };
Packit Service 7770af
Packit Service 7770af
    class invalid_utf8 : public exception {
Packit Service 7770af
        uint8_t u8;
Packit Service 7770af
    public:
Packit Service 7770af
        invalid_utf8 (uint8_t u) : u8(u) {}
Packit Service 7770af
        virtual const char* what() const throw() { return "Invalid UTF-8"; }
Packit Service 7770af
        uint8_t utf8_octet() const {return u8;}
Packit Service 7770af
    };
Packit Service 7770af
Packit Service 7770af
    class invalid_utf16 : public exception {
Packit Service 7770af
        uint16_t u16;
Packit Service 7770af
    public:
Packit Service 7770af
        invalid_utf16 (uint16_t u) : u16(u) {}
Packit Service 7770af
        virtual const char* what() const throw() { return "Invalid UTF-16"; }
Packit Service 7770af
        uint16_t utf16_word() const {return u16;}
Packit Service 7770af
    };
Packit Service 7770af
Packit Service 7770af
    class not_enough_room : public exception {
Packit Service 7770af
    public:
Packit Service 7770af
        virtual const char* what() const throw() { return "Not enough space"; }
Packit Service 7770af
    };
Packit Service 7770af
Packit Service 7770af
    /// The library API - functions intended to be called by the users
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    octet_iterator append(uint32_t cp, octet_iterator result)
Packit Service 7770af
    {
Packit Service 7770af
        if (!utf8::internal::is_code_point_valid(cp))
Packit Service 7770af
            throw invalid_code_point(cp);
Packit Service 7770af
Packit Service 7770af
        if (cp < 0x80)                        // one octet
Packit Service 7770af
            *(result++) = static_cast<uint8_t>(cp);
Packit Service 7770af
        else if (cp < 0x800) {                // two octets
Packit Service 7770af
            *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
Packit Service 7770af
            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
Packit Service 7770af
        }
Packit Service 7770af
        else if (cp < 0x10000) {              // three octets
Packit Service 7770af
            *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
Packit Service 7770af
            *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
Packit Service 7770af
            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
Packit Service 7770af
        }
Packit Service 7770af
        else {                                // four octets
Packit Service 7770af
            *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
Packit Service 7770af
            *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
Packit Service 7770af
            *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
Packit Service 7770af
            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
Packit Service 7770af
        }
Packit Service 7770af
        return result;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator, typename output_iterator>
Packit Service 7770af
    output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
Packit Service 7770af
    {
Packit Service 7770af
        while (start != end) {
Packit Service 7770af
            octet_iterator sequence_start = start;
Packit Service 7770af
            internal::utf_error err_code = utf8::internal::validate_next(start, end);
Packit Service 7770af
            switch (err_code) {
Packit Service 7770af
                case internal::UTF8_OK :
Packit Service 7770af
                    for (octet_iterator it = sequence_start; it != start; ++it)
Packit Service 7770af
                        *out++ = *it;
Packit Service 7770af
                    break;
Packit Service 7770af
                case internal::NOT_ENOUGH_ROOM:
Packit Service 7770af
                    throw not_enough_room();
Packit Service 7770af
                case internal::INVALID_LEAD:
Packit Service 7770af
                    out = utf8::append (replacement, out);
Packit Service 7770af
                    ++start;
Packit Service 7770af
                    break;
Packit Service 7770af
                case internal::INCOMPLETE_SEQUENCE:
Packit Service 7770af
                case internal::OVERLONG_SEQUENCE:
Packit Service 7770af
                case internal::INVALID_CODE_POINT:
Packit Service 7770af
                    out = utf8::append (replacement, out);
Packit Service 7770af
                    ++start;
Packit Service 7770af
                    // just one replacement mark for the sequence
Packit Service 7770af
                    while (start != end && utf8::internal::is_trail(*start))
Packit Service 7770af
                        ++start;
Packit Service 7770af
                    break;
Packit Service 7770af
            }
Packit Service 7770af
        }
Packit Service 7770af
        return out;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator, typename output_iterator>
Packit Service 7770af
    inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
Packit Service 7770af
    {
Packit Service 7770af
        static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
Packit Service 7770af
        return utf8::replace_invalid(start, end, out, replacement_marker);
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    uint32_t next(octet_iterator& it, octet_iterator end)
Packit Service 7770af
    {
Packit Service 7770af
        uint32_t cp = 0;
Packit Service 7770af
        internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
Packit Service 7770af
        switch (err_code) {
Packit Service 7770af
            case internal::UTF8_OK :
Packit Service 7770af
                break;
Packit Service 7770af
            case internal::NOT_ENOUGH_ROOM :
Packit Service 7770af
                throw not_enough_room();
Packit Service 7770af
            case internal::INVALID_LEAD :
Packit Service 7770af
            case internal::INCOMPLETE_SEQUENCE :
Packit Service 7770af
            case internal::OVERLONG_SEQUENCE :
Packit Service 7770af
                throw invalid_utf8(*it);
Packit Service 7770af
            case internal::INVALID_CODE_POINT :
Packit Service 7770af
                throw invalid_code_point(cp);
Packit Service 7770af
        }
Packit Service 7770af
        return cp;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    uint32_t peek_next(octet_iterator it, octet_iterator end)
Packit Service 7770af
    {
Packit Service 7770af
        return utf8::next(it, end);
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    uint32_t prior(octet_iterator& it, octet_iterator start)
Packit Service 7770af
    {
Packit Service 7770af
        // can't do much if it == start
Packit Service 7770af
        if (it == start)
Packit Service 7770af
            throw not_enough_room();
Packit Service 7770af
Packit Service 7770af
        octet_iterator end = it;
Packit Service 7770af
        // Go back until we hit either a lead octet or start
Packit Service 7770af
        while (utf8::internal::is_trail(*(--it)))
Packit Service 7770af
            if (it == start)
Packit Service 7770af
                throw invalid_utf8(*it); // error - no lead byte in the sequence
Packit Service 7770af
        return utf8::peek_next(it, end);
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    /// Deprecated in versions that include "prior"
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    uint32_t previous(octet_iterator& it, octet_iterator pass_start)
Packit Service 7770af
    {
Packit Service 7770af
        octet_iterator end = it;
Packit Service 7770af
        while (utf8::internal::is_trail(*(--it)))
Packit Service 7770af
            if (it == pass_start)
Packit Service 7770af
                throw invalid_utf8(*it); // error - no lead byte in the sequence
Packit Service 7770af
        octet_iterator temp = it;
Packit Service 7770af
        return utf8::next(temp, end);
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator, typename distance_type>
Packit Service 7770af
    void advance (octet_iterator& it, distance_type n, octet_iterator end)
Packit Service 7770af
    {
Packit Service 7770af
        for (distance_type i = 0; i < n; ++i)
Packit Service 7770af
            utf8::next(it, end);
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    typename std::iterator_traits<octet_iterator>::difference_type
Packit Service 7770af
    distance (octet_iterator first, octet_iterator last)
Packit Service 7770af
    {
Packit Service 7770af
        typename std::iterator_traits<octet_iterator>::difference_type dist;
Packit Service 7770af
        for (dist = 0; first < last; ++dist)
Packit Service 7770af
            utf8::next(first, last);
Packit Service 7770af
        return dist;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename u16bit_iterator, typename octet_iterator>
Packit Service 7770af
    octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
Packit Service 7770af
    {
Packit Service 7770af
        while (start != end) {
Packit Service 7770af
            uint32_t cp = utf8::internal::mask16(*start++);
Packit Service 7770af
            // Take care of surrogate pairs first
Packit Service 7770af
            if (utf8::internal::is_lead_surrogate(cp)) {
Packit Service 7770af
                if (start != end) {
Packit Service 7770af
                    uint32_t trail_surrogate = utf8::internal::mask16(*start++);
Packit Service 7770af
                    if (utf8::internal::is_trail_surrogate(trail_surrogate))
Packit Service 7770af
                        cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
Packit Service 7770af
                    else
Packit Service 7770af
                        throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
Packit Service 7770af
                }
Packit Service 7770af
                else
Packit Service 7770af
                    throw invalid_utf16(static_cast<uint16_t>(cp));
Packit Service 7770af
Packit Service 7770af
            }
Packit Service 7770af
            // Lone trail surrogate
Packit Service 7770af
            else if (utf8::internal::is_trail_surrogate(cp))
Packit Service 7770af
                throw invalid_utf16(static_cast<uint16_t>(cp));
Packit Service 7770af
Packit Service 7770af
            result = utf8::append(cp, result);
Packit Service 7770af
        }
Packit Service 7770af
        return result;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename u16bit_iterator, typename octet_iterator>
Packit Service 7770af
    u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
Packit Service 7770af
    {
Packit Service 7770af
        while (start != end) {
Packit Service 7770af
            uint32_t cp = utf8::next(start, end);
Packit Service 7770af
            if (cp > 0xffff) { //make a surrogate pair
Packit Service 7770af
                *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
Packit Service 7770af
                *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
Packit Service 7770af
            }
Packit Service 7770af
            else
Packit Service 7770af
                *result++ = static_cast<uint16_t>(cp);
Packit Service 7770af
        }
Packit Service 7770af
        return result;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator, typename u32bit_iterator>
Packit Service 7770af
    octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
Packit Service 7770af
    {
Packit Service 7770af
        while (start != end)
Packit Service 7770af
            result = utf8::append(*(start++), result);
Packit Service 7770af
Packit Service 7770af
        return result;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    template <typename octet_iterator, typename u32bit_iterator>
Packit Service 7770af
    u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
Packit Service 7770af
    {
Packit Service 7770af
        while (start != end)
Packit Service 7770af
            (*result++) = utf8::next(start, end);
Packit Service 7770af
Packit Service 7770af
        return result;
Packit Service 7770af
    }
Packit Service 7770af
Packit Service 7770af
    // The iterator class
Packit Service 7770af
    template <typename octet_iterator>
Packit Service 7770af
    class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
Packit Service 7770af
      octet_iterator it;
Packit Service 7770af
      octet_iterator range_start;
Packit Service 7770af
      octet_iterator range_end;
Packit Service 7770af
      public:
Packit Service 7770af
      iterator () {}
Packit Service 7770af
      explicit iterator (const octet_iterator& octet_it,
Packit Service 7770af
                         const octet_iterator& range_start,
Packit Service 7770af
                         const octet_iterator& range_end) :
Packit Service 7770af
               it(octet_it), range_start(range_start), range_end(range_end)
Packit Service 7770af
      {
Packit Service 7770af
          if (it < range_start || it > range_end)
Packit Service 7770af
              throw std::out_of_range("Invalid utf-8 iterator position");
Packit Service 7770af
      }
Packit Service 7770af
      // the default "big three" are OK
Packit Service 7770af
      octet_iterator base () const { return it; }
Packit Service 7770af
      uint32_t operator * () const
Packit Service 7770af
      {
Packit Service 7770af
          octet_iterator temp = it;
Packit Service 7770af
          return utf8::next(temp, range_end);
Packit Service 7770af
      }
Packit Service 7770af
      bool operator == (const iterator& rhs) const
Packit Service 7770af
      {
Packit Service 7770af
          if (range_start != rhs.range_start || range_end != rhs.range_end)
Packit Service 7770af
              throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
Packit Service 7770af
          return (it == rhs.it);
Packit Service 7770af
      }
Packit Service 7770af
      bool operator != (const iterator& rhs) const
Packit Service 7770af
      {
Packit Service 7770af
          return !(operator == (rhs));
Packit Service 7770af
      }
Packit Service 7770af
      iterator& operator ++ ()
Packit Service 7770af
      {
Packit Service 7770af
          utf8::next(it, range_end);
Packit Service 7770af
          return *this;
Packit Service 7770af
      }
Packit Service 7770af
      iterator operator ++ (int)
Packit Service 7770af
      {
Packit Service 7770af
          iterator temp = *this;
Packit Service 7770af
          utf8::next(it, range_end);
Packit Service 7770af
          return temp;
Packit Service 7770af
      }
Packit Service 7770af
      iterator& operator -- ()
Packit Service 7770af
      {
Packit Service 7770af
          utf8::prior(it, range_start);
Packit Service 7770af
          return *this;
Packit Service 7770af
      }
Packit Service 7770af
      iterator operator -- (int)
Packit Service 7770af
      {
Packit Service 7770af
          iterator temp = *this;
Packit Service 7770af
          utf8::prior(it, range_start);
Packit Service 7770af
          return temp;
Packit Service 7770af
      }
Packit Service 7770af
    }; // class iterator
Packit Service 7770af
Packit Service 7770af
} // namespace utf8
Packit Service 7770af
Packit Service 7770af
#endif //header guard
Packit Service 7770af
Packit Service 7770af