Blame src/utf8/checked.h

Packit bfcc33
// Copyright 2006 Nemanja Trifunovic
Packit bfcc33
Packit bfcc33
/*
Packit bfcc33
Permission is hereby granted, free of charge, to any person or organization
Packit bfcc33
obtaining a copy of the software and accompanying documentation covered by
Packit bfcc33
this license (the "Software") to use, reproduce, display, distribute,
Packit bfcc33
execute, and transmit the Software, and to prepare derivative works of the
Packit bfcc33
Software, and to permit third-parties to whom the Software is furnished to
Packit bfcc33
do so, all subject to the following:
Packit bfcc33
Packit bfcc33
The copyright notices in the Software and this entire statement, including
Packit bfcc33
the above license grant, this restriction and the following disclaimer,
Packit bfcc33
must be included in all copies of the Software, in whole or in part, and
Packit bfcc33
all derivative works of the Software, unless such copies or derivative
Packit bfcc33
works are solely in the form of machine-executable object code generated by
Packit bfcc33
a source language processor.
Packit bfcc33
Packit bfcc33
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
Packit bfcc33
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
Packit bfcc33
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
Packit bfcc33
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
Packit bfcc33
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
Packit bfcc33
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
Packit bfcc33
DEALINGS IN THE SOFTWARE.
Packit bfcc33
*/
Packit bfcc33
Packit bfcc33
Packit bfcc33
#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
Packit bfcc33
#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
Packit bfcc33
Packit bfcc33
#include "core.h"
Packit bfcc33
#include <stdexcept>
Packit bfcc33
Packit bfcc33
namespace utf8
Packit bfcc33
{
Packit bfcc33
    // Base for the exceptions that may be thrown from the library
Packit bfcc33
    class exception : public ::std::exception {
Packit bfcc33
    };
Packit bfcc33
Packit bfcc33
    // Exceptions that may be thrown from the library functions.
Packit bfcc33
    class invalid_code_point : public exception {
Packit bfcc33
        uint32_t cp;
Packit bfcc33
    public:
Packit bfcc33
        invalid_code_point(uint32_t cp) : cp(cp) {}
Packit bfcc33
        virtual const char* what() const throw() { return "Invalid code point"; }
Packit bfcc33
        uint32_t code_point() const {return cp;}
Packit bfcc33
    };
Packit bfcc33
Packit bfcc33
    class invalid_utf8 : public exception {
Packit bfcc33
        uint8_t u8;
Packit bfcc33
    public:
Packit bfcc33
        invalid_utf8 (uint8_t u) : u8(u) {}
Packit bfcc33
        virtual const char* what() const throw() { return "Invalid UTF-8"; }
Packit bfcc33
        uint8_t utf8_octet() const {return u8;}
Packit bfcc33
    };
Packit bfcc33
Packit bfcc33
    class invalid_utf16 : public exception {
Packit bfcc33
        uint16_t u16;
Packit bfcc33
    public:
Packit bfcc33
        invalid_utf16 (uint16_t u) : u16(u) {}
Packit bfcc33
        virtual const char* what() const throw() { return "Invalid UTF-16"; }
Packit bfcc33
        uint16_t utf16_word() const {return u16;}
Packit bfcc33
    };
Packit bfcc33
Packit bfcc33
    class not_enough_room : public exception {
Packit bfcc33
    public:
Packit bfcc33
        virtual const char* what() const throw() { return "Not enough space"; }
Packit bfcc33
    };
Packit bfcc33
Packit bfcc33
    /// The library API - functions intended to be called by the users
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    octet_iterator append(uint32_t cp, octet_iterator result)
Packit bfcc33
    {
Packit bfcc33
        if (!utf8::internal::is_code_point_valid(cp))
Packit bfcc33
            throw invalid_code_point(cp);
Packit bfcc33
Packit bfcc33
        if (cp < 0x80)                        // one octet
Packit bfcc33
            *(result++) = static_cast<uint8_t>(cp);
Packit bfcc33
        else if (cp < 0x800) {                // two octets
Packit bfcc33
            *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
Packit bfcc33
            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
Packit bfcc33
        }
Packit bfcc33
        else if (cp < 0x10000) {              // three octets
Packit bfcc33
            *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
Packit bfcc33
            *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
Packit bfcc33
            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
Packit bfcc33
        }
Packit bfcc33
        else {                                // four octets
Packit bfcc33
            *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
Packit bfcc33
            *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
Packit bfcc33
            *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
Packit bfcc33
            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
Packit bfcc33
        }
Packit bfcc33
        return result;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator, typename output_iterator>
Packit bfcc33
    output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
Packit bfcc33
    {
Packit bfcc33
        while (start != end) {
Packit bfcc33
            octet_iterator sequence_start = start;
Packit bfcc33
            internal::utf_error err_code = utf8::internal::validate_next(start, end);
Packit bfcc33
            switch (err_code) {
Packit bfcc33
                case internal::UTF8_OK :
Packit bfcc33
                    for (octet_iterator it = sequence_start; it != start; ++it)
Packit bfcc33
                        *out++ = *it;
Packit bfcc33
                    break;
Packit bfcc33
                case internal::NOT_ENOUGH_ROOM:
Packit bfcc33
                    throw not_enough_room();
Packit bfcc33
                case internal::INVALID_LEAD:
Packit bfcc33
                    out = utf8::append (replacement, out);
Packit bfcc33
                    ++start;
Packit bfcc33
                    break;
Packit bfcc33
                case internal::INCOMPLETE_SEQUENCE:
Packit bfcc33
                case internal::OVERLONG_SEQUENCE:
Packit bfcc33
                case internal::INVALID_CODE_POINT:
Packit bfcc33
                    out = utf8::append (replacement, out);
Packit bfcc33
                    ++start;
Packit bfcc33
                    // just one replacement mark for the sequence
Packit bfcc33
                    while (start != end && utf8::internal::is_trail(*start))
Packit bfcc33
                        ++start;
Packit bfcc33
                    break;
Packit bfcc33
            }
Packit bfcc33
        }
Packit bfcc33
        return out;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator, typename output_iterator>
Packit bfcc33
    inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
Packit bfcc33
    {
Packit bfcc33
        static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
Packit bfcc33
        return utf8::replace_invalid(start, end, out, replacement_marker);
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    uint32_t next(octet_iterator& it, octet_iterator end)
Packit bfcc33
    {
Packit bfcc33
        uint32_t cp = 0;
Packit bfcc33
        internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
Packit bfcc33
        switch (err_code) {
Packit bfcc33
            case internal::UTF8_OK :
Packit bfcc33
                break;
Packit bfcc33
            case internal::NOT_ENOUGH_ROOM :
Packit bfcc33
                throw not_enough_room();
Packit bfcc33
            case internal::INVALID_LEAD :
Packit bfcc33
            case internal::INCOMPLETE_SEQUENCE :
Packit bfcc33
            case internal::OVERLONG_SEQUENCE :
Packit bfcc33
                throw invalid_utf8(*it);
Packit bfcc33
            case internal::INVALID_CODE_POINT :
Packit bfcc33
                throw invalid_code_point(cp);
Packit bfcc33
        }
Packit bfcc33
        return cp;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    uint32_t peek_next(octet_iterator it, octet_iterator end)
Packit bfcc33
    {
Packit bfcc33
        return utf8::next(it, end);
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    uint32_t prior(octet_iterator& it, octet_iterator start)
Packit bfcc33
    {
Packit bfcc33
        // can't do much if it == start
Packit bfcc33
        if (it == start)
Packit bfcc33
            throw not_enough_room();
Packit bfcc33
Packit bfcc33
        octet_iterator end = it;
Packit bfcc33
        // Go back until we hit either a lead octet or start
Packit bfcc33
        while (utf8::internal::is_trail(*(--it)))
Packit bfcc33
            if (it == start)
Packit bfcc33
                throw invalid_utf8(*it); // error - no lead byte in the sequence
Packit bfcc33
        return utf8::peek_next(it, end);
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    /// Deprecated in versions that include "prior"
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    uint32_t previous(octet_iterator& it, octet_iterator pass_start)
Packit bfcc33
    {
Packit bfcc33
        octet_iterator end = it;
Packit bfcc33
        while (utf8::internal::is_trail(*(--it)))
Packit bfcc33
            if (it == pass_start)
Packit bfcc33
                throw invalid_utf8(*it); // error - no lead byte in the sequence
Packit bfcc33
        octet_iterator temp = it;
Packit bfcc33
        return utf8::next(temp, end);
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator, typename distance_type>
Packit bfcc33
    void advance (octet_iterator& it, distance_type n, octet_iterator end)
Packit bfcc33
    {
Packit bfcc33
        for (distance_type i = 0; i < n; ++i)
Packit bfcc33
            utf8::next(it, end);
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    typename std::iterator_traits<octet_iterator>::difference_type
Packit bfcc33
    distance (octet_iterator first, octet_iterator last)
Packit bfcc33
    {
Packit bfcc33
        typename std::iterator_traits<octet_iterator>::difference_type dist;
Packit bfcc33
        for (dist = 0; first < last; ++dist)
Packit bfcc33
            utf8::next(first, last);
Packit bfcc33
        return dist;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename u16bit_iterator, typename octet_iterator>
Packit bfcc33
    octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
Packit bfcc33
    {
Packit bfcc33
        while (start != end) {
Packit bfcc33
            uint32_t cp = utf8::internal::mask16(*start++);
Packit bfcc33
            // Take care of surrogate pairs first
Packit bfcc33
            if (utf8::internal::is_lead_surrogate(cp)) {
Packit bfcc33
                if (start != end) {
Packit bfcc33
                    uint32_t trail_surrogate = utf8::internal::mask16(*start++);
Packit bfcc33
                    if (utf8::internal::is_trail_surrogate(trail_surrogate))
Packit bfcc33
                        cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
Packit bfcc33
                    else
Packit bfcc33
                        throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
Packit bfcc33
                }
Packit bfcc33
                else
Packit bfcc33
                    throw invalid_utf16(static_cast<uint16_t>(cp));
Packit bfcc33
Packit bfcc33
            }
Packit bfcc33
            // Lone trail surrogate
Packit bfcc33
            else if (utf8::internal::is_trail_surrogate(cp))
Packit bfcc33
                throw invalid_utf16(static_cast<uint16_t>(cp));
Packit bfcc33
Packit bfcc33
            result = utf8::append(cp, result);
Packit bfcc33
        }
Packit bfcc33
        return result;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename u16bit_iterator, typename octet_iterator>
Packit bfcc33
    u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
Packit bfcc33
    {
Packit bfcc33
        while (start != end) {
Packit bfcc33
            uint32_t cp = utf8::next(start, end);
Packit bfcc33
            if (cp > 0xffff) { //make a surrogate pair
Packit bfcc33
                *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
Packit bfcc33
                *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
Packit bfcc33
            }
Packit bfcc33
            else
Packit bfcc33
                *result++ = static_cast<uint16_t>(cp);
Packit bfcc33
        }
Packit bfcc33
        return result;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator, typename u32bit_iterator>
Packit bfcc33
    octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
Packit bfcc33
    {
Packit bfcc33
        while (start != end)
Packit bfcc33
            result = utf8::append(*(start++), result);
Packit bfcc33
Packit bfcc33
        return result;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    template <typename octet_iterator, typename u32bit_iterator>
Packit bfcc33
    u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
Packit bfcc33
    {
Packit bfcc33
        while (start != end)
Packit bfcc33
            (*result++) = utf8::next(start, end);
Packit bfcc33
Packit bfcc33
        return result;
Packit bfcc33
    }
Packit bfcc33
Packit bfcc33
    // The iterator class
Packit bfcc33
    template <typename octet_iterator>
Packit bfcc33
    class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
Packit bfcc33
      octet_iterator it;
Packit bfcc33
      octet_iterator range_start;
Packit bfcc33
      octet_iterator range_end;
Packit bfcc33
      public:
Packit bfcc33
      iterator () {}
Packit bfcc33
      explicit iterator (const octet_iterator& octet_it,
Packit bfcc33
                         const octet_iterator& range_start,
Packit bfcc33
                         const octet_iterator& range_end) :
Packit bfcc33
               it(octet_it), range_start(range_start), range_end(range_end)
Packit bfcc33
      {
Packit bfcc33
          if (it < range_start || it > range_end)
Packit bfcc33
              throw std::out_of_range("Invalid utf-8 iterator position");
Packit bfcc33
      }
Packit bfcc33
      // the default "big three" are OK
Packit bfcc33
      octet_iterator base () const { return it; }
Packit bfcc33
      uint32_t operator * () const
Packit bfcc33
      {
Packit bfcc33
          octet_iterator temp = it;
Packit bfcc33
          return utf8::next(temp, range_end);
Packit bfcc33
      }
Packit bfcc33
      bool operator == (const iterator& rhs) const
Packit bfcc33
      {
Packit bfcc33
          if (range_start != rhs.range_start || range_end != rhs.range_end)
Packit bfcc33
              throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
Packit bfcc33
          return (it == rhs.it);
Packit bfcc33
      }
Packit bfcc33
      bool operator != (const iterator& rhs) const
Packit bfcc33
      {
Packit bfcc33
          return !(operator == (rhs));
Packit bfcc33
      }
Packit bfcc33
      iterator& operator ++ ()
Packit bfcc33
      {
Packit bfcc33
          utf8::next(it, range_end);
Packit bfcc33
          return *this;
Packit bfcc33
      }
Packit bfcc33
      iterator operator ++ (int)
Packit bfcc33
      {
Packit bfcc33
          iterator temp = *this;
Packit bfcc33
          utf8::next(it, range_end);
Packit bfcc33
          return temp;
Packit bfcc33
      }
Packit bfcc33
      iterator& operator -- ()
Packit bfcc33
      {
Packit bfcc33
          utf8::prior(it, range_start);
Packit bfcc33
          return *this;
Packit bfcc33
      }
Packit bfcc33
      iterator operator -- (int)
Packit bfcc33
      {
Packit bfcc33
          iterator temp = *this;
Packit bfcc33
          utf8::prior(it, range_start);
Packit bfcc33
          return temp;
Packit bfcc33
      }
Packit bfcc33
    }; // class iterator
Packit bfcc33
Packit bfcc33
} // namespace utf8
Packit bfcc33
Packit bfcc33
#endif //header guard
Packit bfcc33
Packit bfcc33