|
Packit |
cc4c63 |
{-
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
Copyright (c) 2002, members of the Haskell Internationalisation Working
|
|
Packit |
cc4c63 |
Group All rights reserved.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
Redistribution and use in source and binary forms, with or without
|
|
Packit |
cc4c63 |
modification, are permitted provided that the following conditions are met:
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
* Redistributions of source code must retain the above copyright notice,
|
|
Packit |
cc4c63 |
this list of conditions and the following disclaimer.
|
|
Packit |
cc4c63 |
* Redistributions in binary form must reproduce the above copyright notice,
|
|
Packit |
cc4c63 |
this list of conditions and the following disclaimer in the
|
|
Packit |
cc4c63 |
documentation and/or other materials provided with the distribution.
|
|
Packit |
cc4c63 |
* Neither the name of the Haskell Internationalisation Working Group nor
|
|
Packit |
cc4c63 |
the names of its contributors may be used to endorse or promote products
|
|
Packit |
cc4c63 |
derived from this software without specific prior written permission.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
Packit |
cc4c63 |
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
Packit |
cc4c63 |
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
Packit |
cc4c63 |
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
Packit |
cc4c63 |
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
Packit |
cc4c63 |
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
Packit |
cc4c63 |
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
Packit |
cc4c63 |
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
Packit |
cc4c63 |
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
Packit |
cc4c63 |
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
Packit |
cc4c63 |
POSSIBILITY OF SUCH DAMAGE.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
This module provides lazy stream encoding/decoding facilities for UTF-8,
|
|
Packit |
cc4c63 |
the Unicode Transformation Format with 8-bit words.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
2002-09-02 Sven Moritz Hallberg <pesco@gmx.de>
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-}
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
{-
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
2007-04-30 Henning Thielemann:
|
|
Packit |
cc4c63 |
Slight changes to make decode lazy.
|
|
Packit |
cc4c63 |
The calls of 'reverse' in the original version have broken laziness
|
|
Packit |
cc4c63 |
and thus had memory leaks.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-}
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
module Data.String.UTF8
|
|
Packit |
cc4c63 |
( encode
|
|
Packit |
cc4c63 |
, decode
|
|
Packit |
cc4c63 |
, decodeEmbedErrors
|
|
Packit |
cc4c63 |
, encodeOne
|
|
Packit |
cc4c63 |
, decodeOne
|
|
Packit |
cc4c63 |
, Error
|
|
Packit |
cc4c63 |
-- Haddock does not want to document signatures with private types
|
|
Packit |
cc4c63 |
-- these functions should be moved to a utility module
|
|
Packit |
cc4c63 |
) where
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
import Data.Char (ord, chr)
|
|
Packit |
cc4c63 |
import Data.Word (Word8, Word16, Word32)
|
|
Packit |
cc4c63 |
import Data.Bits (shiftL, shiftR, (.&.), (.|.))
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
import Data.List (unfoldr)
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- - UTF-8 in General -
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- Adapted from the Unicode standard, version 3.2,
|
|
Packit |
cc4c63 |
-- Table 3.1 "UTF-8 Bit Distribution" (excluded are UTF-16 encodings):
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- Scalar 1st Byte 2nd Byte 3rd Byte 4th Byte
|
|
Packit |
cc4c63 |
-- 000000000xxxxxxx 0xxxxxxx
|
|
Packit |
cc4c63 |
-- 00000yyyyyxxxxxx 110yyyyy 10xxxxxx
|
|
Packit |
cc4c63 |
-- zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
|
|
Packit |
cc4c63 |
-- 000uuuzzzzzzyyyyyyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- Also from the Unicode standard, version 3.2,
|
|
Packit |
cc4c63 |
-- Table 3.1B "Legal UTF-8 Byte Sequences":
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
|
|
Packit |
cc4c63 |
-- U+0000..U+007F 00..7F
|
|
Packit |
cc4c63 |
-- U+0080..U+07FF C2..DF 80..BF
|
|
Packit |
cc4c63 |
-- U+0800..U+0FFF E0 A0..BF 80..BF
|
|
Packit |
cc4c63 |
-- U+1000..U+CFFF E1..EC 80..BF 80..BF
|
|
Packit |
cc4c63 |
-- U+D000..U+D7FF ED 80..9F 80..BF
|
|
Packit |
cc4c63 |
-- U+D800..U+DFFF ill-formed
|
|
Packit |
cc4c63 |
-- U+E000..U+FFFF EE..EF 80..BF 80..BF
|
|
Packit |
cc4c63 |
-- U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
|
|
Packit |
cc4c63 |
-- U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
|
|
Packit |
cc4c63 |
-- U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- - Encoding Functions -
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- Must the encoder ensure that no illegal byte sequences are output or
|
|
Packit |
cc4c63 |
-- can we trust the Haskell system to supply only legal values?
|
|
Packit |
cc4c63 |
-- For now I include error case for the surrogate values U+D800..U+DFFF and
|
|
Packit |
cc4c63 |
-- out-of-range scalars.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- The function is pretty much a transscript of table 3.1B with error checks.
|
|
Packit |
cc4c63 |
-- It dispatches the actual encoding to functions specific to the number of
|
|
Packit |
cc4c63 |
-- required bytes.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
encodeOne :: Char -> [Word8]
|
|
Packit |
cc4c63 |
encodeOne c
|
|
Packit |
cc4c63 |
-- The report guarantees in (6.1.2) that this won't happen:
|
|
Packit |
cc4c63 |
-- | n < 0 = error "encodeUTF8: ord returned a negative value"
|
|
Packit |
cc4c63 |
| n < 0x0080 = encodeOne_onebyte n8
|
|
Packit |
cc4c63 |
| n < 0x0800 = encodeOne_twobyte n16
|
|
Packit |
cc4c63 |
| n < 0xD800 = encodeOne_threebyte n16
|
|
Packit |
cc4c63 |
| n < 0xE000 = error "encodeUTF8: ord returned a surrogate value"
|
|
Packit |
cc4c63 |
| n < 0x10000 = encodeOne_threebyte n16
|
|
Packit |
cc4c63 |
-- Haskell 98 only talks about 16 bit characters, but ghc handles 20.1.
|
|
Packit |
cc4c63 |
| n < 0x10FFFF = encodeOne_fourbyte n32
|
|
Packit |
cc4c63 |
| otherwise = error "encodeUTF8: ord returned a value above 0x10FFFF"
|
|
Packit |
cc4c63 |
where
|
|
Packit |
cc4c63 |
n = ord c :: Int
|
|
Packit |
cc4c63 |
n8 = fromIntegral n :: Word8
|
|
Packit |
cc4c63 |
n16 = fromIntegral n :: Word16
|
|
Packit |
cc4c63 |
n32 = fromIntegral n :: Word32
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- With the above, a stream decoder is trivial:
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
encode :: [Char] -> [Word8]
|
|
Packit |
cc4c63 |
encode = concatMap encodeOne
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- Now follow the individual encoders for certain numbers of bytes...
|
|
Packit |
cc4c63 |
-- _
|
|
Packit |
cc4c63 |
-- / | __ ___ __ __
|
|
Packit |
cc4c63 |
-- / ^| // /__/ // //
|
|
Packit |
cc4c63 |
-- /.==| \\ //_ // //
|
|
Packit |
cc4c63 |
-- It's // || // \_/_//_//_ and it's here to stay!
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
encodeOne_onebyte :: Word8 -> [Word8]
|
|
Packit |
cc4c63 |
encodeOne_onebyte cp = [cp]
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- 00000yyyyyxxxxxx -> 110yyyyy 10xxxxxx
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
encodeOne_twobyte :: Word16 -> [Word8]
|
|
Packit |
cc4c63 |
encodeOne_twobyte cp = [(0xC0.|.ys), (0x80.|.xs)]
|
|
Packit |
cc4c63 |
where
|
|
Packit |
cc4c63 |
xs, ys :: Word8
|
|
Packit |
cc4c63 |
ys = fromIntegral (shiftR cp 6)
|
|
Packit |
cc4c63 |
xs = (fromIntegral cp) .&. 0x3F
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- zzzzyyyyyyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
encodeOne_threebyte :: Word16 -> [Word8]
|
|
Packit |
cc4c63 |
encodeOne_threebyte cp = [(0xE0.|.zs), (0x80.|.ys), (0x80.|.xs)]
|
|
Packit |
cc4c63 |
where
|
|
Packit |
cc4c63 |
xs, ys, zs :: Word8
|
|
Packit |
cc4c63 |
xs = (fromIntegral cp) .&. 0x3F
|
|
Packit |
cc4c63 |
ys = (fromIntegral (shiftR cp 6)) .&. 0x3F
|
|
Packit |
cc4c63 |
zs = fromIntegral (shiftR cp 12)
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- 000uuuzzzzzzyyyyyyxxxxxx -> 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
encodeOne_fourbyte :: Word32 -> [Word8]
|
|
Packit |
cc4c63 |
encodeOne_fourbyte cp = [0xF0.|.us, 0x80.|.zs, 0x80.|.ys, 0x80.|.xs]
|
|
Packit |
cc4c63 |
where
|
|
Packit |
cc4c63 |
xs, ys, zs, us :: Word8
|
|
Packit |
cc4c63 |
xs = (fromIntegral cp) .&. 0x3F
|
|
Packit |
cc4c63 |
ys = (fromIntegral (shiftR cp 6)) .&. 0x3F
|
|
Packit |
cc4c63 |
zs = (fromIntegral (shiftR cp 12)) .&. 0x3F
|
|
Packit |
cc4c63 |
us = fromIntegral (shiftR cp 18)
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- - Decoding -
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- The decoding is a bit more involved. The byte sequence could contain all
|
|
Packit |
cc4c63 |
-- sorts of corruptions. The user must be able to either notice or ignore these
|
|
Packit |
cc4c63 |
-- errors.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- I will first look at the decoding of a single character. The process
|
|
Packit |
cc4c63 |
-- consumes a certain number of bytes from the input. It returns the
|
|
Packit |
cc4c63 |
-- remaining input and either an error and the index of its occurance in the
|
|
Packit |
cc4c63 |
-- byte sequence or the decoded character.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
data Error
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- The first byte in a sequence starts with either zero, two, three, or four
|
|
Packit |
cc4c63 |
-- ones and one zero to indicate the length of the sequence. If it doesn't,
|
|
Packit |
cc4c63 |
-- it is invalid. It is dropped and the next byte interpreted as the start
|
|
Packit |
cc4c63 |
-- of a new sequence.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
= InvalidFirstByte
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- All bytes in the sequence except the first match the bit pattern 10xxxxxx.
|
|
Packit |
cc4c63 |
-- If one doesn't, it is invalid. The sequence up to that point is dropped
|
|
Packit |
cc4c63 |
-- and the "invalid" byte interpreted as the start of a new sequence. The error
|
|
Packit |
cc4c63 |
-- includes the length of the partial sequence and the number of expected bytes.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
| InvalidLaterByte Int -- the byte at relative index n was invalid
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- If a sequence ends prematurely, it has been truncated. It dropped and
|
|
Packit |
cc4c63 |
-- decoding stops. The error reports the actual and expected lengths of the
|
|
Packit |
cc4c63 |
-- sequence.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
| Truncated Int Int -- only n of m expected bytes were present
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- Some sequences would represent code points which would be encoded as a
|
|
Packit |
cc4c63 |
-- shorter sequence by a conformant encoder. Such non-shortest sequences are
|
|
Packit |
cc4c63 |
-- considered erroneous and dropped. The error reports the actual and
|
|
Packit |
cc4c63 |
-- expected number of bytes used.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
| NonShortest Int Int -- n instead of m bytes were used
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- Unicode code points are in the range of [0..0x10FFFF]. Any values outside
|
|
Packit |
cc4c63 |
-- of those bounds are simply invalid.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
| ValueOutOfBounds
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- There is no such thing as "surrogate pairs" any more in UTF-8. The
|
|
Packit |
cc4c63 |
-- corresponding code points now form illegal byte sequences.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
| Surrogate
|
|
Packit |
cc4c63 |
deriving (Show, Eq)
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- Second, third, and fourth bytes share the common requirement to start
|
|
Packit |
cc4c63 |
-- with the bit sequence 10. So, here's the function to check that property.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
first_bits_not_10 :: Word8 -> Bool
|
|
Packit |
cc4c63 |
first_bits_not_10 b
|
|
Packit |
cc4c63 |
| (b.&.0xC0) /= 0x80 = True
|
|
Packit |
cc4c63 |
| otherwise = False
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- Erm, OK, the single-character decoding function's return type is a bit
|
|
Packit |
cc4c63 |
-- longish. It is a tripel:
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- - The first component contains the decoded character or an error
|
|
Packit |
cc4c63 |
-- if the byte sequence was erroneous.
|
|
Packit |
cc4c63 |
-- - The second component contains the number of bytes that were consumed
|
|
Packit |
cc4c63 |
-- from the input.
|
|
Packit |
cc4c63 |
-- - The third component contains the remaining bytes of input.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
decodeOne :: [Word8] -> (Either Error Char, Int, [Word8])
|
|
Packit |
cc4c63 |
decodeOne bs@(b1:rest)
|
|
Packit |
cc4c63 |
| b1 < 0x80 = decodeOne_onebyte bs
|
|
Packit |
cc4c63 |
| b1 < 0xC0 = (Left InvalidFirstByte, 1, rest)
|
|
Packit |
cc4c63 |
| b1 < 0xE0 = decodeOne_twobyte bs
|
|
Packit |
cc4c63 |
| b1 < 0xF0 = decodeOne_threebyte bs
|
|
Packit |
cc4c63 |
| b1 < 0xF5 = decodeOne_fourbyte bs
|
|
Packit |
cc4c63 |
| otherwise = (Left ValueOutOfBounds, 1, rest)
|
|
Packit |
cc4c63 |
decodeOne [] = error "UTF8.decodeOne: No input"
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- 0xxxxxxx -> 000000000xxxxxxx
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
decodeOne_onebyte :: [Word8] -> (Either Error Char, Int, [Word8])
|
|
Packit |
cc4c63 |
decodeOne_onebyte (b:bs) = (Right (cpToChar b), 1, bs)
|
|
Packit |
cc4c63 |
decodeOne_onebyte[] = error "UTF8.decodeOne_onebyte: No input (can't happen)"
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
cpToChar :: Integral a => a -> Char
|
|
Packit |
cc4c63 |
cpToChar = chr . fromIntegral
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- 110yyyyy 10xxxxxx -> 00000yyyyyxxxxxx
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
decodeOne_twobyte :: [Word8] -> (Either Error Char, Int, [Word8])
|
|
Packit |
cc4c63 |
decodeOne_twobyte (_:[])
|
|
Packit |
cc4c63 |
= (Left (Truncated 1 2), 1, [])
|
|
Packit |
cc4c63 |
decodeOne_twobyte (b1:b2:bs)
|
|
Packit |
cc4c63 |
| b1 < 0xC2 = (Left (NonShortest 2 1), 2, bs)
|
|
Packit |
cc4c63 |
| first_bits_not_10 b2 = (Left (InvalidLaterByte 1), 1, (b2:bs))
|
|
Packit |
cc4c63 |
| otherwise = (Right (cpToChar result), 2, bs)
|
|
Packit |
cc4c63 |
where
|
|
Packit |
cc4c63 |
xs, ys, result :: Word32
|
|
Packit |
cc4c63 |
xs = fromIntegral (b2.&.0x3F)
|
|
Packit |
cc4c63 |
ys = fromIntegral (b1.&.0x1F)
|
|
Packit |
cc4c63 |
result = shiftL ys 6 .|. xs
|
|
Packit |
cc4c63 |
decodeOne_twobyte[] = error "UTF8.decodeOne_twobyte: No input (can't happen)"
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- 1110zzzz 10yyyyyy 10xxxxxx -> zzzzyyyyyyxxxxxx
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
decodeOne_threebyte :: [Word8] -> (Either Error Char, Int, [Word8])
|
|
Packit |
cc4c63 |
decodeOne_threebyte (_:[]) = threebyte_truncated 1
|
|
Packit |
cc4c63 |
decodeOne_threebyte (_:_:[]) = threebyte_truncated 2
|
|
Packit |
cc4c63 |
decodeOne_threebyte bs@(b1:b2:b3:rest)
|
|
Packit |
cc4c63 |
| first_bits_not_10 b2
|
|
Packit |
cc4c63 |
= (Left (InvalidLaterByte 1), 1, drop 1 bs)
|
|
Packit |
cc4c63 |
| first_bits_not_10 b3
|
|
Packit |
cc4c63 |
= (Left (InvalidLaterByte 2), 2, drop 2 bs)
|
|
Packit |
cc4c63 |
| result < 0x0080
|
|
Packit |
cc4c63 |
= (Left (NonShortest 3 1), 3, rest)
|
|
Packit |
cc4c63 |
| result < 0x0800
|
|
Packit |
cc4c63 |
= (Left (NonShortest 3 2), 3, rest)
|
|
Packit |
cc4c63 |
| result >= 0xD800 && result < 0xE000
|
|
Packit |
cc4c63 |
= (Left Surrogate, 3, rest)
|
|
Packit |
cc4c63 |
| otherwise
|
|
Packit |
cc4c63 |
= (Right (cpToChar result), 3, rest)
|
|
Packit |
cc4c63 |
where
|
|
Packit |
cc4c63 |
xs, ys, zs, result :: Word32
|
|
Packit |
cc4c63 |
xs = fromIntegral (b3.&.0x3F)
|
|
Packit |
cc4c63 |
ys = fromIntegral (b2.&.0x3F)
|
|
Packit |
cc4c63 |
zs = fromIntegral (b1.&.0x0F)
|
|
Packit |
cc4c63 |
result = shiftL zs 12 .|. shiftL ys 6 .|. xs
|
|
Packit |
cc4c63 |
decodeOne_threebyte[]
|
|
Packit |
cc4c63 |
= error "UTF8.decodeOne_threebyte: No input (can't happen)"
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
threebyte_truncated :: Int -> (Either Error Char, Int, [Word8])
|
|
Packit |
cc4c63 |
threebyte_truncated n = (Left (Truncated n 3), n, [])
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx -> 000uuuzzzzzzyyyyyyxxxxxx
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
decodeOne_fourbyte :: [Word8] -> (Either Error Char, Int, [Word8])
|
|
Packit |
cc4c63 |
decodeOne_fourbyte (_:[]) = fourbyte_truncated 1
|
|
Packit |
cc4c63 |
decodeOne_fourbyte (_:_:[]) = fourbyte_truncated 2
|
|
Packit |
cc4c63 |
decodeOne_fourbyte (_:_:_:[]) = fourbyte_truncated 3
|
|
Packit |
cc4c63 |
decodeOne_fourbyte bs@(b1:b2:b3:b4:rest)
|
|
Packit |
cc4c63 |
| first_bits_not_10 b2
|
|
Packit |
cc4c63 |
= (Left (InvalidLaterByte 1), 1, drop 1 bs)
|
|
Packit |
cc4c63 |
| first_bits_not_10 b3
|
|
Packit |
cc4c63 |
= (Left (InvalidLaterByte 2), 2, drop 2 bs)
|
|
Packit |
cc4c63 |
| first_bits_not_10 b4
|
|
Packit |
cc4c63 |
= (Left (InvalidLaterByte 3), 3, drop 3 bs)
|
|
Packit |
cc4c63 |
| result < 0x0080
|
|
Packit |
cc4c63 |
= (Left (NonShortest 4 1), 4, rest)
|
|
Packit |
cc4c63 |
| result < 0x0800
|
|
Packit |
cc4c63 |
= (Left (NonShortest 4 2), 4, rest)
|
|
Packit |
cc4c63 |
| result < 0x10000
|
|
Packit |
cc4c63 |
= (Left (NonShortest 4 3), 4, rest)
|
|
Packit |
cc4c63 |
| result > 0x10FFFF
|
|
Packit |
cc4c63 |
= (Left ValueOutOfBounds, 4, rest)
|
|
Packit |
cc4c63 |
| otherwise
|
|
Packit |
cc4c63 |
= (Right (cpToChar result), 4, rest)
|
|
Packit |
cc4c63 |
where
|
|
Packit |
cc4c63 |
xs, ys, zs, us, result :: Word32
|
|
Packit |
cc4c63 |
xs = fromIntegral (b4 .&. 0x3F)
|
|
Packit |
cc4c63 |
ys = fromIntegral (b3 .&. 0x3F)
|
|
Packit |
cc4c63 |
zs = fromIntegral (b2 .&. 0x3F)
|
|
Packit |
cc4c63 |
us = fromIntegral (b1 .&. 0x07)
|
|
Packit |
cc4c63 |
result = xs .|. shiftL ys 6 .|. shiftL zs 12 .|. shiftL us 18
|
|
Packit |
cc4c63 |
decodeOne_fourbyte[]
|
|
Packit |
cc4c63 |
= error "UTF8.decodeOne_fourbyte: No input (can't happen)"
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
fourbyte_truncated :: Int -> (Either Error Char, Int, [Word8])
|
|
Packit |
cc4c63 |
fourbyte_truncated n = (Left (Truncated n 4), n, [])
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- The decoder examines all input, recording decoded characters as well as
|
|
Packit |
cc4c63 |
-- error-index pairs along the way.
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
decode :: [Word8] -> ([Char], [(Error,Int)])
|
|
Packit |
cc4c63 |
decode = swap . partitionEither . decodeEmbedErrors
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
decodeEmbedErrors :: [Word8] -> [Either (Error,Int) Char]
|
|
Packit |
cc4c63 |
decodeEmbedErrors =
|
|
Packit |
cc4c63 |
unfoldr (\(pos,xs) ->
|
|
Packit |
cc4c63 |
toMaybe
|
|
Packit |
cc4c63 |
(not $ null xs)
|
|
Packit |
cc4c63 |
(let (c,n,rest) = decodeOne xs
|
|
Packit |
cc4c63 |
in (either (\err -> Left (err,pos)) Right c,
|
|
Packit |
cc4c63 |
(pos+n,rest)))) .
|
|
Packit |
cc4c63 |
(,) 0
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
swap :: (a,b) -> (b,a)
|
|
Packit |
cc4c63 |
swap (x,y) = (y,x)
|
|
Packit |
cc4c63 |
{-# INLINE swap #-}
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
partitionEither :: [Either a b] -> ([a], [b])
|
|
Packit |
cc4c63 |
partitionEither =
|
|
Packit |
cc4c63 |
foldr (\x ~(ls,rs) -> either (\l -> (l:ls,rs)) (\r -> (ls,r:rs)) x) ([],[])
|
|
Packit |
cc4c63 |
{-# INLINE partitionEither #-}
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
toMaybe :: Bool -> a -> Maybe a
|
|
Packit |
cc4c63 |
toMaybe False _ = Nothing
|
|
Packit |
cc4c63 |
toMaybe True x = Just x
|
|
Packit |
cc4c63 |
{-# INLINE toMaybe #-}
|
|
Packit |
cc4c63 |
|
|
Packit |
cc4c63 |
-- ------------------------------------------------------------
|