Blame src/Data/String/UTF8.hs

Packit cc4c63
{-
Packit cc4c63
Packit cc4c63
Copyright (c) 2002, members of the Haskell Internationalisation Working
Packit cc4c63
Group All rights reserved.
Packit cc4c63
Packit cc4c63
Redistribution and use in source and binary forms, with or without
Packit cc4c63
modification, are permitted provided that the following conditions are met:
Packit cc4c63
Packit cc4c63
* Redistributions of source code must retain the above copyright notice,
Packit cc4c63
   this list of conditions and the following disclaimer.
Packit cc4c63
* Redistributions in binary form must reproduce the above copyright notice,
Packit cc4c63
   this list of conditions and the following disclaimer in the
Packit cc4c63
   documentation and/or other materials provided with the distribution.
Packit cc4c63
* Neither the name of the Haskell Internationalisation Working Group nor
Packit cc4c63
   the names of its contributors may be used to endorse or promote products
Packit cc4c63
   derived from this software without specific prior written permission.
Packit cc4c63
Packit cc4c63
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
Packit cc4c63
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
Packit cc4c63
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
Packit cc4c63
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
Packit cc4c63
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
Packit cc4c63
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
Packit cc4c63
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
Packit cc4c63
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
Packit cc4c63
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
Packit cc4c63
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
Packit cc4c63
POSSIBILITY OF SUCH DAMAGE.
Packit cc4c63
Packit cc4c63
This module provides lazy stream encoding/decoding facilities for UTF-8,
Packit cc4c63
the Unicode Transformation Format with 8-bit words.
Packit cc4c63
Packit cc4c63
2002-09-02  Sven Moritz Hallberg <pesco@gmx.de>
Packit cc4c63
Packit cc4c63
-}
Packit cc4c63
Packit cc4c63
{-
Packit cc4c63
Packit cc4c63
2007-04-30 Henning Thielemann:
Packit cc4c63
Slight changes to make decode lazy.
Packit cc4c63
The calls of 'reverse' in the original version have broken laziness
Packit cc4c63
and thus had memory leaks.
Packit cc4c63
Packit cc4c63
-}
Packit cc4c63
Packit cc4c63
module Data.String.UTF8
Packit cc4c63
  ( encode
Packit cc4c63
  , decode
Packit cc4c63
  , decodeEmbedErrors
Packit cc4c63
  , encodeOne
Packit cc4c63
  , decodeOne
Packit cc4c63
  , Error
Packit cc4c63
         -- Haddock does not want to document signatures with private types
Packit cc4c63
         -- these functions should be moved to a utility module
Packit cc4c63
  ) where
Packit cc4c63
Packit cc4c63
import Data.Char (ord, chr)
Packit cc4c63
import Data.Word (Word8, Word16, Word32)
Packit cc4c63
import Data.Bits (shiftL, shiftR, (.&.), (.|.))
Packit cc4c63
Packit cc4c63
import Data.List (unfoldr)
Packit cc4c63
Packit cc4c63
-- - UTF-8 in General -
Packit cc4c63
Packit cc4c63
-- Adapted from the Unicode standard, version 3.2,
Packit cc4c63
-- Table 3.1 "UTF-8 Bit Distribution" (excluded are UTF-16 encodings):
Packit cc4c63
Packit cc4c63
--   Scalar                    1st Byte  2nd Byte  3rd Byte  4th Byte
Packit cc4c63
--           000000000xxxxxxx  0xxxxxxx
Packit cc4c63
--           00000yyyyyxxxxxx  110yyyyy  10xxxxxx
Packit cc4c63
--           zzzzyyyyyyxxxxxx  1110zzzz  10yyyyyy  10xxxxxx
Packit cc4c63
--   000uuuzzzzzzyyyyyyxxxxxx  11110uuu  10zzzzzz  10yyyyyy  10xxxxxx
Packit cc4c63
Packit cc4c63
-- Also from the Unicode standard, version 3.2,
Packit cc4c63
-- Table 3.1B "Legal UTF-8 Byte Sequences":
Packit cc4c63
Packit cc4c63
--   Code Points         1st Byte  2nd Byte  3rd Byte  4th Byte
Packit cc4c63
--     U+0000..U+007F    00..7F
Packit cc4c63
--     U+0080..U+07FF    C2..DF    80..BF
Packit cc4c63
--     U+0800..U+0FFF    E0        A0..BF    80..BF
Packit cc4c63
--     U+1000..U+CFFF    E1..EC    80..BF    80..BF
Packit cc4c63
--     U+D000..U+D7FF    ED        80..9F    80..BF
Packit cc4c63
--     U+D800..U+DFFF    ill-formed
Packit cc4c63
--     U+E000..U+FFFF    EE..EF    80..BF    80..BF
Packit cc4c63
--    U+10000..U+3FFFF   F0        90..BF    80..BF    80..BF
Packit cc4c63
--    U+40000..U+FFFFF   F1..F3    80..BF    80..BF    80..BF
Packit cc4c63
--   U+100000..U+10FFFF  F4        80..8F    80..BF    80..BF
Packit cc4c63
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- - Encoding Functions -
Packit cc4c63
Packit cc4c63
-- Must the encoder ensure that no illegal byte sequences are output or
Packit cc4c63
-- can we trust the Haskell system to supply only legal values?
Packit cc4c63
-- For now I include error case for the surrogate values U+D800..U+DFFF and
Packit cc4c63
-- out-of-range scalars.
Packit cc4c63
Packit cc4c63
-- The function is pretty much a transscript of table 3.1B with error checks.
Packit cc4c63
-- It dispatches the actual encoding to functions specific to the number of
Packit cc4c63
-- required bytes.
Packit cc4c63
Packit cc4c63
encodeOne :: Char -> [Word8]
Packit cc4c63
encodeOne c
Packit cc4c63
    -- The report guarantees in (6.1.2) that this won't happen:
Packit cc4c63
    --   | n < 0       = error "encodeUTF8: ord returned a negative value"
Packit cc4c63
    | n < 0x0080  = encodeOne_onebyte n8
Packit cc4c63
    | n < 0x0800  = encodeOne_twobyte n16
Packit cc4c63
    | n < 0xD800  = encodeOne_threebyte n16
Packit cc4c63
    | n < 0xE000  = error "encodeUTF8: ord returned a surrogate value"
Packit cc4c63
    | n < 0x10000       = encodeOne_threebyte n16
Packit cc4c63
    -- Haskell 98 only talks about 16 bit characters, but ghc handles 20.1.
Packit cc4c63
    | n < 0x10FFFF      = encodeOne_fourbyte n32
Packit cc4c63
    | otherwise  = error "encodeUTF8: ord returned a value above 0x10FFFF"
Packit cc4c63
    where
Packit cc4c63
    n = ord c            :: Int
Packit cc4c63
    n8 = fromIntegral n  :: Word8
Packit cc4c63
    n16 = fromIntegral n :: Word16
Packit cc4c63
    n32 = fromIntegral n :: Word32
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- With the above, a stream decoder is trivial:
Packit cc4c63
Packit cc4c63
encode :: [Char] -> [Word8]
Packit cc4c63
encode = concatMap encodeOne
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- Now follow the individual encoders for certain numbers of bytes...
Packit cc4c63
--           _
Packit cc4c63
--          / |  __  ___  __ __
Packit cc4c63
--         / ^| //  /__/ // //
Packit cc4c63
--        /.==| \\ //_  // //
Packit cc4c63
-- It's  //  || // \_/_//_//_  and it's here to stay!
Packit cc4c63
Packit cc4c63
encodeOne_onebyte :: Word8 -> [Word8]
Packit cc4c63
encodeOne_onebyte cp = [cp]
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- 00000yyyyyxxxxxx -> 110yyyyy 10xxxxxx
Packit cc4c63
Packit cc4c63
encodeOne_twobyte :: Word16 -> [Word8]
Packit cc4c63
encodeOne_twobyte cp = [(0xC0.|.ys), (0x80.|.xs)]
Packit cc4c63
    where
Packit cc4c63
    xs, ys :: Word8
Packit cc4c63
    ys = fromIntegral (shiftR cp 6)
Packit cc4c63
    xs = (fromIntegral cp) .&. 0x3F
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- zzzzyyyyyyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx
Packit cc4c63
Packit cc4c63
encodeOne_threebyte :: Word16 -> [Word8]
Packit cc4c63
encodeOne_threebyte cp = [(0xE0.|.zs), (0x80.|.ys), (0x80.|.xs)]
Packit cc4c63
    where
Packit cc4c63
    xs, ys, zs :: Word8
Packit cc4c63
    xs = (fromIntegral cp) .&. 0x3F
Packit cc4c63
    ys = (fromIntegral (shiftR cp 6)) .&. 0x3F
Packit cc4c63
    zs = fromIntegral (shiftR cp 12)
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- 000uuuzzzzzzyyyyyyxxxxxx -> 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
Packit cc4c63
Packit cc4c63
encodeOne_fourbyte :: Word32 -> [Word8]
Packit cc4c63
encodeOne_fourbyte cp = [0xF0.|.us, 0x80.|.zs, 0x80.|.ys, 0x80.|.xs]
Packit cc4c63
    where
Packit cc4c63
    xs, ys, zs, us :: Word8
Packit cc4c63
    xs = (fromIntegral cp) .&. 0x3F
Packit cc4c63
    ys = (fromIntegral (shiftR cp 6)) .&. 0x3F
Packit cc4c63
    zs = (fromIntegral (shiftR cp 12)) .&. 0x3F
Packit cc4c63
    us = fromIntegral (shiftR cp 18)
Packit cc4c63
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- - Decoding -
Packit cc4c63
Packit cc4c63
-- The decoding is a bit more involved. The byte sequence could contain all
Packit cc4c63
-- sorts of corruptions. The user must be able to either notice or ignore these
Packit cc4c63
-- errors.
Packit cc4c63
Packit cc4c63
-- I will first look at the decoding of a single character. The process
Packit cc4c63
-- consumes a certain number of bytes from the input. It returns the
Packit cc4c63
-- remaining input and either an error and the index of its occurance in the
Packit cc4c63
-- byte sequence or the decoded character.
Packit cc4c63
Packit cc4c63
data Error
Packit cc4c63
Packit cc4c63
-- The first byte in a sequence starts with either zero, two, three, or four
Packit cc4c63
-- ones and one zero to indicate the length of the sequence. If it doesn't,
Packit cc4c63
-- it is invalid. It is dropped and the next byte interpreted as the start
Packit cc4c63
-- of a new sequence.
Packit cc4c63
Packit cc4c63
    = InvalidFirstByte
Packit cc4c63
Packit cc4c63
-- All bytes in the sequence except the first match the bit pattern 10xxxxxx.
Packit cc4c63
-- If one doesn't, it is invalid. The sequence up to that point is dropped
Packit cc4c63
-- and the "invalid" byte interpreted as the start of a new sequence. The error
Packit cc4c63
-- includes the length of the partial sequence and the number of expected bytes.
Packit cc4c63
Packit cc4c63
    | InvalidLaterByte Int      -- the byte at relative index n was invalid
Packit cc4c63
Packit cc4c63
-- If a sequence ends prematurely, it has been truncated. It dropped and
Packit cc4c63
-- decoding stops. The error reports the actual and expected lengths of the
Packit cc4c63
-- sequence.
Packit cc4c63
Packit cc4c63
    | Truncated Int Int         -- only n of m expected bytes were present
Packit cc4c63
Packit cc4c63
-- Some sequences would represent code points which would be encoded as a
Packit cc4c63
-- shorter sequence by a conformant encoder. Such non-shortest sequences are
Packit cc4c63
-- considered erroneous and dropped. The error reports the actual and
Packit cc4c63
-- expected number of bytes used.
Packit cc4c63
Packit cc4c63
    | NonShortest Int Int       -- n instead of m bytes were used
Packit cc4c63
Packit cc4c63
-- Unicode code points are in the range of [0..0x10FFFF]. Any values outside
Packit cc4c63
-- of those bounds are simply invalid.
Packit cc4c63
Packit cc4c63
    | ValueOutOfBounds
Packit cc4c63
Packit cc4c63
-- There is no such thing as "surrogate pairs" any more in UTF-8. The
Packit cc4c63
-- corresponding code points now form illegal byte sequences.
Packit cc4c63
Packit cc4c63
    | Surrogate
Packit cc4c63
      deriving (Show, Eq)
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- Second, third, and fourth bytes share the common requirement to start
Packit cc4c63
-- with the bit sequence 10. So, here's the function to check that property.
Packit cc4c63
Packit cc4c63
first_bits_not_10 :: Word8 -> Bool
Packit cc4c63
first_bits_not_10 b
Packit cc4c63
    | (b.&.0xC0) /= 0x80  = True
Packit cc4c63
    | otherwise           = False
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- Erm, OK, the single-character decoding function's return type is a bit
Packit cc4c63
-- longish. It is a tripel:
Packit cc4c63
Packit cc4c63
--  - The first component contains the decoded character or an error
Packit cc4c63
--    if the byte sequence was erroneous.
Packit cc4c63
--  - The second component contains the number of bytes that were consumed
Packit cc4c63
--    from the input.
Packit cc4c63
--  - The third component contains the remaining bytes of input.
Packit cc4c63
Packit cc4c63
decodeOne :: [Word8] -> (Either Error Char, Int, [Word8])
Packit cc4c63
decodeOne bs@(b1:rest)
Packit cc4c63
    | b1 < 0x80   = decodeOne_onebyte bs
Packit cc4c63
    | b1 < 0xC0   = (Left InvalidFirstByte, 1, rest)
Packit cc4c63
    | b1 < 0xE0   = decodeOne_twobyte bs
Packit cc4c63
    | b1 < 0xF0   = decodeOne_threebyte bs
Packit cc4c63
    | b1 < 0xF5   = decodeOne_fourbyte bs
Packit cc4c63
    | otherwise   = (Left ValueOutOfBounds, 1, rest)
Packit cc4c63
decodeOne [] = error "UTF8.decodeOne: No input"
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- 0xxxxxxx -> 000000000xxxxxxx
Packit cc4c63
Packit cc4c63
decodeOne_onebyte :: [Word8] -> (Either Error Char, Int, [Word8])
Packit cc4c63
decodeOne_onebyte (b:bs) = (Right (cpToChar b), 1, bs)
Packit cc4c63
decodeOne_onebyte[] = error "UTF8.decodeOne_onebyte: No input (can't happen)"
Packit cc4c63
Packit cc4c63
cpToChar :: Integral a => a -> Char
Packit cc4c63
cpToChar = chr . fromIntegral
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- 110yyyyy 10xxxxxx -> 00000yyyyyxxxxxx
Packit cc4c63
Packit cc4c63
decodeOne_twobyte :: [Word8] -> (Either Error Char, Int, [Word8])
Packit cc4c63
decodeOne_twobyte (_:[])
Packit cc4c63
    = (Left (Truncated 1 2), 1, [])
Packit cc4c63
decodeOne_twobyte (b1:b2:bs)
Packit cc4c63
    | b1 < 0xC2            = (Left (NonShortest 2 1), 2, bs)
Packit cc4c63
    | first_bits_not_10 b2 = (Left (InvalidLaterByte 1), 1, (b2:bs))
Packit cc4c63
    | otherwise            = (Right (cpToChar result), 2, bs)
Packit cc4c63
    where
Packit cc4c63
    xs, ys, result :: Word32
Packit cc4c63
    xs = fromIntegral (b2.&.0x3F)
Packit cc4c63
    ys = fromIntegral (b1.&.0x1F)
Packit cc4c63
    result = shiftL ys 6 .|. xs
Packit cc4c63
decodeOne_twobyte[] = error "UTF8.decodeOne_twobyte: No input (can't happen)"
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- 1110zzzz 10yyyyyy 10xxxxxx -> zzzzyyyyyyxxxxxx
Packit cc4c63
Packit cc4c63
decodeOne_threebyte :: [Word8] -> (Either Error Char, Int, [Word8])
Packit cc4c63
decodeOne_threebyte (_:[])   = threebyte_truncated 1
Packit cc4c63
decodeOne_threebyte (_:_:[]) = threebyte_truncated 2
Packit cc4c63
decodeOne_threebyte bs@(b1:b2:b3:rest)
Packit cc4c63
    | first_bits_not_10 b2
Packit cc4c63
        = (Left (InvalidLaterByte 1), 1, drop 1 bs)
Packit cc4c63
    | first_bits_not_10 b3
Packit cc4c63
        = (Left (InvalidLaterByte 2), 2, drop 2 bs)
Packit cc4c63
    | result < 0x0080
Packit cc4c63
        = (Left (NonShortest 3 1), 3, rest)
Packit cc4c63
    | result < 0x0800
Packit cc4c63
        = (Left (NonShortest 3 2), 3, rest)
Packit cc4c63
    | result >= 0xD800 && result < 0xE000
Packit cc4c63
        = (Left Surrogate, 3, rest)
Packit cc4c63
    | otherwise
Packit cc4c63
        = (Right (cpToChar result), 3, rest)
Packit cc4c63
    where
Packit cc4c63
    xs, ys, zs, result :: Word32
Packit cc4c63
    xs = fromIntegral (b3.&.0x3F)
Packit cc4c63
    ys = fromIntegral (b2.&.0x3F)
Packit cc4c63
    zs = fromIntegral (b1.&.0x0F)
Packit cc4c63
    result = shiftL zs 12 .|. shiftL ys 6 .|. xs
Packit cc4c63
decodeOne_threebyte[]
Packit cc4c63
 = error "UTF8.decodeOne_threebyte: No input (can't happen)"
Packit cc4c63
Packit cc4c63
threebyte_truncated :: Int -> (Either Error Char, Int, [Word8])
Packit cc4c63
threebyte_truncated n = (Left (Truncated n 3), n, [])
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx -> 000uuuzzzzzzyyyyyyxxxxxx
Packit cc4c63
Packit cc4c63
decodeOne_fourbyte :: [Word8] -> (Either Error Char, Int, [Word8])
Packit cc4c63
decodeOne_fourbyte (_:[])     = fourbyte_truncated 1
Packit cc4c63
decodeOne_fourbyte (_:_:[])   = fourbyte_truncated 2
Packit cc4c63
decodeOne_fourbyte (_:_:_:[]) = fourbyte_truncated 3
Packit cc4c63
decodeOne_fourbyte bs@(b1:b2:b3:b4:rest)
Packit cc4c63
    | first_bits_not_10 b2
Packit cc4c63
        = (Left (InvalidLaterByte 1), 1, drop 1 bs)
Packit cc4c63
    | first_bits_not_10 b3
Packit cc4c63
        = (Left (InvalidLaterByte 2), 2, drop 2 bs)
Packit cc4c63
    | first_bits_not_10 b4
Packit cc4c63
        = (Left (InvalidLaterByte 3), 3, drop 3 bs)
Packit cc4c63
    | result < 0x0080
Packit cc4c63
        = (Left (NonShortest 4 1), 4, rest)
Packit cc4c63
    | result < 0x0800
Packit cc4c63
        = (Left (NonShortest 4 2), 4, rest)
Packit cc4c63
    | result < 0x10000
Packit cc4c63
        = (Left (NonShortest 4 3), 4, rest)
Packit cc4c63
    | result > 0x10FFFF
Packit cc4c63
        = (Left ValueOutOfBounds, 4, rest)
Packit cc4c63
    | otherwise
Packit cc4c63
        = (Right (cpToChar result), 4, rest)
Packit cc4c63
    where
Packit cc4c63
    xs, ys, zs, us, result :: Word32
Packit cc4c63
    xs = fromIntegral (b4 .&. 0x3F)
Packit cc4c63
    ys = fromIntegral (b3 .&. 0x3F)
Packit cc4c63
    zs = fromIntegral (b2 .&. 0x3F)
Packit cc4c63
    us = fromIntegral (b1 .&. 0x07)
Packit cc4c63
    result = xs .|. shiftL ys 6 .|. shiftL zs 12 .|. shiftL us 18
Packit cc4c63
decodeOne_fourbyte[]
Packit cc4c63
 = error "UTF8.decodeOne_fourbyte: No input (can't happen)"
Packit cc4c63
Packit cc4c63
fourbyte_truncated :: Int -> (Either Error Char, Int, [Word8])
Packit cc4c63
fourbyte_truncated n = (Left (Truncated n 4), n, [])
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- The decoder examines all input, recording decoded characters as well as
Packit cc4c63
-- error-index pairs along the way.
Packit cc4c63
Packit cc4c63
decode :: [Word8] -> ([Char], [(Error,Int)])
Packit cc4c63
decode = swap . partitionEither . decodeEmbedErrors
Packit cc4c63
Packit cc4c63
decodeEmbedErrors :: [Word8] -> [Either (Error,Int) Char]
Packit cc4c63
decodeEmbedErrors =
Packit cc4c63
   unfoldr (\(pos,xs) ->
Packit cc4c63
       toMaybe
Packit cc4c63
          (not $ null xs)
Packit cc4c63
          (let (c,n,rest) = decodeOne xs
Packit cc4c63
           in  (either (\err -> Left (err,pos)) Right c,
Packit cc4c63
                (pos+n,rest)))) .
Packit cc4c63
   (,) 0
Packit cc4c63
Packit cc4c63
swap :: (a,b) -> (b,a)
Packit cc4c63
swap (x,y) = (y,x)
Packit cc4c63
{-# INLINE swap #-}
Packit cc4c63
Packit cc4c63
partitionEither :: [Either a b] -> ([a], [b])
Packit cc4c63
partitionEither =
Packit cc4c63
   foldr (\x ~(ls,rs) -> either (\l -> (l:ls,rs)) (\r -> (ls,r:rs)) x) ([],[])
Packit cc4c63
{-# INLINE partitionEither #-}
Packit cc4c63
Packit cc4c63
toMaybe :: Bool -> a -> Maybe a
Packit cc4c63
toMaybe False _ = Nothing
Packit cc4c63
toMaybe True  x = Just x
Packit cc4c63
{-# INLINE toMaybe #-}
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------