Blame src/Data/String/Unicode.hs

Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
{- |
Packit cc4c63
   Module     : Data.String.Unicode
Packit cc4c63
   Copyright  : Copyright (C) 2010- Uwe Schmidt
Packit cc4c63
   License    : MIT
Packit cc4c63
Packit cc4c63
   Maintainer : Uwe Schmidt (uwe@fh-wedel.de)
Packit cc4c63
   Stability  : stable
Packit cc4c63
   Portability: portable
Packit cc4c63
Packit cc4c63
   Unicode and UTF-8 Conversion Functions
Packit cc4c63
Packit cc4c63
-}
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
module Data.String.Unicode
Packit cc4c63
    (
Packit cc4c63
     -- * Unicode Type declarations
Packit cc4c63
     Unicode,
Packit cc4c63
     UString,
Packit cc4c63
     UTF8Char,
Packit cc4c63
     UTF8String,
Packit cc4c63
     UStringWithErrors,
Packit cc4c63
     DecodingFct,
Packit cc4c63
     DecodingFctEmbedErrors,
Packit cc4c63
Packit cc4c63
      utf8ToUnicode
Packit cc4c63
    , utf8ToUnicodeEmbedErrors
Packit cc4c63
    , latin1ToUnicode
Packit cc4c63
    , ucs2ToUnicode
Packit cc4c63
    , ucs2BigEndianToUnicode
Packit cc4c63
    , ucs2LittleEndianToUnicode
Packit cc4c63
    , utf16beToUnicode
Packit cc4c63
    , utf16leToUnicode
Packit cc4c63
Packit cc4c63
    , unicodeCharToUtf8
Packit cc4c63
Packit cc4c63
    , unicodeToUtf8
Packit cc4c63
    , unicodeToXmlEntity
Packit cc4c63
    , unicodeToLatin1
Packit cc4c63
    , unicodeRemoveNoneAscii
Packit cc4c63
    , unicodeRemoveNoneLatin1
Packit cc4c63
Packit cc4c63
    , intToCharRef
Packit cc4c63
    , intToCharRefHex
Packit cc4c63
    , intToHexString
Packit cc4c63
Packit cc4c63
    , getDecodingFct
Packit cc4c63
    , getDecodingFctEmbedErrors
Packit cc4c63
    , getOutputEncodingFct
Packit cc4c63
Packit cc4c63
    , normalizeNL
Packit cc4c63
    , guessEncoding
Packit cc4c63
Packit cc4c63
    , getOutputEncodingFct'
Packit cc4c63
    , unicodeCharToUtf8'
Packit cc4c63
    , unicodeCharToXmlEntity'
Packit cc4c63
    , unicodeCharToLatin1'
Packit cc4c63
    )
Packit cc4c63
where
Packit cc4c63
Packit cc4c63
import           Data.Char                         (toUpper)
Packit cc4c63
Packit cc4c63
import           Data.Char.IsoLatinTables
Packit cc4c63
import           Data.Char.Properties.XMLCharProps (isXml1ByteChar,
Packit cc4c63
                                                    isXmlLatin1Char)
Packit cc4c63
Packit cc4c63
import           Data.String.EncodingNames
Packit cc4c63
import           Data.String.UTF8Decoding          (decodeUtf8,
Packit cc4c63
                                                    decodeUtf8EmbedErrors)
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- | Unicode is represented as the Char type
Packit cc4c63
--   Precondition for this is the support of Unicode character range
Packit cc4c63
--   in the compiler (e.g. ghc but not hugs)
Packit cc4c63
Packit cc4c63
type Unicode    = Char
Packit cc4c63
Packit cc4c63
-- | the type for Unicode strings
Packit cc4c63
Packit cc4c63
type UString    = [Unicode]
Packit cc4c63
Packit cc4c63
-- | UTF-8 charachters are represented by the Char type
Packit cc4c63
Packit cc4c63
type UTF8Char   = Char
Packit cc4c63
Packit cc4c63
-- | UTF-8 strings are implemented as Haskell strings
Packit cc4c63
Packit cc4c63
type UTF8String = String
Packit cc4c63
Packit cc4c63
-- | Decoding function with a pair containing the result string and a list of decoding errors as result
Packit cc4c63
Packit cc4c63
type DecodingFct = String -> (UString, [String])
Packit cc4c63
Packit cc4c63
type UStringWithErrors = [Either String Char]
Packit cc4c63
Packit cc4c63
-- | Decoding function where decoding errors are interleaved with decoded characters
Packit cc4c63
Packit cc4c63
type DecodingFctEmbedErrors = String -> UStringWithErrors
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- conversion from Unicode strings (UString) to UTF8 encoded strings.
Packit cc4c63
Packit cc4c63
unicodeToUtf8           :: UString -> UTF8String
Packit cc4c63
unicodeToUtf8           = concatMap unicodeCharToUtf8
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- conversion from Unicode (Char) to a UTF8 encoded string.
Packit cc4c63
Packit cc4c63
unicodeCharToUtf8       :: Unicode -> UTF8String
Packit cc4c63
unicodeCharToUtf8 c
Packit cc4c63
    | i >= 0          && i <= 0x0000007F        -- 1 byte UTF8 (7 bits)
Packit cc4c63
        = [ toEnum i ]
Packit cc4c63
    | i >= 0x00000080 && i <= 0x000007FF        -- 2 byte UTF8 (5 + 6 bits)
Packit cc4c63
        = [ toEnum (0xC0 + i `div` 0x40)
Packit cc4c63
          , toEnum (0x80 + i                  `mod` 0x40)
Packit cc4c63
          ]
Packit cc4c63
    | i >= 0x00000800 && i <= 0x0000FFFF        -- 3 byte UTF8 (4 + 6 + 6 bits)
Packit cc4c63
        = [ toEnum (0xE0 +  i `div`   0x1000)
Packit cc4c63
          , toEnum (0x80 + (i `div`     0x40) `mod` 0x40)
Packit cc4c63
          , toEnum (0x80 +  i                 `mod` 0x40)
Packit cc4c63
          ]
Packit cc4c63
    | i >= 0x00010000 && i <= 0x001FFFFF        -- 4 byte UTF8 (3 + 6 + 6 + 6 bits)
Packit cc4c63
        = [ toEnum (0xF0 +  i `div`    0x40000)
Packit cc4c63
          , toEnum (0x80 + (i `div`     0x1000) `mod` 0x40)
Packit cc4c63
          , toEnum (0x80 + (i `div`       0x40) `mod` 0x40)
Packit cc4c63
          , toEnum (0x80 +  i                   `mod` 0x40)
Packit cc4c63
          ]
Packit cc4c63
    | i >= 0x00200000 && i <= 0x03FFFFFF        -- 5 byte UTF8 (2 + 6 + 6 + 6 + 6 bits)
Packit cc4c63
        = [ toEnum (0xF8 +  i `div`  0x1000000)
Packit cc4c63
          , toEnum (0x80 + (i `div`    0x40000) `mod` 0x40)
Packit cc4c63
          , toEnum (0x80 + (i `div`     0x1000) `mod` 0x40)
Packit cc4c63
          , toEnum (0x80 + (i `div`       0x40) `mod` 0x40)
Packit cc4c63
          , toEnum (0x80 +  i                   `mod` 0x40)
Packit cc4c63
          ]
Packit cc4c63
    | i >= 0x04000000 && i <= 0x7FFFFFFF        -- 6 byte UTF8 (1 + 6 + 6 + 6 + 6 + 6 bits)
Packit cc4c63
        = [ toEnum (0xFC +  i `div` 0x40000000)
Packit cc4c63
          , toEnum (0x80 + (i `div`  0x1000000) `mod` 0x40)
Packit cc4c63
          , toEnum (0x80 + (i `div`    0x40000) `mod` 0x40)
Packit cc4c63
          , toEnum (0x80 + (i `div`     0x1000) `mod` 0x40)
Packit cc4c63
          , toEnum (0x80 + (i `div`       0x40) `mod` 0x40)
Packit cc4c63
          , toEnum (0x80 +  i                   `mod` 0x40)
Packit cc4c63
          ]
Packit cc4c63
    | otherwise                                 -- other values not supported
Packit cc4c63
        = error ("unicodeCharToUtf8: illegal integer argument " ++ show i)
Packit cc4c63
    where
Packit cc4c63
    i = fromEnum c
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- code conversion from latin1 to Unicode
Packit cc4c63
Packit cc4c63
latin1ToUnicode :: String -> UString
Packit cc4c63
latin1ToUnicode = id
Packit cc4c63
Packit cc4c63
latinToUnicode  :: [(Char, Char)] -> String -> UString
Packit cc4c63
latinToUnicode tt
Packit cc4c63
    = map charToUni
Packit cc4c63
    where
Packit cc4c63
    charToUni c =
Packit cc4c63
       foldr (\(src,dst) r ->
Packit cc4c63
          case compare c src of
Packit cc4c63
             EQ -> dst
Packit cc4c63
             LT -> c {- not found in table -}
Packit cc4c63
             GT -> r) c tt
Packit cc4c63
Packit cc4c63
-- | conversion from ASCII to unicode with check for legal ASCII char set
Packit cc4c63
--
Packit cc4c63
-- Structure of decoding function copied from 'Data.Char.UTF8.decode'.
Packit cc4c63
Packit cc4c63
decodeAscii     :: DecodingFct
Packit cc4c63
decodeAscii
Packit cc4c63
    = swap . partitionEither . decodeAsciiEmbedErrors
Packit cc4c63
Packit cc4c63
decodeAsciiEmbedErrors  :: String -> UStringWithErrors
Packit cc4c63
decodeAsciiEmbedErrors str
Packit cc4c63
    = map (\(c,pos) -> if isValid c
Packit cc4c63
                         then Right c
Packit cc4c63
                         else Left (toErrStr c pos)) posStr
Packit cc4c63
    where
Packit cc4c63
    posStr = zip str [(0::Int)..]
Packit cc4c63
    toErrStr errChr pos
Packit cc4c63
        = " at input position " ++ show pos ++ ": none ASCII char " ++ show errChr
Packit cc4c63
    isValid x = x < '\x80'
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- UCS-2 big endian to Unicode conversion
Packit cc4c63
Packit cc4c63
ucs2BigEndianToUnicode  :: String -> UString
Packit cc4c63
Packit cc4c63
ucs2BigEndianToUnicode (b : l : r)
Packit cc4c63
    = toEnum (fromEnum b * 256 + fromEnum l) : ucs2BigEndianToUnicode r
Packit cc4c63
Packit cc4c63
ucs2BigEndianToUnicode []
Packit cc4c63
    = []
Packit cc4c63
Packit cc4c63
ucs2BigEndianToUnicode _
Packit cc4c63
    = []                                -- error "illegal UCS-2 byte input sequence with odd length"
Packit cc4c63
                                        -- is ignored (garbage in, garbage out)
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- UCS-2 little endian to Unicode conversion
Packit cc4c63
Packit cc4c63
ucs2LittleEndianToUnicode       :: String -> UString
Packit cc4c63
Packit cc4c63
ucs2LittleEndianToUnicode (l : b : r)
Packit cc4c63
    = toEnum (fromEnum b * 256 + fromEnum l) : ucs2LittleEndianToUnicode r
Packit cc4c63
Packit cc4c63
ucs2LittleEndianToUnicode []
Packit cc4c63
    = []
Packit cc4c63
Packit cc4c63
ucs2LittleEndianToUnicode [_]
Packit cc4c63
    = []                                -- error "illegal UCS-2 byte input sequence with odd length"
Packit cc4c63
                                        -- is ignored
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- UCS-2 to UTF-8 conversion with byte order mark analysis
Packit cc4c63
Packit cc4c63
ucs2ToUnicode           :: String -> UString
Packit cc4c63
Packit cc4c63
ucs2ToUnicode ('\xFE':'\xFF':s)         -- 2 byte mark for big endian encoding
Packit cc4c63
    = ucs2BigEndianToUnicode s
Packit cc4c63
Packit cc4c63
ucs2ToUnicode ('\xFF':'\xFE':s)         -- 2 byte mark for little endian encoding
Packit cc4c63
    = ucs2LittleEndianToUnicode s
Packit cc4c63
Packit cc4c63
ucs2ToUnicode s
Packit cc4c63
    = ucs2BigEndianToUnicode s          -- default: big endian
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- UTF-8 to Unicode conversion with deletion of leading byte order mark, as described in XML standard F.1
Packit cc4c63
Packit cc4c63
utf8ToUnicode           :: DecodingFct
Packit cc4c63
Packit cc4c63
utf8ToUnicode ('\xEF':'\xBB':'\xBF':s)  -- remove byte order mark ( XML standard F.1 )
Packit cc4c63
    = decodeUtf8 s
Packit cc4c63
Packit cc4c63
utf8ToUnicode s
Packit cc4c63
    = decodeUtf8 s
Packit cc4c63
Packit cc4c63
utf8ToUnicodeEmbedErrors        :: DecodingFctEmbedErrors
Packit cc4c63
Packit cc4c63
utf8ToUnicodeEmbedErrors ('\xEF':'\xBB':'\xBF':s)       -- remove byte order mark ( XML standard F.1 )
Packit cc4c63
    = decodeUtf8EmbedErrors s
Packit cc4c63
Packit cc4c63
utf8ToUnicodeEmbedErrors s
Packit cc4c63
    = decodeUtf8EmbedErrors s
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- UTF-16 big endian to UTF-8 conversion with removal of byte order mark
Packit cc4c63
Packit cc4c63
utf16beToUnicode                :: String -> UString
Packit cc4c63
Packit cc4c63
utf16beToUnicode ('\xFE':'\xFF':s)              -- remove byte order mark
Packit cc4c63
    = ucs2BigEndianToUnicode s
Packit cc4c63
Packit cc4c63
utf16beToUnicode s
Packit cc4c63
    = ucs2BigEndianToUnicode s
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- UTF-16 little endian to UTF-8 conversion with removal of byte order mark
Packit cc4c63
Packit cc4c63
utf16leToUnicode                :: String -> UString
Packit cc4c63
Packit cc4c63
utf16leToUnicode ('\xFF':'\xFE':s)              -- remove byte order mark
Packit cc4c63
    = ucs2LittleEndianToUnicode s
Packit cc4c63
Packit cc4c63
utf16leToUnicode s
Packit cc4c63
    = ucs2LittleEndianToUnicode s
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- substitute all Unicode characters, that are not legal 1-byte
Packit cc4c63
-- UTF-8 XML characters by a character reference.
Packit cc4c63
--
Packit cc4c63
-- This function can be used to translate all text nodes and
Packit cc4c63
-- attribute values into pure ascii.
Packit cc4c63
--
Packit cc4c63
-- see also : 'unicodeToLatin1'
Packit cc4c63
Packit cc4c63
unicodeToXmlEntity      :: UString -> String
Packit cc4c63
unicodeToXmlEntity
Packit cc4c63
    = escape isXml1ByteChar (intToCharRef . fromEnum)
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- substitute all Unicode characters, that are not legal latin1
Packit cc4c63
-- UTF-8 XML characters by a character reference.
Packit cc4c63
--
Packit cc4c63
-- This function can be used to translate all text nodes and
Packit cc4c63
-- attribute values into ISO latin1.
Packit cc4c63
--
Packit cc4c63
-- see also : 'unicodeToXmlEntity'
Packit cc4c63
Packit cc4c63
unicodeToLatin1 :: UString -> String
Packit cc4c63
unicodeToLatin1
Packit cc4c63
    = escape isXmlLatin1Char (intToCharRef . fromEnum)
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- substitute selected characters
Packit cc4c63
-- The @check@ function returns 'True' whenever a character needs to substitution
Packit cc4c63
-- The function @esc@ computes a substitute.
Packit cc4c63
Packit cc4c63
escape :: (Unicode -> Bool) -> (Unicode -> String) -> UString -> String
Packit cc4c63
escape check esc =
Packit cc4c63
    concatMap (\uc -> if check uc then [uc] else esc uc)
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- removes all non ascii chars, may be used to transform
Packit cc4c63
-- a document into a pure ascii representation by removing
Packit cc4c63
-- all non ascii chars from tag and attibute names
Packit cc4c63
--
Packit cc4c63
-- see also : 'unicodeRemoveNoneLatin1', 'unicodeToXmlEntity'
Packit cc4c63
Packit cc4c63
unicodeRemoveNoneAscii  :: UString -> String
Packit cc4c63
unicodeRemoveNoneAscii
Packit cc4c63
    = filter isXml1ByteChar
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- removes all non latin1 chars, may be used to transform
Packit cc4c63
-- a document into a pure ascii representation by removing
Packit cc4c63
-- all non ascii chars from tag and attibute names
Packit cc4c63
--
Packit cc4c63
-- see also : 'unicodeRemoveNoneAscii', 'unicodeToLatin1'
Packit cc4c63
Packit cc4c63
unicodeRemoveNoneLatin1 :: UString -> String
Packit cc4c63
unicodeRemoveNoneLatin1
Packit cc4c63
    = filter isXmlLatin1Char
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- convert an Unicode into a XML character reference.
Packit cc4c63
--
Packit cc4c63
-- see also : 'intToCharRefHex'
Packit cc4c63
Packit cc4c63
intToCharRef            :: Int -> String
Packit cc4c63
intToCharRef i
Packit cc4c63
    = "&#" ++ show i ++ ";"
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- convert an Unicode into a XML hexadecimal character reference.
Packit cc4c63
--
Packit cc4c63
-- see also: 'intToCharRef'
Packit cc4c63
Packit cc4c63
intToCharRefHex         :: Int -> String
Packit cc4c63
intToCharRefHex i
Packit cc4c63
    = "&#x" ++ h2 ++ ";"
Packit cc4c63
      where
Packit cc4c63
      h1 = intToHexString i
Packit cc4c63
      h2 = if length h1 `mod` 2 == 1
Packit cc4c63
           then '0': h1
Packit cc4c63
           else h1
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
intToHexString          :: Int -> String
Packit cc4c63
intToHexString i
Packit cc4c63
    | i == 0
Packit cc4c63
        = "0"
Packit cc4c63
    | i > 0
Packit cc4c63
        = intToStr i
Packit cc4c63
    | otherwise
Packit cc4c63
        = error ("intToHexString: negative argument " ++ show i)
Packit cc4c63
    where
Packit cc4c63
    intToStr 0  = ""
Packit cc4c63
    intToStr i' = intToStr (i' `div` 16) ++ [fourBitsToChar (i' `mod` 16)]
Packit cc4c63
Packit cc4c63
fourBitsToChar          :: Int -> Char
Packit cc4c63
fourBitsToChar i        = "0123456789ABCDEF" !! i
Packit cc4c63
{-# INLINE fourBitsToChar #-}
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
--
Packit cc4c63
-- | White Space (XML Standard 2.3) and
Packit cc4c63
-- end of line handling (2.11)
Packit cc4c63
--
Packit cc4c63
-- \#x0D and \#x0D\#x0A are mapped to \#x0A
Packit cc4c63
Packit cc4c63
normalizeNL     :: String -> String
Packit cc4c63
normalizeNL ('\r' : '\n' : rest)        = '\n' : normalizeNL rest
Packit cc4c63
normalizeNL ('\r' : rest)               = '\n' : normalizeNL rest
Packit cc4c63
normalizeNL (c : rest)                  = c    : normalizeNL rest
Packit cc4c63
normalizeNL []                          = []
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- the table of supported character encoding schemes and the associated
Packit cc4c63
-- conversion functions into Unicode:q
Packit cc4c63
Packit cc4c63
{-
Packit cc4c63
This table could be derived from decodingTableEither,
Packit cc4c63
but this way it is certainly more efficient.
Packit cc4c63
-}
Packit cc4c63
Packit cc4c63
decodingTable   :: [(String, DecodingFct)]
Packit cc4c63
decodingTable
Packit cc4c63
    = [ (utf8,          utf8ToUnicode                           )
Packit cc4c63
      , (isoLatin1,     liftDecFct latin1ToUnicode              )
Packit cc4c63
      , (usAscii,       decodeAscii                             )
Packit cc4c63
      , (ucs2,          liftDecFct ucs2ToUnicode                )
Packit cc4c63
      , (utf16,         liftDecFct ucs2ToUnicode                )
Packit cc4c63
      , (utf16be,       liftDecFct utf16beToUnicode             )
Packit cc4c63
      , (utf16le,       liftDecFct utf16leToUnicode             )
Packit cc4c63
      , (iso8859_2,     liftDecFct (latinToUnicode iso_8859_2)  )
Packit cc4c63
      , (iso8859_3,     liftDecFct (latinToUnicode iso_8859_3)  )
Packit cc4c63
      , (iso8859_4,     liftDecFct (latinToUnicode iso_8859_4)  )
Packit cc4c63
      , (iso8859_5,     liftDecFct (latinToUnicode iso_8859_5)  )
Packit cc4c63
      , (iso8859_6,     liftDecFct (latinToUnicode iso_8859_6)  )
Packit cc4c63
      , (iso8859_7,     liftDecFct (latinToUnicode iso_8859_7)  )
Packit cc4c63
      , (iso8859_8,     liftDecFct (latinToUnicode iso_8859_8)  )
Packit cc4c63
      , (iso8859_9,     liftDecFct (latinToUnicode iso_8859_9)  )
Packit cc4c63
      , (iso8859_10,    liftDecFct (latinToUnicode iso_8859_10) )
Packit cc4c63
      , (iso8859_11,    liftDecFct (latinToUnicode iso_8859_11) )
Packit cc4c63
      , (iso8859_13,    liftDecFct (latinToUnicode iso_8859_13) )
Packit cc4c63
      , (iso8859_14,    liftDecFct (latinToUnicode iso_8859_14) )
Packit cc4c63
      , (iso8859_15,    liftDecFct (latinToUnicode iso_8859_15) )
Packit cc4c63
      , (iso8859_16,    liftDecFct (latinToUnicode iso_8859_16) )
Packit cc4c63
      , (unicodeString, liftDecFct id                           )
Packit cc4c63
      , ("",            liftDecFct id                           )       -- default
Packit cc4c63
      ]
Packit cc4c63
    where
Packit cc4c63
    liftDecFct df = \ s -> (df s, [])
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- the lookup function for selecting the decoding function
Packit cc4c63
Packit cc4c63
getDecodingFct          :: String -> Maybe DecodingFct
Packit cc4c63
getDecodingFct enc
Packit cc4c63
    = lookup (map toUpper enc) decodingTable
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- Similar to 'decodingTable' but it embeds errors
Packit cc4c63
-- in the string of decoded characters.
Packit cc4c63
Packit cc4c63
decodingTableEmbedErrors        :: [(String, DecodingFctEmbedErrors)]
Packit cc4c63
decodingTableEmbedErrors
Packit cc4c63
    = [ (utf8,          utf8ToUnicodeEmbedErrors                )
Packit cc4c63
      , (isoLatin1,     liftDecFct latin1ToUnicode              )
Packit cc4c63
      , (usAscii,       decodeAsciiEmbedErrors                  )
Packit cc4c63
      , (ucs2,          liftDecFct ucs2ToUnicode                )
Packit cc4c63
      , (utf16,         liftDecFct ucs2ToUnicode                )
Packit cc4c63
      , (utf16be,       liftDecFct utf16beToUnicode             )
Packit cc4c63
      , (utf16le,       liftDecFct utf16leToUnicode             )
Packit cc4c63
      , (iso8859_2,     liftDecFct (latinToUnicode iso_8859_2)  )
Packit cc4c63
      , (iso8859_3,     liftDecFct (latinToUnicode iso_8859_3)  )
Packit cc4c63
      , (iso8859_4,     liftDecFct (latinToUnicode iso_8859_4)  )
Packit cc4c63
      , (iso8859_5,     liftDecFct (latinToUnicode iso_8859_5)  )
Packit cc4c63
      , (iso8859_6,     liftDecFct (latinToUnicode iso_8859_6)  )
Packit cc4c63
      , (iso8859_7,     liftDecFct (latinToUnicode iso_8859_7)  )
Packit cc4c63
      , (iso8859_8,     liftDecFct (latinToUnicode iso_8859_8)  )
Packit cc4c63
      , (iso8859_9,     liftDecFct (latinToUnicode iso_8859_9)  )
Packit cc4c63
      , (iso8859_10,    liftDecFct (latinToUnicode iso_8859_10) )
Packit cc4c63
      , (iso8859_11,    liftDecFct (latinToUnicode iso_8859_11) )
Packit cc4c63
      , (iso8859_13,    liftDecFct (latinToUnicode iso_8859_13) )
Packit cc4c63
      , (iso8859_14,    liftDecFct (latinToUnicode iso_8859_14) )
Packit cc4c63
      , (iso8859_15,    liftDecFct (latinToUnicode iso_8859_15) )
Packit cc4c63
      , (iso8859_16,    liftDecFct (latinToUnicode iso_8859_16) )
Packit cc4c63
      , (unicodeString, liftDecFct id                           )
Packit cc4c63
      , ("",            liftDecFct id                           )       -- default
Packit cc4c63
      ]
Packit cc4c63
    where
Packit cc4c63
    liftDecFct df = map Right . df
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- the lookup function for selecting the decoding function
Packit cc4c63
Packit cc4c63
getDecodingFctEmbedErrors       :: String -> Maybe DecodingFctEmbedErrors
Packit cc4c63
getDecodingFctEmbedErrors enc
Packit cc4c63
    = lookup (map toUpper enc) decodingTableEmbedErrors
Packit cc4c63
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- the table of supported output encoding schemes and the associated
Packit cc4c63
-- conversion functions from Unicode
Packit cc4c63
Packit cc4c63
outputEncodingTable     :: [(String, (UString -> String))]
Packit cc4c63
outputEncodingTable
Packit cc4c63
    = [ (utf8,          unicodeToUtf8           )
Packit cc4c63
      , (isoLatin1,     unicodeToLatin1         )
Packit cc4c63
      , (usAscii,       unicodeToXmlEntity      )
Packit cc4c63
      , (unicodeString, id                      )
Packit cc4c63
      , ("",            unicodeToUtf8           )       -- default
Packit cc4c63
      ]
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- the lookup function for selecting the encoding function
Packit cc4c63
Packit cc4c63
getOutputEncodingFct            :: String -> Maybe (String -> UString)
Packit cc4c63
getOutputEncodingFct enc
Packit cc4c63
    = lookup (map toUpper enc) outputEncodingTable
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
--
Packit cc4c63
Packit cc4c63
guessEncoding           :: String -> String
Packit cc4c63
Packit cc4c63
guessEncoding ('\xFF':'\xFE':'\x00':'\x00':_)   = "UCS-4LE"             -- with byte order mark
Packit cc4c63
guessEncoding ('\xFF':'\xFE':_)                 = "UTF-16LE"            -- with byte order mark
Packit cc4c63
Packit cc4c63
guessEncoding ('\xFE':'\xFF':'\x00':'\x00':_)   = "UCS-4-3421"          -- with byte order mark
Packit cc4c63
guessEncoding ('\xFE':'\xFF':_)                 = "UTF-16BE"            -- with byte order mark
Packit cc4c63
Packit cc4c63
guessEncoding ('\xEF':'\xBB':'\xBF':_)          = utf8                  -- with byte order mark
Packit cc4c63
Packit cc4c63
guessEncoding ('\x00':'\x00':'\xFE':'\xFF':_)   = "UCS-4BE"             -- with byte order mark
Packit cc4c63
guessEncoding ('\x00':'\x00':'\xFF':'\xFE':_)   = "UCS-4-2143"          -- with byte order mark
Packit cc4c63
Packit cc4c63
guessEncoding ('\x00':'\x00':'\x00':'\x3C':_)   = "UCS-4BE"             -- "<" of "
Packit cc4c63
guessEncoding ('\x3C':'\x00':'\x00':'\x00':_)   = "UCS-4LE"             -- "<" of "
Packit cc4c63
guessEncoding ('\x00':'\x00':'\x3C':'\x00':_)   = "UCS-4-2143"          -- "<" of "
Packit cc4c63
guessEncoding ('\x00':'\x3C':'\x00':'\x00':_)   = "UCS-4-3412"          -- "<" of "
Packit cc4c63
Packit cc4c63
guessEncoding ('\x00':'\x3C':'\x00':'\x3F':_)   = "UTF-16BE"            -- "
Packit cc4c63
guessEncoding ('\x3C':'\x00':'\x3F':'\x00':_)   = "UTF-16LE"            -- "
Packit cc4c63
Packit cc4c63
guessEncoding ('\x4C':'\x6F':'\xA7':'\x94':_)   = "EBCDIC"              -- "
Packit cc4c63
Packit cc4c63
guessEncoding _                                 = ""                    -- no guess
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
swap :: (a,b) -> (b,a)
Packit cc4c63
swap (x,y) = (y,x)
Packit cc4c63
{-# INLINE swap #-}
Packit cc4c63
Packit cc4c63
partitionEither :: [Either a b] -> ([a], [b])
Packit cc4c63
partitionEither =
Packit cc4c63
   foldr (\x ~(ls,rs) -> either (\l -> (l:ls,rs)) (\r -> (ls,r:rs)) x) ([],[])
Packit cc4c63
{-# INLINE partitionEither #-}
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- output encoding for bytestrings
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- the table of supported output encoding schemes and the associated
Packit cc4c63
-- conversion functions from Unicode
Packit cc4c63
Packit cc4c63
type StringFct          = String -> String
Packit cc4c63
Packit cc4c63
outputEncodingTable'     :: [(String, (Char -> StringFct))]
Packit cc4c63
outputEncodingTable'
Packit cc4c63
    = [ (utf8,          unicodeCharToUtf8'           )
Packit cc4c63
      , (isoLatin1,     unicodeCharToLatin1'         )
Packit cc4c63
      , (usAscii,       unicodeCharToXmlEntity'      )
Packit cc4c63
      , ("",            unicodeCharToUtf8'           )       -- default
Packit cc4c63
      ]
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- the lookup function for selecting the encoding function
Packit cc4c63
Packit cc4c63
getOutputEncodingFct'            :: String -> Maybe (Char -> StringFct)
Packit cc4c63
getOutputEncodingFct' enc
Packit cc4c63
    = lookup (map toUpper enc) outputEncodingTable'
Packit cc4c63
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- conversion from Unicode (Char) to a UTF8 encoded string.
Packit cc4c63
Packit cc4c63
unicodeCharToUtf8'      :: Char -> StringFct
Packit cc4c63
unicodeCharToUtf8' c
Packit cc4c63
    | i >= 0          && i <= 0x0000007F        -- 1 byte UTF8 (7 bits)
Packit cc4c63
        = (c :)
Packit cc4c63
Packit cc4c63
    | i >= 0x00000080 && i <= 0x000007FF        -- 2 byte UTF8 (5 + 6 bits)
Packit cc4c63
        = ((toEnum (0xC0 + i `div` 0x40)                   ) :) .
Packit cc4c63
          ((toEnum (0x80 + i                    `mod` 0x40)) :)
Packit cc4c63
Packit cc4c63
    | i >= 0x00000800 && i <= 0x0000FFFF        -- 3 byte UTF8 (4 + 6 + 6 bits)
Packit cc4c63
        = ((toEnum (0xE0 +  i `div`     0x1000)            ) :) .
Packit cc4c63
          ((toEnum (0x80 + (i `div`       0x40) `mod` 0x40)) :) .
Packit cc4c63
          ((toEnum (0x80 +  i                   `mod` 0x40)) :)
Packit cc4c63
Packit cc4c63
    | i >= 0x00010000 && i <= 0x001FFFFF        -- 4 byte UTF8 (3 + 6 + 6 + 6 bits) -- extension to encode 21 bit values
Packit cc4c63
        = ((toEnum (0xF0 +  i `div`    0x40000)            ) :) .
Packit cc4c63
          ((toEnum (0x80 + (i `div`     0x1000) `mod` 0x40)) :) .
Packit cc4c63
          ((toEnum (0x80 + (i `div`       0x40) `mod` 0x40)) :) .
Packit cc4c63
          ((toEnum (0x80 +  i                   `mod` 0x40)) :)
Packit cc4c63
Packit cc4c63
    | i >= 0x00200000 && i <= 0x03FFFFFF        -- 5 byte UTF8 (2 + 6 + 6 + 6 + 6 bits) -- extension to encode 26 bit values
Packit cc4c63
        = ((toEnum (0xF8 +  i `div`  0x1000000)            ) :) .
Packit cc4c63
          ((toEnum (0x80 + (i `div`    0x40000) `mod` 0x40)) :) .
Packit cc4c63
          ((toEnum (0x80 + (i `div`     0x1000) `mod` 0x40)) :) .
Packit cc4c63
          ((toEnum (0x80 + (i `div`       0x40) `mod` 0x40)) :) .
Packit cc4c63
          ((toEnum (0x80 +  i                   `mod` 0x40)) :)
Packit cc4c63
Packit cc4c63
    | i >= 0x04000000 && i <= 0x7FFFFFFF        -- 6 byte UTF8 (1 + 6 + 6 + 6 + 6 + 6 bits) -- extension to encode 31 bit values
Packit cc4c63
        = ((toEnum (0xFC +  i `div` 0x40000000)            ) :) .
Packit cc4c63
          ((toEnum (0x80 + (i `div`  0x1000000) `mod` 0x40)) :) .
Packit cc4c63
          ((toEnum (0x80 + (i `div`    0x40000) `mod` 0x40)) :) .
Packit cc4c63
          ((toEnum (0x80 + (i `div`     0x1000) `mod` 0x40)) :) .
Packit cc4c63
          ((toEnum (0x80 + (i `div`       0x40) `mod` 0x40)) :) .
Packit cc4c63
          ((toEnum (0x80 +  i                   `mod` 0x40)) :)
Packit cc4c63
Packit cc4c63
    | otherwise                                 -- other values not supported
Packit cc4c63
        = error ("unicodeCharToUtf8: illegal integer argument " ++ show i)
Packit cc4c63
    where
Packit cc4c63
    i = fromEnum c
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- substitute all Unicode characters, that are not legal 1-byte
Packit cc4c63
-- UTF-8 XML characters by a character reference.
Packit cc4c63
Packit cc4c63
unicodeCharToXmlEntity' :: Char -> StringFct
Packit cc4c63
unicodeCharToXmlEntity' c
Packit cc4c63
    | isXml1ByteChar c  = (c :)
Packit cc4c63
    | otherwise         = ((intToCharRef . fromEnum $ c) ++)
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63
-- |
Packit cc4c63
-- substitute all Unicode characters, that are not legal latin1
Packit cc4c63
-- UTF-8 XML characters by a character reference.
Packit cc4c63
Packit cc4c63
unicodeCharToLatin1'    :: Char -> StringFct
Packit cc4c63
unicodeCharToLatin1' c
Packit cc4c63
    | isXmlLatin1Char c = (c :)
Packit cc4c63
    | otherwise         = ((intToCharRef . fromEnum $ c) ++)
Packit cc4c63
Packit cc4c63
-- ------------------------------------------------------------
Packit cc4c63
Packit cc4c63