Blame Codec/Archive/Tar/Index/StringTable.hs

Packit 8cecbd
{-# LANGUAGE CPP, BangPatterns, PatternGuards, DeriveDataTypeable #-}
Packit 8cecbd
Packit 8cecbd
module Codec.Archive.Tar.Index.StringTable (
Packit 8cecbd
Packit 8cecbd
    StringTable,
Packit 8cecbd
    lookup,
Packit 8cecbd
    index,
Packit 8cecbd
    construct,
Packit 8cecbd
Packit 8cecbd
    StringTableBuilder,
Packit 8cecbd
    empty,
Packit 8cecbd
    insert,
Packit 8cecbd
    inserts,
Packit 8cecbd
    finalise,
Packit 8cecbd
    unfinalise,
Packit 8cecbd
Packit 8cecbd
    serialise,
Packit 8cecbd
    serialiseSize,
Packit 8cecbd
    deserialiseV1,
Packit 8cecbd
    deserialiseV2,
Packit 8cecbd
Packit 8cecbd
#ifdef TESTS
Packit 8cecbd
    prop_valid,
Packit 8cecbd
    prop_sorted,
Packit 8cecbd
    prop_finalise_unfinalise,
Packit 8cecbd
    prop_serialise_deserialise,
Packit 8cecbd
    prop_serialiseSize,
Packit 8cecbd
#endif
Packit 8cecbd
 ) where
Packit 8cecbd
Packit 8cecbd
import Data.Typeable (Typeable)
Packit 8cecbd
Packit 8cecbd
import Prelude   hiding (lookup, id)
Packit 8cecbd
import Data.List hiding (lookup, insert)
Packit 8cecbd
import Data.Function (on)
Packit 8cecbd
import Data.Word (Word32)
Packit 8cecbd
import Data.Int  (Int32)
Packit 8cecbd
import Data.Bits
Packit 8cecbd
import Data.Monoid (Monoid(..))
Packit 8cecbd
#if (MIN_VERSION_base(4,5,0))
Packit 8cecbd
import Data.Monoid ((<>))
Packit 8cecbd
#endif
Packit 8cecbd
import Control.Exception (assert)
Packit 8cecbd
Packit 8cecbd
import qualified Data.Array.Unboxed as A
Packit 8cecbd
import           Data.Array.Unboxed ((!))
Packit 8cecbd
#if MIN_VERSION_containers(0,5,0)
Packit 8cecbd
import qualified Data.Map.Strict        as Map
Packit 8cecbd
import           Data.Map.Strict (Map)
Packit 8cecbd
#else
Packit 8cecbd
import qualified Data.Map               as Map
Packit 8cecbd
import           Data.Map (Map)
Packit 8cecbd
#endif
Packit 8cecbd
import qualified Data.ByteString        as BS
Packit 8cecbd
import qualified Data.ByteString.Unsafe as BS
Packit 8cecbd
import qualified Data.ByteString.Lazy   as LBS
Packit 8cecbd
#if MIN_VERSION_bytestring(0,10,2) || defined(MIN_VERSION_bytestring_builder)
Packit 8cecbd
import Data.ByteString.Builder          as BS
Packit 8cecbd
import Data.ByteString.Builder.Extra    as BS (byteStringCopy)
Packit 8cecbd
#else
Packit 8cecbd
import Data.ByteString.Lazy.Builder     as BS
Packit 8cecbd
import Data.ByteString.Lazy.Builder.Extras as BS (byteStringCopy)
Packit 8cecbd
#endif
Packit 8cecbd
Packit 8cecbd
Packit 8cecbd
-- | An effecient mapping from strings to a dense set of integers.
Packit 8cecbd
--
Packit 8cecbd
data StringTable id = StringTable
Packit 8cecbd
         {-# UNPACK #-} !BS.ByteString           -- all strings concatenated
Packit 8cecbd
         {-# UNPACK #-} !(A.UArray Int32 Word32) -- string offset table
Packit 8cecbd
         {-# UNPACK #-} !(A.UArray Int32 Int32)  -- string index to id table
Packit 8cecbd
         {-# UNPACK #-} !(A.UArray Int32 Int32)  -- string id to index table
Packit 8cecbd
  deriving (Show, Typeable)
Packit 8cecbd
Packit 8cecbd
instance (Eq id, Enum id) => Eq (StringTable id) where
Packit 8cecbd
  tbl1 == tbl2 = unfinalise tbl1 == unfinalise tbl2
Packit 8cecbd
Packit 8cecbd
-- | Look up a string in the token table. If the string is present, return
Packit 8cecbd
-- its corresponding index.
Packit 8cecbd
--
Packit 8cecbd
lookup :: Enum id => StringTable id -> BS.ByteString -> Maybe id
Packit 8cecbd
lookup (StringTable bs offsets ids _ixs) str =
Packit 8cecbd
    binarySearch 0 (topBound-1) str
Packit 8cecbd
  where
Packit 8cecbd
    (0, topBound) = A.bounds offsets
Packit 8cecbd
Packit 8cecbd
    binarySearch !a !b !key
Packit 8cecbd
      | a > b     = Nothing
Packit 8cecbd
      | otherwise = case compare key (index' bs offsets mid) of
Packit 8cecbd
          LT -> binarySearch a (mid-1) key
Packit 8cecbd
          EQ -> Just $! toEnum (fromIntegral (ids ! mid))
Packit 8cecbd
          GT -> binarySearch (mid+1) b key
Packit 8cecbd
      where mid = (a + b) `div` 2
Packit 8cecbd
Packit 8cecbd
index' :: BS.ByteString -> A.UArray Int32 Word32 -> Int32 -> BS.ByteString
Packit 8cecbd
index' bs offsets i = BS.unsafeTake len . BS.unsafeDrop start $ bs
Packit 8cecbd
  where
Packit 8cecbd
    start, end, len :: Int
Packit 8cecbd
    start = fromIntegral (offsets ! i)
Packit 8cecbd
    end   = fromIntegral (offsets ! (i+1))
Packit 8cecbd
    len   = end - start
Packit 8cecbd
Packit 8cecbd
Packit 8cecbd
-- | Given the index of a string in the table, return the string.
Packit 8cecbd
--
Packit 8cecbd
index :: Enum id => StringTable id -> id -> BS.ByteString
Packit 8cecbd
index (StringTable bs offsets _ids ixs) =
Packit 8cecbd
    index' bs offsets . (ixs !) . fromIntegral . fromEnum
Packit 8cecbd
Packit 8cecbd
Packit 8cecbd
-- | Given a list of strings, construct a 'StringTable' mapping those strings
Packit 8cecbd
-- to a dense set of integers. Also return the ids for all the strings used
Packit 8cecbd
-- in the construction.
Packit 8cecbd
--
Packit 8cecbd
construct :: Enum id => [BS.ByteString] -> StringTable id
Packit 8cecbd
construct = finalise . foldl' (\tbl s -> fst (insert s tbl)) empty
Packit 8cecbd
Packit 8cecbd
Packit 8cecbd
data StringTableBuilder id = StringTableBuilder
Packit 8cecbd
                                              !(Map BS.ByteString id)
Packit 8cecbd
                               {-# UNPACK #-} !Word32
Packit 8cecbd
  deriving (Eq, Show, Typeable)
Packit 8cecbd
Packit 8cecbd
empty :: StringTableBuilder id
Packit 8cecbd
empty = StringTableBuilder Map.empty 0
Packit 8cecbd
Packit 8cecbd
insert :: Enum id => BS.ByteString -> StringTableBuilder id -> (StringTableBuilder id, id)
Packit 8cecbd
insert str builder@(StringTableBuilder smap nextid) =
Packit 8cecbd
    case Map.lookup str smap of
Packit 8cecbd
      Just id -> (builder, id)
Packit 8cecbd
      Nothing -> let !id   = toEnum (fromIntegral nextid)
Packit 8cecbd
                     !smap' = Map.insert str id smap
Packit 8cecbd
                   in (StringTableBuilder smap' (nextid+1), id)
Packit 8cecbd
Packit 8cecbd
inserts :: Enum id => [BS.ByteString] -> StringTableBuilder id -> (StringTableBuilder id, [id])
Packit 8cecbd
inserts bss builder = mapAccumL (flip insert) builder bss
Packit 8cecbd
Packit 8cecbd
finalise :: Enum id => StringTableBuilder id -> StringTable id
Packit 8cecbd
finalise (StringTableBuilder smap _) =
Packit 8cecbd
    (StringTable strs offsets ids ixs)
Packit 8cecbd
  where
Packit 8cecbd
    strs    = BS.concat (Map.keys smap)
Packit 8cecbd
    offsets = A.listArray (0, fromIntegral (Map.size smap))
Packit 8cecbd
            . scanl (\off str -> off + fromIntegral (BS.length str)) 0
Packit 8cecbd
            $ Map.keys smap
Packit 8cecbd
    ids     = A.listArray (0, fromIntegral (Map.size smap) - 1)
Packit 8cecbd
            . map (fromIntegral . fromEnum)
Packit 8cecbd
            $ Map.elems smap
Packit 8cecbd
    ixs     = A.array (A.bounds ids) [ (id,ix) | (ix,id) <- A.assocs ids ]
Packit 8cecbd
Packit 8cecbd
unfinalise :: Enum id => StringTable id -> StringTableBuilder id
Packit 8cecbd
unfinalise (StringTable strs offsets ids _) =
Packit 8cecbd
    StringTableBuilder smap nextid
Packit 8cecbd
  where
Packit 8cecbd
    smap   = Map.fromAscList
Packit 8cecbd
               [ (index' strs offsets ix, toEnum (fromIntegral (ids ! ix)))
Packit 8cecbd
               | ix <- [0..h] ]
Packit 8cecbd
    (0,h)  = A.bounds ids
Packit 8cecbd
    nextid = fromIntegral (h+1)
Packit 8cecbd
Packit 8cecbd
Packit 8cecbd
-------------------------
Packit 8cecbd
-- (de)serialisation
Packit 8cecbd
--
Packit 8cecbd
Packit 8cecbd
serialise :: StringTable id -> BS.Builder
Packit 8cecbd
serialise (StringTable strs offs ids ixs) =
Packit 8cecbd
      let (_, !ixEnd) = A.bounds offs in
Packit 8cecbd
Packit 8cecbd
      BS.word32BE (fromIntegral (BS.length strs))
Packit 8cecbd
   <> BS.word32BE (fromIntegral ixEnd + 1)
Packit 8cecbd
   <> BS.byteStringCopy strs
Packit 8cecbd
   <> foldr (\n r -> BS.word32BE n <> r) mempty (A.elems offs)
Packit 8cecbd
   <> foldr (\n r -> BS.int32BE  n <> r) mempty (A.elems ids)
Packit 8cecbd
   <> foldr (\n r -> BS.int32BE  n <> r) mempty (A.elems ixs)
Packit 8cecbd
Packit 8cecbd
serialiseSize :: StringTable id -> Int
Packit 8cecbd
serialiseSize (StringTable strs offs _ids _ixs) =
Packit 8cecbd
    let (_, !ixEnd) = A.bounds offs
Packit 8cecbd
     in 4 * 2
Packit 8cecbd
      + BS.length strs
Packit 8cecbd
      + 4 * (fromIntegral ixEnd + 1)
Packit 8cecbd
      + 8 *  fromIntegral ixEnd
Packit 8cecbd
Packit 8cecbd
deserialiseV1 :: BS.ByteString -> Maybe (StringTable id, BS.ByteString)
Packit 8cecbd
deserialiseV1 bs
Packit 8cecbd
  | BS.length bs >= 8
Packit 8cecbd
  , let lenStrs = fromIntegral (readWord32BE bs 0)
Packit 8cecbd
        lenArr  = fromIntegral (readWord32BE bs 4)
Packit 8cecbd
        lenTotal= 8 + lenStrs + 4 * lenArr
Packit 8cecbd
  , BS.length bs >= lenTotal
Packit 8cecbd
  , let strs = BS.take lenStrs (BS.drop 8 bs)
Packit 8cecbd
        arr  = A.array (0, fromIntegral lenArr - 1)
Packit 8cecbd
                       [ (i, readWord32BE bs off)
Packit 8cecbd
                       | (i, off) <- zip [0 .. fromIntegral lenArr - 1]
Packit 8cecbd
                                         [offArrS,offArrS+4 .. offArrE]
Packit 8cecbd
                       ]
Packit 8cecbd
        ids  = A.array (0, fromIntegral lenArr - 1)
Packit 8cecbd
                       [ (i,i) | i <- [0 .. fromIntegral lenArr - 1] ]
Packit 8cecbd
        ixs  = ids -- two identity mappings
Packit 8cecbd
        offArrS = 8 + lenStrs
Packit 8cecbd
        offArrE = offArrS + 4 * lenArr - 1
Packit 8cecbd
        !stringTable = StringTable strs arr ids ixs
Packit 8cecbd
        !bs'         = BS.drop lenTotal bs
Packit 8cecbd
  = Just (stringTable, bs')
Packit 8cecbd
Packit 8cecbd
  | otherwise
Packit 8cecbd
  = Nothing
Packit 8cecbd
Packit 8cecbd
deserialiseV2 :: BS.ByteString -> Maybe (StringTable id, BS.ByteString)
Packit 8cecbd
deserialiseV2 bs
Packit 8cecbd
  | BS.length bs >= 8
Packit 8cecbd
  , let lenStrs = fromIntegral (readWord32BE bs 0)
Packit 8cecbd
        lenArr  = fromIntegral (readWord32BE bs 4)
Packit 8cecbd
        lenTotal= 8                   -- the two length prefixes
Packit 8cecbd
                + lenStrs
Packit 8cecbd
                + 4 * lenArr
Packit 8cecbd
                +(4 * (lenArr - 1)) * 2 -- offsets array is 1 longer
Packit 8cecbd
  , BS.length bs >= lenTotal
Packit 8cecbd
  , let strs = BS.take lenStrs (BS.drop 8 bs)
Packit 8cecbd
        offs = A.listArray (0, fromIntegral lenArr - 1)
Packit 8cecbd
                           [ readWord32BE bs off
Packit 8cecbd
                           | off <- offsets offsOff ]
Packit 8cecbd
        -- the second two arrays are 1 shorter
Packit 8cecbd
        ids  = A.listArray (0, fromIntegral lenArr - 2)
Packit 8cecbd
                           [ readInt32BE bs off
Packit 8cecbd
                           | off <- offsets idsOff ]
Packit 8cecbd
        ixs  = A.listArray (0, fromIntegral lenArr - 2)
Packit 8cecbd
                           [ readInt32BE bs off
Packit 8cecbd
                           | off <- offsets ixsOff ]
Packit 8cecbd
        offsOff = 8 + lenStrs
Packit 8cecbd
        idsOff  = offsOff + 4 * lenArr
Packit 8cecbd
        ixsOff  = idsOff  + 4 * (lenArr-1)
Packit 8cecbd
        offsets from = [from,from+4 .. from + 4 * (lenArr - 1)]
Packit 8cecbd
        !stringTable = StringTable strs offs ids ixs
Packit 8cecbd
        !bs'         = BS.drop lenTotal bs
Packit 8cecbd
  = Just (stringTable, bs')
Packit 8cecbd
Packit 8cecbd
  | otherwise
Packit 8cecbd
  = Nothing
Packit 8cecbd
Packit 8cecbd
readInt32BE :: BS.ByteString -> Int -> Int32
Packit 8cecbd
readInt32BE bs i = fromIntegral (readWord32BE bs i)
Packit 8cecbd
Packit 8cecbd
readWord32BE :: BS.ByteString -> Int -> Word32
Packit 8cecbd
readWord32BE bs i =
Packit 8cecbd
    assert (i >= 0 && i+3 <= BS.length bs - 1) $
Packit 8cecbd
    fromIntegral (BS.unsafeIndex bs (i + 0)) `shiftL` 24
Packit 8cecbd
  + fromIntegral (BS.unsafeIndex bs (i + 1)) `shiftL` 16
Packit 8cecbd
  + fromIntegral (BS.unsafeIndex bs (i + 2)) `shiftL` 8
Packit 8cecbd
  + fromIntegral (BS.unsafeIndex bs (i + 3))
Packit 8cecbd
Packit 8cecbd
#ifdef TESTS
Packit 8cecbd
Packit 8cecbd
prop_valid :: [BS.ByteString] -> Bool
Packit 8cecbd
prop_valid strs =
Packit 8cecbd
     all lookupIndex (enumStrings tbl)
Packit 8cecbd
  && all indexLookup (enumIds tbl)
Packit 8cecbd
Packit 8cecbd
  where
Packit 8cecbd
    tbl :: StringTable Int
Packit 8cecbd
    tbl = construct strs
Packit 8cecbd
Packit 8cecbd
    lookupIndex str = index tbl ident == str
Packit 8cecbd
      where Just ident = lookup tbl str
Packit 8cecbd
Packit 8cecbd
    indexLookup ident = lookup tbl str == Just ident
Packit 8cecbd
      where str       = index tbl ident
Packit 8cecbd
Packit 8cecbd
-- this is important so we can use Map.fromAscList
Packit 8cecbd
prop_sorted :: [BS.ByteString] -> Bool
Packit 8cecbd
prop_sorted strings =
Packit 8cecbd
    isSorted [ index' strs offsets ix
Packit 8cecbd
             | ix <- A.range (A.bounds ids) ]
Packit 8cecbd
  where
Packit 8cecbd
    _tbl :: StringTable Int
Packit 8cecbd
    _tbl@(StringTable strs offsets ids _ixs) = construct strings
Packit 8cecbd
    isSorted xs = and (zipWith (<) xs (tail xs))
Packit 8cecbd
Packit 8cecbd
prop_finalise_unfinalise :: [BS.ByteString] -> Bool
Packit 8cecbd
prop_finalise_unfinalise strs =
Packit 8cecbd
    builder == unfinalise (finalise builder)
Packit 8cecbd
  where
Packit 8cecbd
    builder :: StringTableBuilder Int
Packit 8cecbd
    builder = foldl' (\tbl s -> fst (insert s tbl)) empty strs
Packit 8cecbd
Packit 8cecbd
prop_serialise_deserialise :: [BS.ByteString] -> Bool
Packit 8cecbd
prop_serialise_deserialise strs =
Packit 8cecbd
    Just (strtable, BS.empty) == (deserialiseV2
Packit 8cecbd
                                . toStrict . BS.toLazyByteString
Packit 8cecbd
                                . serialise) strtable
Packit 8cecbd
  where
Packit 8cecbd
    strtable :: StringTable Int
Packit 8cecbd
    strtable = construct strs
Packit 8cecbd
Packit 8cecbd
prop_serialiseSize :: [BS.ByteString] -> Bool
Packit 8cecbd
prop_serialiseSize strs =
Packit 8cecbd
    (fromIntegral . LBS.length . BS.toLazyByteString . serialise) strtable
Packit 8cecbd
 == serialiseSize strtable
Packit 8cecbd
  where
Packit 8cecbd
    strtable :: StringTable Int
Packit 8cecbd
    strtable = construct strs
Packit 8cecbd
Packit 8cecbd
enumStrings :: Enum id => StringTable id -> [BS.ByteString]
Packit 8cecbd
enumStrings (StringTable bs offsets _ _) = map (index' bs offsets) [0..h-1]
Packit 8cecbd
  where (0,h) = A.bounds offsets
Packit 8cecbd
Packit 8cecbd
enumIds :: Enum id => StringTable id -> [id]
Packit 8cecbd
enumIds (StringTable _ offsets _ _) = [toEnum 0 .. toEnum (fromIntegral (h-1))]
Packit 8cecbd
  where (0,h) = A.bounds offsets
Packit 8cecbd
Packit 8cecbd
toStrict :: LBS.ByteString -> BS.ByteString
Packit 8cecbd
#if MIN_VERSION_bytestring(0,10,0)
Packit 8cecbd
toStrict = LBS.toStrict
Packit 8cecbd
#else
Packit 8cecbd
toStrict = BS.concat . LBS.toChunks
Packit 8cecbd
#endif
Packit 8cecbd
Packit 8cecbd
#endif
Packit 8cecbd
Packit 8cecbd
#if !(MIN_VERSION_base(4,5,0))
Packit 8cecbd
(<>) :: Monoid m => m -> m -> m
Packit 8cecbd
(<>) = mappend
Packit 8cecbd
#endif