|
Packit |
8cecbd |
{-# LANGUAGE CPP, BangPatterns, PatternGuards, DeriveDataTypeable #-}
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
module Codec.Archive.Tar.Index.StringTable (
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
StringTable,
|
|
Packit |
8cecbd |
lookup,
|
|
Packit |
8cecbd |
index,
|
|
Packit |
8cecbd |
construct,
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
StringTableBuilder,
|
|
Packit |
8cecbd |
empty,
|
|
Packit |
8cecbd |
insert,
|
|
Packit |
8cecbd |
inserts,
|
|
Packit |
8cecbd |
finalise,
|
|
Packit |
8cecbd |
unfinalise,
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
serialise,
|
|
Packit |
8cecbd |
serialiseSize,
|
|
Packit |
8cecbd |
deserialiseV1,
|
|
Packit |
8cecbd |
deserialiseV2,
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
#ifdef TESTS
|
|
Packit |
8cecbd |
prop_valid,
|
|
Packit |
8cecbd |
prop_sorted,
|
|
Packit |
8cecbd |
prop_finalise_unfinalise,
|
|
Packit |
8cecbd |
prop_serialise_deserialise,
|
|
Packit |
8cecbd |
prop_serialiseSize,
|
|
Packit |
8cecbd |
#endif
|
|
Packit |
8cecbd |
) where
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
import Data.Typeable (Typeable)
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
import Prelude hiding (lookup, id)
|
|
Packit |
8cecbd |
import Data.List hiding (lookup, insert)
|
|
Packit |
8cecbd |
import Data.Function (on)
|
|
Packit |
8cecbd |
import Data.Word (Word32)
|
|
Packit |
8cecbd |
import Data.Int (Int32)
|
|
Packit |
8cecbd |
import Data.Bits
|
|
Packit |
8cecbd |
import Data.Monoid (Monoid(..))
|
|
Packit |
8cecbd |
#if (MIN_VERSION_base(4,5,0))
|
|
Packit |
8cecbd |
import Data.Monoid ((<>))
|
|
Packit |
8cecbd |
#endif
|
|
Packit |
8cecbd |
import Control.Exception (assert)
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
import qualified Data.Array.Unboxed as A
|
|
Packit |
8cecbd |
import Data.Array.Unboxed ((!))
|
|
Packit |
8cecbd |
#if MIN_VERSION_containers(0,5,0)
|
|
Packit |
8cecbd |
import qualified Data.Map.Strict as Map
|
|
Packit |
8cecbd |
import Data.Map.Strict (Map)
|
|
Packit |
8cecbd |
#else
|
|
Packit |
8cecbd |
import qualified Data.Map as Map
|
|
Packit |
8cecbd |
import Data.Map (Map)
|
|
Packit |
8cecbd |
#endif
|
|
Packit |
8cecbd |
import qualified Data.ByteString as BS
|
|
Packit |
8cecbd |
import qualified Data.ByteString.Unsafe as BS
|
|
Packit |
8cecbd |
import qualified Data.ByteString.Lazy as LBS
|
|
Packit |
8cecbd |
#if MIN_VERSION_bytestring(0,10,2) || defined(MIN_VERSION_bytestring_builder)
|
|
Packit |
8cecbd |
import Data.ByteString.Builder as BS
|
|
Packit |
8cecbd |
import Data.ByteString.Builder.Extra as BS (byteStringCopy)
|
|
Packit |
8cecbd |
#else
|
|
Packit |
8cecbd |
import Data.ByteString.Lazy.Builder as BS
|
|
Packit |
8cecbd |
import Data.ByteString.Lazy.Builder.Extras as BS (byteStringCopy)
|
|
Packit |
8cecbd |
#endif
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
-- | An effecient mapping from strings to a dense set of integers.
|
|
Packit |
8cecbd |
--
|
|
Packit |
8cecbd |
data StringTable id = StringTable
|
|
Packit |
8cecbd |
{-# UNPACK #-} !BS.ByteString -- all strings concatenated
|
|
Packit |
8cecbd |
{-# UNPACK #-} !(A.UArray Int32 Word32) -- string offset table
|
|
Packit |
8cecbd |
{-# UNPACK #-} !(A.UArray Int32 Int32) -- string index to id table
|
|
Packit |
8cecbd |
{-# UNPACK #-} !(A.UArray Int32 Int32) -- string id to index table
|
|
Packit |
8cecbd |
deriving (Show, Typeable)
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
instance (Eq id, Enum id) => Eq (StringTable id) where
|
|
Packit |
8cecbd |
tbl1 == tbl2 = unfinalise tbl1 == unfinalise tbl2
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
-- | Look up a string in the token table. If the string is present, return
|
|
Packit |
8cecbd |
-- its corresponding index.
|
|
Packit |
8cecbd |
--
|
|
Packit |
8cecbd |
lookup :: Enum id => StringTable id -> BS.ByteString -> Maybe id
|
|
Packit |
8cecbd |
lookup (StringTable bs offsets ids _ixs) str =
|
|
Packit |
8cecbd |
binarySearch 0 (topBound-1) str
|
|
Packit |
8cecbd |
where
|
|
Packit |
8cecbd |
(0, topBound) = A.bounds offsets
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
binarySearch !a !b !key
|
|
Packit |
8cecbd |
| a > b = Nothing
|
|
Packit |
8cecbd |
| otherwise = case compare key (index' bs offsets mid) of
|
|
Packit |
8cecbd |
LT -> binarySearch a (mid-1) key
|
|
Packit |
8cecbd |
EQ -> Just $! toEnum (fromIntegral (ids ! mid))
|
|
Packit |
8cecbd |
GT -> binarySearch (mid+1) b key
|
|
Packit |
8cecbd |
where mid = (a + b) `div` 2
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
index' :: BS.ByteString -> A.UArray Int32 Word32 -> Int32 -> BS.ByteString
|
|
Packit |
8cecbd |
index' bs offsets i = BS.unsafeTake len . BS.unsafeDrop start $ bs
|
|
Packit |
8cecbd |
where
|
|
Packit |
8cecbd |
start, end, len :: Int
|
|
Packit |
8cecbd |
start = fromIntegral (offsets ! i)
|
|
Packit |
8cecbd |
end = fromIntegral (offsets ! (i+1))
|
|
Packit |
8cecbd |
len = end - start
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
-- | Given the index of a string in the table, return the string.
|
|
Packit |
8cecbd |
--
|
|
Packit |
8cecbd |
index :: Enum id => StringTable id -> id -> BS.ByteString
|
|
Packit |
8cecbd |
index (StringTable bs offsets _ids ixs) =
|
|
Packit |
8cecbd |
index' bs offsets . (ixs !) . fromIntegral . fromEnum
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
-- | Given a list of strings, construct a 'StringTable' mapping those strings
|
|
Packit |
8cecbd |
-- to a dense set of integers. Also return the ids for all the strings used
|
|
Packit |
8cecbd |
-- in the construction.
|
|
Packit |
8cecbd |
--
|
|
Packit |
8cecbd |
construct :: Enum id => [BS.ByteString] -> StringTable id
|
|
Packit |
8cecbd |
construct = finalise . foldl' (\tbl s -> fst (insert s tbl)) empty
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
data StringTableBuilder id = StringTableBuilder
|
|
Packit |
8cecbd |
!(Map BS.ByteString id)
|
|
Packit |
8cecbd |
{-# UNPACK #-} !Word32
|
|
Packit |
8cecbd |
deriving (Eq, Show, Typeable)
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
empty :: StringTableBuilder id
|
|
Packit |
8cecbd |
empty = StringTableBuilder Map.empty 0
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
insert :: Enum id => BS.ByteString -> StringTableBuilder id -> (StringTableBuilder id, id)
|
|
Packit |
8cecbd |
insert str builder@(StringTableBuilder smap nextid) =
|
|
Packit |
8cecbd |
case Map.lookup str smap of
|
|
Packit |
8cecbd |
Just id -> (builder, id)
|
|
Packit |
8cecbd |
Nothing -> let !id = toEnum (fromIntegral nextid)
|
|
Packit |
8cecbd |
!smap' = Map.insert str id smap
|
|
Packit |
8cecbd |
in (StringTableBuilder smap' (nextid+1), id)
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
inserts :: Enum id => [BS.ByteString] -> StringTableBuilder id -> (StringTableBuilder id, [id])
|
|
Packit |
8cecbd |
inserts bss builder = mapAccumL (flip insert) builder bss
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
finalise :: Enum id => StringTableBuilder id -> StringTable id
|
|
Packit |
8cecbd |
finalise (StringTableBuilder smap _) =
|
|
Packit |
8cecbd |
(StringTable strs offsets ids ixs)
|
|
Packit |
8cecbd |
where
|
|
Packit |
8cecbd |
strs = BS.concat (Map.keys smap)
|
|
Packit |
8cecbd |
offsets = A.listArray (0, fromIntegral (Map.size smap))
|
|
Packit |
8cecbd |
. scanl (\off str -> off + fromIntegral (BS.length str)) 0
|
|
Packit |
8cecbd |
$ Map.keys smap
|
|
Packit |
8cecbd |
ids = A.listArray (0, fromIntegral (Map.size smap) - 1)
|
|
Packit |
8cecbd |
. map (fromIntegral . fromEnum)
|
|
Packit |
8cecbd |
$ Map.elems smap
|
|
Packit |
8cecbd |
ixs = A.array (A.bounds ids) [ (id,ix) | (ix,id) <- A.assocs ids ]
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
unfinalise :: Enum id => StringTable id -> StringTableBuilder id
|
|
Packit |
8cecbd |
unfinalise (StringTable strs offsets ids _) =
|
|
Packit |
8cecbd |
StringTableBuilder smap nextid
|
|
Packit |
8cecbd |
where
|
|
Packit |
8cecbd |
smap = Map.fromAscList
|
|
Packit |
8cecbd |
[ (index' strs offsets ix, toEnum (fromIntegral (ids ! ix)))
|
|
Packit |
8cecbd |
| ix <- [0..h] ]
|
|
Packit |
8cecbd |
(0,h) = A.bounds ids
|
|
Packit |
8cecbd |
nextid = fromIntegral (h+1)
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
-------------------------
|
|
Packit |
8cecbd |
-- (de)serialisation
|
|
Packit |
8cecbd |
--
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
serialise :: StringTable id -> BS.Builder
|
|
Packit |
8cecbd |
serialise (StringTable strs offs ids ixs) =
|
|
Packit |
8cecbd |
let (_, !ixEnd) = A.bounds offs in
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
BS.word32BE (fromIntegral (BS.length strs))
|
|
Packit |
8cecbd |
<> BS.word32BE (fromIntegral ixEnd + 1)
|
|
Packit |
8cecbd |
<> BS.byteStringCopy strs
|
|
Packit |
8cecbd |
<> foldr (\n r -> BS.word32BE n <> r) mempty (A.elems offs)
|
|
Packit |
8cecbd |
<> foldr (\n r -> BS.int32BE n <> r) mempty (A.elems ids)
|
|
Packit |
8cecbd |
<> foldr (\n r -> BS.int32BE n <> r) mempty (A.elems ixs)
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
serialiseSize :: StringTable id -> Int
|
|
Packit |
8cecbd |
serialiseSize (StringTable strs offs _ids _ixs) =
|
|
Packit |
8cecbd |
let (_, !ixEnd) = A.bounds offs
|
|
Packit |
8cecbd |
in 4 * 2
|
|
Packit |
8cecbd |
+ BS.length strs
|
|
Packit |
8cecbd |
+ 4 * (fromIntegral ixEnd + 1)
|
|
Packit |
8cecbd |
+ 8 * fromIntegral ixEnd
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
deserialiseV1 :: BS.ByteString -> Maybe (StringTable id, BS.ByteString)
|
|
Packit |
8cecbd |
deserialiseV1 bs
|
|
Packit |
8cecbd |
| BS.length bs >= 8
|
|
Packit |
8cecbd |
, let lenStrs = fromIntegral (readWord32BE bs 0)
|
|
Packit |
8cecbd |
lenArr = fromIntegral (readWord32BE bs 4)
|
|
Packit |
8cecbd |
lenTotal= 8 + lenStrs + 4 * lenArr
|
|
Packit |
8cecbd |
, BS.length bs >= lenTotal
|
|
Packit |
8cecbd |
, let strs = BS.take lenStrs (BS.drop 8 bs)
|
|
Packit |
8cecbd |
arr = A.array (0, fromIntegral lenArr - 1)
|
|
Packit |
8cecbd |
[ (i, readWord32BE bs off)
|
|
Packit |
8cecbd |
| (i, off) <- zip [0 .. fromIntegral lenArr - 1]
|
|
Packit |
8cecbd |
[offArrS,offArrS+4 .. offArrE]
|
|
Packit |
8cecbd |
]
|
|
Packit |
8cecbd |
ids = A.array (0, fromIntegral lenArr - 1)
|
|
Packit |
8cecbd |
[ (i,i) | i <- [0 .. fromIntegral lenArr - 1] ]
|
|
Packit |
8cecbd |
ixs = ids -- two identity mappings
|
|
Packit |
8cecbd |
offArrS = 8 + lenStrs
|
|
Packit |
8cecbd |
offArrE = offArrS + 4 * lenArr - 1
|
|
Packit |
8cecbd |
!stringTable = StringTable strs arr ids ixs
|
|
Packit |
8cecbd |
!bs' = BS.drop lenTotal bs
|
|
Packit |
8cecbd |
= Just (stringTable, bs')
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
| otherwise
|
|
Packit |
8cecbd |
= Nothing
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
deserialiseV2 :: BS.ByteString -> Maybe (StringTable id, BS.ByteString)
|
|
Packit |
8cecbd |
deserialiseV2 bs
|
|
Packit |
8cecbd |
| BS.length bs >= 8
|
|
Packit |
8cecbd |
, let lenStrs = fromIntegral (readWord32BE bs 0)
|
|
Packit |
8cecbd |
lenArr = fromIntegral (readWord32BE bs 4)
|
|
Packit |
8cecbd |
lenTotal= 8 -- the two length prefixes
|
|
Packit |
8cecbd |
+ lenStrs
|
|
Packit |
8cecbd |
+ 4 * lenArr
|
|
Packit |
8cecbd |
+(4 * (lenArr - 1)) * 2 -- offsets array is 1 longer
|
|
Packit |
8cecbd |
, BS.length bs >= lenTotal
|
|
Packit |
8cecbd |
, let strs = BS.take lenStrs (BS.drop 8 bs)
|
|
Packit |
8cecbd |
offs = A.listArray (0, fromIntegral lenArr - 1)
|
|
Packit |
8cecbd |
[ readWord32BE bs off
|
|
Packit |
8cecbd |
| off <- offsets offsOff ]
|
|
Packit |
8cecbd |
-- the second two arrays are 1 shorter
|
|
Packit |
8cecbd |
ids = A.listArray (0, fromIntegral lenArr - 2)
|
|
Packit |
8cecbd |
[ readInt32BE bs off
|
|
Packit |
8cecbd |
| off <- offsets idsOff ]
|
|
Packit |
8cecbd |
ixs = A.listArray (0, fromIntegral lenArr - 2)
|
|
Packit |
8cecbd |
[ readInt32BE bs off
|
|
Packit |
8cecbd |
| off <- offsets ixsOff ]
|
|
Packit |
8cecbd |
offsOff = 8 + lenStrs
|
|
Packit |
8cecbd |
idsOff = offsOff + 4 * lenArr
|
|
Packit |
8cecbd |
ixsOff = idsOff + 4 * (lenArr-1)
|
|
Packit |
8cecbd |
offsets from = [from,from+4 .. from + 4 * (lenArr - 1)]
|
|
Packit |
8cecbd |
!stringTable = StringTable strs offs ids ixs
|
|
Packit |
8cecbd |
!bs' = BS.drop lenTotal bs
|
|
Packit |
8cecbd |
= Just (stringTable, bs')
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
| otherwise
|
|
Packit |
8cecbd |
= Nothing
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
readInt32BE :: BS.ByteString -> Int -> Int32
|
|
Packit |
8cecbd |
readInt32BE bs i = fromIntegral (readWord32BE bs i)
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
readWord32BE :: BS.ByteString -> Int -> Word32
|
|
Packit |
8cecbd |
readWord32BE bs i =
|
|
Packit |
8cecbd |
assert (i >= 0 && i+3 <= BS.length bs - 1) $
|
|
Packit |
8cecbd |
fromIntegral (BS.unsafeIndex bs (i + 0)) `shiftL` 24
|
|
Packit |
8cecbd |
+ fromIntegral (BS.unsafeIndex bs (i + 1)) `shiftL` 16
|
|
Packit |
8cecbd |
+ fromIntegral (BS.unsafeIndex bs (i + 2)) `shiftL` 8
|
|
Packit |
8cecbd |
+ fromIntegral (BS.unsafeIndex bs (i + 3))
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
#ifdef TESTS
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
prop_valid :: [BS.ByteString] -> Bool
|
|
Packit |
8cecbd |
prop_valid strs =
|
|
Packit |
8cecbd |
all lookupIndex (enumStrings tbl)
|
|
Packit |
8cecbd |
&& all indexLookup (enumIds tbl)
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
where
|
|
Packit |
8cecbd |
tbl :: StringTable Int
|
|
Packit |
8cecbd |
tbl = construct strs
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
lookupIndex str = index tbl ident == str
|
|
Packit |
8cecbd |
where Just ident = lookup tbl str
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
indexLookup ident = lookup tbl str == Just ident
|
|
Packit |
8cecbd |
where str = index tbl ident
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
-- this is important so we can use Map.fromAscList
|
|
Packit |
8cecbd |
prop_sorted :: [BS.ByteString] -> Bool
|
|
Packit |
8cecbd |
prop_sorted strings =
|
|
Packit |
8cecbd |
isSorted [ index' strs offsets ix
|
|
Packit |
8cecbd |
| ix <- A.range (A.bounds ids) ]
|
|
Packit |
8cecbd |
where
|
|
Packit |
8cecbd |
_tbl :: StringTable Int
|
|
Packit |
8cecbd |
_tbl@(StringTable strs offsets ids _ixs) = construct strings
|
|
Packit |
8cecbd |
isSorted xs = and (zipWith (<) xs (tail xs))
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
prop_finalise_unfinalise :: [BS.ByteString] -> Bool
|
|
Packit |
8cecbd |
prop_finalise_unfinalise strs =
|
|
Packit |
8cecbd |
builder == unfinalise (finalise builder)
|
|
Packit |
8cecbd |
where
|
|
Packit |
8cecbd |
builder :: StringTableBuilder Int
|
|
Packit |
8cecbd |
builder = foldl' (\tbl s -> fst (insert s tbl)) empty strs
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
prop_serialise_deserialise :: [BS.ByteString] -> Bool
|
|
Packit |
8cecbd |
prop_serialise_deserialise strs =
|
|
Packit |
8cecbd |
Just (strtable, BS.empty) == (deserialiseV2
|
|
Packit |
8cecbd |
. toStrict . BS.toLazyByteString
|
|
Packit |
8cecbd |
. serialise) strtable
|
|
Packit |
8cecbd |
where
|
|
Packit |
8cecbd |
strtable :: StringTable Int
|
|
Packit |
8cecbd |
strtable = construct strs
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
prop_serialiseSize :: [BS.ByteString] -> Bool
|
|
Packit |
8cecbd |
prop_serialiseSize strs =
|
|
Packit |
8cecbd |
(fromIntegral . LBS.length . BS.toLazyByteString . serialise) strtable
|
|
Packit |
8cecbd |
== serialiseSize strtable
|
|
Packit |
8cecbd |
where
|
|
Packit |
8cecbd |
strtable :: StringTable Int
|
|
Packit |
8cecbd |
strtable = construct strs
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
enumStrings :: Enum id => StringTable id -> [BS.ByteString]
|
|
Packit |
8cecbd |
enumStrings (StringTable bs offsets _ _) = map (index' bs offsets) [0..h-1]
|
|
Packit |
8cecbd |
where (0,h) = A.bounds offsets
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
enumIds :: Enum id => StringTable id -> [id]
|
|
Packit |
8cecbd |
enumIds (StringTable _ offsets _ _) = [toEnum 0 .. toEnum (fromIntegral (h-1))]
|
|
Packit |
8cecbd |
where (0,h) = A.bounds offsets
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
toStrict :: LBS.ByteString -> BS.ByteString
|
|
Packit |
8cecbd |
#if MIN_VERSION_bytestring(0,10,0)
|
|
Packit |
8cecbd |
toStrict = LBS.toStrict
|
|
Packit |
8cecbd |
#else
|
|
Packit |
8cecbd |
toStrict = BS.concat . LBS.toChunks
|
|
Packit |
8cecbd |
#endif
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
#endif
|
|
Packit |
8cecbd |
|
|
Packit |
8cecbd |
#if !(MIN_VERSION_base(4,5,0))
|
|
Packit |
8cecbd |
(<>) :: Monoid m => m -> m -> m
|
|
Packit |
8cecbd |
(<>) = mappend
|
|
Packit |
8cecbd |
#endif
|