diff --git a/Codec/Archive/Tar.hs b/Codec/Archive/Tar.hs new file mode 100644 index 0000000..20e6658 --- /dev/null +++ b/Codec/Archive/Tar.hs @@ -0,0 +1,277 @@ +{-# LANGUAGE CPP #-} +----------------------------------------------------------------------------- +-- | +-- Module : Codec.Archive.Tar +-- Copyright : (c) 2007 Bjorn Bringert, +-- 2008 Andrea Vezzosi, +-- 2008-2012 Duncan Coutts +-- License : BSD3 +-- +-- Maintainer : duncan@community.haskell.org +-- Portability : portable +-- +-- Reading, writing and manipulating \"@.tar@\" archive files. +-- +-- This module uses common names and so is designed to be imported qualified: +-- +-- > import qualified Codec.Archive.Tar as Tar +-- +----------------------------------------------------------------------------- +module Codec.Archive.Tar ( + + -- | Tar archive files are used to store a collection of other files in a + -- single file. They consists of a sequence of entries. Each entry describes + -- a file or directory (or some other special kind of file). The entry stores + -- a little bit of meta-data, in particular the file or directory name. + -- + -- Unlike some other archive formats, a tar file contains no index. The + -- information about each entry is stored next to the entry. Because of this, + -- tar files are almost always processed linearly rather than in a + -- random-access fashion. + -- + -- The functions in this package are designed for working on tar files + -- linearly and lazily. This makes it possible to do many operations in + -- constant space rather than having to load the entire archive into memory. + -- + -- It can read and write standard POSIX tar files and also the GNU and old + -- Unix V7 tar formats. The convenience functions that are provided in the + -- "Codec.Archive.Tar.Entry" module for creating archive entries are + -- primarily designed for standard portable archives. If you need to + -- construct GNU format archives or exactly preserve file ownership and + -- permissions then you will need to write some extra helper functions. + -- + -- This module contains just the simple high level operations without + -- exposing the all the details of tar files. If you need to inspect tar + -- entries in more detail or construct them directly then you also need + -- the module "Codec.Archive.Tar.Entry". + + -- * High level \"all in one\" operations + create, + extract, + append, + + -- * Notes + -- ** Compressed tar archives + -- | Tar files are commonly used in conjunction with gzip compression, as in + -- \"@.tar.gz@\" or \"@.tar.bz2@\" files. This module does not directly + -- handle compressed tar files however they can be handled easily by + -- composing functions from this module and the modules + -- @Codec.Compression.GZip@ or @Codec.Compression.BZip@ + -- (see @zlib@ or @bzlib@ packages). + -- + -- Creating a compressed \"@.tar.gz@\" file is just a minor variation on the + -- 'create' function, but where throw compression into the pipeline: + -- + -- > BS.writeFile tar . GZip.compress . Tar.write =<< Tar.pack base dir + -- + -- Similarly, extracting a compressed \"@.tar.gz@\" is just a minor variation + -- on the 'extract' function where we use decompression in the pipeline: + -- + -- > Tar.unpack dir . Tar.read . GZip.decompress =<< BS.readFile tar + -- + + -- ** Security + -- | This is pretty important. A maliciously constructed tar archives could + -- contain entries that specify bad file names. It could specify absolute + -- file names like \"@\/etc\/passwd@\" or relative files outside of the + -- archive like \"..\/..\/..\/something\". This security problem is commonly + -- called a \"directory traversal vulnerability\". Historically, such + -- vulnerabilities have been common in packages handling tar archives. + -- + -- The 'extract' and 'unpack' functions check for bad file names. See the + -- 'checkSecurity' function for more details. If you need to do any custom + -- unpacking then you should use this. + + -- ** Tarbombs + -- | A \"tarbomb\" is a @.tar@ file where not all entries are in a + -- subdirectory but instead files extract into the top level directory. The + -- 'extract' function does not check for these however if you want to do + -- that you can use the 'checkTarbomb' function like so: + -- + -- > Tar.unpack dir . Tar.checkTarbomb expectedDir + -- > . Tar.read =<< BS.readFile tar + -- + -- In this case extraction will fail if any file is outside of @expectedDir@. + + -- * Converting between internal and external representation + -- | Note, you cannot expect @write . read@ to give exactly the same output + -- as input. You can expect the information to be preserved exactly however. + -- This is because 'read' accepts common format variations while 'write' + -- produces the standard format. + read, + write, + + -- * Packing and unpacking files to\/from internal representation + -- | These functions are for packing and unpacking portable archives. They + -- are not suitable in cases where it is important to preserve file ownership + -- and permissions or to archive special files like named pipes and Unix + -- device files. + pack, + unpack, + + -- * Types + -- ** Tar entry type + -- | This module provides only very simple and limited read-only access to + -- the 'Entry' type. If you need access to the details or if you need to + -- construct your own entries then also import "Codec.Archive.Tar.Entry". + Entry, + entryPath, + entryContent, + EntryContent(..), + + -- ** Sequences of tar entries + Entries(..), + mapEntries, + mapEntriesNoFail, + foldEntries, + foldlEntries, + unfoldEntries, + + -- * Error handling + -- | Reading tar files can fail if the data does not match the tar file + -- format correctly. + -- + -- The style of error handling by returning structured errors. The pure + -- functions in the library do not throw exceptions, they return the errors + -- as data. The IO actions in the library can throw exceptions, in particular + -- the 'unpack' action does this. All the error types used are an instance of + -- the standard 'Exception' class so it is possible to 'throw' and 'catch' + -- them. + + -- ** Errors from reading tar files + FormatError(..), + +#ifdef TESTS + prop_write_read_ustar, + prop_write_read_gnu, + prop_write_read_v7, +#endif + ) where + +import Codec.Archive.Tar.Types + +import Codec.Archive.Tar.Read +import Codec.Archive.Tar.Write + +import Codec.Archive.Tar.Pack +import Codec.Archive.Tar.Unpack +import Codec.Archive.Tar.Index (hSeekEndEntryOffset) + +import Codec.Archive.Tar.Check + +import Control.Exception (Exception, throw, catch) +import qualified Data.ByteString.Lazy as BS +import System.IO (withFile, IOMode(..)) +import Prelude hiding (read) + +-- | Create a new @\".tar\"@ file from a directory of files. +-- +-- It is equivalent to calling the standard @tar@ program like so: +-- +-- @$ tar -f tarball.tar -C base -c dir@ +-- +-- This assumes a directory @.\/base\/dir@ with files inside, eg +-- @.\/base\/dir\/foo.txt@. The file names inside the resulting tar file will be +-- relative to @dir@, eg @dir\/foo.txt@. +-- +-- This is a high level \"all in one\" operation. Since you may need variations +-- on this function it is instructive to see how it is written. It is just: +-- +-- > BS.writeFile tar . Tar.write =<< Tar.pack base paths +-- +-- Notes: +-- +-- The files and directories must not change during this operation or the +-- result is not well defined. +-- +-- The intention of this function is to create tarballs that are portable +-- between systems. It is /not/ suitable for doing file system backups because +-- file ownership and permissions are not fully preserved. File ownership is +-- not preserved at all. File permissions are set to simple portable values: +-- +-- * @rw-r--r--@ for normal files +-- +-- * @rwxr-xr-x@ for executable files +-- +-- * @rwxr-xr-x@ for directories +-- +create :: FilePath -- ^ Path of the \".tar\" file to write. + -> FilePath -- ^ Base directory + -> [FilePath] -- ^ Files and directories to archive, relative to base dir + -> IO () +create tar base paths = BS.writeFile tar . write =<< pack base paths + +-- | Extract all the files contained in a @\".tar\"@ file. +-- +-- It is equivalent to calling the standard @tar@ program like so: +-- +-- @$ tar -x -f tarball.tar -C dir@ +-- +-- So for example if the @tarball.tar@ file contains @foo\/bar.txt@ then this +-- will extract it to @dir\/foo\/bar.txt@. +-- +-- This is a high level \"all in one\" operation. Since you may need variations +-- on this function it is instructive to see how it is written. It is just: +-- +-- > Tar.unpack dir . Tar.read =<< BS.readFile tar +-- +-- Notes: +-- +-- Extracting can fail for a number of reasons. The tarball may be incorrectly +-- formatted. There may be IO or permission errors. In such cases an exception +-- will be thrown and extraction will not continue. +-- +-- Since the extraction may fail part way through it is not atomic. For this +-- reason you may want to extract into an empty directory and, if the +-- extraction fails, recursively delete the directory. +-- +-- Security: only files inside the target directory will be written. Tarballs +-- containing entries that point outside of the tarball (either absolute paths +-- or relative paths) will be caught and an exception will be thrown. +-- +extract :: FilePath -- ^ Destination directory + -> FilePath -- ^ Tarball + -> IO () +extract dir tar = unpack dir . read =<< BS.readFile tar + +-- | Append new entries to a @\".tar\"@ file from a directory of files. +-- +-- This is much like 'create', except that all the entries are added to the +-- end of an existing tar file. Or if the file does not already exists then +-- it behaves the same as 'create'. +-- +append :: FilePath -- ^ Path of the \".tar\" file to write. + -> FilePath -- ^ Base directory + -> [FilePath] -- ^ Files and directories to archive, relative to base dir + -> IO () +append tar base paths = + withFile tar ReadWriteMode $ \hnd -> do + _ <- hSeekEndEntryOffset hnd Nothing + BS.hPut hnd . write =<< pack base paths + +------------------------- +-- Correctness properties +-- + +#ifdef TESTS + +prop_write_read_ustar :: [Entry] -> Bool +prop_write_read_ustar entries = + foldr Next Done entries' == read (write entries') + where + entries' = [ e { entryFormat = UstarFormat } | e <- entries ] + +prop_write_read_gnu :: [Entry] -> Bool +prop_write_read_gnu entries = + foldr Next Done entries' == read (write entries') + where + entries' = [ e { entryFormat = GnuFormat } | e <- entries ] + +prop_write_read_v7 :: [Entry] -> Bool +prop_write_read_v7 entries = + foldr Next Done entries' == read (write entries') + where + entries' = [ limitToV7FormatCompat e { entryFormat = V7Format } + | e <- entries ] + +#endif diff --git a/Codec/Archive/Tar/Check.hs b/Codec/Archive/Tar/Check.hs new file mode 100644 index 0000000..27724f2 --- /dev/null +++ b/Codec/Archive/Tar/Check.hs @@ -0,0 +1,247 @@ +{-# LANGUAGE DeriveDataTypeable #-} +----------------------------------------------------------------------------- +-- | +-- Module : Codec.Archive.Tar +-- Copyright : (c) 2008-2012 Duncan Coutts +-- 2011 Max Bolingbroke +-- License : BSD3 +-- +-- Maintainer : duncan@community.haskell.org +-- Portability : portable +-- +-- Perform various checks on tar file entries. +-- +----------------------------------------------------------------------------- +module Codec.Archive.Tar.Check ( + + -- * Security + checkSecurity, + FileNameError(..), + + -- * Tarbombs + checkTarbomb, + TarBombError(..), + + -- * Portability + checkPortability, + PortabilityError(..), + PortabilityPlatform, + ) where + +import Codec.Archive.Tar.Types + +import Data.Typeable (Typeable) +import Control.Exception (Exception) +import Control.Monad (MonadPlus(mplus)) +import qualified System.FilePath as FilePath.Native + ( splitDirectories, isAbsolute, isValid ) + +import qualified System.FilePath.Windows as FilePath.Windows +import qualified System.FilePath.Posix as FilePath.Posix + + +-------------------------- +-- Security +-- + +-- | This function checks a sequence of tar entries for file name security +-- problems. It checks that: +-- +-- * file paths are not absolute +-- +-- * file paths do not contain any path components that are \"@..@\" +-- +-- * file names are valid +-- +-- These checks are from the perspective of the current OS. That means we check +-- for \"@C:\blah@\" files on Windows and \"\/blah\" files on Unix. For archive +-- entry types 'HardLink' and 'SymbolicLink' the same checks are done for the +-- link target. A failure in any entry terminates the sequence of entries with +-- an error. +-- +checkSecurity :: Entries e -> Entries (Either e FileNameError) +checkSecurity = checkEntries checkEntrySecurity + +checkEntrySecurity :: Entry -> Maybe FileNameError +checkEntrySecurity entry = case entryContent entry of + HardLink link -> check (entryPath entry) + `mplus` check (fromLinkTarget link) + SymbolicLink link -> check (entryPath entry) + `mplus` check (fromLinkTarget link) + _ -> check (entryPath entry) + + where + check name + | FilePath.Native.isAbsolute name + = Just $ AbsoluteFileName name + + | not (FilePath.Native.isValid name) + = Just $ InvalidFileName name + + | any (=="..") (FilePath.Native.splitDirectories name) + = Just $ InvalidFileName name + + | otherwise = Nothing + +-- | Errors arising from tar file names being in some way invalid or dangerous +data FileNameError + = InvalidFileName FilePath + | AbsoluteFileName FilePath + deriving (Typeable) + +instance Show FileNameError where + show = showFileNameError Nothing + +instance Exception FileNameError + +showFileNameError :: Maybe PortabilityPlatform -> FileNameError -> String +showFileNameError mb_plat err = case err of + InvalidFileName path -> "Invalid" ++ plat ++ " file name in tar archive: " ++ show path + AbsoluteFileName path -> "Absolute" ++ plat ++ " file name in tar archive: " ++ show path + where plat = maybe "" (' ':) mb_plat + + +-------------------------- +-- Tarbombs +-- + +-- | This function checks a sequence of tar entries for being a \"tar bomb\". +-- This means that the tar file does not follow the standard convention that +-- all entries are within a single subdirectory, e.g. a file \"foo.tar\" would +-- usually have all entries within the \"foo/\" subdirectory. +-- +-- Given the expected subdirectory, this function checks all entries are within +-- that subdirectroy. +-- +-- Note: This check must be used in conjunction with 'checkSecurity' +-- (or 'checkPortability'). +-- +checkTarbomb :: FilePath -> Entries e -> Entries (Either e TarBombError) +checkTarbomb expectedTopDir = checkEntries (checkEntryTarbomb expectedTopDir) + +checkEntryTarbomb :: FilePath -> Entry -> Maybe TarBombError +checkEntryTarbomb _ entry | nonFilesystemEntry = Nothing + where + -- Ignore some special entries we will not unpack anyway + nonFilesystemEntry = + case entryContent entry of + OtherEntryType 'g' _ _ -> True --PAX global header + OtherEntryType 'x' _ _ -> True --PAX individual header + _ -> False + +checkEntryTarbomb expectedTopDir entry = + case FilePath.Native.splitDirectories (entryPath entry) of + (topDir:_) | topDir == expectedTopDir -> Nothing + _ -> Just $ TarBombError expectedTopDir + +-- | An error that occurs if a tar file is a \"tar bomb\" that would extract +-- files outside of the intended directory. +data TarBombError = TarBombError FilePath + deriving (Typeable) + +instance Exception TarBombError + +instance Show TarBombError where + show (TarBombError expectedTopDir) + = "File in tar archive is not in the expected directory " ++ show expectedTopDir + + +-------------------------- +-- Portability +-- + +-- | This function checks a sequence of tar entries for a number of portability +-- issues. It will complain if: +-- +-- * The old \"Unix V7\" or \"gnu\" formats are used. For maximum portability +-- only the POSIX standard \"ustar\" format should be used. +-- +-- * A non-portable entry type is used. Only ordinary files, hard links, +-- symlinks and directories are portable. Device files, pipes and others are +-- not portable between all common operating systems. +-- +-- * Non-ASCII characters are used in file names. There is no agreed portable +-- convention for Unicode or other extended character sets in file names in +-- tar archives. +-- +-- * File names that would not be portable to both Unix and Windows. This check +-- includes characters that are valid in both systems and the \'/\' vs \'\\\' +-- directory separator conventions. +-- +checkPortability :: Entries e -> Entries (Either e PortabilityError) +checkPortability = checkEntries checkEntryPortability + +checkEntryPortability :: Entry -> Maybe PortabilityError +checkEntryPortability entry + | entryFormat entry `elem` [V7Format, GnuFormat] + = Just $ NonPortableFormat (entryFormat entry) + + | not (portableFileType (entryContent entry)) + = Just NonPortableFileType + + | not (all portableChar posixPath) + = Just $ NonPortableEntryNameChar posixPath + + | not (FilePath.Posix.isValid posixPath) + = Just $ NonPortableFileName "unix" (InvalidFileName posixPath) + | not (FilePath.Windows.isValid windowsPath) + = Just $ NonPortableFileName "windows" (InvalidFileName windowsPath) + + | FilePath.Posix.isAbsolute posixPath + = Just $ NonPortableFileName "unix" (AbsoluteFileName posixPath) + | FilePath.Windows.isAbsolute windowsPath + = Just $ NonPortableFileName "windows" (AbsoluteFileName windowsPath) + + | any (=="..") (FilePath.Posix.splitDirectories posixPath) + = Just $ NonPortableFileName "unix" (InvalidFileName posixPath) + | any (=="..") (FilePath.Windows.splitDirectories windowsPath) + = Just $ NonPortableFileName "windows" (InvalidFileName windowsPath) + + | otherwise = Nothing + + where + tarPath = entryTarPath entry + posixPath = fromTarPathToPosixPath tarPath + windowsPath = fromTarPathToWindowsPath tarPath + + portableFileType ftype = case ftype of + NormalFile {} -> True + HardLink {} -> True + SymbolicLink {} -> True + Directory -> True + _ -> False + + portableChar c = c <= '\127' + +-- | Portability problems in a tar archive +data PortabilityError + = NonPortableFormat Format + | NonPortableFileType + | NonPortableEntryNameChar FilePath + | NonPortableFileName PortabilityPlatform FileNameError + deriving (Typeable) + +-- | The name of a platform that portability issues arise from +type PortabilityPlatform = String + +instance Exception PortabilityError + +instance Show PortabilityError where + show (NonPortableFormat format) = "Archive is in the " ++ fmt ++ " format" + where fmt = case format of V7Format -> "old Unix V7 tar" + UstarFormat -> "ustar" -- I never generate this but a user might + GnuFormat -> "GNU tar" + show NonPortableFileType = "Non-portable file type in archive" + show (NonPortableEntryNameChar posixPath) + = "Non-portable character in archive entry name: " ++ show posixPath + show (NonPortableFileName platform err) + = showFileNameError (Just platform) err + + +-------------------------- +-- Utils +-- + +checkEntries :: (Entry -> Maybe e') -> Entries e -> Entries (Either e e') +checkEntries checkEntry = + mapEntries (\entry -> maybe (Right entry) Left (checkEntry entry)) diff --git a/Codec/Archive/Tar/Entry.hs b/Codec/Archive/Tar/Entry.hs new file mode 100644 index 0000000..3f19021 --- /dev/null +++ b/Codec/Archive/Tar/Entry.hs @@ -0,0 +1,80 @@ +----------------------------------------------------------------------------- +-- | +-- Module : Codec.Archive.Tar.Entry +-- Copyright : (c) 2007 Bjorn Bringert, +-- 2008 Andrea Vezzosi, +-- 2008-2009 Duncan Coutts +-- License : BSD3 +-- +-- Maintainer : duncan@community.haskell.org +-- Portability : portable +-- +-- Types and functions to manipulate tar entries. +-- +-- While the "Codec.Archive.Tar" module provides only the simple high level +-- API, this module provides full access to the details of tar entries. This +-- lets you inspect all the meta-data, construct entries and handle error cases +-- more precisely. +-- +-- This module uses common names and so is designed to be imported qualified: +-- +-- > import qualified Codec.Archive.Tar as Tar +-- > import qualified Codec.Archive.Tar.Entry as Tar +-- +----------------------------------------------------------------------------- +module Codec.Archive.Tar.Entry ( + + -- * Tar entry and associated types + Entry(..), + --TODO: should be the following with the Entry constructor not exported, + -- but haddock cannot document that properly + -- see http://trac.haskell.org/haddock/ticket/3 + --Entry(filePath, fileMode, ownerId, groupId, fileSize, modTime, + -- fileType, linkTarget, headerExt, fileContent), + entryPath, + EntryContent(..), + Ownership(..), + + FileSize, + Permissions, + EpochTime, + DevMajor, + DevMinor, + TypeCode, + Format(..), + + -- * Constructing simple entry values + simpleEntry, + fileEntry, + directoryEntry, + + -- * Standard file permissions + -- | For maximum portability when constructing archives use only these file + -- permissions. + ordinaryFilePermissions, + executableFilePermissions, + directoryPermissions, + + -- * Constructing entries from disk files + packFileEntry, + packDirectoryEntry, + getDirectoryContentsRecursive, + + -- * TarPath type + TarPath, + toTarPath, + fromTarPath, + fromTarPathToPosixPath, + fromTarPathToWindowsPath, + + -- * LinkTarget type + LinkTarget, + toLinkTarget, + fromLinkTarget, + fromLinkTargetToPosixPath, + fromLinkTargetToWindowsPath, + + ) where + +import Codec.Archive.Tar.Types +import Codec.Archive.Tar.Pack diff --git a/Codec/Archive/Tar/Index.hs b/Codec/Archive/Tar/Index.hs new file mode 100644 index 0000000..df44c7a --- /dev/null +++ b/Codec/Archive/Tar/Index.hs @@ -0,0 +1,825 @@ +{-# LANGUAGE CPP, BangPatterns, PatternGuards #-} +{-# LANGUAGE GeneralizedNewtypeDeriving, DeriveDataTypeable #-} + +----------------------------------------------------------------------------- +-- | +-- Module : Codec.Archive.Tar.Index +-- Copyright : (c) 2010-2015 Duncan Coutts +-- License : BSD3 +-- +-- Maintainer : duncan@community.haskell.org +-- Portability : portable +-- +-- Random access to the content of a @.tar@ archive. +-- +-- This module uses common names and so is designed to be imported qualified: +-- +-- > import qualified Codec.Archive.Tar.Index as TarIndex +-- +----------------------------------------------------------------------------- +module Codec.Archive.Tar.Index ( + + -- | The @tar@ format does not contain an index of files within the + -- archive. Normally, @tar@ file have to be processed linearly. It is + -- sometimes useful however to be able to get random access to files + -- within the archive. + -- + -- This module provides an index of a @tar@ file. A linear pass of the + -- @tar@ file is needed to 'build' the 'TarIndex', but thereafter you can + -- 'lookup' paths in the @tar@ file, and then use 'hReadEntry' to + -- seek to the right part of the file and read the entry. + -- + -- An index cannot be used to lookup 'Directory' entries in a tar file; + -- instead, you will get 'TarDir' entry listing all the entries in the + -- directory. + + -- * Index type + TarIndex, + + -- * Index lookup + lookup, + TarIndexEntry(..), + toList, + + -- ** I\/O operations + TarEntryOffset, + hReadEntry, + hReadEntryHeader, + + -- * Index construction + build, + -- ** Incremental construction + -- $incremental-construction + IndexBuilder, + empty, + addNextEntry, + skipNextEntry, + finalise, + unfinalise, + + -- * Serialising indexes + serialise, + deserialise, + + -- * Lower level operations with offsets and I\/O on tar files + hReadEntryHeaderOrEof, + hSeekEntryOffset, + hSeekEntryContentOffset, + hSeekEndEntryOffset, + nextEntryOffset, + indexEndEntryOffset, + indexNextEntryOffset, + + -- * Deprecated aliases + emptyIndex, + finaliseIndex, + +#ifdef TESTS + prop_lookup, + prop_toList, + prop_valid, + prop_serialise_deserialise, + prop_serialiseSize, + prop_index_matches_tar, + prop_finalise_unfinalise, +#endif + ) where + +import Data.Typeable (Typeable) + +import Codec.Archive.Tar.Types as Tar +import Codec.Archive.Tar.Read as Tar +import qualified Codec.Archive.Tar.Index.StringTable as StringTable +import Codec.Archive.Tar.Index.StringTable (StringTable, StringTableBuilder) +import qualified Codec.Archive.Tar.Index.IntTrie as IntTrie +import Codec.Archive.Tar.Index.IntTrie (IntTrie, IntTrieBuilder) + +import qualified System.FilePath.Posix as FilePath +import Data.Monoid (Monoid(..)) +#if (MIN_VERSION_base(4,5,0)) +import Data.Monoid ((<>)) +#endif +import Data.Word +import Data.Int +import Data.Bits +import qualified Data.Array.Unboxed as A +import Prelude hiding (lookup) +import System.IO +import Control.Exception (assert, throwIO) +import Control.DeepSeq + +import qualified Data.ByteString as BS +import qualified Data.ByteString.Char8 as BS.Char8 +import qualified Data.ByteString.Lazy as LBS +import qualified Data.ByteString.Unsafe as BS +#if MIN_VERSION_bytestring(0,10,2) || defined(MIN_VERSION_bytestring_builder) +import Data.ByteString.Builder as BS +import Data.ByteString.Builder.Extra as BS (toLazyByteStringWith, + untrimmedStrategy) +#else +import Data.ByteString.Lazy.Builder as BS +import Data.ByteString.Lazy.Builder.Extras as BS (toLazyByteStringWith, + untrimmedStrategy) +#endif + +#ifdef TESTS +import qualified Prelude +import Test.QuickCheck +import Test.QuickCheck.Property (ioProperty) +import Control.Applicative ((<$>), (<*>)) +import Control.Monad (unless) +import Data.List (nub, sort, sortBy, stripPrefix, isPrefixOf) +import Data.Maybe +import Data.Function (on) +import Control.Exception (SomeException, try) +import Codec.Archive.Tar.Write as Tar +import qualified Data.ByteString.Handle as HBS +#endif + + +-- | An index of the entries in a tar file. +-- +-- This index type is designed to be quite compact and suitable to store either +-- on disk or in memory. +-- +data TarIndex = TarIndex + + -- As an example of how the mapping works, consider these example files: + -- "foo/bar.hs" at offset 0 + -- "foo/baz.hs" at offset 1024 + -- + -- We split the paths into components and enumerate them. + -- { "foo" -> TokenId 0, "bar.hs" -> TokenId 1, "baz.hs" -> TokenId 2 } + -- + -- We convert paths into sequences of 'TokenId's, i.e. + -- "foo/bar.hs" becomes [PathComponentId 0, PathComponentId 1] + -- "foo/baz.hs" becomes [PathComponentId 0, PathComponentId 2] + -- + -- We use a trie mapping sequences of 'PathComponentId's to the entry offset: + -- { [PathComponentId 0, PathComponentId 1] -> offset 0 + -- , [PathComponentId 0, PathComponentId 2] -> offset 1024 } + + -- The mapping of filepath components as strings to ids. + {-# UNPACK #-} !(StringTable PathComponentId) + + -- Mapping of sequences of filepath component ids to tar entry offsets. + {-# UNPACK #-} !(IntTrie PathComponentId TarEntryOffset) + + -- The offset immediatly after the last entry, where we would append any + -- additional entries. + {-# UNPACK #-} !TarEntryOffset + + deriving (Eq, Show, Typeable) + +instance NFData TarIndex where + rnf (TarIndex _ _ _) = () -- fully strict by construction + +-- | The result of 'lookup' in a 'TarIndex'. It can either be a file directly, +-- or a directory entry containing further entries (and all subdirectories +-- recursively). Note that the subtrees are constructed lazily, so it's +-- cheaper if you don't look at them. +-- +data TarIndexEntry = TarFileEntry {-# UNPACK #-} !TarEntryOffset + | TarDir [(FilePath, TarIndexEntry)] + deriving (Show, Typeable) + + +newtype PathComponentId = PathComponentId Int + deriving (Eq, Ord, Enum, Show, Typeable) + +-- | An offset within a tar file. Use 'hReadEntry', 'hReadEntryHeader' or +-- 'hSeekEntryOffset'. +-- +-- This is actually a tar \"record\" number, not a byte offset. +-- +type TarEntryOffset = Word32 + + +-- | Look up a given filepath in the 'TarIndex'. It may return a 'TarFileEntry' +-- containing the 'TarEntryOffset' of the file within the tar file, or if +-- the filepath identifies a directory then it returns a 'TarDir' containing +-- the list of files within that directory. +-- +-- Given the 'TarEntryOffset' you can then use one of the I\/O operations: +-- +-- * 'hReadEntry' to read the whole entry; +-- +-- * 'hReadEntryHeader' to read just the file metadata (e.g. its length); +-- +lookup :: TarIndex -> FilePath -> Maybe TarIndexEntry +lookup (TarIndex pathTable pathTrie _) path = do + fpath <- toComponentIds pathTable path + tentry <- IntTrie.lookup pathTrie fpath + return (mkIndexEntry tentry) + where + mkIndexEntry (IntTrie.Entry offset) = TarFileEntry offset + mkIndexEntry (IntTrie.Completions entries) = + TarDir [ (fromComponentId pathTable key, mkIndexEntry entry) + | (key, entry) <- entries ] + + +toComponentIds :: StringTable PathComponentId -> FilePath -> Maybe [PathComponentId] +toComponentIds table = + lookupComponents [] + . filter (/= BS.Char8.singleton '.') + . splitDirectories + . BS.Char8.pack + where + lookupComponents cs' [] = Just (reverse cs') + lookupComponents cs' (c:cs) = case StringTable.lookup table c of + Nothing -> Nothing + Just cid -> lookupComponents (cid:cs') cs + +fromComponentId :: StringTable PathComponentId -> PathComponentId -> FilePath +fromComponentId table = BS.Char8.unpack . StringTable.index table + +-- | All the files in the index with their corresponding 'TarEntryOffset's. +-- +-- Note that the files are in no special order. If you intend to read all or +-- most files then is is recommended to sort by the 'TarEntryOffset'. +-- +toList :: TarIndex -> [(FilePath, TarEntryOffset)] +toList (TarIndex pathTable pathTrie _) = + [ (path, off) + | (cids, off) <- IntTrie.toList pathTrie + , let path = FilePath.joinPath (map (fromComponentId pathTable) cids) ] + + +-- | Build a 'TarIndex' from a sequence of tar 'Entries'. The 'Entries' are +-- assumed to start at offset @0@ within a file. +-- +build :: Entries e -> Either e TarIndex +build = go empty + where + go !builder (Next e es) = go (addNextEntry e builder) es + go !builder Done = Right $! finalise builder + go !_ (Fail err) = Left err + + +-- $incremental-construction +-- If you need more control than 'build' then you can construct the index +-- in an acumulator style using the 'IndexBuilder' and operations. +-- +-- Start with 'empty' and use 'addNextEntry' (or 'skipNextEntry') for +-- each 'Entry' in the tar file in order. Every entry must added or skipped in +-- order, otherwise the resulting 'TarIndex' will report the wrong +-- 'TarEntryOffset's. At the end use 'finalise' to get the 'TarIndex'. +-- +-- For example, 'build' is simply: +-- +-- > build = go empty +-- > where +-- > go !builder (Next e es) = go (addNextEntry e builder) es +-- > go !builder Done = Right $! finalise builder +-- > go !_ (Fail err) = Left err + + +-- | The intermediate type used for incremental construction of a 'TarIndex'. +-- +data IndexBuilder + = IndexBuilder !(StringTableBuilder PathComponentId) + !(IntTrieBuilder PathComponentId TarEntryOffset) + {-# UNPACK #-} !TarEntryOffset + deriving (Eq, Show) + +instance NFData IndexBuilder where + rnf (IndexBuilder _ _ _) = () -- fully strict by construction + +-- | The initial empty 'IndexBuilder'. +-- +empty :: IndexBuilder +empty = IndexBuilder StringTable.empty IntTrie.empty 0 + +emptyIndex :: IndexBuilder +emptyIndex = empty +{-# DEPRECATED emptyIndex "Use TarIndex.empty" #-} + +-- | Add the next 'Entry' into the 'IndexBuilder'. +-- +addNextEntry :: Entry -> IndexBuilder -> IndexBuilder +addNextEntry entry (IndexBuilder stbl itrie nextOffset) = + IndexBuilder stbl' itrie' + (nextEntryOffset entry nextOffset) + where + !entrypath = splitTarPath (entryTarPath entry) + (stbl', cids) = StringTable.inserts entrypath stbl + itrie' = IntTrie.insert cids nextOffset itrie + +-- | Use this function if you want to skip some entries and not add them to the +-- final 'TarIndex'. +-- +skipNextEntry :: Entry -> IndexBuilder -> IndexBuilder +skipNextEntry entry (IndexBuilder stbl itrie nextOffset) = + IndexBuilder stbl itrie (nextEntryOffset entry nextOffset) + +-- | Finish accumulating 'Entry' information and build the compact 'TarIndex' +-- lookup structure. +-- +finalise :: IndexBuilder -> TarIndex +finalise (IndexBuilder stbl itrie finalOffset) = + TarIndex pathTable pathTrie finalOffset + where + pathTable = StringTable.finalise stbl + pathTrie = IntTrie.finalise itrie + +finaliseIndex :: IndexBuilder -> TarIndex +finaliseIndex = finalise +{-# DEPRECATED finaliseIndex "Use TarIndex.finalise" #-} + +-- | This is the offset immediately following the entry most recently added +-- to the 'IndexBuilder'. You might use this if you need to know the offsets +-- but don't want to use the 'TarIndex' lookup structure. +-- Use with 'hSeekEntryOffset'. See also 'nextEntryOffset'. +-- +indexNextEntryOffset :: IndexBuilder -> TarEntryOffset +indexNextEntryOffset (IndexBuilder _ _ off) = off + +-- | This is the offset immediately following the last entry in the tar file. +-- This can be useful to append further entries into the tar file. +-- Use with 'hSeekEntryOffset', or just use 'hSeekEndEntryOffset' directly. +-- +indexEndEntryOffset :: TarIndex -> TarEntryOffset +indexEndEntryOffset (TarIndex _ _ off) = off + +-- | Calculate the 'TarEntryOffset' of the next entry, given the size and +-- offset of the current entry. +-- +-- This is much like using 'skipNextEntry' and 'indexNextEntryOffset', but without +-- using an 'IndexBuilder'. +-- +nextEntryOffset :: Entry -> TarEntryOffset -> TarEntryOffset +nextEntryOffset entry offset = + offset + + 1 + + case entryContent entry of + NormalFile _ size -> blocks size + OtherEntryType _ _ size -> blocks size + _ -> 0 + where + -- NOTE: to avoid underflow, do the (fromIntegral :: Int64 -> Word32) last + blocks :: Int64 -> TarEntryOffset + blocks size = fromIntegral (1 + (size - 1) `div` 512) + +type FilePathBS = BS.ByteString + +splitTarPath :: TarPath -> [FilePathBS] +splitTarPath (TarPath name prefix) = + splitDirectories prefix ++ splitDirectories name + +splitDirectories :: FilePathBS -> [FilePathBS] +splitDirectories bs = + case BS.Char8.split '/' bs of + c:cs | BS.null c -> BS.Char8.singleton '/' : filter (not . BS.null) cs + cs -> filter (not . BS.null) cs + + +------------------------- +-- Resume building an existing index +-- + +-- | Resume building an existing index +-- +-- A 'TarIndex' is optimized for a highly compact and efficient in-memory +-- representation. This, however, makes it read-only. If you have an existing +-- 'TarIndex' for a large file, and want to add to it, you can translate the +-- 'TarIndex' back to an 'IndexBuilder'. Be aware that this is a relatively +-- costly operation (linear in the size of the 'TarIndex'), though still +-- faster than starting again from scratch. +-- +-- This is the left inverse to 'finalise' (modulo ordering). +-- +unfinalise :: TarIndex -> IndexBuilder +unfinalise (TarIndex pathTable pathTrie finalOffset) = + IndexBuilder (StringTable.unfinalise pathTable) + (IntTrie.unfinalise pathTrie) + finalOffset + + +------------------------- +-- I/O operations +-- + +-- | Reads an entire 'Entry' at the given 'TarEntryOffset' in the tar file. +-- The 'Handle' must be open for reading and be seekable. +-- +-- This reads the whole entry into memory strictly, not incrementally. For more +-- control, use 'hReadEntryHeader' and then read the entry content manually. +-- +hReadEntry :: Handle -> TarEntryOffset -> IO Entry +hReadEntry hnd off = do + entry <- hReadEntryHeader hnd off + case entryContent entry of + NormalFile _ size -> do body <- LBS.hGet hnd (fromIntegral size) + return entry { + entryContent = NormalFile body size + } + OtherEntryType c _ size -> do body <- LBS.hGet hnd (fromIntegral size) + return entry { + entryContent = OtherEntryType c body size + } + _ -> return entry + +-- | Read the header for a 'Entry' at the given 'TarEntryOffset' in the tar +-- file. The 'entryContent' will contain the correct metadata but an empty file +-- content. The 'Handle' must be open for reading and be seekable. +-- +-- The 'Handle' position is advanced to the beginning of the entry content (if +-- any). You must check the 'entryContent' to see if the entry is of type +-- 'NormalFile'. If it is, the 'NormalFile' gives the content length and you +-- are free to read this much data from the 'Handle'. +-- +-- > entry <- Tar.hReadEntryHeader hnd +-- > case Tar.entryContent entry of +-- > Tar.NormalFile _ size -> do content <- BS.hGet hnd size +-- > ... +-- +-- Of course you don't have to read it all in one go (as 'hReadEntry' does), +-- you can use any appropriate method to read it incrementally. +-- +-- In addition to I\/O errors, this can throw a 'FormatError' if the offset is +-- wrong, or if the file is not valid tar format. +-- +-- There is also the lower level operation 'hSeekEntryOffset'. +-- +hReadEntryHeader :: Handle -> TarEntryOffset -> IO Entry +hReadEntryHeader hnd blockOff = do + hSeekEntryOffset hnd blockOff + header <- LBS.hGet hnd 512 + case Tar.read header of + Tar.Next entry _ -> return entry + Tar.Fail e -> throwIO e + Tar.Done -> fail "hReadEntryHeader: impossible" + +-- | Set the 'Handle' position to the position corresponding to the given +-- 'TarEntryOffset'. +-- +-- This position is where the entry metadata can be read. If you already know +-- the entry has a body (and perhaps know it's length), you may wish to seek to +-- the body content directly using 'hSeekEntryContentOffset'. +-- +hSeekEntryOffset :: Handle -> TarEntryOffset -> IO () +hSeekEntryOffset hnd blockOff = + hSeek hnd AbsoluteSeek (fromIntegral blockOff * 512) + +-- | Set the 'Handle' position to the entry content position corresponding to +-- the given 'TarEntryOffset'. +-- +-- This position is where the entry content can be read using ordinary I\/O +-- operations (though you have to know in advance how big the entry content +-- is). This is /only valid/ if you /already know/ the entry has a body (i.e. +-- is a normal file). +-- +hSeekEntryContentOffset :: Handle -> TarEntryOffset -> IO () +hSeekEntryContentOffset hnd blockOff = + hSeekEntryOffset hnd (blockOff + 1) + +-- | This is a low level variant on 'hReadEntryHeader', that can be used to +-- iterate through a tar file, entry by entry. +-- +-- It has a few differences compared to 'hReadEntryHeader': +-- +-- * It returns an indication when the end of the tar file is reached. +-- +-- * It /does not/ move the 'Handle' position to the beginning of the entry +-- content. +-- +-- * It returns the 'TarEntryOffset' of the next entry. +-- +-- After this action, the 'Handle' position is not in any useful place. If +-- you want to skip to the next entry, take the 'TarEntryOffset' returned and +-- use 'hReadEntryHeaderOrEof' again. Or if having inspected the 'Entry' +-- header you want to read the entry content (if it has one) then use +-- 'hSeekEntryContentOffset' on the original input 'TarEntryOffset'. +-- +hReadEntryHeaderOrEof :: Handle -> TarEntryOffset + -> IO (Maybe (Entry, TarEntryOffset)) +hReadEntryHeaderOrEof hnd blockOff = do + hSeekEntryOffset hnd blockOff + header <- LBS.hGet hnd 1024 + case Tar.read header of + Tar.Next entry _ -> let !blockOff' = nextEntryOffset entry blockOff + in return (Just (entry, blockOff')) + Tar.Done -> return Nothing + Tar.Fail e -> throwIO e + +-- | Seek to the end of a tar file, to the position where new entries can +-- be appended, and return that 'TarEntryOffset'. +-- +-- If you have a valid 'TarIndex' for this tar file then you should supply it +-- because it allows seeking directly to the correct location. +-- +-- If you do not have an index, then this becomes an expensive linear +-- operation because we have to read each tar entry header from the beginning +-- to find the location immediately after the last entry (this is because tar +-- files have a variable length trailer and we cannot reliably find that by +-- starting at the end). In this mode, it will fail with an exception if the +-- file is not in fact in the tar format. +-- +hSeekEndEntryOffset :: Handle -> Maybe TarIndex -> IO TarEntryOffset +hSeekEndEntryOffset hnd (Just index) = do + let offset = indexEndEntryOffset index + hSeekEntryOffset hnd offset + return offset + +hSeekEndEntryOffset hnd Nothing = do + size <- hFileSize hnd + if size == 0 + then return 0 + else seekToEnd 0 + where + seekToEnd offset = do + mbe <- hReadEntryHeaderOrEof hnd offset + case mbe of + Nothing -> do hSeekEntryOffset hnd offset + return offset + Just (_, offset') -> seekToEnd offset' + +------------------------- +-- (de)serialisation +-- + +-- | The 'TarIndex' is compact in memory, and it has a similarly compact +-- external representation. +-- +serialise :: TarIndex -> BS.ByteString +serialise = toStrict . serialiseLBS + +-- we keep this version around just so we can check we got the size right. +serialiseLBS :: TarIndex -> LBS.ByteString +serialiseLBS index = + BS.toLazyByteStringWith + (BS.untrimmedStrategy (serialiseSize index) 512) LBS.empty + (serialiseBuilder index) + +serialiseSize :: TarIndex -> Int +serialiseSize (TarIndex stringTable intTrie _) = + StringTable.serialiseSize stringTable + + IntTrie.serialiseSize intTrie + + 8 + +serialiseBuilder :: TarIndex -> BS.Builder +serialiseBuilder (TarIndex stringTable intTrie finalOffset) = + BS.word32BE 2 -- format version + <> BS.word32BE finalOffset + <> StringTable.serialise stringTable + <> IntTrie.serialise intTrie + +-- | Read the external representation back into a 'TarIndex'. +-- +deserialise :: BS.ByteString -> Maybe (TarIndex, BS.ByteString) +deserialise bs + | BS.length bs < 8 + = Nothing + + | let ver = readWord32BE bs 0 + , ver == 1 + = do let !finalOffset = readWord32BE bs 4 + (stringTable, bs') <- StringTable.deserialiseV1 (BS.drop 8 bs) + (intTrie, bs'') <- IntTrie.deserialise bs' + return (TarIndex stringTable intTrie finalOffset, bs'') + + | let ver = readWord32BE bs 0 + , ver == 2 + = do let !finalOffset = readWord32BE bs 4 + (stringTable, bs') <- StringTable.deserialiseV2 (BS.drop 8 bs) + (intTrie, bs'') <- IntTrie.deserialise bs' + return (TarIndex stringTable intTrie finalOffset, bs'') + + | otherwise = Nothing + +readWord32BE :: BS.ByteString -> Int -> Word32 +readWord32BE bs i = + assert (i >= 0 && i+3 <= BS.length bs - 1) $ + fromIntegral (BS.unsafeIndex bs (i + 0)) `shiftL` 24 + + fromIntegral (BS.unsafeIndex bs (i + 1)) `shiftL` 16 + + fromIntegral (BS.unsafeIndex bs (i + 2)) `shiftL` 8 + + fromIntegral (BS.unsafeIndex bs (i + 3)) + + +------------------------- +-- Test properties +-- + +#ifdef TESTS + +-- Not quite the properties of a finite mapping because we also have lookups +-- that result in completions. + +prop_lookup :: ValidPaths -> NonEmptyFilePath -> Bool +prop_lookup (ValidPaths paths) (NonEmptyFilePath p) = + case (lookup index p, Prelude.lookup p paths) of + (Nothing, Nothing) -> True + (Just (TarFileEntry offset), Just (_,offset')) -> offset == offset' + (Just (TarDir entries), Nothing) -> sort (nub (map fst entries)) + == sort (nub completions) + _ -> False + where + index = construct paths + completions = [ head (FilePath.splitDirectories completion) + | (path,_) <- paths + , completion <- maybeToList $ stripPrefix (p ++ "/") path ] + +prop_toList :: ValidPaths -> Bool +prop_toList (ValidPaths paths) = + sort (toList index) + == sort [ (path, off) | (path, (_sz, off)) <- paths ] + where + index = construct paths + +prop_valid :: ValidPaths -> Bool +prop_valid (ValidPaths paths) + | not $ StringTable.prop_valid pathbits = error "TarIndex: bad string table" + | not $ IntTrie.prop_lookup intpaths = error "TarIndex: bad int trie" + | not $ IntTrie.prop_completions intpaths = error "TarIndex: bad int trie" + | not $ prop' = error "TarIndex: bad prop" + | otherwise = True + + where + index@(TarIndex pathTable _ _) = construct paths + + pathbits = concatMap (map BS.Char8.pack . FilePath.splitDirectories . fst) + paths + intpaths = [ (cids, offset) + | (path, (_size, offset)) <- paths + , let Just cids = toComponentIds pathTable path ] + prop' = flip all paths $ \(file, (_size, offset)) -> + case lookup index file of + Just (TarFileEntry offset') -> offset' == offset + _ -> False + +prop_serialise_deserialise :: ValidPaths -> Bool +prop_serialise_deserialise (ValidPaths paths) = + Just (index, BS.empty) == (deserialise . serialise) index + where + index = construct paths + +prop_serialiseSize :: ValidPaths -> Bool +prop_serialiseSize (ValidPaths paths) = + case (LBS.toChunks . serialiseLBS) index of + [c1] -> BS.length c1 == serialiseSize index + _ -> False + where + index = construct paths + +newtype NonEmptyFilePath = NonEmptyFilePath FilePath deriving Show + +instance Arbitrary NonEmptyFilePath where + arbitrary = NonEmptyFilePath . FilePath.joinPath + <$> listOf1 (elements ["a", "b", "c", "d"]) + +newtype ValidPaths = ValidPaths [(FilePath, (Int64, TarEntryOffset))] deriving Show + +instance Arbitrary ValidPaths where + arbitrary = do + paths <- makeNoPrefix <$> listOf arbitraryPath + sizes <- vectorOf (length paths) (getNonNegative <$> arbitrary) + let offsets = scanl (\o sz -> o + 1 + blocks sz) 0 sizes + return (ValidPaths (zip paths (zip sizes offsets))) + where + arbitraryPath = FilePath.joinPath + <$> listOf1 (elements ["a", "b", "c", "d"]) + makeNoPrefix [] = [] + makeNoPrefix (k:ks) + | all (not . isPrefixOfOther k) ks + = k : makeNoPrefix ks + | otherwise = makeNoPrefix ks + + isPrefixOfOther a b = a `isPrefixOf` b || b `isPrefixOf` a + + blocks :: Int64 -> TarEntryOffset + blocks size = fromIntegral (1 + ((size - 1) `div` 512)) + +-- Helper for bulk construction. +construct :: [(FilePath, (Int64, TarEntryOffset))] -> TarIndex +construct = + either (\_ -> undefined) id + . build + . foldr (\(path, (size, _off)) es -> Next (testEntry path size) es) Done + +example0 :: Entries () +example0 = + testEntry "foo-1.0/foo-1.0.cabal" 1500 -- at block 0 + `Next` testEntry "foo-1.0/LICENSE" 2000 -- at block 4 + `Next` testEntry "foo-1.0/Data/Foo.hs" 1000 -- at block 9 + `Next` Done + +example1 :: Entries () +example1 = + Next (testEntry "./" 1500) Done <> example0 + +testEntry :: FilePath -> Int64 -> Entry +testEntry name size = simpleEntry path (NormalFile mempty size) + where + Right path = toTarPath False name + +-- | Simple tar archive containing regular files only +data SimpleTarArchive = SimpleTarArchive { + simpleTarEntries :: Tar.Entries () + , simpleTarRaw :: [(FilePath, LBS.ByteString)] + , simpleTarBS :: LBS.ByteString + } + +instance Show SimpleTarArchive where + show = show . simpleTarRaw + +prop_index_matches_tar :: SimpleTarArchive -> Property +prop_index_matches_tar sta = + ioProperty (try go >>= either (\e -> throwIO (e :: SomeException)) + (\_ -> return True)) + where + go :: IO () + go = do + h <- HBS.readHandle True (simpleTarBS sta) + goEntries h 0 (simpleTarEntries sta) + + goEntries :: Handle -> TarEntryOffset -> Tar.Entries () -> IO () + goEntries _ _ Tar.Done = + return () + goEntries _ _ (Tar.Fail _) = + throwIO (userError "Fail entry in SimpleTarArchive") + goEntries h offset (Tar.Next e es) = do + goEntry h offset e + goEntries h (nextEntryOffset e offset) es + + goEntry :: Handle -> TarEntryOffset -> Tar.Entry -> IO () + goEntry h offset e = do + e' <- hReadEntry h offset + case (Tar.entryContent e, Tar.entryContent e') of + (Tar.NormalFile bs sz, Tar.NormalFile bs' sz') -> + unless (sz == sz' && bs == bs') $ + throwIO $ userError "Entry mismatch" + _otherwise -> + throwIO $ userError "unexpected entry types" + +instance Arbitrary SimpleTarArchive where + arbitrary = do + numEntries <- sized $ \n -> choose (0, n) + rawEntries <- mkRaw numEntries + let entries = mkList rawEntries + return SimpleTarArchive { + simpleTarEntries = mkEntries entries + , simpleTarRaw = rawEntries + , simpleTarBS = Tar.write entries + } + where + mkRaw :: Int -> Gen [(FilePath, LBS.ByteString)] + mkRaw 0 = return [] + mkRaw n = do + -- Pick a size around 0, 1, or 2 block boundaries + sz <- sized $ \n -> elements (take n fileSizes) + bs <- LBS.pack `fmap` vectorOf sz arbitrary + es <- mkRaw (n - 1) + return $ ("file" ++ show n, bs) : es + + mkList :: [(FilePath, LBS.ByteString)] -> [Tar.Entry] + mkList [] = [] + mkList ((fp, bs):es) = entry : mkList es + where + Right path = toTarPath False fp + entry = simpleEntry path content + content = NormalFile bs (LBS.length bs) + + mkEntries :: [Tar.Entry] -> Tar.Entries () + mkEntries [] = Tar.Done + mkEntries (e:es) = Tar.Next e (mkEntries es) + + -- Sizes around 0, 1, and 2 block boundaries + fileSizes :: [Int] + fileSizes = [ + 0 , 1 , 2 + , 510 , 511 , 512 , 513 , 514 + , 1022 , 1023 , 1024 , 1025 , 1026 + ] + +-- | 'IndexBuilder' constructed from a 'SimpleIndex' +newtype SimpleIndexBuilder = SimpleIndexBuilder IndexBuilder + deriving Show + +instance Arbitrary SimpleIndexBuilder where + arbitrary = SimpleIndexBuilder . build' . simpleTarEntries <$> arbitrary + where + -- like 'build', but don't finalize + build' :: Show e => Entries e -> IndexBuilder + build' = go empty + where + go !builder (Next e es) = go (addNextEntry e builder) es + go !builder Done = builder + go !_ (Fail err) = error (show err) + +prop_finalise_unfinalise :: SimpleIndexBuilder -> Bool +prop_finalise_unfinalise (SimpleIndexBuilder index) = + unfinalise (finalise index) == index + +#endif + +toStrict :: LBS.ByteString -> BS.ByteString +#if MIN_VERSION_bytestring(0,10,0) +toStrict = LBS.toStrict +#else +toStrict = BS.concat . LBS.toChunks +#endif + +#if !(MIN_VERSION_base(4,5,0)) +(<>) :: Monoid m => m -> m -> m +(<>) = mappend +#endif diff --git a/Codec/Archive/Tar/Index/IntTrie.hs b/Codec/Archive/Tar/Index/IntTrie.hs new file mode 100644 index 0000000..4e8fe8f --- /dev/null +++ b/Codec/Archive/Tar/Index/IntTrie.hs @@ -0,0 +1,607 @@ +{-# LANGUAGE CPP, BangPatterns, PatternGuards #-} +{-# LANGUAGE DeriveDataTypeable, ScopedTypeVariables #-} + +module Codec.Archive.Tar.Index.IntTrie ( + + IntTrie, + construct, + toList, + + IntTrieBuilder, + empty, + insert, + finalise, + unfinalise, + + lookup, + TrieLookup(..), + + serialise, + serialiseSize, + deserialise, + +#ifdef TESTS + test1, test2, test3, + ValidPaths(..), + prop_lookup, + prop_completions, + prop_lookup_mono, + prop_completions_mono, + prop_construct_toList, + prop_finalise_unfinalise, + prop_serialise_deserialise, + prop_serialiseSize, +#endif + ) where + +import Prelude hiding (lookup) + +import Data.Typeable (Typeable) + +import qualified Data.Array.Unboxed as A +import Data.Array.IArray ((!)) +import qualified Data.Bits as Bits +import Data.Word (Word32) +import Data.Bits +import Data.Monoid (Monoid(..)) +#if (MIN_VERSION_base(4,5,0)) +import Data.Monoid ((<>)) +#endif +import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as LBS +import qualified Data.ByteString.Unsafe as BS +#if MIN_VERSION_bytestring(0,10,2) || defined(MIN_VERSION_bytestring_builder) +import Data.ByteString.Builder as BS +#else +import Data.ByteString.Lazy.Builder as BS +#endif +import Control.Exception (assert) +#if MIN_VERSION_containers(0,5,0) +import qualified Data.Map.Strict as Map +import qualified Data.IntMap.Strict as IntMap +import Data.IntMap.Strict (IntMap) +#else +import qualified Data.Map as Map +import qualified Data.IntMap as IntMap +import Data.IntMap (IntMap) +#endif + +import Data.List hiding (lookup, insert) +import Data.Function (on) + +#ifdef TESTS +import Test.QuickCheck +import Control.Applicative ((<$>), (<*>)) +#endif + + +-- | A compact mapping from sequences of nats to nats. +-- +-- NOTE: The tries in this module have values /only/ at the leaves (which +-- correspond to files), they do not have values at the branch points (which +-- correspond to directories). +newtype IntTrie k v = IntTrie (A.UArray Word32 Word32) + deriving (Eq, Show, Typeable) + + +-- Compact, read-only implementation of a trie. It's intended for use with file +-- paths, but we do that via string ids. + +#ifdef TESTS +-- Example mapping: +-- +example0 :: [(FilePath, Int)] +example0 = + [("foo-1.0/foo-1.0.cabal", 512) -- tar block 1 + ,("foo-1.0/LICENSE", 2048) -- tar block 4 + ,("foo-1.0/Data/Foo.hs", 4096)] -- tar block 8 + +-- After converting path components to integers this becomes: +-- +example1 :: [([Word32], Word32)] +example1 = + [([1,2], 512) + ,([1,3], 2048) + ,([1,4,5], 4096)] + +-- As a trie this looks like: + +-- [ (1, *) ] +-- | +-- [ (2, 512), (3, 1024), (4, *) ] +-- | +-- [ (5, 4096) ] + +-- We use an intermediate trie representation + +mktrie :: [(Int, TrieNode k v)] -> IntTrieBuilder k v +mkleaf :: (Enum k, Enum v) => k -> v -> (Int, TrieNode k v) +mknode :: Enum k => k -> IntTrieBuilder k v -> (Int, TrieNode k v) + +mktrie = IntTrieBuilder . IntMap.fromList +mkleaf k v = (fromEnum k, TrieLeaf (enumToWord32 v)) +mknode k t = (fromEnum k, TrieNode t) + +example2 :: IntTrieBuilder Word32 Word32 +example2 = mktrie [ mknode 1 t1 ] + where + t1 = mktrie [ mkleaf 2 512, mkleaf 3 2048, mknode 4 t2 ] + t2 = mktrie [ mkleaf 5 4096 ] + + +example2' :: IntTrieBuilder Word32 Word32 +example2' = mktrie [ mknode 0 t1 ] + where + t1 = mktrie [ mknode 3 t2 ] + t2 = mktrie [ mknode 1 t3, mknode 2 t4 ] + t3 = mktrie [ mkleaf 4 10608 ] + t4 = mktrie [ mkleaf 4 10612 ] +{- +0: [1,N0,3] + + 3: [1,N3,6] + + 6: [2,N1,N2,11,12] + + 11: [1,4,10608] + 14: [1,4,10612] +-} + +example2'' :: IntTrieBuilder Word32 Word32 +example2'' = mktrie [ mknode 1 t1, mknode 2 t2 ] + where + t1 = mktrie [ mkleaf 4 10608 ] + t2 = mktrie [ mkleaf 4 10612 ] + +example2''' :: IntTrieBuilder Word32 Word32 +example2''' = mktrie [ mknode 0 t3 ] + where + t3 = mktrie [ mknode 4 t8, mknode 6 t11 ] + t8 = mktrie [ mknode 1 t14 ] + t11 = mktrie [ mkleaf 5 10605 ] + t14 = mktrie [ mknode 2 t19, mknode 3 t22 ] + t19 = mktrie [ mkleaf 7 10608 ] + t22 = mktrie [ mkleaf 7 10612 ] +{- + 0: [1,N0,3] + 3: [2,N4,N6,8,11] + 8: [1,N1,11] +11: [1,5,10605] +14: [2,N2,N3,16,19] +19: [1,7,10608] +22: [1,7,10612] +-} + +-- We convert from the 'Paths' to the 'IntTrieBuilder' using 'inserts': +-- +test1 = example2 == inserts example1 empty +#endif + +-- Each node has a size and a sequence of keys followed by an equal length +-- sequnce of corresponding entries. Since we're going to flatten this into +-- a single array then we will need to replace the trie structure with pointers +-- represented as array offsets. + +-- Each node is a pair of arrays, one of keys and one of Either value pointer. +-- We need to distinguish values from internal pointers. We use a tag bit: +-- +tagLeaf, tagNode, untag :: Word32 -> Word32 +tagLeaf = id +tagNode = flip Bits.setBit 31 +untag = flip Bits.clearBit 31 + +isNode :: Word32 -> Bool +isNode = flip Bits.testBit 31 + +-- So the overall array form of the above trie is: +-- +-- offset: 0 1 2 3 4 5 6 7 8 9 10 11 12 +-- array: [ 1 | N1 | 3 ][ 3 | 2, 3, N4 | 512, 2048, 10 ][ 1 | 5 | 4096 ] +-- \__/ \___/ + +#ifdef TESTS +example3 :: [Word32] +example3 = + [1, tagNode 1, + 3, + 3, tagLeaf 2, tagLeaf 3, tagNode 4, + 512, 2048, 10, + 1, tagLeaf 5, + 4096 + ] + +-- We get the array form by using flattenTrie: + +test2 = example3 == flattenTrie example2 + +example4 :: IntTrie Int Int +example4 = IntTrie (mkArray example3) + +mkArray :: [Word32] -> A.UArray Word32 Word32 +mkArray xs = A.listArray (0, fromIntegral (length xs) - 1) xs + +test3 = case lookup example4 [1] of + Just (Completions [(2,_),(3,_),(4,_)]) -> True + _ -> False + +test1, test2, test3 :: Bool +#endif + +------------------------------------- +-- Decoding the trie array form +-- + +completionsFrom :: (Enum k, Enum v) => IntTrie k v -> Word32 -> Completions k v +completionsFrom trie@(IntTrie arr) nodeOff = + [ (word32ToEnum (untag key), next) + | keyOff <- [keysStart..keysEnd] + , let key = arr ! keyOff + entry = arr ! (keyOff + nodeSize) + next | isNode key = Completions (completionsFrom trie entry) + | otherwise = Entry (word32ToEnum entry) + ] + where + nodeSize = arr ! nodeOff + keysStart = nodeOff + 1 + keysEnd = nodeOff + nodeSize + +-- | Convert the trie to a list +-- +-- This is the left inverse to 'construct' (modulo ordering). +toList :: forall k v. (Enum k, Enum v) => IntTrie k v -> [([k], v)] +toList = concatMap (aux []) . (`completionsFrom` 0) + where + aux :: [k] -> (k, TrieLookup k v) -> [([k], v)] + aux ks (k, Entry v) = [(reverse (k:ks), v)] + aux ks (k, Completions cs) = concatMap (aux (k:ks)) cs + +------------------------------------- +-- Toplevel trie array construction +-- + +-- So constructing the 'IntTrie' as a whole is just a matter of stringing +-- together all the bits + +-- | Build an 'IntTrie' from a bunch of (key, value) pairs, where the keys +-- are sequences. +-- +construct :: (Enum k, Enum v) => [([k], v)] -> IntTrie k v +construct = finalise . flip inserts empty + + +--------------------------------- +-- Looking up in the trie array +-- + +data TrieLookup k v = Entry !v | Completions (Completions k v) deriving Show +type Completions k v = [(k, TrieLookup k v)] + +lookup :: forall k v. (Enum k, Enum v) => IntTrie k v -> [k] -> Maybe (TrieLookup k v) +lookup trie@(IntTrie arr) = go 0 + where + go :: Word32 -> [k] -> Maybe (TrieLookup k v) + go nodeOff [] = Just (completions nodeOff) + go nodeOff (k:ks) = case search nodeOff (tagLeaf k') of + Just entryOff + | null ks -> Just (entry entryOff) + | otherwise -> Nothing + Nothing -> case search nodeOff (tagNode k') of + Nothing -> Nothing + Just entryOff -> go (arr ! entryOff) ks + where + k' = enumToWord32 k + + entry entryOff = Entry (word32ToEnum (arr ! entryOff)) + completions nodeOff = Completions (completionsFrom trie nodeOff) + + search :: Word32 -> Word32 -> Maybe Word32 + search nodeOff key = fmap (+nodeSize) (bsearch keysStart keysEnd key) + where + nodeSize = arr ! nodeOff + keysStart = nodeOff + 1 + keysEnd = nodeOff + nodeSize + + bsearch :: Word32 -> Word32 -> Word32 -> Maybe Word32 + bsearch a b key + | a > b = Nothing + | otherwise = case compare key (arr ! mid) of + LT -> bsearch a (mid-1) key + EQ -> Just mid + GT -> bsearch (mid+1) b key + where mid = (a + b) `div` 2 + + +enumToWord32 :: Enum n => n -> Word32 +enumToWord32 = fromIntegral . fromEnum + +word32ToEnum :: Enum n => Word32 -> n +word32ToEnum = toEnum . fromIntegral + + +------------------------- +-- Building Tries +-- + +newtype IntTrieBuilder k v = IntTrieBuilder (IntMap (TrieNode k v)) + deriving (Show, Eq) + +data TrieNode k v = TrieLeaf {-# UNPACK #-} !Word32 + | TrieNode !(IntTrieBuilder k v) + deriving (Show, Eq) + +empty :: IntTrieBuilder k v +empty = IntTrieBuilder IntMap.empty + +insert :: (Enum k, Enum v) => [k] -> v + -> IntTrieBuilder k v -> IntTrieBuilder k v +insert [] _v t = t +insert (k:ks) v t = insertTrie (fromEnum k) (map fromEnum ks) (enumToWord32 v) t + +insertTrie :: Int -> [Int] -> Word32 + -> IntTrieBuilder k v -> IntTrieBuilder k v +insertTrie k ks v (IntTrieBuilder t) = + IntTrieBuilder $ + IntMap.alter (\t' -> Just $! maybe (freshTrieNode ks v) + (insertTrieNode ks v) t') + k t + +insertTrieNode :: [Int] -> Word32 -> TrieNode k v -> TrieNode k v +insertTrieNode [] v _ = TrieLeaf v +insertTrieNode (k:ks) v (TrieLeaf _) = TrieNode (freshTrie k ks v) +insertTrieNode (k:ks) v (TrieNode t) = TrieNode (insertTrie k ks v t) + +freshTrie :: Int -> [Int] -> Word32 -> IntTrieBuilder k v +freshTrie k [] v = + IntTrieBuilder (IntMap.singleton k (TrieLeaf v)) +freshTrie k (k':ks) v = + IntTrieBuilder (IntMap.singleton k (TrieNode (freshTrie k' ks v))) + +freshTrieNode :: [Int] -> Word32 -> TrieNode k v +freshTrieNode [] v = TrieLeaf v +freshTrieNode (k:ks) v = TrieNode (freshTrie k ks v) + +inserts :: (Enum k, Enum v) => [([k], v)] + -> IntTrieBuilder k v -> IntTrieBuilder k v +inserts kvs t = foldl' (\t' (ks, v) -> insert ks v t') t kvs + +finalise :: IntTrieBuilder k v -> IntTrie k v +finalise trie = + IntTrie $ + A.listArray (0, fromIntegral (flatTrieLength trie) - 1) + (flattenTrie trie) + +unfinalise :: (Enum k, Enum v) => IntTrie k v -> IntTrieBuilder k v +unfinalise trie = + go (completionsFrom trie 0) + where + go kns = + IntTrieBuilder $ + IntMap.fromList + [ (fromEnum k, t) + | (k, n) <- kns + , let t = case n of + Entry v -> TrieLeaf (enumToWord32 v) + Completions kns' -> TrieNode (go kns') + ] + +--------------------------------- +-- Flattening Tries +-- + +type Offset = Int + +flatTrieLength :: IntTrieBuilder k v -> Int +flatTrieLength (IntTrieBuilder tns) = + 1 + + 2 * IntMap.size tns + + sum [ flatTrieLength n | TrieNode n <- IntMap.elems tns ] + +-- This is a breadth-first traversal. We keep a list of the tries that we are +-- to write out next. Each of these have an offset allocated to them at the +-- time we put them into the list. We keep a running offset so we know where +-- to allocate next. +-- +flattenTrie :: IntTrieBuilder k v -> [Word32] +flattenTrie trie = go (queue [trie]) (size trie) + where + size (IntTrieBuilder tns) = 1 + 2 * IntMap.size tns + + go :: Q (IntTrieBuilder k v) -> Offset -> [Word32] + go todo !offset = + case dequeue todo of + Nothing -> [] + Just (IntTrieBuilder tnodes, tries) -> + flat ++ go tries' offset' + where + !count = IntMap.size tnodes + flat = fromIntegral count + : Map.keys keysValues + ++ Map.elems keysValues + (!offset', !keysValues, !tries') = +#if MIN_VERSION_containers(0,4,2) + IntMap.foldlWithKey' accumNodes + (offset, Map.empty, tries) + tnodes +#else + foldl' (\a (k,v) -> accumNodes a k v) + (offset, Map.empty, tries) + (IntMap.toList tnodes) +#endif + + accumNodes :: (Offset, Map.Map Word32 Word32, Q (IntTrieBuilder k v)) + -> Int -> TrieNode k v + -> (Offset, Map.Map Word32 Word32, Q (IntTrieBuilder k v)) + accumNodes (!off, !kvs, !tries) !k (TrieLeaf v) = + (off, kvs', tries) + where + kvs' = Map.insert (tagLeaf (int2Word32 k)) v kvs + + accumNodes (!off, !kvs, !tries) !k (TrieNode t) = + (off + size t, kvs', tries') + where + kvs' = Map.insert (tagNode (int2Word32 k)) (int2Word32 off) kvs + tries' = enqueue tries t + +data Q a = Q [a] [a] + +queue :: [a] -> Q a +queue xs = Q xs [] + +enqueue :: Q a -> a -> Q a +enqueue (Q front back) x = Q front (x : back) + +dequeue :: Q a -> Maybe (a, Q a) +dequeue (Q (x:xs) back) = Just (x, Q xs back) +dequeue (Q [] back) = case reverse back of + x:xs -> Just (x, Q xs []) + [] -> Nothing + +int2Word32 :: Int -> Word32 +int2Word32 = fromIntegral + + +------------------------- +-- (de)serialisation +-- + +serialise :: IntTrie k v -> BS.Builder +serialise (IntTrie arr) = + let (_, !ixEnd) = A.bounds arr in + BS.word32BE (ixEnd+1) + <> foldr (\n r -> BS.word32BE n <> r) mempty (A.elems arr) + +serialiseSize :: IntTrie k v -> Int +serialiseSize (IntTrie arr) = + let (_, ixEnd) = A.bounds arr in + 4 + + 4 * (fromIntegral ixEnd + 1) + +deserialise :: BS.ByteString -> Maybe (IntTrie k v, BS.ByteString) +deserialise bs + | BS.length bs >= 4 + , let lenArr = readWord32BE bs 0 + lenTotal = 4 + 4 * fromIntegral lenArr + , BS.length bs >= 4 + 4 * fromIntegral lenArr + , let !arr = A.array (0, lenArr-1) + [ (i, readWord32BE bs off) + | (i, off) <- zip [0..lenArr-1] [4,8 .. lenTotal - 4] ] + !bs' = BS.drop lenTotal bs + = Just (IntTrie arr, bs') + + | otherwise + = Nothing + +readWord32BE :: BS.ByteString -> Int -> Word32 +readWord32BE bs i = + assert (i >= 0 && i+3 <= BS.length bs - 1) $ + fromIntegral (BS.unsafeIndex bs (i + 0)) `shiftL` 24 + + fromIntegral (BS.unsafeIndex bs (i + 1)) `shiftL` 16 + + fromIntegral (BS.unsafeIndex bs (i + 2)) `shiftL` 8 + + fromIntegral (BS.unsafeIndex bs (i + 3)) + + +------------------------- +-- Correctness property +-- + +#ifdef TESTS + +prop_lookup :: (Ord k, Enum k, Eq v, Enum v, Show k, Show v) + => [([k], v)] -> Bool +prop_lookup paths = + flip all paths $ \(key, value) -> + case lookup trie key of + Just (Entry value') | value' == value -> True + Just (Entry value') -> error $ "IntTrie: " ++ show (key, value, value') + Nothing -> error $ "IntTrie: didn't find " ++ show key + Just (Completions xs) -> error $ "IntTrie: " ++ show xs + + where + trie = construct paths + +prop_completions :: forall k v. (Ord k, Enum k, Eq v, Enum v) => [([k], v)] -> Bool +prop_completions paths = + inserts paths empty + == convertCompletions (completionsFrom (construct paths) 0) + where + convertCompletions :: Ord k => Completions k v -> IntTrieBuilder k v + convertCompletions kls = + IntTrieBuilder $ + IntMap.fromList + [ case l of + Entry v -> mkleaf k v + Completions kls' -> mknode k (convertCompletions kls') + | (k, l) <- sortBy (compare `on` fst) kls ] + + +prop_lookup_mono :: ValidPaths -> Bool +prop_lookup_mono (ValidPaths paths) = prop_lookup paths + +prop_completions_mono :: ValidPaths -> Bool +prop_completions_mono (ValidPaths paths) = prop_completions paths + +prop_construct_toList :: ValidPaths -> Bool +prop_construct_toList (ValidPaths paths) = + sortBy (compare `on` fst) (toList (construct paths)) + == sortBy (compare `on` fst) paths + +prop_finalise_unfinalise :: ValidPaths -> Bool +prop_finalise_unfinalise (ValidPaths paths) = + builder == unfinalise (finalise builder) + where + builder :: IntTrieBuilder Char Char + builder = inserts paths empty + +prop_serialise_deserialise :: ValidPaths -> Bool +prop_serialise_deserialise (ValidPaths paths) = + Just (trie, BS.empty) == (deserialise + . toStrict . BS.toLazyByteString + . serialise) trie + where + trie :: IntTrie Char Char + trie = construct paths + +prop_serialiseSize :: ValidPaths -> Bool +prop_serialiseSize (ValidPaths paths) = + (fromIntegral . LBS.length . BS.toLazyByteString . serialise) trie + == serialiseSize trie + where + trie :: IntTrie Char Char + trie = construct paths + +newtype ValidPaths = ValidPaths [([Char], Char)] deriving Show + +instance Arbitrary ValidPaths where + arbitrary = + ValidPaths . makeNoPrefix <$> listOf ((,) <$> listOf1 arbitrary <*> arbitrary) + where + makeNoPrefix [] = [] + makeNoPrefix ((k,v):kvs) + | all (\(k', _) -> not (isPrefixOfOther k k')) kvs + = (k,v) : makeNoPrefix kvs + | otherwise = makeNoPrefix kvs + + shrink (ValidPaths kvs) = + map ValidPaths . filter noPrefix . filter nonEmpty . shrink $ kvs + where + noPrefix [] = True + noPrefix ((k,_):kvs') = all (\(k', _) -> not (isPrefixOfOther k k')) kvs' + && noPrefix kvs' + nonEmpty = all (not . null . fst) + +isPrefixOfOther a b = a `isPrefixOf` b || b `isPrefixOf` a + +toStrict :: LBS.ByteString -> BS.ByteString +#if MIN_VERSION_bytestring(0,10,0) +toStrict = LBS.toStrict +#else +toStrict = BS.concat . LBS.toChunks +#endif + +#endif + +#if !(MIN_VERSION_base(4,5,0)) +(<>) :: Monoid m => m -> m -> m +(<>) = mappend +#endif + diff --git a/Codec/Archive/Tar/Index/StringTable.hs b/Codec/Archive/Tar/Index/StringTable.hs new file mode 100644 index 0000000..688bd80 --- /dev/null +++ b/Codec/Archive/Tar/Index/StringTable.hs @@ -0,0 +1,324 @@ +{-# LANGUAGE CPP, BangPatterns, PatternGuards, DeriveDataTypeable #-} + +module Codec.Archive.Tar.Index.StringTable ( + + StringTable, + lookup, + index, + construct, + + StringTableBuilder, + empty, + insert, + inserts, + finalise, + unfinalise, + + serialise, + serialiseSize, + deserialiseV1, + deserialiseV2, + +#ifdef TESTS + prop_valid, + prop_sorted, + prop_finalise_unfinalise, + prop_serialise_deserialise, + prop_serialiseSize, +#endif + ) where + +import Data.Typeable (Typeable) + +import Prelude hiding (lookup, id) +import Data.List hiding (lookup, insert) +import Data.Function (on) +import Data.Word (Word32) +import Data.Int (Int32) +import Data.Bits +import Data.Monoid (Monoid(..)) +#if (MIN_VERSION_base(4,5,0)) +import Data.Monoid ((<>)) +#endif +import Control.Exception (assert) + +import qualified Data.Array.Unboxed as A +import Data.Array.Unboxed ((!)) +#if MIN_VERSION_containers(0,5,0) +import qualified Data.Map.Strict as Map +import Data.Map.Strict (Map) +#else +import qualified Data.Map as Map +import Data.Map (Map) +#endif +import qualified Data.ByteString as BS +import qualified Data.ByteString.Unsafe as BS +import qualified Data.ByteString.Lazy as LBS +#if MIN_VERSION_bytestring(0,10,2) || defined(MIN_VERSION_bytestring_builder) +import Data.ByteString.Builder as BS +import Data.ByteString.Builder.Extra as BS (byteStringCopy) +#else +import Data.ByteString.Lazy.Builder as BS +import Data.ByteString.Lazy.Builder.Extras as BS (byteStringCopy) +#endif + + +-- | An effecient mapping from strings to a dense set of integers. +-- +data StringTable id = StringTable + {-# UNPACK #-} !BS.ByteString -- all strings concatenated + {-# UNPACK #-} !(A.UArray Int32 Word32) -- string offset table + {-# UNPACK #-} !(A.UArray Int32 Int32) -- string index to id table + {-# UNPACK #-} !(A.UArray Int32 Int32) -- string id to index table + deriving (Show, Typeable) + +instance (Eq id, Enum id) => Eq (StringTable id) where + tbl1 == tbl2 = unfinalise tbl1 == unfinalise tbl2 + +-- | Look up a string in the token table. If the string is present, return +-- its corresponding index. +-- +lookup :: Enum id => StringTable id -> BS.ByteString -> Maybe id +lookup (StringTable bs offsets ids _ixs) str = + binarySearch 0 (topBound-1) str + where + (0, topBound) = A.bounds offsets + + binarySearch !a !b !key + | a > b = Nothing + | otherwise = case compare key (index' bs offsets mid) of + LT -> binarySearch a (mid-1) key + EQ -> Just $! toEnum (fromIntegral (ids ! mid)) + GT -> binarySearch (mid+1) b key + where mid = (a + b) `div` 2 + +index' :: BS.ByteString -> A.UArray Int32 Word32 -> Int32 -> BS.ByteString +index' bs offsets i = BS.unsafeTake len . BS.unsafeDrop start $ bs + where + start, end, len :: Int + start = fromIntegral (offsets ! i) + end = fromIntegral (offsets ! (i+1)) + len = end - start + + +-- | Given the index of a string in the table, return the string. +-- +index :: Enum id => StringTable id -> id -> BS.ByteString +index (StringTable bs offsets _ids ixs) = + index' bs offsets . (ixs !) . fromIntegral . fromEnum + + +-- | Given a list of strings, construct a 'StringTable' mapping those strings +-- to a dense set of integers. Also return the ids for all the strings used +-- in the construction. +-- +construct :: Enum id => [BS.ByteString] -> StringTable id +construct = finalise . foldl' (\tbl s -> fst (insert s tbl)) empty + + +data StringTableBuilder id = StringTableBuilder + !(Map BS.ByteString id) + {-# UNPACK #-} !Word32 + deriving (Eq, Show, Typeable) + +empty :: StringTableBuilder id +empty = StringTableBuilder Map.empty 0 + +insert :: Enum id => BS.ByteString -> StringTableBuilder id -> (StringTableBuilder id, id) +insert str builder@(StringTableBuilder smap nextid) = + case Map.lookup str smap of + Just id -> (builder, id) + Nothing -> let !id = toEnum (fromIntegral nextid) + !smap' = Map.insert str id smap + in (StringTableBuilder smap' (nextid+1), id) + +inserts :: Enum id => [BS.ByteString] -> StringTableBuilder id -> (StringTableBuilder id, [id]) +inserts bss builder = mapAccumL (flip insert) builder bss + +finalise :: Enum id => StringTableBuilder id -> StringTable id +finalise (StringTableBuilder smap _) = + (StringTable strs offsets ids ixs) + where + strs = BS.concat (Map.keys smap) + offsets = A.listArray (0, fromIntegral (Map.size smap)) + . scanl (\off str -> off + fromIntegral (BS.length str)) 0 + $ Map.keys smap + ids = A.listArray (0, fromIntegral (Map.size smap) - 1) + . map (fromIntegral . fromEnum) + $ Map.elems smap + ixs = A.array (A.bounds ids) [ (id,ix) | (ix,id) <- A.assocs ids ] + +unfinalise :: Enum id => StringTable id -> StringTableBuilder id +unfinalise (StringTable strs offsets ids _) = + StringTableBuilder smap nextid + where + smap = Map.fromAscList + [ (index' strs offsets ix, toEnum (fromIntegral (ids ! ix))) + | ix <- [0..h] ] + (0,h) = A.bounds ids + nextid = fromIntegral (h+1) + + +------------------------- +-- (de)serialisation +-- + +serialise :: StringTable id -> BS.Builder +serialise (StringTable strs offs ids ixs) = + let (_, !ixEnd) = A.bounds offs in + + BS.word32BE (fromIntegral (BS.length strs)) + <> BS.word32BE (fromIntegral ixEnd + 1) + <> BS.byteStringCopy strs + <> foldr (\n r -> BS.word32BE n <> r) mempty (A.elems offs) + <> foldr (\n r -> BS.int32BE n <> r) mempty (A.elems ids) + <> foldr (\n r -> BS.int32BE n <> r) mempty (A.elems ixs) + +serialiseSize :: StringTable id -> Int +serialiseSize (StringTable strs offs _ids _ixs) = + let (_, !ixEnd) = A.bounds offs + in 4 * 2 + + BS.length strs + + 4 * (fromIntegral ixEnd + 1) + + 8 * fromIntegral ixEnd + +deserialiseV1 :: BS.ByteString -> Maybe (StringTable id, BS.ByteString) +deserialiseV1 bs + | BS.length bs >= 8 + , let lenStrs = fromIntegral (readWord32BE bs 0) + lenArr = fromIntegral (readWord32BE bs 4) + lenTotal= 8 + lenStrs + 4 * lenArr + , BS.length bs >= lenTotal + , let strs = BS.take lenStrs (BS.drop 8 bs) + arr = A.array (0, fromIntegral lenArr - 1) + [ (i, readWord32BE bs off) + | (i, off) <- zip [0 .. fromIntegral lenArr - 1] + [offArrS,offArrS+4 .. offArrE] + ] + ids = A.array (0, fromIntegral lenArr - 1) + [ (i,i) | i <- [0 .. fromIntegral lenArr - 1] ] + ixs = ids -- two identity mappings + offArrS = 8 + lenStrs + offArrE = offArrS + 4 * lenArr - 1 + !stringTable = StringTable strs arr ids ixs + !bs' = BS.drop lenTotal bs + = Just (stringTable, bs') + + | otherwise + = Nothing + +deserialiseV2 :: BS.ByteString -> Maybe (StringTable id, BS.ByteString) +deserialiseV2 bs + | BS.length bs >= 8 + , let lenStrs = fromIntegral (readWord32BE bs 0) + lenArr = fromIntegral (readWord32BE bs 4) + lenTotal= 8 -- the two length prefixes + + lenStrs + + 4 * lenArr + +(4 * (lenArr - 1)) * 2 -- offsets array is 1 longer + , BS.length bs >= lenTotal + , let strs = BS.take lenStrs (BS.drop 8 bs) + offs = A.listArray (0, fromIntegral lenArr - 1) + [ readWord32BE bs off + | off <- offsets offsOff ] + -- the second two arrays are 1 shorter + ids = A.listArray (0, fromIntegral lenArr - 2) + [ readInt32BE bs off + | off <- offsets idsOff ] + ixs = A.listArray (0, fromIntegral lenArr - 2) + [ readInt32BE bs off + | off <- offsets ixsOff ] + offsOff = 8 + lenStrs + idsOff = offsOff + 4 * lenArr + ixsOff = idsOff + 4 * (lenArr-1) + offsets from = [from,from+4 .. from + 4 * (lenArr - 1)] + !stringTable = StringTable strs offs ids ixs + !bs' = BS.drop lenTotal bs + = Just (stringTable, bs') + + | otherwise + = Nothing + +readInt32BE :: BS.ByteString -> Int -> Int32 +readInt32BE bs i = fromIntegral (readWord32BE bs i) + +readWord32BE :: BS.ByteString -> Int -> Word32 +readWord32BE bs i = + assert (i >= 0 && i+3 <= BS.length bs - 1) $ + fromIntegral (BS.unsafeIndex bs (i + 0)) `shiftL` 24 + + fromIntegral (BS.unsafeIndex bs (i + 1)) `shiftL` 16 + + fromIntegral (BS.unsafeIndex bs (i + 2)) `shiftL` 8 + + fromIntegral (BS.unsafeIndex bs (i + 3)) + +#ifdef TESTS + +prop_valid :: [BS.ByteString] -> Bool +prop_valid strs = + all lookupIndex (enumStrings tbl) + && all indexLookup (enumIds tbl) + + where + tbl :: StringTable Int + tbl = construct strs + + lookupIndex str = index tbl ident == str + where Just ident = lookup tbl str + + indexLookup ident = lookup tbl str == Just ident + where str = index tbl ident + +-- this is important so we can use Map.fromAscList +prop_sorted :: [BS.ByteString] -> Bool +prop_sorted strings = + isSorted [ index' strs offsets ix + | ix <- A.range (A.bounds ids) ] + where + _tbl :: StringTable Int + _tbl@(StringTable strs offsets ids _ixs) = construct strings + isSorted xs = and (zipWith (<) xs (tail xs)) + +prop_finalise_unfinalise :: [BS.ByteString] -> Bool +prop_finalise_unfinalise strs = + builder == unfinalise (finalise builder) + where + builder :: StringTableBuilder Int + builder = foldl' (\tbl s -> fst (insert s tbl)) empty strs + +prop_serialise_deserialise :: [BS.ByteString] -> Bool +prop_serialise_deserialise strs = + Just (strtable, BS.empty) == (deserialiseV2 + . toStrict . BS.toLazyByteString + . serialise) strtable + where + strtable :: StringTable Int + strtable = construct strs + +prop_serialiseSize :: [BS.ByteString] -> Bool +prop_serialiseSize strs = + (fromIntegral . LBS.length . BS.toLazyByteString . serialise) strtable + == serialiseSize strtable + where + strtable :: StringTable Int + strtable = construct strs + +enumStrings :: Enum id => StringTable id -> [BS.ByteString] +enumStrings (StringTable bs offsets _ _) = map (index' bs offsets) [0..h-1] + where (0,h) = A.bounds offsets + +enumIds :: Enum id => StringTable id -> [id] +enumIds (StringTable _ offsets _ _) = [toEnum 0 .. toEnum (fromIntegral (h-1))] + where (0,h) = A.bounds offsets + +toStrict :: LBS.ByteString -> BS.ByteString +#if MIN_VERSION_bytestring(0,10,0) +toStrict = LBS.toStrict +#else +toStrict = BS.concat . LBS.toChunks +#endif + +#endif + +#if !(MIN_VERSION_base(4,5,0)) +(<>) :: Monoid m => m -> m -> m +(<>) = mappend +#endif diff --git a/Codec/Archive/Tar/Pack.hs b/Codec/Archive/Tar/Pack.hs new file mode 100644 index 0000000..2057203 --- /dev/null +++ b/Codec/Archive/Tar/Pack.hs @@ -0,0 +1,189 @@ +{-# LANGUAGE CPP #-} +----------------------------------------------------------------------------- +-- | +-- Module : Codec.Archive.Tar +-- Copyright : (c) 2007 Bjorn Bringert, +-- 2008 Andrea Vezzosi, +-- 2008-2009, 2012, 2016 Duncan Coutts +-- License : BSD3 +-- +-- Maintainer : duncan@community.haskell.org +-- Portability : portable +-- +----------------------------------------------------------------------------- +module Codec.Archive.Tar.Pack ( + pack, + packFileEntry, + packDirectoryEntry, + + getDirectoryContentsRecursive, + ) where + +import Codec.Archive.Tar.Types + +import qualified Data.ByteString.Lazy as BS +import System.FilePath + ( () ) +import qualified System.FilePath as FilePath.Native + ( addTrailingPathSeparator, hasTrailingPathSeparator ) +import System.Directory + ( getDirectoryContents, doesDirectoryExist, getModificationTime + , Permissions(..), getPermissions ) +#if MIN_VERSION_directory(1,2,0) +-- The directory package switched to the new time package +import Data.Time.Clock + ( UTCTime ) +import Data.Time.Clock.POSIX + ( utcTimeToPOSIXSeconds ) +#else +import System.Time + ( ClockTime(..) ) +#endif +import System.IO + ( IOMode(ReadMode), openBinaryFile, hFileSize ) +import System.IO.Unsafe (unsafeInterleaveIO) + +-- | Creates a tar archive from a list of directory or files. Any directories +-- specified will have their contents included recursively. Paths in the +-- archive will be relative to the given base directory. +-- +-- This is a portable implementation of packing suitable for portable archives. +-- In particular it only constructs 'NormalFile' and 'Directory' entries. Hard +-- links and symbolic links are treated like ordinary files. It cannot be used +-- to pack directories containing recursive symbolic links. Special files like +-- FIFOs (named pipes), sockets or device files will also cause problems. +-- +-- An exception will be thrown for any file names that are too long to +-- represent as a 'TarPath'. +-- +-- * This function returns results lazily. Subdirectories are scanned +-- and files are read one by one as the list of entries is consumed. +-- +pack :: FilePath -- ^ Base directory + -> [FilePath] -- ^ Files and directories to pack, relative to the base dir + -> IO [Entry] +pack baseDir paths0 = preparePaths baseDir paths0 >>= packPaths baseDir + +preparePaths :: FilePath -> [FilePath] -> IO [FilePath] +preparePaths baseDir paths = + fmap concat $ interleave + [ do isDir <- doesDirectoryExist (baseDir path) + if isDir + then do entries <- getDirectoryContentsRecursive (baseDir path) + let entries' = map (path ) entries + dir = FilePath.Native.addTrailingPathSeparator path + if null path then return entries' + else return (dir : entries') + else return [path] + | path <- paths ] + +packPaths :: FilePath -> [FilePath] -> IO [Entry] +packPaths baseDir paths = + interleave + [ do tarpath <- either fail return (toTarPath isDir relpath) + if isDir then packDirectoryEntry filepath tarpath + else packFileEntry filepath tarpath + | relpath <- paths + , let isDir = FilePath.Native.hasTrailingPathSeparator filepath + filepath = baseDir relpath ] + +interleave :: [IO a] -> IO [a] +interleave = unsafeInterleaveIO . go + where + go [] = return [] + go (x:xs) = do + x' <- x + xs' <- interleave xs + return (x':xs') + +-- | Construct a tar 'Entry' based on a local file. +-- +-- This sets the entry size, the data contained in the file and the file's +-- modification time. If the file is executable then that information is also +-- preserved. File ownership and detailed permissions are not preserved. +-- +-- * The file contents is read lazily. +-- +packFileEntry :: FilePath -- ^ Full path to find the file on the local disk + -> TarPath -- ^ Path to use for the tar Entry in the archive + -> IO Entry +packFileEntry filepath tarpath = do + mtime <- getModTime filepath + perms <- getPermissions filepath + file <- openBinaryFile filepath ReadMode + size <- hFileSize file + content <- BS.hGetContents file + return (simpleEntry tarpath (NormalFile content (fromIntegral size))) { + entryPermissions = if executable perms then executableFilePermissions + else ordinaryFilePermissions, + entryTime = mtime + } + +-- | Construct a tar 'Entry' based on a local directory (but not its contents). +-- +-- The only attribute of the directory that is used is its modification time. +-- Directory ownership and detailed permissions are not preserved. +-- +packDirectoryEntry :: FilePath -- ^ Full path to find the file on the local disk + -> TarPath -- ^ Path to use for the tar Entry in the archive + -> IO Entry +packDirectoryEntry filepath tarpath = do + mtime <- getModTime filepath + return (directoryEntry tarpath) { + entryTime = mtime + } + +-- | This is a utility function, much like 'getDirectoryContents'. The +-- difference is that it includes the contents of subdirectories. +-- +-- The paths returned are all relative to the top directory. Directory paths +-- are distinguishable by having a trailing path separator +-- (see 'FilePath.Native.hasTrailingPathSeparator'). +-- +-- All directories are listed before the files that they contain. Amongst the +-- contents of a directory, subdirectories are listed after normal files. The +-- overall result is that files within a directory will be together in a single +-- contiguous group. This tends to improve file layout and IO performance when +-- creating or extracting tar archives. +-- +-- * This function returns results lazily. Subdirectories are not scanned +-- until the files entries in the parent directory have been consumed. +-- +getDirectoryContentsRecursive :: FilePath -> IO [FilePath] +getDirectoryContentsRecursive dir0 = + fmap tail (recurseDirectories dir0 [""]) + +recurseDirectories :: FilePath -> [FilePath] -> IO [FilePath] +recurseDirectories _ [] = return [] +recurseDirectories base (dir:dirs) = unsafeInterleaveIO $ do + (files, dirs') <- collect [] [] =<< getDirectoryContents (base dir) + + files' <- recurseDirectories base (dirs' ++ dirs) + return (dir : files ++ files') + + where + collect files dirs' [] = return (reverse files, reverse dirs') + collect files dirs' (entry:entries) | ignore entry + = collect files dirs' entries + collect files dirs' (entry:entries) = do + let dirEntry = dir entry + dirEntry' = FilePath.Native.addTrailingPathSeparator dirEntry + isDirectory <- doesDirectoryExist (base dirEntry) + if isDirectory + then collect files (dirEntry':dirs') entries + else collect (dirEntry:files) dirs' entries + + ignore ['.'] = True + ignore ['.', '.'] = True + ignore _ = False + +getModTime :: FilePath -> IO EpochTime +getModTime path = do +#if MIN_VERSION_directory(1,2,0) + -- The directory package switched to the new time package + t <- getModificationTime path + return . floor . utcTimeToPOSIXSeconds $ t +#else + (TOD s _) <- getModificationTime path + return $! fromIntegral s +#endif diff --git a/Codec/Archive/Tar/Read.hs b/Codec/Archive/Tar/Read.hs new file mode 100644 index 0000000..1b7667a --- /dev/null +++ b/Codec/Archive/Tar/Read.hs @@ -0,0 +1,273 @@ +{-# LANGUAGE CPP, DeriveDataTypeable, BangPatterns #-} +----------------------------------------------------------------------------- +-- | +-- Module : Codec.Archive.Tar.Read +-- Copyright : (c) 2007 Bjorn Bringert, +-- 2008 Andrea Vezzosi, +-- 2008-2009 Duncan Coutts, +-- 2011 Max Bolingbroke +-- License : BSD3 +-- +-- Maintainer : duncan@community.haskell.org +-- Portability : portable +-- +----------------------------------------------------------------------------- +module Codec.Archive.Tar.Read (read, FormatError(..)) where + +import Codec.Archive.Tar.Types + +import Data.Char (ord) +import Data.Int (Int64) +import Data.Bits (Bits(shiftL)) +import Control.Exception (Exception(..)) +import Data.Typeable (Typeable) +import Control.Applicative +import Control.Monad +import Control.DeepSeq + +import qualified Data.ByteString as BS +import qualified Data.ByteString.Char8 as BS.Char8 +import qualified Data.ByteString.Unsafe as BS +import qualified Data.ByteString.Lazy as LBS + +import Prelude hiding (read) + +#if !MIN_VERSION_bytestring(0,10,0) +import Data.Monoid (Monoid(..)) +import qualified Data.ByteString.Lazy.Internal as LBS +#endif + +-- | Errors that can be encountered when parsing a Tar archive. +data FormatError + = TruncatedArchive + | ShortTrailer + | BadTrailer + | TrailingJunk + | ChecksumIncorrect + | NotTarFormat + | UnrecognisedTarFormat + | HeaderBadNumericEncoding +#if MIN_VERSION_base(4,8,0) + deriving (Eq, Show, Typeable) + +instance Exception FormatError where + displayException TruncatedArchive = "truncated tar archive" + displayException ShortTrailer = "short tar trailer" + displayException BadTrailer = "bad tar trailer" + displayException TrailingJunk = "tar file has trailing junk" + displayException ChecksumIncorrect = "tar checksum error" + displayException NotTarFormat = "data is not in tar format" + displayException UnrecognisedTarFormat = "tar entry not in a recognised format" + displayException HeaderBadNumericEncoding = "tar header is malformed (bad numeric encoding)" +#else + deriving (Eq, Typeable) + +instance Show FormatError where + show TruncatedArchive = "truncated tar archive" + show ShortTrailer = "short tar trailer" + show BadTrailer = "bad tar trailer" + show TrailingJunk = "tar file has trailing junk" + show ChecksumIncorrect = "tar checksum error" + show NotTarFormat = "data is not in tar format" + show UnrecognisedTarFormat = "tar entry not in a recognised format" + show HeaderBadNumericEncoding = "tar header is malformed (bad numeric encoding)" + +instance Exception FormatError +#endif + +instance NFData FormatError where + rnf !_ = () -- enumerations are fully strict by construction + +-- | Convert a data stream in the tar file format into an internal data +-- structure. Decoding errors are reported by the 'Fail' constructor of the +-- 'Entries' type. +-- +-- * The conversion is done lazily. +-- +read :: LBS.ByteString -> Entries FormatError +read = unfoldEntries getEntry + +getEntry :: LBS.ByteString -> Either FormatError (Maybe (Entry, LBS.ByteString)) +getEntry bs + | BS.length header < 512 = Left TruncatedArchive + + -- Tar files end with at least two blocks of all '0'. Checking this serves + -- two purposes. It checks the format but also forces the tail of the data + -- which is necessary to close the file if it came from a lazily read file. + | LBS.head bs == 0 = case LBS.splitAt 1024 bs of + (end, trailing) + | LBS.length end /= 1024 -> Left ShortTrailer + | not (LBS.all (== 0) end) -> Left BadTrailer + | not (LBS.all (== 0) trailing) -> Left TrailingJunk + | otherwise -> Right Nothing + + | otherwise = partial $ do + + case (chksum_, format_) of + (Ok chksum, _ ) | correctChecksum header chksum -> return () + (Ok _, Ok _) -> Error ChecksumIncorrect + _ -> Error NotTarFormat + + -- These fields are partial, have to check them + format <- format_; mode <- mode_; + uid <- uid_; gid <- gid_; + size <- size_; mtime <- mtime_; + devmajor <- devmajor_; devminor <- devminor_; + + let content = LBS.take size (LBS.drop 512 bs) + padding = (512 - size) `mod` 512 + bs' = LBS.drop (512 + size + padding) bs + + entry = Entry { + entryTarPath = TarPath name prefix, + entryContent = case typecode of + '\0' -> NormalFile content size + '0' -> NormalFile content size + '1' -> HardLink (LinkTarget linkname) + '2' -> SymbolicLink (LinkTarget linkname) + _ | format == V7Format + -> OtherEntryType typecode content size + '3' -> CharacterDevice devmajor devminor + '4' -> BlockDevice devmajor devminor + '5' -> Directory + '6' -> NamedPipe + '7' -> NormalFile content size + _ -> OtherEntryType typecode content size, + entryPermissions = mode, + entryOwnership = Ownership (BS.Char8.unpack uname) + (BS.Char8.unpack gname) uid gid, + entryTime = mtime, + entryFormat = format + } + + return (Just (entry, bs')) + + where +#if MIN_VERSION_bytestring(0,10,0) + header = LBS.toStrict (LBS.take 512 bs) +#else + header = toStrict (LBS.take 512 bs) + toStrict = LBS.foldrChunks mappend mempty +#endif + + name = getString 0 100 header + mode_ = getOct 100 8 header + uid_ = getOct 108 8 header + gid_ = getOct 116 8 header + size_ = getOct 124 12 header + mtime_ = getOct 136 12 header + chksum_ = getOct 148 8 header + typecode = getByte 156 header + linkname = getString 157 100 header + magic = getChars 257 8 header + uname = getString 265 32 header + gname = getString 297 32 header + devmajor_ = getOct 329 8 header + devminor_ = getOct 337 8 header + prefix = getString 345 155 header +-- trailing = getBytes 500 12 header + + format_ + | magic == ustarMagic = return UstarFormat + | magic == gnuMagic = return GnuFormat + | magic == v7Magic = return V7Format + | otherwise = Error UnrecognisedTarFormat + +v7Magic, ustarMagic, gnuMagic :: BS.ByteString +v7Magic = BS.Char8.pack "\0\0\0\0\0\0\0\0" +ustarMagic = BS.Char8.pack "ustar\NUL00" +gnuMagic = BS.Char8.pack "ustar \NUL" + +correctChecksum :: BS.ByteString -> Int -> Bool +correctChecksum header checksum = checksum == checksum' + where + -- sum of all 512 bytes in the header block, + -- treating each byte as an 8-bit unsigned value + sumchars = BS.foldl' (\x y -> x + fromIntegral y) 0 + -- treating the 8 bytes of chksum as blank characters. + checksum' = sumchars (BS.take 148 header) + + 256 -- 256 = sumchars (BS.Char8.replicate 8 ' ') + + sumchars (BS.drop 156 header) + +-- * TAR format primitive input + +{-# SPECIALISE getOct :: Int -> Int -> BS.ByteString -> Partial FormatError Int #-} +{-# SPECIALISE getOct :: Int -> Int -> BS.ByteString -> Partial FormatError Int64 #-} +getOct :: (Integral a, Bits a) => Int -> Int -> BS.ByteString -> Partial FormatError a +getOct off len = parseOct + . BS.Char8.takeWhile (\c -> c /= '\NUL' && c /= ' ') + . BS.Char8.dropWhile (== ' ') + . getBytes off len + where + parseOct s | BS.null s = return 0 + -- As a star extension, octal fields can hold a base-256 value if the high + -- bit of the initial character is set. The initial character can be: + -- 0x80 ==> trailing characters hold a positive base-256 value + -- 0xFF ==> trailing characters hold a negative base-256 value + -- + -- In both cases, there won't be a trailing NUL/space. + -- + -- GNU tar seems to contain a half-implementation of code that deals with + -- extra bits in the first character, but I don't think it works and the + -- docs I can find on star seem to suggest that these will always be 0, + -- which is what I will assume. + parseOct s | BS.head s == 128 = return (readBytes (BS.tail s)) + | BS.head s == 255 = return (negate (readBytes (BS.tail s))) + parseOct s = case readOct s of + Just x -> return x + Nothing -> Error HeaderBadNumericEncoding + + readBytes :: (Integral a, Bits a) => BS.ByteString -> a + readBytes = BS.foldl' (\acc x -> acc `shiftL` 8 + fromIntegral x) 0 + +getBytes :: Int -> Int -> BS.ByteString -> BS.ByteString +getBytes off len = BS.take len . BS.drop off + +getByte :: Int -> BS.ByteString -> Char +getByte off bs = BS.Char8.index bs off + +getChars :: Int -> Int -> BS.ByteString -> BS.ByteString +getChars off len = getBytes off len + +getString :: Int -> Int -> BS.ByteString -> BS.ByteString +getString off len = BS.copy . BS.Char8.takeWhile (/='\0') . getBytes off len + +-- These days we'd just use Either, but in older versions of base there was no +-- Monad instance for Either, it was in mtl with an anoying Error constraint. +-- +data Partial e a = Error e | Ok a + +partial :: Partial e a -> Either e a +partial (Error msg) = Left msg +partial (Ok x) = Right x + +instance Functor (Partial e) where + fmap = liftM + +instance Applicative (Partial e) where + pure = Ok + (<*>) = ap + +instance Monad (Partial e) where + return = pure + Error m >>= _ = Error m + Ok x >>= k = k x + fail = error "fail @(Partial e)" + +{-# SPECIALISE readOct :: BS.ByteString -> Maybe Int #-} +{-# SPECIALISE readOct :: BS.ByteString -> Maybe Int64 #-} +readOct :: Integral n => BS.ByteString -> Maybe n +readOct bs0 = case go 0 0 bs0 of + -1 -> Nothing + n -> Just n + where + go :: Integral n => Int -> n -> BS.ByteString -> n + go !i !n !bs + | BS.null bs = if i == 0 then -1 else n + | otherwise = + case BS.unsafeHead bs of + w | w >= 0x30 + && w <= 0x39 -> go (i+1) + (n * 8 + (fromIntegral w - 0x30)) + (BS.unsafeTail bs) + | otherwise -> -1 diff --git a/Codec/Archive/Tar/Types.hs b/Codec/Archive/Tar/Types.hs new file mode 100644 index 0000000..0689be0 --- /dev/null +++ b/Codec/Archive/Tar/Types.hs @@ -0,0 +1,697 @@ +{-# LANGUAGE CPP, GeneralizedNewtypeDeriving, BangPatterns #-} +----------------------------------------------------------------------------- +-- | +-- Module : Codec.Archive.Tar.Types +-- Copyright : (c) 2007 Bjorn Bringert, +-- 2008 Andrea Vezzosi, +-- 2008-2009 Duncan Coutts +-- 2011 Max Bolingbroke +-- License : BSD3 +-- +-- Maintainer : duncan@community.haskell.org +-- Portability : portable +-- +-- Types to represent the content of @.tar@ archives. +-- +----------------------------------------------------------------------------- +module Codec.Archive.Tar.Types ( + + Entry(..), + entryPath, + EntryContent(..), + FileSize, + Permissions, + Ownership(..), + EpochTime, + TypeCode, + DevMajor, + DevMinor, + Format(..), + + simpleEntry, + fileEntry, + directoryEntry, + + ordinaryFilePermissions, + executableFilePermissions, + directoryPermissions, + + TarPath(..), + toTarPath, + fromTarPath, + fromTarPathToPosixPath, + fromTarPathToWindowsPath, + + LinkTarget(..), + toLinkTarget, + fromLinkTarget, + fromLinkTargetToPosixPath, + fromLinkTargetToWindowsPath, + + Entries(..), + mapEntries, + mapEntriesNoFail, + foldEntries, + foldlEntries, + unfoldEntries, + +#ifdef TESTS + limitToV7FormatCompat +#endif + ) where + +import Data.Int (Int64) +import Data.Monoid (Monoid(..)) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Char8 as BS.Char8 +import qualified Data.ByteString.Lazy as LBS +import Control.DeepSeq + +import qualified System.FilePath as FilePath.Native + ( joinPath, splitDirectories, addTrailingPathSeparator ) +import qualified System.FilePath.Posix as FilePath.Posix + ( joinPath, splitPath, splitDirectories, hasTrailingPathSeparator + , addTrailingPathSeparator ) +import qualified System.FilePath.Windows as FilePath.Windows + ( joinPath, addTrailingPathSeparator ) +import System.Posix.Types + ( FileMode ) + +#ifdef TESTS +import Test.QuickCheck +import Control.Applicative ((<$>), pure, (<*>)) +#endif + + +type FileSize = Int64 +-- | The number of seconds since the UNIX epoch +type EpochTime = Int64 +type DevMajor = Int +type DevMinor = Int +type TypeCode = Char +type Permissions = FileMode + +-- | Tar archive entry. +-- +data Entry = Entry { + + -- | The path of the file or directory within the archive. This is in a + -- tar-specific form. Use 'entryPath' to get a native 'FilePath'. + entryTarPath :: {-# UNPACK #-} !TarPath, + + -- | The real content of the entry. For 'NormalFile' this includes the + -- file data. An entry usually contains a 'NormalFile' or a 'Directory'. + entryContent :: !EntryContent, + + -- | File permissions (Unix style file mode). + entryPermissions :: {-# UNPACK #-} !Permissions, + + -- | The user and group to which this file belongs. + entryOwnership :: {-# UNPACK #-} !Ownership, + + -- | The time the file was last modified. + entryTime :: {-# UNPACK #-} !EpochTime, + + -- | The tar format the archive is using. + entryFormat :: !Format + } + deriving (Eq, Show) + +-- | Native 'FilePath' of the file or directory within the archive. +-- +entryPath :: Entry -> FilePath +entryPath = fromTarPath . entryTarPath + +-- | The content of a tar archive entry, which depends on the type of entry. +-- +-- Portable archives should contain only 'NormalFile' and 'Directory'. +-- +data EntryContent = NormalFile LBS.ByteString {-# UNPACK #-} !FileSize + | Directory + | SymbolicLink !LinkTarget + | HardLink !LinkTarget + | CharacterDevice {-# UNPACK #-} !DevMajor + {-# UNPACK #-} !DevMinor + | BlockDevice {-# UNPACK #-} !DevMajor + {-# UNPACK #-} !DevMinor + | NamedPipe + | OtherEntryType {-# UNPACK #-} !TypeCode LBS.ByteString + {-# UNPACK #-} !FileSize + deriving (Eq, Ord, Show) + +data Ownership = Ownership { + -- | The owner user name. Should be set to @\"\"@ if unknown. + ownerName :: String, + + -- | The owner group name. Should be set to @\"\"@ if unknown. + groupName :: String, + + -- | Numeric owner user id. Should be set to @0@ if unknown. + ownerId :: {-# UNPACK #-} !Int, + + -- | Numeric owner group id. Should be set to @0@ if unknown. + groupId :: {-# UNPACK #-} !Int + } + deriving (Eq, Ord, Show) + +-- | There have been a number of extensions to the tar file format over the +-- years. They all share the basic entry fields and put more meta-data in +-- different extended headers. +-- +data Format = + + -- | This is the classic Unix V7 tar format. It does not support owner and + -- group names, just numeric Ids. It also does not support device numbers. + V7Format + + -- | The \"USTAR\" format is an extension of the classic V7 format. It was + -- later standardised by POSIX. It has some restrictions but is the most + -- portable format. + -- + | UstarFormat + + -- | The GNU tar implementation also extends the classic V7 format, though + -- in a slightly different way from the USTAR format. In general for new + -- archives the standard USTAR/POSIX should be used. + -- + | GnuFormat + deriving (Eq, Ord, Show) + +instance NFData Entry where + rnf (Entry _ c _ _ _ _) = rnf c + +instance NFData EntryContent where + rnf x = case x of + NormalFile c _ -> rnflbs c + OtherEntryType _ c _ -> rnflbs c + _ -> seq x () + where +#if MIN_VERSION_bytestring(0,10,0) + rnflbs = rnf +#else + rnflbs = foldr (\ !_bs r -> r) () . LBS.toChunks +#endif + +instance NFData Ownership where + rnf (Ownership o g _ _) = rnf o `seq` rnf g + +-- | @rw-r--r--@ for normal files +ordinaryFilePermissions :: Permissions +ordinaryFilePermissions = 0o0644 + +-- | @rwxr-xr-x@ for executable files +executableFilePermissions :: Permissions +executableFilePermissions = 0o0755 + +-- | @rwxr-xr-x@ for directories +directoryPermissions :: Permissions +directoryPermissions = 0o0755 + +-- | An 'Entry' with all default values except for the file name and type. It +-- uses the portable USTAR/POSIX format (see 'UstarHeader'). +-- +-- You can use this as a basis and override specific fields, eg: +-- +-- > (emptyEntry name HardLink) { linkTarget = target } +-- +simpleEntry :: TarPath -> EntryContent -> Entry +simpleEntry tarpath content = Entry { + entryTarPath = tarpath, + entryContent = content, + entryPermissions = case content of + Directory -> directoryPermissions + _ -> ordinaryFilePermissions, + entryOwnership = Ownership "" "" 0 0, + entryTime = 0, + entryFormat = UstarFormat + } + +-- | A tar 'Entry' for a file. +-- +-- Entry fields such as file permissions and ownership have default values. +-- +-- You can use this as a basis and override specific fields. For example if you +-- need an executable file you could use: +-- +-- > (fileEntry name content) { fileMode = executableFileMode } +-- +fileEntry :: TarPath -> LBS.ByteString -> Entry +fileEntry name fileContent = + simpleEntry name (NormalFile fileContent (LBS.length fileContent)) + +-- | A tar 'Entry' for a directory. +-- +-- Entry fields such as file permissions and ownership have default values. +-- +directoryEntry :: TarPath -> Entry +directoryEntry name = simpleEntry name Directory + +-- +-- * Tar paths +-- + +-- | The classic tar format allowed just 100 characters for the file name. The +-- USTAR format extended this with an extra 155 characters, however it uses a +-- complex method of splitting the name between the two sections. +-- +-- Instead of just putting any overflow into the extended area, it uses the +-- extended area as a prefix. The aggravating insane bit however is that the +-- prefix (if any) must only contain a directory prefix. That is the split +-- between the two areas must be on a directory separator boundary. So there is +-- no simple calculation to work out if a file name is too long. Instead we +-- have to try to find a valid split that makes the name fit in the two areas. +-- +-- The rationale presumably was to make it a bit more compatible with old tar +-- programs that only understand the classic format. A classic tar would be +-- able to extract the file name and possibly some dir prefix, but not the +-- full dir prefix. So the files would end up in the wrong place, but that's +-- probably better than ending up with the wrong names too. +-- +-- So it's understandable but rather annoying. +-- +-- * Tar paths use Posix format (ie @\'/\'@ directory separators), irrespective +-- of the local path conventions. +-- +-- * The directory separator between the prefix and name is /not/ stored. +-- +data TarPath = TarPath {-# UNPACK #-} !BS.ByteString -- path name, 100 characters max. + {-# UNPACK #-} !BS.ByteString -- path prefix, 155 characters max. + deriving (Eq, Ord) + +instance NFData TarPath where + rnf (TarPath _ _) = () -- fully strict by construction + +instance Show TarPath where + show = show . fromTarPath + +-- | Convert a 'TarPath' to a native 'FilePath'. +-- +-- The native 'FilePath' will use the native directory separator but it is not +-- otherwise checked for validity or sanity. In particular: +-- +-- * The tar path may be invalid as a native path, eg the file name @\"nul\"@ +-- is not valid on Windows. +-- +-- * The tar path may be an absolute path or may contain @\"..\"@ components. +-- For security reasons this should not usually be allowed, but it is your +-- responsibility to check for these conditions (eg using 'checkSecurity'). +-- +fromTarPath :: TarPath -> FilePath +fromTarPath (TarPath namebs prefixbs) = adjustDirectory $ + FilePath.Native.joinPath $ FilePath.Posix.splitDirectories prefix + ++ FilePath.Posix.splitDirectories name + where + name = BS.Char8.unpack namebs + prefix = BS.Char8.unpack prefixbs + adjustDirectory | FilePath.Posix.hasTrailingPathSeparator name + = FilePath.Native.addTrailingPathSeparator + | otherwise = id + +-- | Convert a 'TarPath' to a Unix\/Posix 'FilePath'. +-- +-- The difference compared to 'fromTarPath' is that it always returns a Unix +-- style path irrespective of the current operating system. +-- +-- This is useful to check how a 'TarPath' would be interpreted on a specific +-- operating system, eg to perform portability checks. +-- +fromTarPathToPosixPath :: TarPath -> FilePath +fromTarPathToPosixPath (TarPath namebs prefixbs) = adjustDirectory $ + FilePath.Posix.joinPath $ FilePath.Posix.splitDirectories prefix + ++ FilePath.Posix.splitDirectories name + where + name = BS.Char8.unpack namebs + prefix = BS.Char8.unpack prefixbs + adjustDirectory | FilePath.Posix.hasTrailingPathSeparator name + = FilePath.Posix.addTrailingPathSeparator + | otherwise = id + +-- | Convert a 'TarPath' to a Windows 'FilePath'. +-- +-- The only difference compared to 'fromTarPath' is that it always returns a +-- Windows style path irrespective of the current operating system. +-- +-- This is useful to check how a 'TarPath' would be interpreted on a specific +-- operating system, eg to perform portability checks. +-- +fromTarPathToWindowsPath :: TarPath -> FilePath +fromTarPathToWindowsPath (TarPath namebs prefixbs) = adjustDirectory $ + FilePath.Windows.joinPath $ FilePath.Posix.splitDirectories prefix + ++ FilePath.Posix.splitDirectories name + where + name = BS.Char8.unpack namebs + prefix = BS.Char8.unpack prefixbs + adjustDirectory | FilePath.Posix.hasTrailingPathSeparator name + = FilePath.Windows.addTrailingPathSeparator + | otherwise = id + +-- | Convert a native 'FilePath' to a 'TarPath'. +-- +-- The conversion may fail if the 'FilePath' is too long. See 'TarPath' for a +-- description of the problem with splitting long 'FilePath's. +-- +toTarPath :: Bool -- ^ Is the path for a directory? This is needed because for + -- directories a 'TarPath' must always use a trailing @\/@. + -> FilePath -> Either String TarPath +toTarPath isDir = splitLongPath + . addTrailingSep + . FilePath.Posix.joinPath + . FilePath.Native.splitDirectories + where + addTrailingSep | isDir = FilePath.Posix.addTrailingPathSeparator + | otherwise = id + +-- | Take a sanitised path, split on directory separators and try to pack it +-- into the 155 + 100 tar file name format. +-- +-- The strategy is this: take the name-directory components in reverse order +-- and try to fit as many components into the 100 long name area as possible. +-- If all the remaining components fit in the 155 name area then we win. +-- +splitLongPath :: FilePath -> Either String TarPath +splitLongPath path = + case packName nameMax (reverse (FilePath.Posix.splitPath path)) of + Left err -> Left err + Right (name, []) -> Right $! TarPath (BS.Char8.pack name) + BS.empty + Right (name, first:rest) -> case packName prefixMax remainder of + Left err -> Left err + Right (_ , (_:_)) -> Left "File name too long (cannot split)" + Right (prefix, []) -> Right $! TarPath (BS.Char8.pack name) + (BS.Char8.pack prefix) + where + -- drop the '/' between the name and prefix: + remainder = init first : rest + + where + nameMax, prefixMax :: Int + nameMax = 100 + prefixMax = 155 + + packName _ [] = Left "File name empty" + packName maxLen (c:cs) + | n > maxLen = Left "File name too long" + | otherwise = Right (packName' maxLen n [c] cs) + where n = length c + + packName' maxLen n ok (c:cs) + | n' <= maxLen = packName' maxLen n' (c:ok) cs + where n' = n + length c + packName' _ _ ok cs = (FilePath.Posix.joinPath ok, cs) + +-- | The tar format allows just 100 ASCII characters for the 'SymbolicLink' and +-- 'HardLink' entry types. +-- +newtype LinkTarget = LinkTarget BS.ByteString + deriving (Eq, Ord, Show) + +instance NFData LinkTarget where +#if MIN_VERSION_bytestring(0,10,0) + rnf (LinkTarget bs) = rnf bs +#else + rnf (LinkTarget !_bs) = () +#endif + +-- | Convert a native 'FilePath' to a tar 'LinkTarget'. This may fail if the +-- string is longer than 100 characters or if it contains non-portable +-- characters. +-- +toLinkTarget :: FilePath -> Maybe LinkTarget +toLinkTarget path | length path <= 100 = Just $! LinkTarget (BS.Char8.pack path) + | otherwise = Nothing + +-- | Convert a tar 'LinkTarget' to a native 'FilePath'. +-- +fromLinkTarget :: LinkTarget -> FilePath +fromLinkTarget (LinkTarget pathbs) = adjustDirectory $ + FilePath.Native.joinPath $ FilePath.Posix.splitDirectories path + where + path = BS.Char8.unpack pathbs + adjustDirectory | FilePath.Posix.hasTrailingPathSeparator path + = FilePath.Native.addTrailingPathSeparator + | otherwise = id + +-- | Convert a tar 'LinkTarget' to a Unix/Posix 'FilePath'. +-- +fromLinkTargetToPosixPath :: LinkTarget -> FilePath +fromLinkTargetToPosixPath (LinkTarget pathbs) = adjustDirectory $ + FilePath.Posix.joinPath $ FilePath.Posix.splitDirectories path + where + path = BS.Char8.unpack pathbs + adjustDirectory | FilePath.Posix.hasTrailingPathSeparator path + = FilePath.Native.addTrailingPathSeparator + | otherwise = id + +-- | Convert a tar 'LinkTarget' to a Windows 'FilePath'. +-- +fromLinkTargetToWindowsPath :: LinkTarget -> FilePath +fromLinkTargetToWindowsPath (LinkTarget pathbs) = adjustDirectory $ + FilePath.Windows.joinPath $ FilePath.Posix.splitDirectories path + where + path = BS.Char8.unpack pathbs + adjustDirectory | FilePath.Posix.hasTrailingPathSeparator path + = FilePath.Windows.addTrailingPathSeparator + | otherwise = id + +-- +-- * Entries type +-- + +-- | A tar archive is a sequence of entries. +-- +-- The point of this type as opposed to just using a list is that it makes the +-- failure case explicit. We need this because the sequence of entries we get +-- from reading a tarball can include errors. +-- +-- It is a concrete data type so you can manipulate it directly but it is often +-- clearer to use the provided functions for mapping, folding and unfolding. +-- +-- Converting from a list can be done with just @foldr Next Done@. Converting +-- back into a list can be done with 'foldEntries' however in that case you +-- must be prepared to handle the 'Fail' case inherent in the 'Entries' type. +-- +-- The 'Monoid' instance lets you concatenate archives or append entries to an +-- archive. +-- +data Entries e = Next Entry (Entries e) + | Done + | Fail e + deriving (Eq, Show) + +infixr 5 `Next` + +-- | This is like the standard 'unfoldr' function on lists, but for 'Entries'. +-- It includes failure as an extra possibility that the stepper function may +-- return. +-- +-- It can be used to generate 'Entries' from some other type. For example it is +-- used internally to lazily unfold entries from a 'LBS.ByteString'. +-- +unfoldEntries :: (a -> Either e (Maybe (Entry, a))) -> a -> Entries e +unfoldEntries f = unfold + where + unfold x = case f x of + Left err -> Fail err + Right Nothing -> Done + Right (Just (e, x')) -> Next e (unfold x') + +-- | This is like the standard 'foldr' function on lists, but for 'Entries'. +-- Compared to 'foldr' it takes an extra function to account for the +-- possibility of failure. +-- +-- This is used to consume a sequence of entries. For example it could be used +-- to scan a tarball for problems or to collect an index of the contents. +-- +foldEntries :: (Entry -> a -> a) -> a -> (e -> a) -> Entries e -> a +foldEntries next done fail' = fold + where + fold (Next e es) = next e (fold es) + fold Done = done + fold (Fail err) = fail' err + +-- | A 'foldl'-like function on Entries. It either returns the final +-- accumulator result, or the failure along with the intermediate accumulator +-- value. +-- +foldlEntries :: (a -> Entry -> a) -> a -> Entries e -> Either (e, a) a +foldlEntries f z = go z + where + go !acc (Next e es) = go (f acc e) es + go !acc Done = Right acc + go !acc (Fail err) = Left (err, acc) + +-- | This is like the standard 'map' function on lists, but for 'Entries'. It +-- includes failure as a extra possible outcome of the mapping function. +-- +-- If your mapping function cannot fail it may be more convenient to use +-- 'mapEntriesNoFail' +mapEntries :: (Entry -> Either e' Entry) -> Entries e -> Entries (Either e e') +mapEntries f = + foldEntries (\entry rest -> either (Fail . Right) (flip Next rest) (f entry)) Done (Fail . Left) + +-- | Like 'mapEntries' but the mapping function itself cannot fail. +-- +mapEntriesNoFail :: (Entry -> Entry) -> Entries e -> Entries e +mapEntriesNoFail f = + foldEntries (\entry -> Next (f entry)) Done Fail + +instance Monoid (Entries e) where + mempty = Done + mappend a b = foldEntries Next b Fail a + +instance Functor Entries where + fmap f = foldEntries Next Done (Fail . f) + +instance NFData e => NFData (Entries e) where + rnf (Next e es) = rnf e `seq` rnf es + rnf Done = () + rnf (Fail e) = rnf e + + +------------------------- +-- QuickCheck instances +-- + +#ifdef TESTS + +instance Arbitrary Entry where + arbitrary = Entry <$> arbitrary <*> arbitrary <*> arbitraryPermissions + <*> arbitrary <*> arbitraryEpochTime <*> arbitrary + where + arbitraryPermissions :: Gen Permissions + arbitraryPermissions = fromIntegral <$> (arbitraryOctal 7 :: Gen Int) + + arbitraryEpochTime :: Gen EpochTime + arbitraryEpochTime = fromIntegral <$> (arbitraryOctal 11 :: Gen Int) + + shrink (Entry path content perms author time format) = + [ Entry path' content' perms author' time' format + | (path', content', author', time') <- + shrink (path, content, author, time) ] + ++ [ Entry path content perms' author time format + | perms' <- shrinkIntegral perms ] + +instance Arbitrary TarPath where + arbitrary = either error id + . toTarPath False + . FilePath.Posix.joinPath + <$> listOf1ToN (255 `div` 5) + (elements (map (replicate 4) "abcd")) + + shrink = map (either error id . toTarPath False) + . map FilePath.Posix.joinPath + . filter (not . null) + . shrinkList shrinkNothing + . FilePath.Posix.splitPath + . fromTarPathToPosixPath + +instance Arbitrary LinkTarget where + arbitrary = maybe (error "link target too large") id + . toLinkTarget + . FilePath.Native.joinPath + <$> listOf1ToN (100 `div` 5) + (elements (map (replicate 4) "abcd")) + + shrink = map (maybe (error "link target too large") id . toLinkTarget) + . map FilePath.Posix.joinPath + . filter (not . null) + . shrinkList shrinkNothing + . FilePath.Posix.splitPath + . fromLinkTargetToPosixPath + + +listOf1ToN :: Int -> Gen a -> Gen [a] +listOf1ToN n g = sized $ \sz -> do + n <- choose (1, min n (max 1 sz)) + vectorOf n g + +listOf0ToN :: Int -> Gen a -> Gen [a] +listOf0ToN n g = sized $ \sz -> do + n <- choose (0, min n sz) + vectorOf n g + +instance Arbitrary EntryContent where + arbitrary = + frequency + [ (16, do bs <- arbitrary; + return (NormalFile bs (LBS.length bs))) + , (2, pure Directory) + , (1, SymbolicLink <$> arbitrary) + , (1, HardLink <$> arbitrary) + , (1, CharacterDevice <$> arbitraryOctal 7 <*> arbitraryOctal 7) + , (1, BlockDevice <$> arbitraryOctal 7 <*> arbitraryOctal 7) + , (1, pure NamedPipe) + , (1, do c <- elements (['A'..'Z']++['a'..'z']) + bs <- arbitrary; + return (OtherEntryType c bs (LBS.length bs))) + ] + + shrink (NormalFile bs _) = [ NormalFile bs' (LBS.length bs') + | bs' <- shrink bs ] + shrink Directory = [] + shrink (SymbolicLink link) = [ SymbolicLink link' | link' <- shrink link ] + shrink (HardLink link) = [ HardLink link' | link' <- shrink link ] + shrink (CharacterDevice ma mi) = [ CharacterDevice ma' mi' + | (ma', mi') <- shrink (ma, mi) ] + shrink (BlockDevice ma mi) = [ BlockDevice ma' mi' + | (ma', mi') <- shrink (ma, mi) ] + shrink NamedPipe = [] + shrink (OtherEntryType c bs _) = [ OtherEntryType c bs' (LBS.length bs') + | bs' <- shrink bs ] + +instance Arbitrary LBS.ByteString where + arbitrary = fmap LBS.pack arbitrary + shrink = map LBS.pack . shrink . LBS.unpack + +instance Arbitrary BS.ByteString where + arbitrary = fmap BS.pack arbitrary + shrink = map BS.pack . shrink . BS.unpack + +instance Arbitrary Ownership where + arbitrary = Ownership <$> name <*> name + <*> idno <*> idno + where + name = listOf0ToN 32 (arbitrary `suchThat` (/= '\0')) + idno = arbitraryOctal 7 + + shrink (Ownership oname gname oid gid) = + [ Ownership oname' gname' oid' gid' + | (oname', gname', oid', gid') <- shrink (oname, gname, oid, gid) ] + +instance Arbitrary Format where + arbitrary = elements [V7Format, UstarFormat, GnuFormat] + + +--arbitraryOctal :: (Integral n, Random n) => Int -> Gen n +arbitraryOctal n = + oneof [ pure 0 + , choose (0, upperBound) + , pure upperBound + ] + where + upperBound = 8^n-1 + +-- For QC tests it's useful to have a way to limit the info to that which can +-- be expressed in the old V7 format +limitToV7FormatCompat :: Entry -> Entry +limitToV7FormatCompat entry@Entry { entryFormat = V7Format } = + entry { + entryContent = case entryContent entry of + CharacterDevice _ _ -> OtherEntryType '3' LBS.empty 0 + BlockDevice _ _ -> OtherEntryType '4' LBS.empty 0 + Directory -> OtherEntryType '5' LBS.empty 0 + NamedPipe -> OtherEntryType '6' LBS.empty 0 + other -> other, + + entryOwnership = (entryOwnership entry) { + groupName = "", + ownerName = "" + }, + + entryTarPath = let TarPath name _prefix = entryTarPath entry + in TarPath name BS.empty + } +limitToV7FormatCompat entry = entry + +#endif + diff --git a/Codec/Archive/Tar/Unpack.hs b/Codec/Archive/Tar/Unpack.hs new file mode 100644 index 0000000..38bda1b --- /dev/null +++ b/Codec/Archive/Tar/Unpack.hs @@ -0,0 +1,124 @@ +{-# LANGUAGE CPP #-} +----------------------------------------------------------------------------- +-- | +-- Module : Codec.Archive.Tar +-- Copyright : (c) 2007 Bjorn Bringert, +-- 2008 Andrea Vezzosi, +-- 2008-2009, 2012, 2016 Duncan Coutts +-- License : BSD3 +-- +-- Maintainer : duncan@community.haskell.org +-- Portability : portable +-- +----------------------------------------------------------------------------- +module Codec.Archive.Tar.Unpack ( + unpack, + ) where + +import Codec.Archive.Tar.Types +import Codec.Archive.Tar.Check + +import qualified Data.ByteString.Lazy as BS +import System.FilePath + ( () ) +import qualified System.FilePath as FilePath.Native + ( takeDirectory ) +import System.Directory + ( createDirectoryIfMissing, copyFile ) +import Control.Exception + ( Exception, throwIO ) +#if MIN_VERSION_directory(1,2,3) +import System.Directory + ( setModificationTime ) +import Data.Time.Clock.POSIX + ( posixSecondsToUTCTime ) +import Control.Exception as Exception + ( catch ) +import System.IO.Error + ( isPermissionError ) +#endif + + +-- | Create local files and directories based on the entries of a tar archive. +-- +-- This is a portable implementation of unpacking suitable for portable +-- archives. It handles 'NormalFile' and 'Directory' entries and has simulated +-- support for 'SymbolicLink' and 'HardLink' entries. Links are implemented by +-- copying the target file. This therefore works on Windows as well as Unix. +-- All other entry types are ignored, that is they are not unpacked and no +-- exception is raised. +-- +-- If the 'Entries' ends in an error then it is raised an an exception. Any +-- files or directories that have been unpacked before the error was +-- encountered will not be deleted. For this reason you may want to unpack +-- into an empty directory so that you can easily clean up if unpacking fails +-- part-way. +-- +-- On its own, this function only checks for security (using 'checkSecurity'). +-- You can do other checks by applying checking functions to the 'Entries' that +-- you pass to this function. For example: +-- +-- > unpack dir (checkTarbomb expectedDir entries) +-- +-- If you care about the priority of the reported errors then you may want to +-- use 'checkSecurity' before 'checkTarbomb' or other checks. +-- +unpack :: Exception e => FilePath -> Entries e -> IO () +unpack baseDir entries = unpackEntries [] (checkSecurity entries) + >>= emulateLinks + + where + -- We're relying here on 'checkSecurity' to make sure we're not scribbling + -- files all over the place. + + unpackEntries _ (Fail err) = either throwIO throwIO err + unpackEntries links Done = return links + unpackEntries links (Next entry es) = case entryContent entry of + NormalFile file _ -> extractFile path file mtime + >> unpackEntries links es + Directory -> extractDir path mtime + >> unpackEntries links es + HardLink link -> (unpackEntries $! saveLink path link links) es + SymbolicLink link -> (unpackEntries $! saveLink path link links) es + _ -> unpackEntries links es --ignore other file types + where + path = entryPath entry + mtime = entryTime entry + + extractFile path content mtime = do + -- Note that tar archives do not make sure each directory is created + -- before files they contain, indeed we may have to create several + -- levels of directory. + createDirectoryIfMissing True absDir + BS.writeFile absPath content + setModTime absPath mtime + where + absDir = baseDir FilePath.Native.takeDirectory path + absPath = baseDir path + + extractDir path mtime = do + createDirectoryIfMissing True absPath + setModTime absPath mtime + where + absPath = baseDir path + + saveLink path link links = seq (length path) + $ seq (length link') + $ (path, link'):links + where link' = fromLinkTarget link + + emulateLinks = mapM_ $ \(relPath, relLinkTarget) -> + let absPath = baseDir relPath + absTarget = FilePath.Native.takeDirectory absPath relLinkTarget + in copyFile absTarget absPath + +setModTime :: FilePath -> EpochTime -> IO () +#if MIN_VERSION_directory(1,2,3) +-- functionality only supported as of directory-1.2.3.x +setModTime path t = + setModificationTime path (posixSecondsToUTCTime (fromIntegral t)) + `Exception.catch` \e -> + if isPermissionError e then return () else throwIO e +#else +setModTime _path _t = return () +#endif diff --git a/Codec/Archive/Tar/Write.hs b/Codec/Archive/Tar/Write.hs new file mode 100644 index 0000000..e522539 --- /dev/null +++ b/Codec/Archive/Tar/Write.hs @@ -0,0 +1,143 @@ +----------------------------------------------------------------------------- +-- | +-- Module : Codec.Archive.Tar.Write +-- Copyright : (c) 2007 Bjorn Bringert, +-- 2008 Andrea Vezzosi, +-- 2008-2009 Duncan Coutts +-- License : BSD3 +-- +-- Maintainer : duncan@community.haskell.org +-- Portability : portable +-- +----------------------------------------------------------------------------- +module Codec.Archive.Tar.Write (write) where + +import Codec.Archive.Tar.Types + +import Data.Char (ord) +import Data.List (foldl') +import Data.Monoid (mempty) +import Numeric (showOct) + +import qualified Data.ByteString as BS +import qualified Data.ByteString.Char8 as BS.Char8 +import qualified Data.ByteString.Lazy as LBS +import qualified Data.ByteString.Lazy.Char8 as LBS.Char8 + + +-- | Create the external representation of a tar archive by serialising a list +-- of tar entries. +-- +-- * The conversion is done lazily. +-- +write :: [Entry] -> LBS.ByteString +write es = LBS.concat $ map putEntry es ++ [LBS.replicate (512*2) 0] + +putEntry :: Entry -> LBS.ByteString +putEntry entry = case entryContent entry of + NormalFile content size -> LBS.concat [ header, content, padding size ] + OtherEntryType _ content size -> LBS.concat [ header, content, padding size ] + _ -> header + where + header = putHeader entry + padding size = LBS.replicate paddingSize 0 + where paddingSize = fromIntegral (negate size `mod` 512) + +putHeader :: Entry -> LBS.ByteString +putHeader entry = + LBS.Char8.pack + $ take 148 block + ++ putOct 7 checksum + ++ ' ' : drop 156 block +-- ++ putOct 8 checksum +-- ++ drop 156 block + where + block = putHeaderNoChkSum entry + checksum = foldl' (\x y -> x + ord y) 0 block + +putHeaderNoChkSum :: Entry -> String +putHeaderNoChkSum Entry { + entryTarPath = TarPath name prefix, + entryContent = content, + entryPermissions = permissions, + entryOwnership = ownership, + entryTime = modTime, + entryFormat = format + } = + + concat + [ putBString 100 $ name + , putOct 8 $ permissions + , putOct 8 $ ownerId ownership + , putOct 8 $ groupId ownership + , putOct 12 $ contentSize + , putOct 12 $ modTime + , fill 8 $ ' ' -- dummy checksum + , putChar8 $ typeCode + , putBString 100 $ linkTarget + ] ++ + case format of + V7Format -> + fill 255 '\NUL' + UstarFormat -> concat + [ putBString 8 $ ustarMagic + , putString 32 $ ownerName ownership + , putString 32 $ groupName ownership + , putOct 8 $ deviceMajor + , putOct 8 $ deviceMinor + , putBString 155 $ prefix + , fill 12 $ '\NUL' + ] + GnuFormat -> concat + [ putBString 8 $ gnuMagic + , putString 32 $ ownerName ownership + , putString 32 $ groupName ownership + , putGnuDev 8 $ deviceMajor + , putGnuDev 8 $ deviceMinor + , putBString 155 $ prefix + , fill 12 $ '\NUL' + ] + where + (typeCode, contentSize, linkTarget, + deviceMajor, deviceMinor) = case content of + NormalFile _ size -> ('0' , size, mempty, 0, 0) + Directory -> ('5' , 0, mempty, 0, 0) + SymbolicLink (LinkTarget link) -> ('2' , 0, link, 0, 0) + HardLink (LinkTarget link) -> ('1' , 0, link, 0, 0) + CharacterDevice major minor -> ('3' , 0, mempty, major, minor) + BlockDevice major minor -> ('4' , 0, mempty, major, minor) + NamedPipe -> ('6' , 0, mempty, 0, 0) + OtherEntryType code _ size -> (code, size, mempty, 0, 0) + + putGnuDev w n = case content of + CharacterDevice _ _ -> putOct w n + BlockDevice _ _ -> putOct w n + _ -> replicate w '\NUL' + +ustarMagic, gnuMagic :: BS.ByteString +ustarMagic = BS.Char8.pack "ustar\NUL00" +gnuMagic = BS.Char8.pack "ustar \NUL" + +-- * TAR format primitive output + +type FieldWidth = Int + +putBString :: FieldWidth -> BS.ByteString -> String +putBString n s = BS.Char8.unpack (BS.take n s) ++ fill (n - BS.length s) '\NUL' + +putString :: FieldWidth -> String -> String +putString n s = take n s ++ fill (n - length s) '\NUL' + +--TODO: check integer widths, eg for large file sizes +putOct :: (Integral a, Show a) => FieldWidth -> a -> String +putOct n x = + let octStr = take (n-1) $ showOct x "" + in fill (n - length octStr - 1) '0' + ++ octStr + ++ putChar8 '\NUL' + +putChar8 :: Char -> String +putChar8 c = [c] + +fill :: FieldWidth -> Char -> String +fill n c = replicate n c diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3a7b8df --- /dev/null +++ b/LICENSE @@ -0,0 +1,28 @@ +Copyright (c) 2007 Björn Bringert, + 2008-2015 Duncan Coutts, + 2011 Max Bolingbroke +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +- Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +- Neither the names of the copyright owners nor the names of the + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Setup.lhs b/Setup.lhs new file mode 100644 index 0000000..37d1ded --- /dev/null +++ b/Setup.lhs @@ -0,0 +1,4 @@ +> import Distribution.Simple + +> main :: IO () +> main = defaultMain diff --git a/bench/Main.hs b/bench/Main.hs new file mode 100644 index 0000000..c136b64 --- /dev/null +++ b/bench/Main.hs @@ -0,0 +1,45 @@ +module Main where + +import qualified Codec.Archive.Tar as Tar +import qualified Codec.Archive.Tar.Index as TarIndex + +import qualified Data.ByteString.Lazy as BS +import Control.Exception + +import Criterion +import Criterion.Main + +main = defaultMain benchmarks + +benchmarks :: [Benchmark] +benchmarks = + [ env loadTarFile $ \tarfile -> + bench "read" (nf Tar.read tarfile) + + , env loadTarEntriesList $ \entries -> + bench "write" (nf Tar.write entries) + + , env loadTarEntries $ \entries -> + bench "index build" (nf TarIndex.build entries) + + , env loadTarIndex $ \entries -> + bench "index rebuild" (nf (TarIndex.finalise . TarIndex.unfinalise) entries) + ] + +loadTarFile :: IO BS.ByteString +loadTarFile = + BS.readFile "01-index.tar" + +loadTarEntries :: IO (Tar.Entries Tar.FormatError) +loadTarEntries = + fmap Tar.read loadTarFile + +loadTarEntriesList :: IO [Tar.Entry] +loadTarEntriesList = + fmap (Tar.foldEntries (:) [] throw) loadTarEntries + +loadTarIndex :: IO TarIndex.TarIndex +loadTarIndex = + fmap (either throw id . TarIndex.build) + loadTarEntries + diff --git a/changelog.md b/changelog.md new file mode 100644 index 0000000..320806d --- /dev/null +++ b/changelog.md @@ -0,0 +1,73 @@ +0.5.0.3 Duncan Coutts May 2016 + + * Fix tarbomb logic to ignore special PAX entries. Was breaking many + valid tarballs. https://github.com/haskell/cabal/issues/3390 + +0.5.0.2 Duncan Coutts April 2016 + + * Fix compatability when using ghc-7.4.x and directory >= 1.2.3 + +0.5.0.1 Duncan Coutts January 2016 + + * Fix compatability with directory-1.2.3+ + +0.5.0.0 Duncan Coutts January 2016 + + * Work with old version of bytestring (using bytestring-builder package). + * Builds with GHC 6.10 -- 8.0. + * Change type of Index.serialise to be simply strict bytestring. + * Preserve file timestamps on unpack (with directory-1.2.3+) + +0.4.5.0 Duncan Coutts January 2016 + + * Revert accidental minor API change in 0.4.x series (the type of the + owner and group name strings). The 0.4.3.0 and 0.4.4.0 releases + contained the accidental API change. + * Add a handy foldlEntries function + +0.4.4.0 Duncan Coutts January 2016 + + * Build and warning fixes for GHC 7.10 and 8.0 + * New Index module function `toList` to get all index entries + +0.4.3.0 Duncan Coutts January 2016 + + * New Index function `unfinalise` to extend existing index + * 9x faster reading + * 9x faster index construction + * 24x faster index extension + * More compact entry types, using ByteStrings + * More Eq and Show instances + * Greater QC test coverage + * Fix minor bug in reading non-standard v7 format entries + +0.4.2.2 Edsko de Vries October 2015 + + * Fix bug in Index + +0.4.2.1 Duncan Coutts July 2015 + + * Fix tests for the Index modules (the code was right) + +0.4.2.0 Duncan Coutts July 2015 + + * New Index module for random access to tar file contents + * New lower level tar file I/O actions + * New tarball file 'append' action + +0.4.1.0 Duncan Coutts January 2015 + + * Build with GHC 7.10 + * Switch from old-time to time package + * Added more instance for Entries type + +0.4.0.1 Duncan Coutts October 2012 + + * fixes to work with directory 1.2 + * More Eq/Ord instances + +0.4.0.0 Duncan Coutts February 2012 + + * More explicit error types and error handling + * Support star base-256 number format + * Improved API documentation diff --git a/tar.cabal b/tar.cabal new file mode 100644 index 0000000..d227c8c --- /dev/null +++ b/tar.cabal @@ -0,0 +1,128 @@ +name: tar +version: 0.5.0.3 +license: BSD3 +license-file: LICENSE +author: Duncan Coutts + Bjorn Bringert +maintainer: Duncan Coutts +bug-reports: https://github.com/haskell/tar/issues +copyright: 2007 Bjorn Bringert + 2008-2016 Duncan Coutts +category: Codec +synopsis: Reading, writing and manipulating ".tar" archive files. +description: This library is for working with \"@.tar@\" archive files. It + can read and write a range of common variations of archive + format including V7, POSIX USTAR and GNU formats. + . + It provides support for packing and unpacking portable + archives. This makes it suitable for distribution but not + backup because details like file ownership and exact + permissions are not preserved. + . + It also provides features for random access to archive + content using an index. +build-type: Simple +cabal-version: >=1.8 +extra-source-files: changelog.md +tested-with: GHC==6.10.4, GHC==6.12.3, GHC==7.0.4, GHC==7.2.2, GHC==7.4.2, + GHC==7.6.3, GHC==7.8.4, GHC==7.10.2, GHC==8.1 + +source-repository head + type: git + location: https://github.com/haskell/tar.git + +flag old-time + default: False + +flag old-bytestring + default: False + +library + build-depends: base == 4.*, + filepath, + directory, + array, + containers >= 0.2, + deepseq >= 1.1 && < 1.5 + if flag(old-time) + build-depends: directory < 1.2, old-time + else + build-depends: directory >= 1.2, time + + if flag(old-bytestring) + build-depends: bytestring-builder, bytestring >= 0.9 && <0.10 + else + build-depends: bytestring >= 0.10 + + exposed-modules: + Codec.Archive.Tar + Codec.Archive.Tar.Entry + Codec.Archive.Tar.Check + Codec.Archive.Tar.Index + + other-modules: + Codec.Archive.Tar.Types + Codec.Archive.Tar.Read + Codec.Archive.Tar.Write + Codec.Archive.Tar.Pack + Codec.Archive.Tar.Unpack + Codec.Archive.Tar.Index.StringTable + Codec.Archive.Tar.Index.IntTrie + + other-extensions: + CPP, BangPatterns, + DeriveDataTypeable, ScopedTypeVariables + + ghc-options: -Wall -fno-warn-unused-imports + +test-suite properties + type: exitcode-stdio-1.0 + build-depends: base, + filepath, + array, + containers, + deepseq, + bytestring-handle, + QuickCheck == 2.*, + tasty >= 0.10 && <0.12, + tasty-quickcheck == 0.8.* + + if flag(old-time) + build-depends: directory < 1.2, old-time + else + build-depends: directory >= 1.2, time + + if flag(old-bytestring) + build-depends: bytestring-builder, bytestring >= 0.9 && <0.10 + else + build-depends: bytestring >= 0.10 + + hs-source-dirs: . test + + main-is: test/Properties.hs + cpp-options: -DTESTS + + other-modules: + Codec.Archive.Tar.Index + Codec.Archive.Tar.Index.StringTable + Codec.Archive.Tar.Index.IntTrie + + other-extensions: + CPP, BangPatterns, + DeriveDataTypeable, ScopedTypeVariables + + ghc-options: -fno-ignore-asserts + +benchmark bench + type: exitcode-stdio-1.0 + hs-source-dirs: . bench + main-is: bench/Main.hs + build-depends: base, + bytestring, + filepath, directory, + array, + containers, + deepseq, + old-time, time, + criterion >= 1.0 + diff --git a/test/Properties.hs b/test/Properties.hs new file mode 100644 index 0000000..7546a83 --- /dev/null +++ b/test/Properties.hs @@ -0,0 +1,54 @@ +module Main where + +import qualified Codec.Archive.Tar.Index as Index +import qualified Codec.Archive.Tar.Index.IntTrie as IntTrie +import qualified Codec.Archive.Tar.Index.StringTable as StringTable +import qualified Codec.Archive.Tar as Tar + +import qualified Data.ByteString as BS + +import Test.Tasty +import Test.Tasty.QuickCheck + +main :: IO () +main = + defaultMain $ + testGroup "tar tests" [ + + testGroup "write/read" [ + testProperty "ustar format" Tar.prop_write_read_ustar, + testProperty "gnu format" Tar.prop_write_read_gnu, + testProperty "v7 format" Tar.prop_write_read_v7 + ] + + , testGroup "string table" [ + testProperty "construction" StringTable.prop_valid, + testProperty "sorted" StringTable.prop_sorted, + testProperty "serialise" StringTable.prop_serialise_deserialise, + testProperty "size" StringTable.prop_serialiseSize, + testProperty "unfinalise" StringTable.prop_finalise_unfinalise + ] + + , testGroup "int trie" [ + testProperty "unit 1" IntTrie.test1, + testProperty "unit 2" IntTrie.test2, + testProperty "unit 3" IntTrie.test3, + testProperty "lookups" IntTrie.prop_lookup_mono, + testProperty "completions" IntTrie.prop_completions_mono, + testProperty "toList" IntTrie.prop_construct_toList, + testProperty "serialise" IntTrie.prop_serialise_deserialise, + testProperty "size" IntTrie.prop_serialiseSize, + testProperty "unfinalise" IntTrie.prop_finalise_unfinalise + ] + + , testGroup "index" [ + testProperty "lookup" Index.prop_lookup, + testProperty "valid" Index.prop_valid, + testProperty "toList" Index.prop_toList, + testProperty "serialise" Index.prop_serialise_deserialise, + testProperty "size" Index.prop_serialiseSize, + testProperty "matches tar" Index.prop_index_matches_tar, + testProperty "unfinalise" Index.prop_finalise_unfinalise + ] + ] +