Blame libdjvu/BSByteStream.h

Packit df99a1
//C-  -*- C++ -*-
Packit df99a1
//C- -------------------------------------------------------------------
Packit df99a1
//C- DjVuLibre-3.5
Packit df99a1
//C- Copyright (c) 2002  Leon Bottou and Yann Le Cun.
Packit df99a1
//C- Copyright (c) 2001  AT&T
Packit df99a1
//C-
Packit df99a1
//C- This software is subject to, and may be distributed under, the
Packit df99a1
//C- GNU General Public License, either Version 2 of the license,
Packit df99a1
//C- or (at your option) any later version. The license should have
Packit df99a1
//C- accompanied the software or you may obtain a copy of the license
Packit df99a1
//C- from the Free Software Foundation at http://www.fsf.org .
Packit df99a1
//C-
Packit df99a1
//C- This program is distributed in the hope that it will be useful,
Packit df99a1
//C- but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit df99a1
//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit df99a1
//C- GNU General Public License for more details.
Packit df99a1
//C- 
Packit df99a1
//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from
Packit df99a1
//C- Lizardtech Software.  Lizardtech Software has authorized us to
Packit df99a1
//C- replace the original DjVu(r) Reference Library notice by the following
Packit df99a1
//C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu):
Packit df99a1
//C-
Packit df99a1
//C-  ------------------------------------------------------------------
Packit df99a1
//C- | DjVu (r) Reference Library (v. 3.5)
Packit df99a1
//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
Packit df99a1
//C- | The DjVu Reference Library is protected by U.S. Pat. No.
Packit df99a1
//C- | 6,058,214 and patents pending.
Packit df99a1
//C- |
Packit df99a1
//C- | This software is subject to, and may be distributed under, the
Packit df99a1
//C- | GNU General Public License, either Version 2 of the license,
Packit df99a1
//C- | or (at your option) any later version. The license should have
Packit df99a1
//C- | accompanied the software or you may obtain a copy of the license
Packit df99a1
//C- | from the Free Software Foundation at http://www.fsf.org .
Packit df99a1
//C- |
Packit df99a1
//C- | The computer code originally released by LizardTech under this
Packit df99a1
//C- | license and unmodified by other parties is deemed "the LIZARDTECH
Packit df99a1
//C- | ORIGINAL CODE."  Subject to any third party intellectual property
Packit df99a1
//C- | claims, LizardTech grants recipient a worldwide, royalty-free, 
Packit df99a1
//C- | non-exclusive license to make, use, sell, or otherwise dispose of 
Packit df99a1
//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the 
Packit df99a1
//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU 
Packit df99a1
//C- | General Public License.   This grant only confers the right to 
Packit df99a1
//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to 
Packit df99a1
//C- | the extent such infringement is reasonably necessary to enable 
Packit df99a1
//C- | recipient to make, have made, practice, sell, or otherwise dispose 
Packit df99a1
//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to 
Packit df99a1
//C- | any greater extent that may be necessary to utilize further 
Packit df99a1
//C- | modifications or combinations.
Packit df99a1
//C- |
Packit df99a1
//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
Packit df99a1
//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
Packit df99a1
//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
Packit df99a1
//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Packit df99a1
//C- +------------------------------------------------------------------
Packit df99a1
Packit df99a1
#ifndef _BSBYTESTREAM_H
Packit df99a1
#define _BSBYTESTREAM_H
Packit df99a1
#ifdef HAVE_CONFIG_H
Packit df99a1
#include "config.h"
Packit df99a1
#endif
Packit df99a1
#if NEED_GNUG_PRAGMAS
Packit df99a1
# pragma interface
Packit df99a1
#endif
Packit df99a1
Packit df99a1
/** @name BSByteStream.h
Packit df99a1
    
Packit df99a1
    Files #"BSByteStream.h"# and #"BSByteStream.cpp"# implement a very compact
Packit df99a1
    general purpose compressor based on the Burrows-Wheeler transform.  The
Packit df99a1
    utility program \Ref{bzz} provides a front-end for this class. Although
Packit df99a1
    this compression model is not currently used in DjVu files, it may be used
Packit df99a1
    in the future for encoding textual data chunks.
Packit df99a1
Packit df99a1
    {\bf Algorithms} --- The Burrows-Wheeler transform (also named Block-Sorting)
Packit df99a1
    is performed using a combination of the Karp-Miller-Rosenberg and the
Packit df99a1
    Bentley-Sedgewick algorithms. This is comparable to (Sadakane, DCC 98)
Packit df99a1
    with a slightly more flexible ranking scheme. Symbols are then ordered
Packit df99a1
    according to a running estimate of their occurrence frequencies.  The
Packit df99a1
    symbol ranks are then coded using a simple fixed tree and the
Packit df99a1
    \Ref{ZPCodec} binary adaptive coder.
Packit df99a1
Packit df99a1
    {\bf Performances} --- The basic algorithm is mostly similar to those
Packit df99a1
    implemented in well known compressors like #bzip# or #bzip2#
Packit df99a1
    (\URL{http://www.muraroa.demon.co.uk}).  The adaptive binary coder however
Packit df99a1
    generates small differences. The adaptation noise may cost up to 5\% in
Packit df99a1
    file size, but this penalty is usually offset by the benefits of
Packit df99a1
    adaptation.  This is good when processing large and highly structured
Packit df99a1
    files like spreadsheet files.  Compression and decompression speed is
Packit df99a1
    about twice slower than #bzip2# but the sorting algorithms is more
Packit df99a1
    robust. Unlike #bzip2# (as of August 1998), this code can compress half a
Packit df99a1
    megabyte of "abababab...." in bounded time.
Packit df99a1
    
Packit df99a1
    Here are some comparative results (in bits per character) obtained on the
Packit df99a1
    Canterbury Corpus (\URL{http://corpus.canterbury.ac.nz}) as of August
Packit df99a1
    1998. The BSByteStream performance on the single spreadsheet file #Excl#
Packit df99a1
    moves #bzz#'s weighted average ahead of much more sophisticated methods,
Packit df99a1
    like Suzanne Bunton's #fsmxBest# system
Packit df99a1
    \URL{http://corpus.canterbury.ac.nz/methodinfo/fsmx.html}.  This result
Packit df99a1
    will not last very long.
Packit df99a1
Packit df99a1
    {\footnotesize
Packit df99a1
    \begin{tabular}{lccccccccccccc}
Packit df99a1
      & text & fax & Csrc & Excl & SPRC & tech 
Packit df99a1
      & poem & html & lisp & man & play & Weighted & Average \\
Packit df99a1
      compress 
Packit df99a1
      & 3.27 & 0.97 & 3.56 & 2.41 & 4.21 & 3.06 
Packit df99a1
      & 3.38 & 3.68 & 3.90 & 4.43 & 3.51 
Packit df99a1
      & 2.55 & 3.31 \\
Packit df99a1
      gzip -9
Packit df99a1
      & 2.85 & 0.82 & 2.24 & 1.63 & 2.67 & 2.71 
Packit df99a1
      & 3.23 & 2.59 & 2.65 & 3.31 & 3.12 
Packit df99a1
      & 2.08 & 2.53 \\  
Packit df99a1
      bzip2 -9
Packit df99a1
      & 2.27 & 0.78 & 2.18 & 1.01 & 2.70 & 2.02 
Packit df99a1
      & 2.42 & 2.48 & 2.79 & 3.33 & 2.53 
Packit df99a1
      & 1.54 & 2.23 \\
Packit df99a1
      ppmd
Packit df99a1
      & 2.31 & 0.99 & 2.11 & 1.08 & 2.68 & 2.19 
Packit df99a1
      & 2.48 & 2.38 & 2.43 & 3.00 & 2.53 
Packit df99a1
      & 1.65 & 2.20 \\
Packit df99a1
      fsmx
Packit df99a1
      & {\bf 2.10} & 0.79 & {\bf 1.89} & 1.48 & {\bf 2.52} & {\bf 1.84} 
Packit df99a1
      & {\bf 2.21} & {\bf 2.24} & {\bf 2.29} & {\bf 2.91} & {\bf 2.35} 
Packit df99a1
      & 1.63 & {\bf 2.06} \\
Packit df99a1
      {\bf bzz}
Packit df99a1
      & 2.25 & {\bf 0.76} & 2.13 & {\bf 0.78} & 2.67 & 2.00
Packit df99a1
      & 2.40 & 2.52 & 2.60 & 3.19 & 2.52 
Packit df99a1
      & {\bf 1.44} & 2.16
Packit df99a1
    \end{tabular}
Packit df99a1
    }
Packit df99a1
Packit df99a1
    Note that the DjVu people have several entries in this table.  Program
Packit df99a1
    #compress# was written some time ago by Joe Orost
Packit df99a1
    (\URL{http://www.research.att.com/info/orost}). The #ppmc# method, (a
Packit df99a1
    precursor of #ppmd#) was created by Paul Howard
Packit df99a1
    (\URL{http://www.research.att.com/info/pgh}). The #bzz# program is just
Packit df99a1
    below your eyes.
Packit df99a1
Packit df99a1
    @author
Packit df99a1
    L\'eon Bottou <leonb@research.att.com> -- Initial implementation\\
Packit df99a1
    Andrei Erofeev <eaf@geocities.com> -- Improved Block Sorting algorithm.
Packit df99a1
    @memo
Packit df99a1
    Simple Burrows-Wheeler general purpose compressor.
Packit df99a1
*/
Packit df99a1
//@{
Packit df99a1
Packit df99a1
Packit df99a1
#include "ByteStream.h"
Packit df99a1
#include "GException.h"
Packit df99a1
#include "ZPCodec.h"
Packit df99a1
Packit df99a1
Packit df99a1
#ifdef HAVE_NAMESPACES
Packit df99a1
namespace DJVU {
Packit df99a1
# ifdef NOT_DEFINED // Just to fool emacs c++ mode
Packit df99a1
}
Packit df99a1
#endif
Packit df99a1
#endif
Packit df99a1
Packit df99a1
Packit df99a1
/** Performs bzz compression/decompression.
Packit df99a1
    
Packit df99a1
    Class #BSByteStream# defines a \Ref{ByteStream} which transparently
Packit df99a1
    performs the BZZ compression/decompression. The constructor of class
Packit df99a1
    \Ref{BSByteStream} takes another \Ref{ByteStream} as argument.  Any data
Packit df99a1
    written to the BSByteStream is compressed and written to this second
Packit df99a1
    ByteStream. Any data read from the BSByteStream is internally generated by
Packit df99a1
    decompressing data read from the second ByteStream.
Packit df99a1
Packit df99a1
    Program \Ref{bzz} demonstrates how to use this class.  All the hard work
Packit df99a1
    is achieved by a simple ByteStream to ByteStream copy, as shown below.
Packit df99a1
    \begin{verbatim}
Packit df99a1
      GP<ByteStream> in=ByteStream::create(infile,"rb");
Packit df99a1
      GP<ByteStream> out=ByteStream::create(outfile,"wb");
Packit df99a1
      if (encoding) {
Packit df99a1
          BSByteStream bsb(out, blocksize);
Packit df99a1
          bsb.copy(*in);
Packit df99a1
      } else {
Packit df99a1
          BSByteStream bsb(in);
Packit df99a1
          out->copy(bsb);
Packit df99a1
      }
Packit df99a1
    \end{verbatim}
Packit df99a1
    Due to the block oriented nature of the Burrows-Wheeler transform, there
Packit df99a1
    is a very significant latency between the data input and the data output.
Packit df99a1
    You can use function #flush# to force data output at the expense of
Packit df99a1
    compression efficiency.
Packit df99a1
Packit df99a1
    You should never directly access a ByteStream object connected to a valid
Packit df99a1
    BSByteStream object. The ByteStream object can be accessed again after the
Packit df99a1
    destruction of the BSByteStream object.  Note that the encoder always
Packit df99a1
    flushes its internal buffers and writes a few final code bytes when the
Packit df99a1
    BSByteStream object is destroyed.  Note also that the decoder often reads
Packit df99a1
    a few bytes beyond the last code byte written by the encoder.  This lag
Packit df99a1
    means that you must reposition the ByteStream after the destruction of the
Packit df99a1
    BSByteStream object and before re-using the ByteStream object (see
Packit df99a1
    \Ref{IFFByteStream}.)
Packit df99a1
*/
Packit df99a1
class DJVUAPI BSByteStream : public ByteStream
Packit df99a1
{
Packit df99a1
public:
Packit df99a1
// Limits on block sizes
Packit df99a1
  enum { MINBLOCK=10, MAXBLOCK=4096 };
Packit df99a1
Packit df99a1
// Sorting tresholds
Packit df99a1
  enum { FREQMAX=4, CTXIDS=3 };
Packit df99a1
Packit df99a1
  class Decode;
Packit df99a1
  class Encode;
Packit df99a1
protected:
Packit df99a1
  BSByteStream(GP<ByteStream> bs);
Packit df99a1
Packit df99a1
public:
Packit df99a1
  /** Creates a BSByteStream.
Packit df99a1
      The BSByteStream will be used for decompressing data.
Packit df99a1
      \begin{description}
Packit df99a1
      \item[Decompression]
Packit df99a1
      The BSByteStream is created and the decompressor initializes.  Chunks of
Packit df99a1
      data will be read from ByteStream #bs# and decompressed into an internal
Packit df99a1
      buffer. Function #read# can be used to access the decompressed data.
Packit df99a1
      \end{description} */
Packit df99a1
  static GP<ByteStream> create(GP<ByteStream> bs);
Packit df99a1
Packit df99a1
  /** Constructs a BSByteStream.
Packit df99a1
      The BSByteStream will be used for compressing data.
Packit df99a1
      \begin{description}
Packit df99a1
      \item[Compression]
Packit df99a1
      Set #blocksize# to a positive number smaller than 4096 to 
Packit df99a1
      initialize the compressor.  Data written to the BSByteStream will be
Packit df99a1
      accumulated into an internal buffer.  The buffered data will be
Packit df99a1
      compressed and written to ByteStream #bs# whenever the buffer sizes
Packit df99a1
      reaches the maximum value specified by argument #blocksize# (in
Packit df99a1
      kilobytes).  Using a larger block size usually increases the compression
Packit df99a1
      ratio at the expense of computation time.  There is no need however to
Packit df99a1
      specify a block size larger than the total number of bytes to compress.
Packit df99a1
      Setting #blocksize# to #1024# is a good starting point.  A minimal block
Packit df99a1
      size of 10 is silently enforced.
Packit df99a1
      \end{description} */
Packit df99a1
  static GP<ByteStream> create(GP<ByteStream> bs, const int blocksize);
Packit df99a1
Packit df99a1
  // ByteStream Interface
Packit df99a1
  ~BSByteStream();
Packit df99a1
  virtual long tell(void) const;
Packit df99a1
  virtual void flush(void) = 0;
Packit df99a1
protected:
Packit df99a1
  // Data
Packit df99a1
  long            offset;
Packit df99a1
  int             bptr;
Packit df99a1
  unsigned int    blocksize;
Packit df99a1
  int             size;
Packit df99a1
  ByteStream *bs;
Packit df99a1
  GP<ByteStream> gbs;
Packit df99a1
  unsigned char  *data;
Packit df99a1
  GPBuffer<unsigned char> gdata;
Packit df99a1
  // Coder
Packit df99a1
  GP<ZPCodec> gzp;
Packit df99a1
  BitContext ctx[300];
Packit df99a1
private:  
Packit df99a1
  // Cancel C++ default stuff
Packit df99a1
  BSByteStream(const BSByteStream &);
Packit df99a1
  BSByteStream & operator=(const BSByteStream &);
Packit df99a1
  BSByteStream(ByteStream *);
Packit df99a1
  BSByteStream(ByteStream *, int);
Packit df99a1
};
Packit df99a1
Packit df99a1
//@}
Packit df99a1
Packit df99a1
Packit df99a1
#ifdef HAVE_NAMESPACES
Packit df99a1
}
Packit df99a1
# ifndef NOT_USING_DJVU_NAMESPACE
Packit df99a1
using namespace DJVU;
Packit df99a1
# endif
Packit df99a1
#endif
Packit df99a1
#endif