Blame libdjvu/DjVuText.h

Packit df99a1
//C-  -*- C++ -*-
Packit df99a1
//C- -------------------------------------------------------------------
Packit df99a1
//C- DjVuLibre-3.5
Packit df99a1
//C- Copyright (c) 2002  Leon Bottou and Yann Le Cun.
Packit df99a1
//C- Copyright (c) 2001  AT&T
Packit df99a1
//C-
Packit df99a1
//C- This software is subject to, and may be distributed under, the
Packit df99a1
//C- GNU General Public License, either Version 2 of the license,
Packit df99a1
//C- or (at your option) any later version. The license should have
Packit df99a1
//C- accompanied the software or you may obtain a copy of the license
Packit df99a1
//C- from the Free Software Foundation at http://www.fsf.org .
Packit df99a1
//C-
Packit df99a1
//C- This program is distributed in the hope that it will be useful,
Packit df99a1
//C- but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit df99a1
//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit df99a1
//C- GNU General Public License for more details.
Packit df99a1
//C- 
Packit df99a1
//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from
Packit df99a1
//C- Lizardtech Software.  Lizardtech Software has authorized us to
Packit df99a1
//C- replace the original DjVu(r) Reference Library notice by the following
Packit df99a1
//C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu):
Packit df99a1
//C-
Packit df99a1
//C-  ------------------------------------------------------------------
Packit df99a1
//C- | DjVu (r) Reference Library (v. 3.5)
Packit df99a1
//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
Packit df99a1
//C- | The DjVu Reference Library is protected by U.S. Pat. No.
Packit df99a1
//C- | 6,058,214 and patents pending.
Packit df99a1
//C- |
Packit df99a1
//C- | This software is subject to, and may be distributed under, the
Packit df99a1
//C- | GNU General Public License, either Version 2 of the license,
Packit df99a1
//C- | or (at your option) any later version. The license should have
Packit df99a1
//C- | accompanied the software or you may obtain a copy of the license
Packit df99a1
//C- | from the Free Software Foundation at http://www.fsf.org .
Packit df99a1
//C- |
Packit df99a1
//C- | The computer code originally released by LizardTech under this
Packit df99a1
//C- | license and unmodified by other parties is deemed "the LIZARDTECH
Packit df99a1
//C- | ORIGINAL CODE."  Subject to any third party intellectual property
Packit df99a1
//C- | claims, LizardTech grants recipient a worldwide, royalty-free, 
Packit df99a1
//C- | non-exclusive license to make, use, sell, or otherwise dispose of 
Packit df99a1
//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the 
Packit df99a1
//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU 
Packit df99a1
//C- | General Public License.   This grant only confers the right to 
Packit df99a1
//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to 
Packit df99a1
//C- | the extent such infringement is reasonably necessary to enable 
Packit df99a1
//C- | recipient to make, have made, practice, sell, or otherwise dispose 
Packit df99a1
//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to 
Packit df99a1
//C- | any greater extent that may be necessary to utilize further 
Packit df99a1
//C- | modifications or combinations.
Packit df99a1
//C- |
Packit df99a1
//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
Packit df99a1
//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
Packit df99a1
//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
Packit df99a1
//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Packit df99a1
//C- +------------------------------------------------------------------
Packit df99a1
Packit df99a1
#ifndef _DJVUTEXT_H
Packit df99a1
#define _DJVUTEXT_H
Packit df99a1
#ifdef HAVE_CONFIG_H
Packit df99a1
#include "config.h"
Packit df99a1
#endif
Packit df99a1
#if NEED_GNUG_PRAGMAS
Packit df99a1
# pragma interface
Packit df99a1
#endif
Packit df99a1
Packit df99a1
Packit df99a1
Packit df99a1
/** @name DjVuText.h
Packit df99a1
Packit df99a1
    Files #"DjVuText.h"# and #"DjVuText.cpp"# implement the mechanism for
Packit df99a1
    text in DjVuImages.
Packit df99a1
Packit df99a1
    This file implements annotations understood by the DjVu plugins 
Packit df99a1
    and encoders.
Packit df99a1
Packit df99a1
Packit df99a1
    using: contents of #TXT*# chunks.
Packit df99a1
Packit df99a1
    Contents of the #FORM:TEXT# should be passed to \Ref{DjVuText::decode}()
Packit df99a1
    for parsing, which initializes \Ref{DjVuText::TXT} 
Packit df99a1
    and fills them with decoded data. 
Packit df99a1
    @memo Implements support for DjVuImage hidden text.
Packit df99a1
    @author Andrei Erofeev <eaf@geocities.com>
Packit df99a1
*/
Packit df99a1
//@{
Packit df99a1
Packit df99a1
Packit df99a1
#include "GMapAreas.h"
Packit df99a1
Packit df99a1
#ifdef HAVE_NAMESPACES
Packit df99a1
namespace DJVU {
Packit df99a1
# ifdef NOT_DEFINED // Just to fool emacs c++ mode
Packit df99a1
}
Packit df99a1
#endif
Packit df99a1
#endif
Packit df99a1
Packit df99a1
Packit df99a1
class ByteStream;
Packit df99a1
Packit df99a1
// -------- DJVUTXT --------
Packit df99a1
Packit df99a1
/** Description of the text contained in a DjVu page.  This class contains the
Packit df99a1
    textual data for the page.  It describes the text as a hierarchy of zones
Packit df99a1
    corresponding to page, column, region, paragraph, lines, words, etc...
Packit df99a1
    The piece of text associated with each zone is represented by an offset
Packit df99a1
    and a length describing a segment of a global UTF8 encoded string.  */
Packit df99a1
Packit df99a1
class DJVUAPI DjVuTXT : public GPEnabled
Packit df99a1
{
Packit df99a1
protected:
Packit df99a1
  DjVuTXT(void) {}
Packit df99a1
public:
Packit df99a1
  /// Default creator
Packit df99a1
  static GP<DjVuTXT> create(void) {return new DjVuTXT();}
Packit df99a1
Packit df99a1
  /** These constants are used to tell what a zone describes.
Packit df99a1
      This can be useful for a copy/paste application. 
Packit df99a1
      The deeper we go into the hierarchy, the higher the constant. */
Packit df99a1
  enum ZoneType { PAGE=1, COLUMN=2, REGION=3, PARAGRAPH=4, 
Packit df99a1
                  LINE=5, WORD=6, CHARACTER=7 };
Packit df99a1
  /** Data structure representing document textual components.
Packit df99a1
      The text structure is represented by a hierarchy of rectangular zones. */
Packit df99a1
  class DJVUAPI Zone 
Packit df99a1
  {
Packit df99a1
  public:
Packit df99a1
    Zone();
Packit df99a1
    /** Type of the zone. */
Packit df99a1
    enum ZoneType ztype;
Packit df99a1
    /** Rectangle spanned by the zone */
Packit df99a1
    GRect rect;
Packit df99a1
    /** Position of the zone text in string #textUTF8#. */
Packit df99a1
    int text_start;
Packit df99a1
    /** Length of the zone text in string #textUTF8#. */
Packit df99a1
    int text_length;
Packit df99a1
    /** List of children zone. */
Packit df99a1
    GList<Zone> children;
Packit df99a1
    /** Appends another subzone inside this zone.  The new zone is initialized
Packit df99a1
        with an empty rectangle, empty text, and has the same type as this
Packit df99a1
        zone. */
Packit df99a1
    Zone *append_child();
Packit df99a1
    /** Find the text_start and text_end indicated by the given box. */
Packit df99a1
    void get_text_with_rect(const GRect &box, 
Packit df99a1
                            int &string_start,int &string_end ) const;
Packit df99a1
    /** Find the zones used by the specified string and append them to the list. */
Packit df99a1
    void find_zones(GList<Zone *> &list, 
Packit df99a1
                    const int string_start, const int string_end) const;
Packit df99a1
    /** Finds the smallest rectangles and appends them to the list. */
Packit df99a1
    void get_smallest(GList<GRect> &list) const;
Packit df99a1
    /** Finds the smallest rectangles and appends them to the list after 
Packit df99a1
        padding the smallest unit to fit width or height for the parent rectangle
Packit df99a1
        and adding the number of specified pixels. */
Packit df99a1
    void get_smallest(GList<GRect> &list,const int padding) const;
Packit df99a1
    /// Find out this Zone's parent.
Packit df99a1
    const Zone *get_parent(void) const;
Packit df99a1
  private:
Packit df99a1
    friend class DjVuTXT;
Packit df99a1
    const Zone *zone_parent;
Packit df99a1
    void cleartext();
Packit df99a1
    void normtext(const char *instr, GUTF8String &outstr);
Packit df99a1
    unsigned int memuse() const;
Packit df99a1
    static const int version;
Packit df99a1
    void encode(const GP<ByteStream> &bs, 
Packit df99a1
                const Zone * parent=0, const Zone * prev=0) const;
Packit df99a1
    void decode(const GP<ByteStream> &bs, int maxtext,
Packit df99a1
                const Zone * parent=0, const Zone * prev=0);
Packit df99a1
  };
Packit df99a1
  /** Textual data for this page.  
Packit df99a1
      The content of this string is encoded using the UTF8 code.
Packit df99a1
      This code corresponds to ASCII for the first 127 characters.
Packit df99a1
      Columns, regions, paragraph and lines are delimited by the following
Packit df99a1
      control character:
Packit df99a1
      \begin{tabular}{lll}
Packit df99a1
        {\bf Name} & {\bf Octal} & {\bf Ascii name} \\\hline\\
Packit df99a1
        {\tt DjVuText::end_of_column}    & 013 & VT, Vertical Tab \\
Packit df99a1
        {\tt DjVuText::end_of_region}    & 035 & GS, Group Separator \\
Packit df99a1
        {\tt DjVuText::end_of_paragraph} & 037 & US, Unit Separator \\
Packit df99a1
        {\tt DjVuText::end_of_line}      & 012 & LF: Line Feed
Packit df99a1
      \end{tabular} */
Packit df99a1
  GUTF8String textUTF8;
Packit df99a1
  static const char end_of_column    ;      // VT: Vertical Tab
Packit df99a1
  static const char end_of_region    ;      // GS: Group Separator
Packit df99a1
  static const char end_of_paragraph ;      // US: Unit Separator
Packit df99a1
  static const char end_of_line      ;      // LF: Line Feed
Packit df99a1
  /** Main zone in the document.
Packit df99a1
      This zone represent the page. */
Packit df99a1
  Zone page_zone;
Packit df99a1
  /** Tests whether there is a meaningful zone hierarchy. */
Packit df99a1
  int has_valid_zones() const;
Packit df99a1
  /** Normalize textual data.  Assuming that a zone hierarchy has been built
Packit df99a1
      and represents the reading order.  This function reorganizes the string
Packit df99a1
      #textUTF8# by gathering the highest level text available in the zone
Packit df99a1
      hierarchy.  The text offsets and lengths are recomputed for all the
Packit df99a1
      zones in the hierarchy. Separators are inserted where appropriate. */
Packit df99a1
  void normalize_text();
Packit df99a1
  /** Encode data for a TXT chunk. */
Packit df99a1
  void encode(const GP<ByteStream> &bs) const;
Packit df99a1
  /** Decode data from a TXT chunk. */
Packit df99a1
  void decode(const GP<ByteStream> &bs);
Packit df99a1
  /** Returns a copy of this object. */
Packit df99a1
  GP<DjVuTXT> copy(void) const;
Packit df99a1
  /// Write XML formated text.
Packit df99a1
  void writeText(ByteStream &bs,const int height) const;
Packit df99a1
  /// Get XML formatted text.
Packit df99a1
  GUTF8String get_xmlText(const int height) const;
Packit df99a1
  /** Find the text specified by the rectangle. */  
Packit df99a1
  GList<Zone*> find_text_in_rect(GRect target_rect, GUTF8String &text) const;
Packit df99a1
  /** Find the text specified by the rectangle. */
Packit df99a1
  GList<GRect> find_text_with_rect(const GRect &box, GUTF8String &text, const int padding=0) const;
Packit df99a1
  /** Get all zones of zone type zone_type under node parent. 
Packit df99a1
      zone_list contains the return value. */
Packit df99a1
  void get_zones(int zone_type, const Zone *parent, GList<Zone *> & zone_list) const;
Packit df99a1
  /** Returns the number of bytes needed by this data structure. It's
Packit df99a1
      used by caching routines to estimate the size of a \Ref{DjVuImage}. */
Packit df99a1
  unsigned int get_memory_usage() const;
Packit df99a1
};
Packit df99a1
Packit df99a1
inline const DjVuTXT::Zone *
Packit df99a1
DjVuTXT::Zone::get_parent(void) const
Packit df99a1
{
Packit df99a1
  return zone_parent;
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
class DJVUAPI DjVuText : public GPEnabled
Packit df99a1
{
Packit df99a1
protected:
Packit df99a1
   DjVuText(void) {}
Packit df99a1
public:
Packit df99a1
   /// Default creator.
Packit df99a1
   static GP<DjVuText> create(void) {return new DjVuText();}
Packit df99a1
Packit df99a1
      /** Decodes a sequence of annotation chunks and merges contents of every
Packit df99a1
	  chunk with previously decoded information. This function
Packit df99a1
	  should be called right after applying \Ref{IFFByteStream::get_chunk}()
Packit df99a1
	  to data from #FORM:TEXT#. */
Packit df99a1
   void decode(const GP<ByteStream> &bs);
Packit df99a1
Packit df99a1
      /** Encodes all annotations back into a sequence of chunks to be put
Packit df99a1
	  inside a #FORM:TEXT#. */
Packit df99a1
   void	encode(const GP<ByteStream> &bs);
Packit df99a1
Packit df99a1
      /// Returns a copy of this object
Packit df99a1
   GP<DjVuText>	copy(void) const;
Packit df99a1
Packit df99a1
      /** Returns the number of bytes needed by this data structure. It's
Packit df99a1
	  used by caching routines to estimate the size of a \Ref{DjVuImage}. */
Packit df99a1
   inline unsigned int get_memory_usage() const;
Packit df99a1
Packit df99a1
   /// Write XML formated text.
Packit df99a1
   void writeText(ByteStream &bs,const int height) const;
Packit df99a1
Packit df99a1
   /// Get XML formatted text.
Packit df99a1
   GUTF8String get_xmlText(const int height) const;
Packit df99a1
Packit df99a1
   GP<DjVuTXT>  txt;
Packit df99a1
private: // dummy stuff
Packit df99a1
   static void decode(ByteStream *);
Packit df99a1
   static void	encode(ByteStream *);
Packit df99a1
};
Packit df99a1
Packit df99a1
//@}
Packit df99a1
Packit df99a1
inline unsigned int
Packit df99a1
DjVuText::get_memory_usage() const
Packit df99a1
{
Packit df99a1
  return (txt)?(txt->get_memory_usage()):0;
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
// ----- THE END
Packit df99a1
Packit df99a1
#ifdef HAVE_NAMESPACES
Packit df99a1
}
Packit df99a1
# ifndef NOT_USING_DJVU_NAMESPACE
Packit df99a1
using namespace DJVU;
Packit df99a1
# endif
Packit df99a1
#endif
Packit df99a1
#endif
Packit df99a1
Packit df99a1