Blame libdjvu/DjVuText.cpp

Packit df99a1
//C-  -*- C++ -*-
Packit df99a1
//C- -------------------------------------------------------------------
Packit df99a1
//C- DjVuLibre-3.5
Packit df99a1
//C- Copyright (c) 2002  Leon Bottou and Yann Le Cun.
Packit df99a1
//C- Copyright (c) 2001  AT&T
Packit df99a1
//C-
Packit df99a1
//C- This software is subject to, and may be distributed under, the
Packit df99a1
//C- GNU General Public License, either Version 2 of the license,
Packit df99a1
//C- or (at your option) any later version. The license should have
Packit df99a1
//C- accompanied the software or you may obtain a copy of the license
Packit df99a1
//C- from the Free Software Foundation at http://www.fsf.org .
Packit df99a1
//C-
Packit df99a1
//C- This program is distributed in the hope that it will be useful,
Packit df99a1
//C- but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit df99a1
//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit df99a1
//C- GNU General Public License for more details.
Packit df99a1
//C- 
Packit df99a1
//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from
Packit df99a1
//C- Lizardtech Software.  Lizardtech Software has authorized us to
Packit df99a1
//C- replace the original DjVu(r) Reference Library notice by the following
Packit df99a1
//C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu):
Packit df99a1
//C-
Packit df99a1
//C-  ------------------------------------------------------------------
Packit df99a1
//C- | DjVu (r) Reference Library (v. 3.5)
Packit df99a1
//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
Packit df99a1
//C- | The DjVu Reference Library is protected by U.S. Pat. No.
Packit df99a1
//C- | 6,058,214 and patents pending.
Packit df99a1
//C- |
Packit df99a1
//C- | This software is subject to, and may be distributed under, the
Packit df99a1
//C- | GNU General Public License, either Version 2 of the license,
Packit df99a1
//C- | or (at your option) any later version. The license should have
Packit df99a1
//C- | accompanied the software or you may obtain a copy of the license
Packit df99a1
//C- | from the Free Software Foundation at http://www.fsf.org .
Packit df99a1
//C- |
Packit df99a1
//C- | The computer code originally released by LizardTech under this
Packit df99a1
//C- | license and unmodified by other parties is deemed "the LIZARDTECH
Packit df99a1
//C- | ORIGINAL CODE."  Subject to any third party intellectual property
Packit df99a1
//C- | claims, LizardTech grants recipient a worldwide, royalty-free, 
Packit df99a1
//C- | non-exclusive license to make, use, sell, or otherwise dispose of 
Packit df99a1
//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the 
Packit df99a1
//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU 
Packit df99a1
//C- | General Public License.   This grant only confers the right to 
Packit df99a1
//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to 
Packit df99a1
//C- | the extent such infringement is reasonably necessary to enable 
Packit df99a1
//C- | recipient to make, have made, practice, sell, or otherwise dispose 
Packit df99a1
//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to 
Packit df99a1
//C- | any greater extent that may be necessary to utilize further 
Packit df99a1
//C- | modifications or combinations.
Packit df99a1
//C- |
Packit df99a1
//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
Packit df99a1
//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
Packit df99a1
//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
Packit df99a1
//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Packit df99a1
//C- +------------------------------------------------------------------
Packit df99a1
Packit df99a1
#ifdef HAVE_CONFIG_H
Packit df99a1
# include "config.h"
Packit df99a1
#endif
Packit df99a1
#if NEED_GNUG_PRAGMAS
Packit df99a1
# pragma implementation
Packit df99a1
#endif
Packit df99a1
Packit df99a1
#include "DjVuText.h"
Packit df99a1
#include "IFFByteStream.h"
Packit df99a1
#include "BSByteStream.h"
Packit df99a1
#include "debug.h"
Packit df99a1
#include <ctype.h>
Packit df99a1
Packit df99a1
Packit df99a1
Packit df99a1
#ifdef HAVE_NAMESPACES
Packit df99a1
namespace DJVU {
Packit df99a1
# ifdef NOT_DEFINED // Just to fool emacs c++ mode
Packit df99a1
}
Packit df99a1
#endif
Packit df99a1
#endif
Packit df99a1
Packit df99a1
Packit df99a1
Packit df99a1
#ifdef min
Packit df99a1
#undef min
Packit df99a1
#endif
Packit df99a1
template<class TYPE>
Packit df99a1
static inline TYPE min(TYPE a,TYPE b) { return (a
Packit df99a1
Packit df99a1
//***************************************************************************
Packit df99a1
//******************************** DjVuTXT **********************************
Packit df99a1
//***************************************************************************
Packit df99a1
Packit df99a1
const char DjVuTXT::end_of_column    = 013;      // VT: Vertical Tab
Packit df99a1
const char DjVuTXT::end_of_region    = 035;      // GS: Group Separator
Packit df99a1
const char DjVuTXT::end_of_paragraph = 037;      // US: Unit Separator
Packit df99a1
const char DjVuTXT::end_of_line      = 012;      // LF: Line Feed
Packit df99a1
Packit df99a1
const int DjVuTXT::Zone::version  = 1;
Packit df99a1
Packit df99a1
DjVuTXT::Zone::Zone()
Packit df99a1
  : ztype(DjVuTXT::PAGE), text_start(0), text_length(0), zone_parent(0)
Packit df99a1
{
Packit df99a1
}
Packit df99a1
Packit df99a1
DjVuTXT::Zone *
Packit df99a1
DjVuTXT::Zone::append_child()
Packit df99a1
{
Packit df99a1
  Zone empty;
Packit df99a1
  empty.ztype = ztype;
Packit df99a1
  empty.text_start = 0;
Packit df99a1
  empty.text_length = 0;
Packit df99a1
  empty.zone_parent=this;
Packit df99a1
  children.append(empty);
Packit df99a1
  return & children[children.lastpos()];
Packit df99a1
}
Packit df99a1
Packit df99a1
void
Packit df99a1
DjVuTXT::Zone::cleartext()
Packit df99a1
{
Packit df99a1
  text_start = 0;
Packit df99a1
  text_length = 0;
Packit df99a1
  for (GPosition i=children; i; ++i)
Packit df99a1
    children[i].cleartext();
Packit df99a1
}
Packit df99a1
Packit df99a1
void
Packit df99a1
DjVuTXT::Zone::normtext(const char *instr, GUTF8String &outstr)
Packit df99a1
{
Packit df99a1
  if (text_length == 0)
Packit df99a1
    {
Packit df99a1
      // Descend collecting text below
Packit df99a1
      text_start = outstr.length();
Packit df99a1
      for (GPosition i=children; i; ++i)
Packit df99a1
        children[i].normtext(instr, outstr);
Packit df99a1
      text_length = outstr.length() - text_start;
Packit df99a1
      // Ignore empty zones
Packit df99a1
      if (text_length == 0)
Packit df99a1
        return;
Packit df99a1
    }
Packit df99a1
  else
Packit df99a1
    {
Packit df99a1
      // Collect text at this level
Packit df99a1
      int new_start = outstr.length();
Packit df99a1
      outstr = outstr + GUTF8String(instr+text_start, text_length);
Packit df99a1
      text_start = new_start;
Packit df99a1
      // Clear textual information on lower level nodes
Packit df99a1
      for (GPosition i=children; i; ++i)
Packit df99a1
        children[i].cleartext();
Packit df99a1
    }
Packit df99a1
  // Determine standard separator
Packit df99a1
  char sep;
Packit df99a1
  switch (ztype)
Packit df99a1
    {
Packit df99a1
    case COLUMN:
Packit df99a1
      sep = end_of_column; break;
Packit df99a1
    case REGION:
Packit df99a1
      sep = end_of_region; break;
Packit df99a1
    case PARAGRAPH: 
Packit df99a1
      sep = end_of_paragraph; break;
Packit df99a1
    case LINE:
Packit df99a1
      sep = end_of_line; break;
Packit df99a1
    case WORD:
Packit df99a1
      sep = ' '; break;
Packit df99a1
    default:
Packit df99a1
      return;
Packit df99a1
    }
Packit df99a1
  // Add separator if not present yet.
Packit df99a1
  if (outstr[text_start+text_length-1] != sep)
Packit df99a1
    {
Packit df99a1
      outstr = outstr + GUTF8String(&sep, 1);
Packit df99a1
      text_length += 1;
Packit df99a1
    }
Packit df99a1
}
Packit df99a1
Packit df99a1
unsigned int 
Packit df99a1
DjVuTXT::Zone::memuse() const
Packit df99a1
{
Packit df99a1
  int memuse = sizeof(*this);
Packit df99a1
  for (GPosition i=children; i; ++i)
Packit df99a1
    memuse += children[i].memuse();
Packit df99a1
  return memuse;
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
#ifndef NEED_DECODER_ONLY
Packit df99a1
void 
Packit df99a1
DjVuTXT::Zone::encode(
Packit df99a1
  const GP<ByteStream> &gbs, const Zone * parent, const Zone * prev) const
Packit df99a1
{
Packit df99a1
  ByteStream &bs=*gbs;
Packit df99a1
  // Encode type
Packit df99a1
  bs.write8(ztype);
Packit df99a1
  
Packit df99a1
  // Modify text_start and bounding rectangle based on the context
Packit df99a1
  // (whether there is a previous non-zero same-level-child or parent)
Packit df99a1
  int start=text_start;
Packit df99a1
  int x=rect.xmin, y=rect.ymin;
Packit df99a1
  int width=rect.width(), height=rect.height();
Packit df99a1
  if (prev)
Packit df99a1
  {
Packit df99a1
    if (ztype==PAGE || ztype==PARAGRAPH || ztype==LINE)
Packit df99a1
    {
Packit df99a1
      // Encode offset from the lower left corner of the previous
Packit df99a1
      // child in the coord system in that corner with x to the
Packit df99a1
      // right and y down
Packit df99a1
      x=x-prev->rect.xmin;
Packit df99a1
      y=prev->rect.ymin-(y+height);
Packit df99a1
    } else // Either COLUMN or WORD or CHARACTER
Packit df99a1
    {
Packit df99a1
      // Encode offset from the lower right corner of the previous
Packit df99a1
      // child in the coord system in that corner with x to the
Packit df99a1
      // right and y up
Packit df99a1
      x=x-prev->rect.xmax;
Packit df99a1
      y=y-prev->rect.ymin;
Packit df99a1
    }
Packit df99a1
    start-=prev->text_start+prev->text_length;
Packit df99a1
  } else if (parent)
Packit df99a1
  {
Packit df99a1
    // Encode offset from the upper left corner of the parent
Packit df99a1
    // in the coord system in that corner with x to the right and y down
Packit df99a1
    x=x-parent->rect.xmin;
Packit df99a1
    y=parent->rect.ymax-(y+height);
Packit df99a1
    start-=parent->text_start;
Packit df99a1
  }
Packit df99a1
  // Encode rectangle
Packit df99a1
  bs.write16(0x8000+x);
Packit df99a1
  bs.write16(0x8000+y);
Packit df99a1
  bs.write16(0x8000+width);
Packit df99a1
  bs.write16(0x8000+height);
Packit df99a1
  // Encode text info
Packit df99a1
  bs.write16(0x8000+start);
Packit df99a1
  bs.write24(text_length);
Packit df99a1
  // Encode number of children
Packit df99a1
  bs.write24(children.size());
Packit df99a1
  
Packit df99a1
  const Zone * prev_child=0;
Packit df99a1
  // Encode all children
Packit df99a1
  for (GPosition i=children; i; ++i)
Packit df99a1
  {
Packit df99a1
    children[i].encode(gbs, this, prev_child);
Packit df99a1
    prev_child=&children[i];
Packit df99a1
  }
Packit df99a1
}
Packit df99a1
#endif
Packit df99a1
Packit df99a1
void 
Packit df99a1
DjVuTXT::Zone::decode(const GP<ByteStream> &gbs, int maxtext,
Packit df99a1
		      const Zone * parent, const Zone * prev)
Packit df99a1
{
Packit df99a1
  ByteStream &bs=*gbs;
Packit df99a1
  // Decode type
Packit df99a1
  ztype = (ZoneType) bs.read8();
Packit df99a1
  if ( ztype<PAGE || ztype>CHARACTER )
Packit df99a1
    G_THROW( ERR_MSG("DjVuText.corrupt_text") );
Packit df99a1
Packit df99a1
  // Decode coordinates
Packit df99a1
  int x=(int) bs.read16()-0x8000;
Packit df99a1
  int y=(int) bs.read16()-0x8000;
Packit df99a1
  int width=(int) bs.read16()-0x8000;
Packit df99a1
  int height=(int) bs.read16()-0x8000;
Packit df99a1
Packit df99a1
  // Decode text info
Packit df99a1
  text_start = (int) bs.read16()-0x8000;
Packit df99a1
//  int start=text_start;
Packit df99a1
  text_length = bs.read24();
Packit df99a1
  if (prev)
Packit df99a1
  {
Packit df99a1
    if (ztype==PAGE || ztype==PARAGRAPH || ztype==LINE)
Packit df99a1
    {
Packit df99a1
      x=x+prev->rect.xmin;
Packit df99a1
      y=prev->rect.ymin-(y+height);
Packit df99a1
    } else // Either COLUMN or WORD or CHARACTER
Packit df99a1
    {
Packit df99a1
      x=x+prev->rect.xmax;
Packit df99a1
      y=y+prev->rect.ymin;
Packit df99a1
    }
Packit df99a1
    text_start+=prev->text_start+prev->text_length;
Packit df99a1
  } else if (parent)
Packit df99a1
  {
Packit df99a1
    x=x+parent->rect.xmin;
Packit df99a1
    y=parent->rect.ymax-(y+height);
Packit df99a1
    text_start+=parent->text_start;
Packit df99a1
  }
Packit df99a1
  rect=GRect(x, y, width, height);
Packit df99a1
  // Get children size
Packit df99a1
  int size = bs.read24();
Packit df99a1
Packit df99a1
  // Checks
Packit df99a1
  if (rect.isempty() || text_start<0 || text_start+text_length>maxtext )
Packit df99a1
    G_THROW( ERR_MSG("DjVuText.corrupt_text") );
Packit df99a1
Packit df99a1
  // Process children
Packit df99a1
  const Zone * prev_child=0;
Packit df99a1
  children.empty();
Packit df99a1
  while (size-- > 0) 
Packit df99a1
  {
Packit df99a1
    Zone *z = append_child();
Packit df99a1
    z->decode(gbs, maxtext, this, prev_child);
Packit df99a1
    prev_child=z;
Packit df99a1
  }
Packit df99a1
}
Packit df99a1
Packit df99a1
void 
Packit df99a1
DjVuTXT::normalize_text()
Packit df99a1
{
Packit df99a1
  GUTF8String newtextUTF8;
Packit df99a1
  page_zone.normtext( (const char*)textUTF8, newtextUTF8 );
Packit df99a1
  textUTF8 = newtextUTF8;
Packit df99a1
}
Packit df99a1
Packit df99a1
int 
Packit df99a1
DjVuTXT::has_valid_zones() const
Packit df99a1
{
Packit df99a1
  if (!textUTF8)
Packit df99a1
    return false;
Packit df99a1
  if (page_zone.rect.isempty()) 
Packit df99a1
    return false;
Packit df99a1
  return true;
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
#ifndef NEED_DECODER_ONLY
Packit df99a1
void 
Packit df99a1
DjVuTXT::encode(const GP<ByteStream> &gbs) const
Packit df99a1
{
Packit df99a1
  ByteStream &bs=*gbs;
Packit df99a1
  if (! textUTF8 )
Packit df99a1
    G_THROW( ERR_MSG("DjVuText.no_text") );
Packit df99a1
  // Encode text
Packit df99a1
  int textsize = textUTF8.length();
Packit df99a1
  bs.write24( textsize );
Packit df99a1
  bs.writall( (void*)(const char*)textUTF8, textsize );
Packit df99a1
  // Encode zones
Packit df99a1
  if (has_valid_zones())
Packit df99a1
  {
Packit df99a1
    bs.write8(Zone::version);
Packit df99a1
    page_zone.encode(gbs);
Packit df99a1
  }
Packit df99a1
}
Packit df99a1
#endif
Packit df99a1
Packit df99a1
void 
Packit df99a1
DjVuTXT::decode(const GP<ByteStream> &gbs)
Packit df99a1
{
Packit df99a1
  ByteStream &bs=*gbs;
Packit df99a1
  // Read text
Packit df99a1
  textUTF8.empty();
Packit df99a1
  int textsize = bs.read24();
Packit df99a1
  char *buffer = textUTF8.getbuf(textsize);
Packit df99a1
  int readsize = bs.read(buffer,textsize);
Packit df99a1
  buffer[readsize] = 0;
Packit df99a1
  if (readsize < textsize)
Packit df99a1
    G_THROW( ERR_MSG("DjVuText.corrupt_chunk") );
Packit df99a1
  // Try reading zones
Packit df99a1
  unsigned char version;
Packit df99a1
  if ( bs.read( (void*) &version, 1 ) == 1) 
Packit df99a1
  {
Packit df99a1
    if (version != Zone::version)
Packit df99a1
      G_THROW( ERR_MSG("DjVuText.bad_version") "\t" + GUTF8String(version) );
Packit df99a1
    page_zone.decode(gbs, textsize);
Packit df99a1
  }
Packit df99a1
}
Packit df99a1
Packit df99a1
GP<DjVuTXT> 
Packit df99a1
DjVuTXT::copy(void) const
Packit df99a1
{
Packit df99a1
  return new DjVuTXT(*this);
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
static inline bool
Packit df99a1
intersects_zone(GRect box, const GRect &zone)
Packit df99a1
{
Packit df99a1
  return
Packit df99a1
    ((box.xmin < zone.xmin)
Packit df99a1
      ?(box.xmax >= zone.xmin)
Packit df99a1
      :(box.xmin <= zone.xmax))
Packit df99a1
    &&((box.ymin < zone.ymin)
Packit df99a1
      ?(box.ymax >= zone.ymin)
Packit df99a1
      :(box.ymin <= zone.ymax));
Packit df99a1
}
Packit df99a1
Packit df99a1
void
Packit df99a1
DjVuTXT::Zone::get_text_with_rect(const GRect &box, 
Packit df99a1
                                  int &string_start, int &string_end) const
Packit df99a1
{
Packit df99a1
  GPosition pos=children;
Packit df99a1
  if(pos?box.contains(rect):intersects_zone(box,rect))
Packit df99a1
  {
Packit df99a1
    const int text_end=text_start+text_length;
Packit df99a1
    if(string_start == string_end)
Packit df99a1
    {
Packit df99a1
      string_start=text_start;
Packit df99a1
      string_end=text_end;
Packit df99a1
    }else
Packit df99a1
    {
Packit df99a1
      if (string_end < text_end)
Packit df99a1
        string_end=text_end;
Packit df99a1
      if(text_start < string_start)
Packit df99a1
        string_start=text_start;
Packit df99a1
    }
Packit df99a1
  }else if(pos&&intersects_zone(box,rect))
Packit df99a1
  {
Packit df99a1
    do
Packit df99a1
    {
Packit df99a1
      children[pos].get_text_with_rect(box,string_start,string_end);
Packit df99a1
    } while(++pos);
Packit df99a1
  }
Packit df99a1
}
Packit df99a1
Packit df99a1
void
Packit df99a1
DjVuTXT::Zone::find_zones(GList<Zone *> &list, 
Packit df99a1
                          const int string_start, const int string_end) const
Packit df99a1
{
Packit df99a1
  const int text_end=text_start+text_length;
Packit df99a1
  if(text_start >= string_start)
Packit df99a1
    {
Packit df99a1
      if(text_end <= string_end)
Packit df99a1
        {
Packit df99a1
          list.append(const_cast<Zone *>(this));
Packit df99a1
        }
Packit df99a1
      else if(text_start < string_end)
Packit df99a1
        {
Packit df99a1
          if (children.size())
Packit df99a1
            for (GPosition pos=children; pos; ++pos)
Packit df99a1
              children[pos].find_zones(list,string_start,string_end);
Packit df99a1
          else
Packit df99a1
            list.append(const_cast<Zone *>(this));
Packit df99a1
        }
Packit df99a1
    }
Packit df99a1
  else if( text_end > string_start)
Packit df99a1
    {
Packit df99a1
      if (children.size())
Packit df99a1
        for (GPosition pos=children; pos; ++pos)
Packit df99a1
          children[pos].find_zones(list,string_start,string_end);
Packit df99a1
      else
Packit df99a1
        list.append(const_cast<Zone *>(this));
Packit df99a1
    }
Packit df99a1
}
Packit df99a1
Packit df99a1
void
Packit df99a1
DjVuTXT::Zone::get_smallest(GList<GRect> &list) const
Packit df99a1
{
Packit df99a1
  GPosition pos=children;
Packit df99a1
  if(pos)
Packit df99a1
    {
Packit df99a1
      do {
Packit df99a1
        children[pos].get_smallest(list);
Packit df99a1
      } while (++pos);
Packit df99a1
    }
Packit df99a1
  else
Packit df99a1
    {
Packit df99a1
      list.append(rect);
Packit df99a1
    }
Packit df99a1
}
Packit df99a1
Packit df99a1
void
Packit df99a1
DjVuTXT::Zone::get_smallest(GList<GRect> &list, const int padding) const
Packit df99a1
{
Packit df99a1
  GPosition pos=children;
Packit df99a1
  if(pos)
Packit df99a1
    {
Packit df99a1
      do {
Packit df99a1
        children[pos].get_smallest(list,padding);
Packit df99a1
      } while (++pos);
Packit df99a1
    }
Packit df99a1
  else if(zone_parent && zone_parent->ztype >= PARAGRAPH)
Packit df99a1
    {
Packit df99a1
      const GRect &xrect=zone_parent->rect;
Packit df99a1
      if(xrect.height() < xrect.width())
Packit df99a1
        {
Packit df99a1
          list.append(GRect(rect.xmin-padding,xrect.ymin-padding,rect.width()
Packit df99a1
                            +2*padding,xrect.height()+2*padding));
Packit df99a1
        }
Packit df99a1
      else
Packit df99a1
        {
Packit df99a1
          list.append(GRect(xrect.xmin-padding,rect.ymin-padding,xrect.width()
Packit df99a1
                            +2*padding,rect.height()+2*padding));
Packit df99a1
        }
Packit df99a1
    }
Packit df99a1
  else
Packit df99a1
    {
Packit df99a1
      list.append(GRect(rect.xmin-padding,rect.ymin-padding,rect.width()
Packit df99a1
                        +2*padding,rect.height()+2*padding));
Packit df99a1
    }
Packit df99a1
}
Packit df99a1
Packit df99a1
void
Packit df99a1
DjVuTXT::get_zones(int zone_type, const Zone *parent, 
Packit df99a1
                   GList<Zone *> & zone_list) const 
Packit df99a1
   // get all the zones of  type zone_type under zone node parent
Packit df99a1
{
Packit df99a1
   // search all branches under parent
Packit df99a1
   const Zone *zone=parent;
Packit df99a1
   for( int cur_ztype=zone->ztype; cur_ztype
Packit df99a1
   {
Packit df99a1
      GPosition pos;
Packit df99a1
      for(pos=zone->children; pos; ++pos)
Packit df99a1
      {
Packit df99a1
	 Zone *zcur=(Zone *)&zone->children[pos];
Packit df99a1
	 if ( zcur->ztype == zone_type )
Packit df99a1
	 {
Packit df99a1
	    GPosition zpos=zone_list;
Packit df99a1
	    if ( !zone_list.search(zcur,zpos) )
Packit df99a1
	       zone_list.append(zcur);
Packit df99a1
	 }
Packit df99a1
	 else if ( zone->children[pos].ztype < zone_type )
Packit df99a1
	    get_zones(zone_type, &zone->children[pos], zone_list);
Packit df99a1
      }
Packit df99a1
   }
Packit df99a1
}
Packit df99a1
Packit df99a1
GList<GRect>
Packit df99a1
DjVuTXT::find_text_with_rect(const GRect &box, GUTF8String &text, 
Packit df99a1
                             const int padding) const
Packit df99a1
{
Packit df99a1
  GList<GRect> retval;
Packit df99a1
  int text_start=0;
Packit df99a1
  int text_end=0;
Packit df99a1
  page_zone.get_text_with_rect(box,text_start,text_end);
Packit df99a1
  if(text_start != text_end)
Packit df99a1
  {
Packit df99a1
    GList<Zone *> zones;
Packit df99a1
    page_zone.find_zones(zones,text_start,text_end);
Packit df99a1
    GPosition pos=zones;
Packit df99a1
    if(pos)
Packit df99a1
    {
Packit df99a1
      do
Packit df99a1
      {
Packit df99a1
        if(padding >= 0)
Packit df99a1
        {
Packit df99a1
          zones[pos]->get_smallest(retval,padding);
Packit df99a1
        }else
Packit df99a1
        {
Packit df99a1
          zones[pos]->get_smallest(retval);
Packit df99a1
        }
Packit df99a1
      } while(++pos);
Packit df99a1
    }
Packit df99a1
  }
Packit df99a1
  text=textUTF8.substr(text_start,text_end-text_start);
Packit df99a1
  return retval;
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
GList<DjVuTXT::Zone *>
Packit df99a1
DjVuTXT::find_text_in_rect(GRect target_rect, GUTF8String &text) const
Packit df99a1
   // returns a list of zones of type WORD in the nearest/selected paragraph 
Packit df99a1
{
Packit df99a1
   GList<Zone *> zone_list;
Packit df99a1
   GList<Zone *> lines;
Packit df99a1
Packit df99a1
   get_zones((int)PARAGRAPH, &page_zone, zone_list);
Packit df99a1
   // it's possible that no paragraph structure exists for reasons that  
Packit df99a1
   // 1) ocr engine is not capable 2) file was modified by user. In such case, 
Packit df99a1
   // we can only make a rough guess, i.e., select all the lines intersected with
Packit df99a1
   // target_rect
Packit df99a1
   if (zone_list.isempty())
Packit df99a1
   {
Packit df99a1
      get_zones((int)LINE, &page_zone, zone_list);
Packit df99a1
      GPosition pos;
Packit df99a1
      for(pos=zone_list; pos; ++pos)
Packit df99a1
      {
Packit df99a1
	 GRect rect=zone_list[pos]->rect;
Packit df99a1
	 int h0=rect.height()/2;
Packit df99a1
	 if(rect.intersect(rect,target_rect) && rect.height()>h0)
Packit df99a1
	    lines.append(zone_list[pos]);
Packit df99a1
      }
Packit df99a1
   } else 
Packit df99a1
   {
Packit df99a1
      GPosition pos, pos_sel=zone_list;
Packit df99a1
      float ar=0;
Packit df99a1
      for(pos=zone_list; pos; ++pos)
Packit df99a1
      {
Packit df99a1
	 GRect rect=zone_list[pos]->rect;
Packit df99a1
	 int area=rect.area();
Packit df99a1
	 if (rect.intersect(rect, target_rect))
Packit df99a1
	 {
Packit df99a1
	    float ftmp=rect.area()/(float)area;
Packit df99a1
	    if ( !ar || ar
Packit df99a1
	    {
Packit df99a1
	       ar=ftmp;
Packit df99a1
	       pos_sel=pos;
Packit df99a1
	    }
Packit df99a1
	 }
Packit df99a1
      }
Packit df99a1
      Zone *parag = 0;
Packit df99a1
      if ( ar>0 ) parag=zone_list[pos_sel];
Packit df99a1
      zone_list.empty();
Packit df99a1
      if ( ar>0 ) 
Packit df99a1
      {
Packit df99a1
	 get_zones((int)LINE, parag, zone_list);
Packit df99a1
	 if ( !zone_list.isempty() )
Packit df99a1
	 {
Packit df99a1
	    for(GPosition pos=zone_list; pos; ++pos)
Packit df99a1
	    {
Packit df99a1
	       GRect rect=zone_list[pos]->rect;
Packit df99a1
	       int h0=rect.height()/2;
Packit df99a1
	       if(rect.intersect(rect,target_rect) && rect.height()>h0)
Packit df99a1
		  lines.append(zone_list[pos]);
Packit df99a1
	    }
Packit df99a1
	 }
Packit df99a1
      }
Packit df99a1
   }
Packit df99a1
Packit df99a1
   zone_list.empty();
Packit df99a1
   if (!lines.isempty()) 
Packit df99a1
   {
Packit df99a1
      int i=1, lsize=lines.size();
Packit df99a1
Packit df99a1
      GList<Zone *> words;
Packit df99a1
      for (GPosition pos=lines; pos; ++pos, ++i)
Packit df99a1
      {
Packit df99a1
	 words.empty();
Packit df99a1
	 get_zones((int)WORD, lines[pos], words);
Packit df99a1
Packit df99a1
	 if ( lsize==1 )
Packit df99a1
	 {
Packit df99a1
	    for(GPosition p=words;p;++p)
Packit df99a1
	    {
Packit df99a1
	       GRect rect=words[p]->rect;
Packit df99a1
	       if(rect.intersect(rect,target_rect))
Packit df99a1
	       //if (target_rect.contains(words[p]->rect))
Packit df99a1
		  zone_list.append(words[p]);
Packit df99a1
	    }
Packit df99a1
	 } else
Packit df99a1
	 {
Packit df99a1
	    if (i==1)
Packit df99a1
	    {
Packit df99a1
	       bool start=true;
Packit df99a1
	       for(GPosition p=words; p; ++p)
Packit df99a1
	       {
Packit df99a1
		  if ( start )
Packit df99a1
		  {
Packit df99a1
		     GRect rect=words[p]->rect;
Packit df99a1
		     if(rect.intersect(rect,target_rect))
Packit df99a1
			//if (target_rect.contains(words[p]->rect))
Packit df99a1
		     {
Packit df99a1
			start=false;
Packit df99a1
			zone_list.append(words[p]);
Packit df99a1
		     }
Packit df99a1
		  } else 
Packit df99a1
		     zone_list.append(words[p]);
Packit df99a1
	       }
Packit df99a1
	    } else if (i==lsize)
Packit df99a1
	    {
Packit df99a1
	       bool end=true;
Packit df99a1
	       for(GPosition p=words.lastpos();p;--p)
Packit df99a1
	       {
Packit df99a1
		  if ( end )
Packit df99a1
		  {
Packit df99a1
		     GRect rect=words[p]->rect;
Packit df99a1
		     if(rect.intersect(rect,target_rect))
Packit df99a1
			//if(target_rect.contains(words[p]->rect) )
Packit df99a1
		     {
Packit df99a1
			end=false;
Packit df99a1
			zone_list.append(words[p]);
Packit df99a1
		     }
Packit df99a1
		  } else 
Packit df99a1
		     zone_list.append(words[p]);
Packit df99a1
	       }
Packit df99a1
	    }
Packit df99a1
Packit df99a1
	    if (i!=1 && i!=lsize )
Packit df99a1
	    {
Packit df99a1
	       for(GPosition p=words;p;++p)
Packit df99a1
		  zone_list.append(words[p]);
Packit df99a1
	    }
Packit df99a1
	 }
Packit df99a1
      }
Packit df99a1
   } 
Packit df99a1
Packit df99a1
   return zone_list;
Packit df99a1
}
Packit df99a1
Packit df99a1
unsigned int 
Packit df99a1
DjVuTXT::get_memory_usage() const
Packit df99a1
{
Packit df99a1
  return sizeof(*this) + textUTF8.length() + page_zone.memuse() - sizeof(page_zone); 
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
Packit df99a1
//***************************************************************************
Packit df99a1
//******************************** DjVuText *********************************
Packit df99a1
//***************************************************************************
Packit df99a1
Packit df99a1
void
Packit df99a1
DjVuText::decode(const GP<ByteStream> &gbs)
Packit df99a1
{
Packit df99a1
  GUTF8String chkid;
Packit df99a1
  GP<IFFByteStream> giff=IFFByteStream::create(gbs);
Packit df99a1
  IFFByteStream &iff=*giff;
Packit df99a1
  while( iff.get_chunk(chkid) )
Packit df99a1
  {
Packit df99a1
    if (chkid == "TXTa")
Packit df99a1
    {
Packit df99a1
      if (txt)
Packit df99a1
        G_THROW( ERR_MSG("DjVuText.dupl_text") );
Packit df99a1
      txt = DjVuTXT::create();
Packit df99a1
      txt->decode(iff.get_bytestream());
Packit df99a1
    }
Packit df99a1
    else if (chkid == "TXTz")
Packit df99a1
    {
Packit df99a1
      if (txt)
Packit df99a1
        G_THROW( ERR_MSG("DjVuText.dupl_text") );
Packit df99a1
      txt = DjVuTXT::create();
Packit df99a1
      const GP<ByteStream> gbsiff=BSByteStream::create(iff.get_bytestream());
Packit df99a1
      txt->decode(gbsiff);
Packit df99a1
    }
Packit df99a1
    // Add decoding of other chunks here
Packit df99a1
    iff.close_chunk();
Packit df99a1
  }
Packit df99a1
}
Packit df99a1
Packit df99a1
void
Packit df99a1
DjVuText::encode(const GP<ByteStream> &gbs)
Packit df99a1
{
Packit df99a1
  if (txt)
Packit df99a1
  {
Packit df99a1
    const GP<IFFByteStream> giff=IFFByteStream::create(gbs);
Packit df99a1
    IFFByteStream &iff=*giff;
Packit df99a1
    iff.put_chunk("TXTz");
Packit df99a1
    {
Packit df99a1
      GP<ByteStream> gbsiff=BSByteStream::create(iff.get_bytestream(),50);
Packit df99a1
      txt->encode(gbsiff);
Packit df99a1
    }
Packit df99a1
    iff.close_chunk();
Packit df99a1
  }
Packit df99a1
  // Add encoding of other chunks here
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
GP<DjVuText>
Packit df99a1
DjVuText::copy(void) const
Packit df99a1
{
Packit df99a1
   GP<DjVuText> text= new DjVuText;
Packit df99a1
      // Copy any primitives (if any)
Packit df99a1
   *text=*this;
Packit df99a1
      // Copy each substructure
Packit df99a1
   if (txt)
Packit df99a1
     text->txt = txt->copy();
Packit df99a1
   return text;
Packit df99a1
}
Packit df99a1
Packit df99a1
static GUTF8String
Packit df99a1
indent ( int spaces)
Packit df99a1
{
Packit df99a1
  GUTF8String ret;
Packit df99a1
  for( int i = 0 ; i < spaces ; i++ )
Packit df99a1
    ret += ' ';
Packit df99a1
  return ret;
Packit df99a1
}
Packit df99a1
Packit df99a1
static const char *tags[8]=
Packit df99a1
{ 0,
Packit df99a1
  "HIDDENTEXT",
Packit df99a1
  "PAGECOLUMN",
Packit df99a1
  "REGION",
Packit df99a1
  "PARAGRAPH",
Packit df99a1
  "LINE",
Packit df99a1
  "WORD",
Packit df99a1
  "CHARACTER" };
Packit df99a1
static const int tags_size=sizeof(tags)/sizeof(const char *);
Packit df99a1
Packit df99a1
static GUTF8String
Packit df99a1
start_tag(const DjVuTXT::ZoneType zone)
Packit df99a1
{
Packit df99a1
  GUTF8String retval;
Packit df99a1
  if((tags_size > (int)zone)&&((int)zone > 0))
Packit df99a1
  {
Packit df99a1
    switch (zone)
Packit df99a1
    {
Packit df99a1
      case DjVuTXT::CHARACTER:
Packit df99a1
        retval="<"+GUTF8String(tags[zone])+">";
Packit df99a1
        break;
Packit df99a1
      case DjVuTXT::WORD:
Packit df99a1
        retval=indent(2*(int)zone+2)+"<"+tags[zone]+">";
Packit df99a1
        break;
Packit df99a1
      default:
Packit df99a1
        retval=indent(2*(int)zone+2)+"<"+tags[zone]+">\n";
Packit df99a1
        break;
Packit df99a1
    }
Packit df99a1
  }
Packit df99a1
  return retval;
Packit df99a1
}
Packit df99a1
Packit df99a1
static GUTF8String
Packit df99a1
start_tag(const DjVuTXT::ZoneType zone, const GUTF8String &attributes)
Packit df99a1
{
Packit df99a1
  GUTF8String retval;
Packit df99a1
  if((tags_size > (int)zone)&&((int)zone > 0))
Packit df99a1
  {
Packit df99a1
    switch (zone)
Packit df99a1
    {
Packit df99a1
      case DjVuTXT::CHARACTER:
Packit df99a1
        retval="<"+GUTF8String(tags[zone])+" "+attributes+">";
Packit df99a1
        break;
Packit df99a1
      case DjVuTXT::WORD:
Packit df99a1
        retval=indent(2*(int)zone+2)+"<"+tags[zone]+" "+attributes+">";
Packit df99a1
        break;
Packit df99a1
      default:
Packit df99a1
        retval=indent(2*(int)zone+2)+"<"+tags[zone]+" "+attributes+">\n";
Packit df99a1
        break;
Packit df99a1
    }
Packit df99a1
  }
Packit df99a1
  return retval;
Packit df99a1
}
Packit df99a1
Packit df99a1
static inline GUTF8String
Packit df99a1
start_tag(const int layer)
Packit df99a1
{
Packit df99a1
  return start_tag((const DjVuTXT::ZoneType)layer);
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
static GUTF8String
Packit df99a1
end_tag(const DjVuTXT::ZoneType zone)
Packit df99a1
{
Packit df99a1
  GUTF8String retval;
Packit df99a1
  if((tags_size > (int)zone)&&((int)zone >= 0))
Packit df99a1
  {
Packit df99a1
    switch (zone)
Packit df99a1
    {
Packit df99a1
      case DjVuTXT::CHARACTER:
Packit df99a1
        retval="</"+GUTF8String(tags[zone])+">";
Packit df99a1
        break;
Packit df99a1
      case DjVuTXT::WORD:
Packit df99a1
        retval="</"+GUTF8String(tags[zone])+">\n";
Packit df99a1
        break;
Packit df99a1
      default:
Packit df99a1
        retval=indent(2*(int)zone+2)+"</"+tags[zone]+">\n";
Packit df99a1
        break;
Packit df99a1
    }
Packit df99a1
  }
Packit df99a1
  return retval;
Packit df99a1
}
Packit df99a1
Packit df99a1
static inline GUTF8String
Packit df99a1
end_tag(const int layer)
Packit df99a1
{
Packit df99a1
  return end_tag((const DjVuTXT::ZoneType)layer);
Packit df99a1
}
Packit df99a1
Packit df99a1
static GUTF8String
Packit df99a1
tolayer(int &layer, const DjVuTXT::ZoneType next_layer)
Packit df99a1
{
Packit df99a1
  GUTF8String retval;
Packit df99a1
  for( ;layer < (int)next_layer;layer++ )
Packit df99a1
  {
Packit df99a1
    retval+=start_tag(layer);
Packit df99a1
  }
Packit df99a1
  while (layer > (int)next_layer )
Packit df99a1
  {
Packit df99a1
    retval+=end_tag(--layer);
Packit df99a1
  }
Packit df99a1
  return retval;
Packit df99a1
}
Packit df99a1
Packit df99a1
static void
Packit df99a1
writeText( ByteStream & str_out,
Packit df99a1
            const GUTF8String &textUTF8,
Packit df99a1
            const DjVuTXT::Zone &zone,
Packit df99a1
            const int WindowHeight );
Packit df99a1
Packit df99a1
static void
Packit df99a1
writeText( ByteStream & str_out,
Packit df99a1
           const GUTF8String &textUTF8,
Packit df99a1
           const DjVuTXT::ZoneType zlayer,
Packit df99a1
           const GList<DjVuTXT::Zone> &children,
Packit df99a1
           const int WindowHeight )
Packit df99a1
{
Packit df99a1
//  assert( txt->has_valid_zones() );
Packit df99a1
//  DEBUG_MSG( "--zonetype=" << txt->page_zone.ztype << "\n" );
Packit df99a1
Packit df99a1
  //  Beginning tags for missing layers
Packit df99a1
  int layer=(int)zlayer;
Packit df99a1
  //  Output the next layer
Packit df99a1
  for(GPosition pos=children ; pos ; ++pos )
Packit df99a1
  {
Packit df99a1
    str_out.writestring(tolayer(layer,children[pos].ztype));
Packit df99a1
    writeText( str_out,
Packit df99a1
                textUTF8,
Packit df99a1
                children[pos],
Packit df99a1
                WindowHeight );
Packit df99a1
  }
Packit df99a1
  str_out.writestring(tolayer(layer,zlayer));
Packit df99a1
}
Packit df99a1
Packit df99a1
static void
Packit df99a1
writeText( ByteStream & str_out,
Packit df99a1
            const GUTF8String &textUTF8,
Packit df99a1
            const DjVuTXT::Zone &zone,
Packit df99a1
            const int WindowHeight )
Packit df99a1
{
Packit df99a1
//  DEBUG_MSG( "--zonetype=" << zone.ztype << "\n" );
Packit df99a1
Packit df99a1
  const GUTF8String xindent(indent( 2 * zone.ztype + 2 ));
Packit df99a1
  GPosition pos=zone.children;
Packit df99a1
  // Build attribute string
Packit df99a1
  if( ! pos )
Packit df99a1
  {
Packit df99a1
    GUTF8String coords;
Packit df99a1
    coords.format("coords=\"%d,%d,%d,%d\"",
Packit df99a1
      zone.rect.xmin, WindowHeight - 1 - zone.rect.ymin,
Packit df99a1
      zone.rect.xmax, WindowHeight - 1 - zone.rect.ymax);
Packit df99a1
    const int start=zone.text_start;
Packit df99a1
    const int end=textUTF8.firstEndSpace(start,zone.text_length);
Packit df99a1
    str_out.writestring(start_tag(zone.ztype,coords));
Packit df99a1
    str_out.writestring(textUTF8.substr(start,end-start).toEscaped());
Packit df99a1
    str_out.writestring(end_tag(zone.ztype));
Packit df99a1
  } else
Packit df99a1
  {
Packit df99a1
    writeText(str_out,textUTF8,zone.ztype,zone.children,WindowHeight);
Packit df99a1
  }
Packit df99a1
}
Packit df99a1
Packit df99a1
void
Packit df99a1
DjVuTXT::writeText(ByteStream &str_out,const int height) const
Packit df99a1
{
Packit df99a1
  if(has_valid_zones())
Packit df99a1
  {
Packit df99a1
    ::writeText(str_out,textUTF8,DjVuTXT::PAGE,page_zone.children,height);
Packit df99a1
  }else
Packit df99a1
  {
Packit df99a1
    str_out.writestring(start_tag(DjVuTXT::PAGE));
Packit df99a1
    str_out.writestring(end_tag(DjVuTXT::PAGE));
Packit df99a1
  }
Packit df99a1
}
Packit df99a1
Packit df99a1
void
Packit df99a1
DjVuText::writeText(ByteStream &str_out,const int height) const
Packit df99a1
{
Packit df99a1
  if(txt)
Packit df99a1
  {
Packit df99a1
    txt->writeText(str_out,height);
Packit df99a1
  }else
Packit df99a1
  {
Packit df99a1
    str_out.writestring("<"+GUTF8String(tags[DjVuTXT::PAGE])+"/>\n");
Packit df99a1
  }
Packit df99a1
   
Packit df99a1
}
Packit df99a1
GUTF8String
Packit df99a1
DjVuTXT::get_xmlText(const int height) const
Packit df99a1
{
Packit df99a1
  GP<ByteStream> gbs(ByteStream::create());
Packit df99a1
  ByteStream &bs=*gbs;
Packit df99a1
  writeText(bs,height);
Packit df99a1
  bs.seek(0L);
Packit df99a1
  return bs.getAsUTF8();
Packit df99a1
}
Packit df99a1
Packit df99a1
GUTF8String
Packit df99a1
DjVuText::get_xmlText(const int height) const
Packit df99a1
{
Packit df99a1
  GUTF8String retval;
Packit df99a1
  if(txt)
Packit df99a1
  {
Packit df99a1
    retval=txt->get_xmlText(height);
Packit df99a1
  }else
Packit df99a1
  {
Packit df99a1
    retval="<"+GUTF8String(tags[DjVuTXT::PAGE])+"/>\n";
Packit df99a1
  }
Packit df99a1
  return retval;
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
#ifdef HAVE_NAMESPACES
Packit df99a1
}
Packit df99a1
# ifndef NOT_USING_DJVU_NAMESPACE
Packit df99a1
using namespace DJVU;
Packit df99a1
# endif
Packit df99a1
#endif
Packit df99a1