Blame tools/djvutxt.cpp

Packit df99a1
//C-  -*- C++ -*-
Packit df99a1
//C- -------------------------------------------------------------------
Packit df99a1
//C- DjVuLibre-3.5
Packit df99a1
//C- Copyright (c) 2002  Leon Bottou and Yann Le Cun.
Packit df99a1
//C- Copyright (c) 2001  AT&T
Packit df99a1
//C-
Packit df99a1
//C- This software is subject to, and may be distributed under, the
Packit df99a1
//C- GNU General Public License, either Version 2 of the license,
Packit df99a1
//C- or (at your option) any later version. The license should have
Packit df99a1
//C- accompanied the software or you may obtain a copy of the license
Packit df99a1
//C- from the Free Software Foundation at http://www.fsf.org .
Packit df99a1
//C-
Packit df99a1
//C- This program is distributed in the hope that it will be useful,
Packit df99a1
//C- but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit df99a1
//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
Packit df99a1
//C- GNU General Public License for more details.
Packit df99a1
//C- 
Packit df99a1
//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from
Packit df99a1
//C- Lizardtech Software.  Lizardtech Software has authorized us to
Packit df99a1
//C- replace the original DjVu(r) Reference Library notice by the following
Packit df99a1
//C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu):
Packit df99a1
//C-
Packit df99a1
//C-  ------------------------------------------------------------------
Packit df99a1
//C- | DjVu (r) Reference Library (v. 3.5)
Packit df99a1
//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
Packit df99a1
//C- | The DjVu Reference Library is protected by U.S. Pat. No.
Packit df99a1
//C- | 6,058,214 and patents pending.
Packit df99a1
//C- |
Packit df99a1
//C- | This software is subject to, and may be distributed under, the
Packit df99a1
//C- | GNU General Public License, either Version 2 of the license,
Packit df99a1
//C- | or (at your option) any later version. The license should have
Packit df99a1
//C- | accompanied the software or you may obtain a copy of the license
Packit df99a1
//C- | from the Free Software Foundation at http://www.fsf.org .
Packit df99a1
//C- |
Packit df99a1
//C- | The computer code originally released by LizardTech under this
Packit df99a1
//C- | license and unmodified by other parties is deemed "the LIZARDTECH
Packit df99a1
//C- | ORIGINAL CODE."  Subject to any third party intellectual property
Packit df99a1
//C- | claims, LizardTech grants recipient a worldwide, royalty-free, 
Packit df99a1
//C- | non-exclusive license to make, use, sell, or otherwise dispose of 
Packit df99a1
//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the 
Packit df99a1
//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU 
Packit df99a1
//C- | General Public License.   This grant only confers the right to 
Packit df99a1
//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to 
Packit df99a1
//C- | the extent such infringement is reasonably necessary to enable 
Packit df99a1
//C- | recipient to make, have made, practice, sell, or otherwise dispose 
Packit df99a1
//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to 
Packit df99a1
//C- | any greater extent that may be necessary to utilize further 
Packit df99a1
//C- | modifications or combinations.
Packit df99a1
//C- |
Packit df99a1
//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
Packit df99a1
//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
Packit df99a1
//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
Packit df99a1
//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
Packit df99a1
//C- +------------------------------------------------------------------
Packit df99a1
Packit df99a1
Packit df99a1
/* Program djvutxt has been rewritten to use the ddjvuapi only.
Packit df99a1
 */
Packit df99a1
Packit df99a1
#ifdef HAVE_CONFIG_H
Packit df99a1
# include "config.h"
Packit df99a1
#endif
Packit df99a1
Packit df99a1
#include <stddef.h>
Packit df99a1
#include <stdlib.h>
Packit df99a1
#include <stdio.h>
Packit df99a1
#include <stdarg.h>
Packit df99a1
#include <string.h>
Packit df99a1
#include <locale.h>
Packit df99a1
#include <fcntl.h>
Packit df99a1
#include <errno.h>
Packit df99a1
Packit df99a1
#if defined(_WIN32) && !defined(__CYGWIN32__)
Packit df99a1
# include <mbctype.h>
Packit df99a1
#endif
Packit df99a1
Packit df99a1
#include "libdjvu/miniexp.h"
Packit df99a1
#include "libdjvu/ddjvuapi.h"
Packit df99a1
Packit df99a1
Packit df99a1
/* Some day we'll redo i18n right. */
Packit df99a1
#ifndef i18n
Packit df99a1
# define i18n(x) (x)
Packit df99a1
# define I18N(x) (x)
Packit df99a1
#endif
Packit df99a1
Packit df99a1
Packit df99a1
/* Options */
Packit df99a1
const char *inputfilename = 0;
Packit df99a1
const char *outputfilename = 0;
Packit df99a1
const char *detail = 0;
Packit df99a1
const char *pagespec = 0;
Packit df99a1
int escape = 0;
Packit df99a1
Packit df99a1
ddjvu_context_t *ctx;
Packit df99a1
ddjvu_document_t *doc;
Packit df99a1
Packit df99a1
Packit df99a1
void
Packit df99a1
handle(int wait)
Packit df99a1
{
Packit df99a1
  const ddjvu_message_t *msg;
Packit df99a1
  if (!ctx)
Packit df99a1
    return;
Packit df99a1
  if (wait)
Packit df99a1
    msg = ddjvu_message_wait(ctx);
Packit df99a1
  while ((msg = ddjvu_message_peek(ctx)))
Packit df99a1
    {
Packit df99a1
      switch(msg->m_any.tag)
Packit df99a1
        {
Packit df99a1
        case DDJVU_ERROR:
Packit df99a1
          fprintf(stderr,"djvutxt: %s\n", msg->m_error.message);
Packit df99a1
          if (msg->m_error.filename)
Packit df99a1
            fprintf(stderr,"djvutxt: '%s:%d'\n", 
Packit df99a1
                    msg->m_error.filename, msg->m_error.lineno);
Packit df99a1
          exit(10);
Packit df99a1
        default:
Packit df99a1
          break;
Packit df99a1
        }
Packit df99a1
      ddjvu_message_pop(ctx);
Packit df99a1
    }
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
void 
Packit df99a1
die(const char *fmt, ...)
Packit df99a1
{
Packit df99a1
  /* Handling messages might give a better error message */
Packit df99a1
  handle(FALSE);
Packit df99a1
  /* Print */
Packit df99a1
  va_list args;
Packit df99a1
  fprintf(stderr,"djvutxt: ");
Packit df99a1
  va_start(args, fmt);
Packit df99a1
  vfprintf(stderr, fmt, args);
Packit df99a1
  va_end(args);
Packit df99a1
  fprintf(stderr,"\n");
Packit df99a1
  /* Terminates */
Packit df99a1
  exit(10);
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
void
Packit df99a1
dopage(int pageno)
Packit df99a1
{
Packit df99a1
  miniexp_t r = miniexp_nil;
Packit df99a1
  const char *lvl = (detail) ? detail : "page";
Packit df99a1
  while ((r = ddjvu_document_get_pagetext(doc,pageno-1,lvl))==miniexp_dummy)
Packit df99a1
    handle(TRUE);
Packit df99a1
  if (detail)
Packit df99a1
    {
Packit df99a1
      miniexp_io_t io;
Packit df99a1
      miniexp_io_init(&io);
Packit df99a1
#ifdef miniexp_io_print7bits
Packit df99a1
      int flags = (escape) ? miniexp_io_print7bits : 0;
Packit df99a1
      io.p_flags = &flag;;
Packit df99a1
#else
Packit df99a1
      io.p_print7bits = &escape;
Packit df99a1
#endif
Packit df99a1
      miniexp_pprint_r(&io, r, 72);
Packit df99a1
    }
Packit df99a1
  else if ((r = miniexp_nth(5, r)) && miniexp_stringp(r))
Packit df99a1
    {
Packit df99a1
      const char *s = miniexp_to_str(r); 
Packit df99a1
      if (! escape)
Packit df99a1
        fputs(s, stdout);
Packit df99a1
      else
Packit df99a1
        {
Packit df99a1
          unsigned char c;
Packit df99a1
          while ((c = *(unsigned char*)s++))
Packit df99a1
            {
Packit df99a1
              bool esc = false;
Packit df99a1
              if (c == '\\' || c >= 0x7f)
Packit df99a1
                esc = true; /* non-ascii */
Packit df99a1
              if (c < 0x20 && !strchr("\013\035\037\012", c))
Packit df99a1
                esc = true; /* non-printable other than separators */
Packit df99a1
              if (esc)
Packit df99a1
                printf("\\%03o", c);
Packit df99a1
              else
Packit df99a1
                putc(c, stdout);
Packit df99a1
            }
Packit df99a1
        }
Packit df99a1
      fputs("\n\f", stdout);
Packit df99a1
    }
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
void
Packit df99a1
parse_pagespec(const char *s, int max_page, void (*dopage)(int))
Packit df99a1
{
Packit df99a1
  static const char *err = I18N("invalid page specification: %s");
Packit df99a1
  int spec = 0;
Packit df99a1
  int both = 1;
Packit df99a1
  int start_page = 1;
Packit df99a1
  int end_page = max_page;
Packit df99a1
  int pageno;
Packit df99a1
  char *p = (char*)s;
Packit df99a1
  while (*p)
Packit df99a1
    {
Packit df99a1
      spec = 0;
Packit df99a1
      while (*p==' ')
Packit df99a1
        p += 1;
Packit df99a1
      if (! *p)
Packit df99a1
        break;
Packit df99a1
      if (*p>='0' && *p<='9') {
Packit df99a1
        end_page = strtol(p, &p, 10);
Packit df99a1
        spec = 1;
Packit df99a1
      } else if (*p=='$') {
Packit df99a1
        spec = 1;
Packit df99a1
        end_page = max_page;
Packit df99a1
        p += 1;
Packit df99a1
      } else if (both) {
Packit df99a1
        end_page = 1;
Packit df99a1
      } else {
Packit df99a1
        end_page = max_page;
Packit df99a1
      }
Packit df99a1
      while (*p==' ')
Packit df99a1
        p += 1;
Packit df99a1
      if (both) {
Packit df99a1
        start_page = end_page;
Packit df99a1
        if (*p == '-') {
Packit df99a1
          p += 1;
Packit df99a1
          both = 0;
Packit df99a1
          continue;
Packit df99a1
        }
Packit df99a1
      }
Packit df99a1
      both = 1;
Packit df99a1
      while (*p==' ')
Packit df99a1
        p += 1;
Packit df99a1
      if (*p && *p != ',')
Packit df99a1
        die(i18n(err), s);
Packit df99a1
      if (*p == ',')
Packit df99a1
        p += 1;
Packit df99a1
      if (! spec)
Packit df99a1
        die(i18n(err), s);
Packit df99a1
      if (end_page < 0)
Packit df99a1
        end_page = 0;
Packit df99a1
      if (start_page < 0)
Packit df99a1
        start_page = 0;
Packit df99a1
      if (end_page > max_page)
Packit df99a1
        end_page = max_page;
Packit df99a1
      if (start_page > max_page)
Packit df99a1
        start_page = max_page;
Packit df99a1
      if (start_page <= end_page)
Packit df99a1
        for(pageno=start_page; pageno<=end_page; pageno++)
Packit df99a1
          (*dopage)(pageno);
Packit df99a1
      else
Packit df99a1
        for(pageno=start_page; pageno>=end_page; pageno--)
Packit df99a1
          (*dopage)(pageno);
Packit df99a1
    }
Packit df99a1
  if (! spec)
Packit df99a1
    die(i18n(err), s);
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
void
Packit df99a1
usage()
Packit df99a1
{
Packit df99a1
#ifdef DJVULIBRE_VERSION
Packit df99a1
  fprintf(stderr, "DDJVU --- DjVuLibre-" DJVULIBRE_VERSION "\n");
Packit df99a1
#endif
Packit df99a1
  fprintf(stderr, "%s",
Packit df99a1
    i18n("DjVu text extraction utility\n\n"
Packit df99a1
         "Usage: djvutxt [options] <djvufile> [<outputfile>]\n\n"
Packit df99a1
         "Options:\n"
Packit df99a1
         " -page=PAGESPEC    Selects page(s) to be decoded.\n"
Packit df99a1
         " -detail=KEYWORD   Outputs S-expression with the text location.\n"
Packit df99a1
         "                   The optional keyword <page>, <region>, <para>,\n"
Packit df99a1
         "                   <line>,<word>, or <char> specify the finest\n"
Packit df99a1
         "                   level of detail. Default is <char>.\n"
Packit df99a1
         " -escape           Output octal escape sequences for all\n"
Packit df99a1
         "                   non ASCII UTF-8 characters.\n\n") );
Packit df99a1
  /* Terminate? */
Packit df99a1
  exit(10);
Packit df99a1
}
Packit df99a1
Packit df99a1
Packit df99a1
int
Packit df99a1
main(int argc, char **argv)
Packit df99a1
{
Packit df99a1
  int i;
Packit df99a1
#if defined(_WIN32) && !defined(__CYGWIN32__)
Packit df99a1
  _setmbcp(_MB_CP_OEM);
Packit df99a1
#endif
Packit df99a1
  /* Parse options */
Packit df99a1
  for (i=1; i
Packit df99a1
    {
Packit df99a1
      char *s = argv[i];
Packit df99a1
      if (s[0] == '-' && s[1] != 0)
Packit df99a1
        {
Packit df99a1
          char buf[32];
Packit df99a1
          const char *opt = s;
Packit df99a1
          const char *arg = strchr(opt, '=');
Packit df99a1
          if (*opt == '-')
Packit df99a1
            opt += 1;
Packit df99a1
          if (*opt == '-')
Packit df99a1
            opt += 1;
Packit df99a1
          if (arg)
Packit df99a1
            {
Packit df99a1
              int l = arg - opt;
Packit df99a1
              if (l > (int)sizeof(buf) - 1)
Packit df99a1
                l = sizeof(buf) - 1;
Packit df99a1
              strncpy(buf, opt, l);
Packit df99a1
              buf[l] = 0;
Packit df99a1
              opt = buf;
Packit df99a1
              arg += 1;
Packit df99a1
            }
Packit df99a1
          
Packit df99a1
          if (!strcmp(opt,"page") || 
Packit df99a1
              !strcmp(opt,"pages") )
Packit df99a1
            {
Packit df99a1
              if (!arg && i
Packit df99a1
                arg = argv[i++];
Packit df99a1
              if (!arg)
Packit df99a1
                die(i18n("option %s needs an argument."), s);
Packit df99a1
              if (pagespec)
Packit df99a1
                fprintf(stderr,i18n("warning: duplicate option --page=...\n"));
Packit df99a1
              pagespec = arg;
Packit df99a1
            }
Packit df99a1
          else if (!strcmp(opt, "detail"))
Packit df99a1
            {
Packit df99a1
              if (!arg)
Packit df99a1
                arg = "char";
Packit df99a1
              if (detail)
Packit df99a1
                fprintf(stderr,i18n("warning: duplicate option --detail.\n"));
Packit df99a1
              detail = arg;
Packit df99a1
            }
Packit df99a1
          else if (!strcmp(opt, "escape") && !arg)
Packit df99a1
            escape = 1;
Packit df99a1
          else
Packit df99a1
            die(i18n("unrecognized option %s."), s);
Packit df99a1
        }
Packit df99a1
      else if (!inputfilename)
Packit df99a1
        inputfilename = s;
Packit df99a1
      else if (! outputfilename)
Packit df99a1
        outputfilename = s;
Packit df99a1
      else
Packit df99a1
        usage();
Packit df99a1
    }
Packit df99a1
  
Packit df99a1
  /* Defaults */
Packit df99a1
  if (! inputfilename)
Packit df99a1
    usage();
Packit df99a1
  if (outputfilename)
Packit df99a1
    if (! freopen(outputfilename, "w", stdout))
Packit df99a1
      die(i18n("cannot open output file %s."), outputfilename);
Packit df99a1
  if (! pagespec)
Packit df99a1
    pagespec = "1-$";
Packit df99a1
  
Packit df99a1
  /* Create context and document */
Packit df99a1
  if (! (ctx = ddjvu_context_create(argv[0])))
Packit df99a1
    die(i18n("Cannot create djvu context."));
Packit df99a1
  if (! (doc = ddjvu_document_create_by_filename(ctx, inputfilename, TRUE)))
Packit df99a1
    die(i18n("Cannot open djvu document '%s'."), inputfilename);
Packit df99a1
  while (! ddjvu_document_decoding_done(doc))
Packit df99a1
    handle(TRUE);
Packit df99a1
  
Packit df99a1
  /* Process all pages */
Packit df99a1
  i = ddjvu_document_get_pagenum(doc);
Packit df99a1
  parse_pagespec(pagespec, i, dopage);
Packit df99a1
  
Packit df99a1
  /* Close */
Packit df99a1
  if (doc)
Packit df99a1
    ddjvu_document_release(doc);
Packit df99a1
  if (ctx)
Packit df99a1
    ddjvu_context_release(ctx);
Packit df99a1
Packit df99a1
  /* Return */
Packit df99a1
  minilisp_finish();  
Packit df99a1
  return 0;
Packit df99a1
}
Packit df99a1