Blob Blame History Raw
//C-  -*- C++ -*-
//C- -------------------------------------------------------------------
//C- DjVuLibre-3.5
//C- Copyright (c) 2002  Leon Bottou and Yann Le Cun.
//C- Copyright (c) 2001  AT&T
//C-
//C- This software is subject to, and may be distributed under, the
//C- GNU General Public License, either Version 2 of the license,
//C- or (at your option) any later version. The license should have
//C- accompanied the software or you may obtain a copy of the license
//C- from the Free Software Foundation at http://www.fsf.org .
//C-
//C- This program is distributed in the hope that it will be useful,
//C- but WITHOUT ANY WARRANTY; without even the implied warranty of
//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//C- GNU General Public License for more details.
//C- 
//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from
//C- Lizardtech Software.  Lizardtech Software has authorized us to
//C- replace the original DjVu(r) Reference Library notice by the following
//C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu):
//C-
//C-  ------------------------------------------------------------------
//C- | DjVu (r) Reference Library (v. 3.5)
//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
//C- | The DjVu Reference Library is protected by U.S. Pat. No.
//C- | 6,058,214 and patents pending.
//C- |
//C- | This software is subject to, and may be distributed under, the
//C- | GNU General Public License, either Version 2 of the license,
//C- | or (at your option) any later version. The license should have
//C- | accompanied the software or you may obtain a copy of the license
//C- | from the Free Software Foundation at http://www.fsf.org .
//C- |
//C- | The computer code originally released by LizardTech under this
//C- | license and unmodified by other parties is deemed "the LIZARDTECH
//C- | ORIGINAL CODE."  Subject to any third party intellectual property
//C- | claims, LizardTech grants recipient a worldwide, royalty-free, 
//C- | non-exclusive license to make, use, sell, or otherwise dispose of 
//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the 
//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU 
//C- | General Public License.   This grant only confers the right to 
//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to 
//C- | the extent such infringement is reasonably necessary to enable 
//C- | recipient to make, have made, practice, sell, or otherwise dispose 
//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to 
//C- | any greater extent that may be necessary to utilize further 
//C- | modifications or combinations.
//C- |
//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
//C- +------------------------------------------------------------------


/* Program djvutxt has been rewritten to use the ddjvuapi only.
 */

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <locale.h>
#include <fcntl.h>
#include <errno.h>

#if defined(_WIN32) && !defined(__CYGWIN32__)
# include <mbctype.h>
#endif

#include "libdjvu/miniexp.h"
#include "libdjvu/ddjvuapi.h"


/* Some day we'll redo i18n right. */
#ifndef i18n
# define i18n(x) (x)
# define I18N(x) (x)
#endif


/* Options */
const char *inputfilename = 0;
const char *outputfilename = 0;
const char *detail = 0;
const char *pagespec = 0;
int escape = 0;

ddjvu_context_t *ctx;
ddjvu_document_t *doc;


void
handle(int wait)
{
  const ddjvu_message_t *msg;
  if (!ctx)
    return;
  if (wait)
    msg = ddjvu_message_wait(ctx);
  while ((msg = ddjvu_message_peek(ctx)))
    {
      switch(msg->m_any.tag)
        {
        case DDJVU_ERROR:
          fprintf(stderr,"djvutxt: %s\n", msg->m_error.message);
          if (msg->m_error.filename)
            fprintf(stderr,"djvutxt: '%s:%d'\n", 
                    msg->m_error.filename, msg->m_error.lineno);
          exit(10);
        default:
          break;
        }
      ddjvu_message_pop(ctx);
    }
}


void 
die(const char *fmt, ...)
{
  /* Handling messages might give a better error message */
  handle(FALSE);
  /* Print */
  va_list args;
  fprintf(stderr,"djvutxt: ");
  va_start(args, fmt);
  vfprintf(stderr, fmt, args);
  va_end(args);
  fprintf(stderr,"\n");
  /* Terminates */
  exit(10);
}


void
dopage(int pageno)
{
  miniexp_t r = miniexp_nil;
  const char *lvl = (detail) ? detail : "page";
  while ((r = ddjvu_document_get_pagetext(doc,pageno-1,lvl))==miniexp_dummy)
    handle(TRUE);
  if (detail)
    {
      miniexp_io_t io;
      miniexp_io_init(&io);
#ifdef miniexp_io_print7bits
      int flags = (escape) ? miniexp_io_print7bits : 0;
      io.p_flags = &flags;
#else
      io.p_print7bits = &escape;
#endif
      miniexp_pprint_r(&io, r, 72);
    }
  else if ((r = miniexp_nth(5, r)) && miniexp_stringp(r))
    {
      const char *s = miniexp_to_str(r); 
      if (! escape)
        fputs(s, stdout);
      else
        {
          unsigned char c;
          while ((c = *(unsigned char*)s++))
            {
              bool esc = false;
              if (c == '\\' || c >= 0x7f)
                esc = true; /* non-ascii */
              if (c < 0x20 && !strchr("\013\035\037\012", c))
                esc = true; /* non-printable other than separators */
              if (esc)
                printf("\\%03o", c);
              else
                putc(c, stdout);
            }
        }
      fputs("\n\f", stdout);
    }
}


void
parse_pagespec(const char *s, int max_page, void (*dopage)(int))
{
  static const char *err = I18N("invalid page specification: %s");
  int spec = 0;
  int both = 1;
  int start_page = 1;
  int end_page = max_page;
  int pageno;
  char *p = (char*)s;
  while (*p)
    {
      spec = 0;
      while (*p==' ')
        p += 1;
      if (! *p)
        break;
      if (*p>='0' && *p<='9') {
        end_page = strtol(p, &p, 10);
        spec = 1;
      } else if (*p=='$') {
        spec = 1;
        end_page = max_page;
        p += 1;
      } else if (both) {
        end_page = 1;
      } else {
        end_page = max_page;
      }
      while (*p==' ')
        p += 1;
      if (both) {
        start_page = end_page;
        if (*p == '-') {
          p += 1;
          both = 0;
          continue;
        }
      }
      both = 1;
      while (*p==' ')
        p += 1;
      if (*p && *p != ',')
        die(i18n(err), s);
      if (*p == ',')
        p += 1;
      if (! spec)
        die(i18n(err), s);
      if (end_page < 0)
        end_page = 0;
      if (start_page < 0)
        start_page = 0;
      if (end_page > max_page)
        end_page = max_page;
      if (start_page > max_page)
        start_page = max_page;
      if (start_page <= end_page)
        for(pageno=start_page; pageno<=end_page; pageno++)
          (*dopage)(pageno);
      else
        for(pageno=start_page; pageno>=end_page; pageno--)
          (*dopage)(pageno);
    }
  if (! spec)
    die(i18n(err), s);
}


void
usage()
{
#ifdef DJVULIBRE_VERSION
  fprintf(stderr, "DDJVU --- DjVuLibre-" DJVULIBRE_VERSION "\n");
#endif
  fprintf(stderr, "%s",
    i18n("DjVu text extraction utility\n\n"
         "Usage: djvutxt [options] <djvufile> [<outputfile>]\n\n"
         "Options:\n"
         " -page=PAGESPEC    Selects page(s) to be decoded.\n"
         " -detail=KEYWORD   Outputs S-expression with the text location.\n"
         "                   The optional keyword <page>, <region>, <para>,\n"
         "                   <line>,<word>, or <char> specify the finest\n"
         "                   level of detail. Default is <char>.\n"
         " -escape           Output octal escape sequences for all\n"
         "                   non ASCII UTF-8 characters.\n\n") );
  /* Terminate */
  exit(10);
}


int
main(int argc, char **argv)
{
  int i;
#if defined(_WIN32) && !defined(__CYGWIN32__)
  _setmbcp(_MB_CP_OEM);
#endif
  /* Parse options */
  for (i=1; i<argc; i++)
    {
      char *s = argv[i];
      if (s[0] == '-' && s[1] != 0)
        {
          char buf[32];
          const char *opt = s;
          const char *arg = strchr(opt, '=');
          if (*opt == '-')
            opt += 1;
          if (*opt == '-')
            opt += 1;
          if (arg)
            {
              int l = arg - opt;
              if (l > (int)sizeof(buf) - 1)
                l = sizeof(buf) - 1;
              strncpy(buf, opt, l);
              buf[l] = 0;
              opt = buf;
              arg += 1;
            }
          
          if (!strcmp(opt,"page") || 
              !strcmp(opt,"pages") )
            {
              if (!arg && i<argc)
                arg = argv[i++];
              if (!arg)
                die(i18n("option %s needs an argument."), s);
              if (pagespec)
                fprintf(stderr,i18n("warning: duplicate option --page=...\n"));
              pagespec = arg;
            }
          else if (!strcmp(opt, "detail"))
            {
              if (!arg)
                arg = "char";
              if (detail)
                fprintf(stderr,i18n("warning: duplicate option --detail.\n"));
              detail = arg;
            }
          else if (!strcmp(opt, "escape") && !arg)
            escape = 1;
          else
            die(i18n("unrecognized option %s."), s);
        }
      else if (!inputfilename)
        inputfilename = s;
      else if (! outputfilename)
        outputfilename = s;
      else
        usage();
    }
  
  /* Defaults */
  if (! inputfilename)
    usage();
  if (outputfilename)
    if (! freopen(outputfilename, "w", stdout))
      die(i18n("cannot open output file %s."), outputfilename);
  if (! pagespec)
    pagespec = "1-$";
  
  /* Create context and document */
  if (! (ctx = ddjvu_context_create(argv[0])))
    die(i18n("Cannot create djvu context."));
  if (! (doc = ddjvu_document_create_by_filename(ctx, inputfilename, TRUE)))
    die(i18n("Cannot open djvu document '%s'."), inputfilename);
  while (! ddjvu_document_decoding_done(doc))
    handle(TRUE);
  
  /* Process all pages */
  i = ddjvu_document_get_pagenum(doc);
  parse_pagespec(pagespec, i, dopage);
  
  /* Close */
  if (doc)
    ddjvu_document_release(doc);
  if (ctx)
    ddjvu_context_release(ctx);

  /* Return */
  minilisp_finish();  
  return 0;
}