Blame tools/pdfscan.py

Packit 0f19cf
#! /usr/bin/env python
Packit 0f19cf
#
Packit 0f19cf
# This tool is provided by dblatex (http://dblatex.sourceforge.net) and has
Packit 0f19cf
# the same copyright.
Packit 0f19cf
#
Packit 0f19cf
# It was initially developped to find out the fonts used and their size because
Packit 0f19cf
# as strange as it may seem, no obvious tool gives the font sizes used (pdffonts
Packit 0f19cf
# just lists the font objects of the PDF). The script can be improved to give
Packit 0f19cf
# more informations in a next release.
Packit 0f19cf
#
Packit 0f19cf
# To understand the PDF format, read:
Packit 0f19cf
#   * The reference:
Packit 0f19cf
#     http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/
Packit 0f19cf
#                                                      pdf_reference_1-7.pdf
Packit 0f19cf
#
Packit 0f19cf
#   * A usefull introduction:
Packit 0f19cf
#     http://www.adobe.com/content/dam/Adobe/en/technology/pdfs/
Packit 0f19cf
#                                                      PDF_Day_A_Look_Inside.pdf
Packit 0f19cf
#
Packit 0f19cf
#
Packit 0f19cf
import os
Packit 0f19cf
import sys
Packit 0f19cf
import traceback
Packit 0f19cf
import zlib
Packit 0f19cf
import re
Packit 0f19cf
import logging
Packit 0f19cf
import tempfile
Packit 0f19cf
import shutil
Packit 0f19cf
import struct
Packit 0f19cf
import codecs
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class ErrorHandler:
Packit 0f19cf
    def __init__(self):
Packit 0f19cf
        self._dump_stack = False
Packit 0f19cf
        self.rc = 0
Packit 0f19cf
Packit 0f19cf
    def dump_stack(self, dump=True):
Packit 0f19cf
        self._dump_stack = dump
Packit 0f19cf
Packit 0f19cf
    def failure_track(self, msg, rc=1):
Packit 0f19cf
        self.rc = rc
Packit 0f19cf
        print >>sys.stderr, (msg)
Packit 0f19cf
        if self._dump_stack:
Packit 0f19cf
            traceback.print_exc()
Packit 0f19cf
Packit 0f19cf
    def failed_exit(self, rc=1):
Packit 0f19cf
        self.failure_track(msg, rc)
Packit 0f19cf
        sys.exit(self.rc)
Packit 0f19cf
Packit 0f19cf
def pdfstring_is_list(data):
Packit 0f19cf
    return (data and data[0] == "[" and data[-1] == "]")
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFResolver:
Packit 0f19cf
    _resolver = None
Packit 0f19cf
Packit 0f19cf
    @classmethod
Packit 0f19cf
    def set_resolver(cls, resolver):
Packit 0f19cf
        cls._resolver = resolver
Packit 0f19cf
Packit 0f19cf
    @classmethod
Packit 0f19cf
    def get_resolver(cls):
Packit 0f19cf
        return cls._resolver
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFBaseObject:
Packit 0f19cf
    _log = logging.getLogger("pdfscan.base")
Packit 0f19cf
Packit 0f19cf
    def __init__(self):
Packit 0f19cf
        pass
Packit 0f19cf
    def debug(self, text):
Packit 0f19cf
        self._log.debug(text)
Packit 0f19cf
    def warning(self, text):
Packit 0f19cf
        self._log.warning(text)
Packit 0f19cf
    def error(self, text):
Packit 0f19cf
        self._log.error(text)
Packit 0f19cf
    def info(self, text):
Packit 0f19cf
        self._log.info(text)
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFFile(PDFBaseObject):
Packit 0f19cf
    """
Packit 0f19cf
    Main object that parses the PDF file and extract the objects needed for
Packit 0f19cf
    scanning.
Packit 0f19cf
    """
Packit 0f19cf
    _log = logging.getLogger("pdfscan.pdffile")
Packit 0f19cf
Packit 0f19cf
    def __init__(self, stream_manager=None):
Packit 0f19cf
        self._file = None
Packit 0f19cf
        self.filesize = 0
Packit 0f19cf
        self.startxref_pos = 0
Packit 0f19cf
        self.trailer = None
Packit 0f19cf
        self.xref_first = None
Packit 0f19cf
        self.xref_table = {}
Packit 0f19cf
        self.xref_objstm = {}
Packit 0f19cf
        self.objstm_objects = {}
Packit 0f19cf
        self.page_objects = []
Packit 0f19cf
        self.pdfobjects = PDFObjectGroup()
Packit 0f19cf
        self.stream_manager = stream_manager or StreamManager()
Packit 0f19cf
        # Create an publish the object resolver
Packit 0f19cf
        self.resolver = PDFObjectResolver(self)
Packit 0f19cf
        PDFResolver.set_resolver(self.resolver)
Packit 0f19cf
        # Create a global font manager
Packit 0f19cf
        self.fontmgr = FontManager({})
Packit 0f19cf
Packit 0f19cf
        # Detect the beginning of a PDF Object
Packit 0f19cf
        self.re_objstart = re.compile("(\d+) (\d+) obj(.*$)", re.DOTALL)
Packit 0f19cf
Packit 0f19cf
    def cleanup(self):
Packit 0f19cf
        self.stream_manager.cleanup()
Packit 0f19cf
Packit 0f19cf
    def load(self, filename):
Packit 0f19cf
        self.filesize = os.path.getsize(filename)
Packit 0f19cf
        self._file = open(filename, "rb")
Packit 0f19cf
        self.read_xref()
Packit 0f19cf
        self.build_final_xref()
Packit 0f19cf
Packit 0f19cf
    def find_startxref(self, offset_trailer=160):
Packit 0f19cf
        # Look for the first xref from the end
Packit 0f19cf
        offset, data = self.filesize, ""
Packit 0f19cf
        while not("startxref" in data) or offset == 0:
Packit 0f19cf
            offset = max(0, offset - offset_trailer)
Packit 0f19cf
            self._file.seek(offset)
Packit 0f19cf
            data = self._file.read(offset_trailer) + data
Packit 0f19cf
Packit 0f19cf
        m = re.search("\sstartxref\s+(\d+)\s+%%EOF", data, re.M)
Packit 0f19cf
        if not(m):
Packit 0f19cf
            self.error("Problem in PDF file: startxref not found")
Packit 0f19cf
            return 0
Packit 0f19cf
        self.startxref_pos = int(m.group(1))
Packit 0f19cf
        return self.startxref_pos
Packit 0f19cf
Packit 0f19cf
    def read_xref(self):
Packit 0f19cf
        startxref = self.find_startxref()
Packit 0f19cf
        xref = None
Packit 0f19cf
Packit 0f19cf
        while startxref:
Packit 0f19cf
            self._file.seek(startxref)
Packit 0f19cf
            line = self._file.readline()
Packit 0f19cf
            m = re.search("xref\s(.*)", line, re.M|re.DOTALL)
Packit 0f19cf
            if (m):
Packit 0f19cf
                found_xref = PDFXrefSection(self._file)
Packit 0f19cf
                found_xref.read_table(m.group(1))
Packit 0f19cf
            elif self.re_objstart.search(line):
Packit 0f19cf
                self.info("Xref section not found. Try to load XRef object")
Packit 0f19cf
                pdfobject, remain_line = self._parse_object(startxref)
Packit 0f19cf
                found_xref = PDFXrefObject(pdfobject)
Packit 0f19cf
Packit 0f19cf
            startxref = int(found_xref.trailer.get("/Prev", 0))
Packit 0f19cf
Packit 0f19cf
            if xref: xref.set_older(found_xref)
Packit 0f19cf
            xref = found_xref
Packit 0f19cf
Packit 0f19cf
        self.xref_first = xref
Packit 0f19cf
Packit 0f19cf
    def build_final_xref(self):
Packit 0f19cf
        xref = self.xref_first
Packit 0f19cf
        while xref:
Packit 0f19cf
            self.trailer = xref.trailer
Packit 0f19cf
            self.xref_table.update(xref.table)
Packit 0f19cf
            self.xref_objstm.update(xref.objstm)
Packit 0f19cf
            xref = xref.newer
Packit 0f19cf
Packit 0f19cf
    def get_objstm(self, objstm_id):
Packit 0f19cf
        return self.objstm_objects.get(objstm_id, None)
Packit 0f19cf
Packit 0f19cf
    def create_objstm(self, pdfobject):
Packit 0f19cf
        self.debug("Create objstm %s" % pdfobject.ident())
Packit 0f19cf
        pdfobject.compute()
Packit 0f19cf
        pdfobject.stream_decode()
Packit 0f19cf
        self.pdfobjects.add_object(pdfobject)
Packit 0f19cf
        objstm = PDFObjectStream(pdfobject)
Packit 0f19cf
        self.objstm_objects[objstm.ident()] = objstm
Packit 0f19cf
        return objstm
Packit 0f19cf
Packit 0f19cf
    def xref_resolve_object(self, ident):
Packit 0f19cf
        offset = self.xref_table.get(ident, 0)
Packit 0f19cf
        if offset != 0:
Packit 0f19cf
            #print "Object '%s' found at offset: %d" % (ident, offset)
Packit 0f19cf
            pdfobject, remain_line = self._parse_object(offset)
Packit 0f19cf
            return pdfobject
Packit 0f19cf
Packit 0f19cf
    def xref_resolve(self, ident):
Packit 0f19cf
        # Try to resolve a standard object
Packit 0f19cf
        pdfobject = self.xref_resolve_object(ident)
Packit 0f19cf
        if pdfobject:
Packit 0f19cf
            return pdfobject
Packit 0f19cf
Packit 0f19cf
        # Find the ObjStm infos that contains that object
Packit 0f19cf
        objstm_data = self.xref_objstm.get(ident, 0)
Packit 0f19cf
        if objstm_data == 0:
Packit 0f19cf
            self.warning("ObjStm id for '%s' not found in xref table" % ident)
Packit 0f19cf
            return None
Packit 0f19cf
Packit 0f19cf
        # If the ObjStm itself is not resolved, resolve it first
Packit 0f19cf
        objstm_id = "%d 0" % objstm_data[0]
Packit 0f19cf
        object_idx = objstm_data[1]
Packit 0f19cf
Packit 0f19cf
        objstm = self.get_objstm(objstm_id)
Packit 0f19cf
        if not(objstm):
Packit 0f19cf
            pdfobject = self.xref_resolve_object(objstm_id)
Packit 0f19cf
            if pdfobject: objstm = self.create_objstm(pdfobject)
Packit 0f19cf
        if not(objstm):
Packit 0f19cf
            self.error("Object '%s' cannot be resolved: ObjStm '%s' not found" \
Packit 0f19cf
                      % (ident, objstm_id))
Packit 0f19cf
            return None
Packit 0f19cf
Packit 0f19cf
        # Ok, now get the object from the ObjStm
Packit 0f19cf
        pdfobject = objstm.get_object(object_idx)
Packit 0f19cf
Packit 0f19cf
        return pdfobject
Packit 0f19cf
Packit 0f19cf
    def resolve_object(self, ident):
Packit 0f19cf
        pdfobject = self.pdfobjects.get_object(ident)
Packit 0f19cf
        if not(pdfobject):
Packit 0f19cf
            #print "Try to resolve object '%s'" % ident
Packit 0f19cf
            pdfobject = self.xref_resolve(ident)
Packit 0f19cf
            if pdfobject:
Packit 0f19cf
                self.pdfobjects.add_object(pdfobject)
Packit 0f19cf
        return pdfobject
Packit 0f19cf
Packit 0f19cf
    def get_object(self, ident):
Packit 0f19cf
        ident = ident.replace(" R", "").strip()
Packit 0f19cf
        pdfobject = self.resolver.get(ident)
Packit 0f19cf
        if pdfobject:
Packit 0f19cf
            pdfobject.link_to(self.resolver)
Packit 0f19cf
        return pdfobject
Packit 0f19cf
Packit 0f19cf
    def _parse_object(self, offset):
Packit 0f19cf
        pdfobj = None
Packit 0f19cf
        parsed_object = None
Packit 0f19cf
        remain_line = ""
Packit 0f19cf
Packit 0f19cf
        self._file.seek(offset)
Packit 0f19cf
Packit 0f19cf
        while not(parsed_object):
Packit 0f19cf
            line = self._file.readline()
Packit 0f19cf
            if not(line):
Packit 0f19cf
                break
Packit 0f19cf
Packit 0f19cf
            while line:
Packit 0f19cf
                if pdfobj:
Packit 0f19cf
                    fields = line.split("endobj", 1)
Packit 0f19cf
                    if len(fields) > 1:
Packit 0f19cf
                        if fields[0]:
Packit 0f19cf
                            pdfobj.append_string(fields[0])
Packit 0f19cf
                        pdfobj.compute()
Packit 0f19cf
                        remain_line = fields[1]
Packit 0f19cf
                        parsed_object = pdfobj
Packit 0f19cf
                    else:
Packit 0f19cf
                        pdfobj.append_string(line)
Packit 0f19cf
                    line = ""
Packit 0f19cf
                else:
Packit 0f19cf
                    m = self.re_objstart.search(line)
Packit 0f19cf
                    if m:
Packit 0f19cf
                        number, revision = m.group(1), m.group(2)
Packit 0f19cf
                        pdfobj = PDFObject(number, revision,
Packit 0f19cf
                                       stream_manager=self.stream_manager)
Packit 0f19cf
                        line = m.group(3)
Packit 0f19cf
                    else:
Packit 0f19cf
                        # drop the line
Packit 0f19cf
                        line = ""
Packit 0f19cf
Packit 0f19cf
        return (parsed_object, remain_line)
Packit 0f19cf
Packit 0f19cf
    def _expand_pages(self, page_kids):
Packit 0f19cf
        # Iterations to make a list of unitary pages (/Page) from a list
Packit 0f19cf
        # containing group of pages (/Pages). The iterations stop when all
Packit 0f19cf
        # The objects in the list are replaced by unit pages and not
Packit 0f19cf
        # intermediate page groups
Packit 0f19cf
        page_list = page_kids
Packit 0f19cf
        has_kid = len(page_list)
Packit 0f19cf
        while has_kid:
Packit 0f19cf
            newlist = []
Packit 0f19cf
            has_kid = 0
Packit 0f19cf
            for kid in page_list:
Packit 0f19cf
                #print kid
Packit 0f19cf
                kid.link_to(self.resolver)
Packit 0f19cf
                if kid.get_type() == "/Pages":
Packit 0f19cf
                    kids = kid.descriptor.get("/Kids")
Packit 0f19cf
                    self.debug("Expand page list: %s -> %s" % (kid, kids))
Packit 0f19cf
                    has_kid += len(kids)
Packit 0f19cf
                elif kid.get_type() == "/Page":
Packit 0f19cf
                    kids = [kid]
Packit 0f19cf
                else:
Packit 0f19cf
                    self.error("%s: %s" % (kid, kid.descriptor.params))
Packit 0f19cf
                    self.error("%s: What's wrong? '%s'" % (kid, kid.get_type()))
Packit 0f19cf
                    kids = []
Packit 0f19cf
                newlist = newlist + kids
Packit 0f19cf
            page_list = newlist
Packit 0f19cf
        return page_list
Packit 0f19cf
Packit 0f19cf
    def load_pages(self):
Packit 0f19cf
        root = self.trailer.get("/Root")
Packit 0f19cf
        catalog = self.get_object(root)
Packit 0f19cf
        pages = catalog.descriptor.get("/Pages")
Packit 0f19cf
        page_count = int(pages.descriptor.get("/Count"))
Packit 0f19cf
Packit 0f19cf
        self.info("Found %d pages" % page_count)
Packit 0f19cf
        pages.link_to(self.resolver)
Packit 0f19cf
        page_kids = pages.descriptor.get("/Kids")
Packit 0f19cf
        self.page_objects = self._expand_pages(page_kids)
Packit 0f19cf
        if len(self.page_objects) != page_count:
Packit 0f19cf
            self.error("Unconsistent pages found: %d vs %d" % \
Packit 0f19cf
                  (len(self.page_objects), page_count))
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFObjectResolver:
Packit 0f19cf
    def __init__(self, pdffile):
Packit 0f19cf
        self.pdffile = pdffile
Packit 0f19cf
Packit 0f19cf
    def get(self, ident, default=None):
Packit 0f19cf
        pdfobject = self.pdffile.resolve_object(ident)
Packit 0f19cf
        if not(pdfobject): pdfobject = default
Packit 0f19cf
        return pdfobject
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFObjectGroup(PDFBaseObject):
Packit 0f19cf
    """
Packit 0f19cf
    Group of the PDF Objects contained in a file. This wrapper is a dictionnary
Packit 0f19cf
    of the objects, and consolidates the links between the objects.
Packit 0f19cf
    """
Packit 0f19cf
    _log = logging.getLogger("pdfscan.pdffile")
Packit 0f19cf
Packit 0f19cf
    def __init__(self):
Packit 0f19cf
        self.pdfobjects = {}
Packit 0f19cf
        self.objtypes = {}
Packit 0f19cf
        self.unresolved = []
Packit 0f19cf
Packit 0f19cf
    def count(self):
Packit 0f19cf
        return len(self.pdfobjects.values())
Packit 0f19cf
Packit 0f19cf
    def types(self):
Packit 0f19cf
        return self.objtypes.keys()
Packit 0f19cf
Packit 0f19cf
    def add_object(self, pdfobject):
Packit 0f19cf
        self.pdfobjects[pdfobject.ident()] = pdfobject
Packit 0f19cf
        objtype = pdfobject.get_type()
Packit 0f19cf
        if not(objtype):
Packit 0f19cf
            objtype = "misc"
Packit 0f19cf
        lst = self.objtypes.get(objtype, [])
Packit 0f19cf
        lst.append(pdfobject)
Packit 0f19cf
        self.objtypes[objtype] = lst
Packit 0f19cf
        self.unresolved.append(pdfobject)
Packit 0f19cf
Packit 0f19cf
    def get_objects_by_type(self, objtype):
Packit 0f19cf
        return self.objtypes.get(objtype, [])
Packit 0f19cf
Packit 0f19cf
    def get_object(self, ident):
Packit 0f19cf
        return self.pdfobjects.get(ident, None)
Packit 0f19cf
Packit 0f19cf
    def link_objects(self):
Packit 0f19cf
        self.debug("%d objects to resolve" % (len(self.unresolved)))
Packit 0f19cf
        unresolved = []
Packit 0f19cf
        for pdfobj in self.unresolved:
Packit 0f19cf
            if pdfobj.link_to(self.pdfobjects):
Packit 0f19cf
                unresolved.append(pdfobj)
Packit 0f19cf
        self.unresolved = unresolved
Packit 0f19cf
Packit 0f19cf
    def stream_decode(self):
Packit 0f19cf
        for pdfobj in self.pdfobjects.values():
Packit 0f19cf
            pdfobj.stream_decode()
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFPage:
Packit 0f19cf
    def __init__(self, pdf, page, pagenum=0):
Packit 0f19cf
        self.pagenum = pagenum
Packit 0f19cf
        self.pdf = pdf
Packit 0f19cf
        contents = page.descriptor.get("/Contents")
Packit 0f19cf
        resources = page.descriptor.get("/Resources")
Packit 0f19cf
Packit 0f19cf
        if (isinstance(resources, PDFDescriptor)):
Packit 0f19cf
            rsc_descriptor = resources
Packit 0f19cf
        else:
Packit 0f19cf
            rsc_descriptor = resources.descriptor
Packit 0f19cf
Packit 0f19cf
        rsc_descriptor.link_to(pdf.resolver)
Packit 0f19cf
        font = rsc_descriptor.get("/Font")
Packit 0f19cf
        if font:
Packit 0f19cf
            font.link_to(pdf.resolver)
Packit 0f19cf
            if (isinstance(font, PDFDescriptor)):
Packit 0f19cf
                fontdict = font.infos()
Packit 0f19cf
            else:
Packit 0f19cf
                fontdict = font.descriptor.infos()
Packit 0f19cf
        else:
Packit 0f19cf
            fontdict = {}
Packit 0f19cf
Packit 0f19cf
        if not(isinstance(contents, list)):
Packit 0f19cf
            contents = [contents]
Packit 0f19cf
Packit 0f19cf
        self.page = page
Packit 0f19cf
        self.contents = contents
Packit 0f19cf
        self.fontdict = fontdict
Packit 0f19cf
        self.fontmgr = FontManager(fontdict, pdf.fontmgr)
Packit 0f19cf
        self.streams = []
Packit 0f19cf
        
Packit 0f19cf
        self.link_to(pdf.resolver)
Packit 0f19cf
        self.load_streams()
Packit 0f19cf
Packit 0f19cf
    def link_to(self, resolver):
Packit 0f19cf
        for content in self.contents:
Packit 0f19cf
            content.link_to(resolver)
Packit 0f19cf
Packit 0f19cf
    def load_streams(self):
Packit 0f19cf
        for content in self.contents:
Packit 0f19cf
            stream = PDFContentStream(content, self.fontmgr)
Packit 0f19cf
            self.streams.append(stream)
Packit 0f19cf
Packit 0f19cf
    def find_fonts(self):
Packit 0f19cf
        return self.fontmgr.get_used()
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFXrefSection(PDFBaseObject):
Packit 0f19cf
    """
Packit 0f19cf
    Section starting by 'xref' and followed by the 'trailer'. The xref data
Packit 0f19cf
    contain information about how to access to objects in the file and is
Packit 0f19cf
    therefore a crucial part of the object resolution.
Packit 0f19cf
    """
Packit 0f19cf
    _log = logging.getLogger("pdfscan.xref")
Packit 0f19cf
Packit 0f19cf
    _re_desc = re.compile("(<<(?:(?]|(?)>(?!>))*>>)",
Packit 0f19cf
                          re.MULTILINE)
Packit 0f19cf
Packit 0f19cf
    def __init__(self, fd):
Packit 0f19cf
        self.trailer = None
Packit 0f19cf
        self.table = {}
Packit 0f19cf
        self.objstm = {}
Packit 0f19cf
        self._file = fd
Packit 0f19cf
        self.older = None
Packit 0f19cf
        self.newer = None
Packit 0f19cf
Packit 0f19cf
    def set_older(self, older):
Packit 0f19cf
        self.older = older
Packit 0f19cf
        older.newer = self
Packit 0f19cf
Packit 0f19cf
    def _xref_fill_entry(self, fields, obj_id):
Packit 0f19cf
        offset, revision, what = fields
Packit 0f19cf
        if what == "n":
Packit 0f19cf
            ident = "%d %d" % (obj_id, int(revision))
Packit 0f19cf
            self.table[ident] = int(offset)
Packit 0f19cf
Packit 0f19cf
    def read_table(self, linestart=""):
Packit 0f19cf
        line = linestart.strip() or self._file.readline()
Packit 0f19cf
        subsection = line.split()
Packit 0f19cf
Packit 0f19cf
        while subsection[0] != "trailer":
Packit 0f19cf
            start_ref = int(subsection[0])
Packit 0f19cf
            object_count = int(subsection[1])
Packit 0f19cf
            if len(subsection) == 5:
Packit 0f19cf
                self._xref_fill_entry(subsection[2:], start_ref)
Packit 0f19cf
                start_ref += 1
Packit 0f19cf
                object_count -= 1
Packit 0f19cf
Packit 0f19cf
            for i in range(object_count):
Packit 0f19cf
                line = self._file.readline()
Packit 0f19cf
                self._xref_fill_entry(line.split(), start_ref+i)
Packit 0f19cf
Packit 0f19cf
            line = self._file.readline()
Packit 0f19cf
            subsection = line.split()
Packit 0f19cf
Packit 0f19cf
        #print len(self.table.values())
Packit 0f19cf
Packit 0f19cf
        if subsection[0] == "trailer":
Packit 0f19cf
            data = " ".join(subsection)
Packit 0f19cf
        
Packit 0f19cf
        # Ensure we have a complete dictionnary
Packit 0f19cf
        while not(">>" in data):
Packit 0f19cf
            data += self._file.readline()
Packit 0f19cf
Packit 0f19cf
        m = self._re_desc.search(data)
Packit 0f19cf
        if not(m):
Packit 0f19cf
            self.error("Problem in PDF file: cannot find valid trailer")
Packit 0f19cf
            return
Packit 0f19cf
        self.trailer = PDFDescriptor(string=m.group(1))
Packit 0f19cf
        self.trailer.compute()
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFStreamHandler:
Packit 0f19cf
    """
Packit 0f19cf
    Core abstract class in charge to handle the stream of <pdfobject>
Packit 0f19cf
    """
Packit 0f19cf
    def __init__(self, pdfobject):
Packit 0f19cf
        self.stream_object = pdfobject
Packit 0f19cf
Packit 0f19cf
    def ident(self):
Packit 0f19cf
        return self.stream_object.ident()
Packit 0f19cf
    def debug(self, text):
Packit 0f19cf
        self.stream_object.debug(text)
Packit 0f19cf
    def warning(self, text):
Packit 0f19cf
        self.stream_object.warning(text)
Packit 0f19cf
    def error(self, text):
Packit 0f19cf
        self.stream_object.error(text)
Packit 0f19cf
    def info(self, text):
Packit 0f19cf
        self.stream_object.info(text)
Packit 0f19cf
Packit 0f19cf
class PDFXrefObject(PDFStreamHandler):
Packit 0f19cf
    """
Packit 0f19cf
    A specific object that contains XRef entries in binary format. It is an
Packit 0f19cf
    alternative to the xref section.
Packit 0f19cf
    """
Packit 0f19cf
    def __init__(self, pdfobject):
Packit 0f19cf
        PDFStreamHandler.__init__(self, pdfobject)
Packit 0f19cf
        self.trailer = pdfobject.descriptor
Packit 0f19cf
        self.table = {}
Packit 0f19cf
        self.objstm = {}
Packit 0f19cf
        self.older = None
Packit 0f19cf
        self.newer = None
Packit 0f19cf
Packit 0f19cf
        if pdfobject.descriptor.get("/Type") != "/XRef":
Packit 0f19cf
            self.error("Not an XRef object. Give up")
Packit 0f19cf
            return
Packit 0f19cf
Packit 0f19cf
        _format = pdfobject.descriptor.get("/W")
Packit 0f19cf
        _format = _format.replace("[", "").replace("]", "")
Packit 0f19cf
        self._format = [ int(f) for f in _format.split() ]
Packit 0f19cf
Packit 0f19cf
        # An /XRef object must contains a stream
Packit 0f19cf
        pdfobject.stream_decode()
Packit 0f19cf
        self.data = pdfobject.stream_text()
Packit 0f19cf
        self.read_table()
Packit 0f19cf
Packit 0f19cf
    def set_older(self, older):
Packit 0f19cf
        self.older = older
Packit 0f19cf
        older.newer = self
Packit 0f19cf
Packit 0f19cf
    def _xref_fill_entry(self, fields, obj_id):
Packit 0f19cf
        offset, revision, what = fields
Packit 0f19cf
        if what == "n":
Packit 0f19cf
            ident = "%d %d" % (obj_id, int(revision))
Packit 0f19cf
            self.table[ident] = int(offset)
Packit 0f19cf
            self.debug("Record xref entry: '%s' @ %s" % (ident, offset))
Packit 0f19cf
Packit 0f19cf
    def _xref_fill_objstm(self, fields, obj_id):
Packit 0f19cf
        objstm_id, obj_index = fields
Packit 0f19cf
        ident = "%d %d" % (obj_id, 0)
Packit 0f19cf
        self.objstm[ident] = (objstm_id, obj_index)
Packit 0f19cf
        self.debug("Record xref entry in objstm: '%s' @ %s" % \
Packit 0f19cf
                   (ident, fields))
Packit 0f19cf
Packit 0f19cf
    def _int_of(self, string):
Packit 0f19cf
        # Convert to int from bytes string that can be of any size
Packit 0f19cf
        m = len(string)
Packit 0f19cf
        d = 0
Packit 0f19cf
        for i, c in enumerate(string):
Packit 0f19cf
            d += (1 << (8*(m - i-1))) * struct.unpack("B", c)[0]
Packit 0f19cf
        return d
Packit 0f19cf
Packit 0f19cf
    def read_table(self, linestart=""):
Packit 0f19cf
        data = self.data
Packit 0f19cf
        fields = 3 * [0]
Packit 0f19cf
        entry_size = sum(self._format)
Packit 0f19cf
        # TODO: use /Index
Packit 0f19cf
        obj_id = 0
Packit 0f19cf
Packit 0f19cf
        while data:
Packit 0f19cf
            first = 0
Packit 0f19cf
            last = 0
Packit 0f19cf
            for i in range(3):
Packit 0f19cf
                last += self._format[i]
Packit 0f19cf
                fields[i] = self._int_of(data[first:last])
Packit 0f19cf
                first = last
Packit 0f19cf
Packit 0f19cf
            data = data[entry_size:]
Packit 0f19cf
            
Packit 0f19cf
            if fields[0] == 1:
Packit 0f19cf
                self._xref_fill_entry(fields[1:3] + ["n"], obj_id)
Packit 0f19cf
            elif fields[0] == 2:
Packit 0f19cf
                self._xref_fill_objstm(fields[1:3], obj_id)
Packit 0f19cf
Packit 0f19cf
            obj_id += 1
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFObjectStream(PDFStreamHandler):
Packit 0f19cf
    """
Packit 0f19cf
    A PDF Object Stream contains in its stream some compressed PDF objects.
Packit 0f19cf
    This class works on a PDF object stream to build the containded PDF objects.
Packit 0f19cf
    """
Packit 0f19cf
    def __init__(self, pdfobject):
Packit 0f19cf
        PDFStreamHandler.__init__(self, pdfobject)
Packit 0f19cf
        self._pdfobjects = []
Packit 0f19cf
Packit 0f19cf
    def pdfobjects(self):
Packit 0f19cf
        if not(self._pdfobjects):
Packit 0f19cf
            self.compute()
Packit 0f19cf
        return self._pdfobjects
Packit 0f19cf
Packit 0f19cf
    def _getinfo(self, what):
Packit 0f19cf
        return self.stream_object.descriptor.get(what)
Packit 0f19cf
Packit 0f19cf
    def get_object(self, idx):
Packit 0f19cf
        if not(self._pdfobjects):
Packit 0f19cf
            self.compute()
Packit 0f19cf
        if idx < 0 or idx >= len(self._pdfobjects):
Packit 0f19cf
            return None
Packit 0f19cf
        return self._pdfobjects[idx]
Packit 0f19cf
Packit 0f19cf
    def parse_object_list(self, data):
Packit 0f19cf
        values = data.split()
Packit 0f19cf
        objlist = []
Packit 0f19cf
Packit 0f19cf
        for i in range(0, len(values), 2):
Packit 0f19cf
            # The pair is ('object number', byte_offset)
Packit 0f19cf
            objlist.append((values[i], int(values[i+1])))
Packit 0f19cf
        self.objlist = objlist
Packit 0f19cf
        return objlist
Packit 0f19cf
Packit 0f19cf
    def compute(self):
Packit 0f19cf
        _type = self._getinfo("/Type")
Packit 0f19cf
        if  _type != "/ObjStm":
Packit 0f19cf
            self.error("Cannot read object stream: Invalid type '%s'" % _type)
Packit 0f19cf
            return
Packit 0f19cf
Packit 0f19cf
        nb_objects = int(self._getinfo("/N"))
Packit 0f19cf
        objlist_b = int(self._getinfo("/First"))
Packit 0f19cf
        stream = self.stream_object.stream_cache
Packit 0f19cf
Packit 0f19cf
        objlist = self.parse_object_list(stream.read(objlist_b))
Packit 0f19cf
Packit 0f19cf
        if len(objlist) != nb_objects:
Packit 0f19cf
            self.warning("Error in parsing the Stream Object: found %d"\
Packit 0f19cf
                         "objects instead of %d" % (len(objlist), nb_object))
Packit 0f19cf
Packit 0f19cf
        # List Terminator
Packit 0f19cf
        objlist.append(("",-1))
Packit 0f19cf
Packit 0f19cf
        bytes_read = 0
Packit 0f19cf
        for i in range(len(objlist)-1):
Packit 0f19cf
            # In ObjectStream, a PDF object revision is always '0'
Packit 0f19cf
            number, revision = objlist[i][0], "0"
Packit 0f19cf
Packit 0f19cf
            # The size of the object data is given by the position of the next
Packit 0f19cf
            objsize = objlist[i+1][1] - bytes_read
Packit 0f19cf
            if objsize >= 0:
Packit 0f19cf
                data = stream.read(objsize)
Packit 0f19cf
            else:
Packit 0f19cf
                data = stream.read()
Packit 0f19cf
            bytes_read += len(data)
Packit 0f19cf
            self.debug("Object[%d] in stream: '%s' has %d bytes" % \
Packit 0f19cf
                       (i, number, objsize))
Packit 0f19cf
Packit 0f19cf
            # Build the PDF Object from stream data
Packit 0f19cf
            pdfobj = PDFObject(number, revision)
Packit 0f19cf
            pdfobj.append_string(data)
Packit 0f19cf
            pdfobj.compute()
Packit 0f19cf
            self._pdfobjects.append(pdfobj)
Packit 0f19cf
Packit 0f19cf
        stream.close()
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFObject:
Packit 0f19cf
    """
Packit 0f19cf
    A PDF Object contains the data between the 'obj ... 'endobj' tags.
Packit 0f19cf
    It has a unique identifier given by the (number,revision) pair.
Packit 0f19cf
    The data contained by a PDF object can be dictionnaries (descriptors),
Packit 0f19cf
    stream contents and other stuff.
Packit 0f19cf
    """
Packit 0f19cf
    # Extract a dictionnary '<<...>>' leaf (does not contain another dict)
Packit 0f19cf
    _re_desc = re.compile("(<<(?:(?]|(?)>(?!>))*>>)",
Packit 0f19cf
                          re.MULTILINE)
Packit 0f19cf
Packit 0f19cf
    def __init__(self, number, revision, stream_manager=None):
Packit 0f19cf
        self.string = ""
Packit 0f19cf
        self.number = number
Packit 0f19cf
        self.revision = revision
Packit 0f19cf
        self.descriptors = []
Packit 0f19cf
        self.descriptor = None
Packit 0f19cf
        self.data = ""
Packit 0f19cf
        self.stream = None
Packit 0f19cf
        self.outfile = ""
Packit 0f19cf
        self.stream_manager = stream_manager or StreamManager()
Packit 0f19cf
        self._log = logging.getLogger("pdfscan.pdfobject")
Packit 0f19cf
        self.debug("New Object")
Packit 0f19cf
        self.re_desc = self._re_desc
Packit 0f19cf
Packit 0f19cf
    def debug(self, text):
Packit 0f19cf
        self._log.debug(self.logstr(text))
Packit 0f19cf
    def warning(self, text):
Packit 0f19cf
        self._log.warning(self.logstr(text))
Packit 0f19cf
    def error(self, text):
Packit 0f19cf
        self._log.error(self.logstr(text))
Packit 0f19cf
    def info(self, text):
Packit 0f19cf
        self._log.info(self.logstr(text))
Packit 0f19cf
Packit 0f19cf
    def ident(self):
Packit 0f19cf
        return "%s %s" % (self.number, self.revision)
Packit 0f19cf
Packit 0f19cf
    def __repr__(self):
Packit 0f19cf
        return "(%s R)" % self.ident()
Packit 0f19cf
Packit 0f19cf
    def __int__(self):
Packit 0f19cf
        return int(self.data)
Packit 0f19cf
Packit 0f19cf
    def logstr(self, text):
Packit 0f19cf
        return "Object [%s %s]: %s" % (self.number,self.revision,text)
Packit 0f19cf
Packit 0f19cf
    def append_string(self, string):
Packit 0f19cf
        self.string = self.string + string
Packit 0f19cf
Packit 0f19cf
    def compute(self):
Packit 0f19cf
        string = self.string
Packit 0f19cf
Packit 0f19cf
        s = re.split("stream\s", string, re.MULTILINE)
Packit 0f19cf
        if len(s) > 1:
Packit 0f19cf
            self.debug("Contains stream")
Packit 0f19cf
            self.stream = s[1].strip()
Packit 0f19cf
Packit 0f19cf
        string = s[0]
Packit 0f19cf
Packit 0f19cf
        # Iterate to build all the nested dictionnaries/descriptors,
Packit 0f19cf
        # from the deepest to the main one
Packit 0f19cf
        self.descriptors = []
Packit 0f19cf
        while True:
Packit 0f19cf
            descs = self.re_desc.findall(string)
Packit 0f19cf
            if not(descs):
Packit 0f19cf
                break
Packit 0f19cf
            for desc_str in descs:
Packit 0f19cf
                desc = PDFDescriptor(string=desc_str)
Packit 0f19cf
                string = string.replace(desc_str,
Packit 0f19cf
                            "{descriptor(%d)}" % len(self.descriptors))
Packit 0f19cf
                self.descriptors.append(desc)
Packit 0f19cf
            
Packit 0f19cf
        self.debug("Found %d descriptors" % len(self.descriptors))
Packit 0f19cf
Packit 0f19cf
        for descobj in self.descriptors:
Packit 0f19cf
            descobj.compute(descriptors=self.descriptors)
Packit 0f19cf
Packit 0f19cf
        if self.descriptors:
Packit 0f19cf
            self.descriptor = self.descriptors[-1]
Packit 0f19cf
        else:
Packit 0f19cf
            self.descriptor = PDFDescriptor()
Packit 0f19cf
Packit 0f19cf
        self.data = re.sub("{descriptor\(\d+\)}", "",
Packit 0f19cf
                           string, flags=re.MULTILINE).strip()
Packit 0f19cf
        self.debug("Data: '%s'" % self.data)
Packit 0f19cf
Packit 0f19cf
    def stream_decode(self):
Packit 0f19cf
        if not(self.stream):
Packit 0f19cf
            return
Packit 0f19cf
        self.debug("Try to decode stream...")
Packit 0f19cf
Packit 0f19cf
        # Consolidate stream buffer from the /Length information
Packit 0f19cf
        stream_size = int(self.descriptor.get("/Length"))
Packit 0f19cf
        self.stream = self.stream[0:stream_size]
Packit 0f19cf
Packit 0f19cf
        # Put the stream in a cache
Packit 0f19cf
        self.stream_cache = self.stream_manager.cache(number=self.number,
Packit 0f19cf
                                                      revision=self.revision)
Packit 0f19cf
Packit 0f19cf
        method = self.descriptor.get("/Filter")
Packit 0f19cf
        if method == "/FlateDecode":
Packit 0f19cf
            method = "zlib"
Packit 0f19cf
        elif method == "/DCTDecode":
Packit 0f19cf
            # This is JPEG. Just dump it
Packit 0f19cf
            self.warning("this is a JPEG stream")
Packit 0f19cf
            method = ""
Packit 0f19cf
        elif method != "":
Packit 0f19cf
            self.error("don't know how to decode stream with filter '%s'" \
Packit 0f19cf
                     % method)
Packit 0f19cf
            return
Packit 0f19cf
Packit 0f19cf
        self.stream_cache.write(self.stream, compress_type=method)
Packit 0f19cf
Packit 0f19cf
    def stream_text(self):
Packit 0f19cf
        if not(self.stream):
Packit 0f19cf
            return ""
Packit 0f19cf
        data = self.stream_cache.read()
Packit 0f19cf
        self.stream_cache.close()
Packit 0f19cf
        return data
Packit 0f19cf
Packit 0f19cf
    def get_type(self):
Packit 0f19cf
        _type = self.descriptor.get("/Type")
Packit 0f19cf
        if _type:
Packit 0f19cf
            return _type
Packit 0f19cf
        if self.stream:
Packit 0f19cf
            return "stream"
Packit 0f19cf
        if pdfstring_is_list(self.data):
Packit 0f19cf
            return "list"
Packit 0f19cf
        if self.descriptor.is_name_tree_node():
Packit 0f19cf
            return "name tree"
Packit 0f19cf
Packit 0f19cf
    def link_to(self, pdfobjects):
Packit 0f19cf
        self.debug("Link objects")
Packit 0f19cf
        for desc in self.descriptors:
Packit 0f19cf
            desc.link_to(pdfobjects)
Packit 0f19cf
Packit 0f19cf
        if pdfstring_is_list(self.data):
Packit 0f19cf
            pass
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFDescriptor:
Packit 0f19cf
    """
Packit 0f19cf
    Contains the data between the << ... >> brackets in PDF objects. It is
Packit 0f19cf
    a dictionnary that can contain other descriptors/dictionnaries.
Packit 0f19cf
    """
Packit 0f19cf
    # Unique identifier for these objects
Packit 0f19cf
    _id = 0
Packit 0f19cf
Packit 0f19cf
    # Detect the dictionnary fields covering these cases:
Packit 0f19cf
    # <<
Packit 0f19cf
    #  /Type /Page                    : the value is another keyword
Packit 0f19cf
    #  /Contents 5 0 R                : the value is a string up next keyword
Packit 0f19cf
    #  /Resources 4 0 R                   
Packit 0f19cf
    #  /MediaBox [0 0 595.276 841.89] : the value is an array
Packit 0f19cf
    #  /Parent 12 0 R
Packit 0f19cf
    # >>
Packit 0f19cf
    _re_dict = re.compile("/\w+\s*/[^/\s]+|/\w+\s*\[[^\]]*\]|/\w+\s*[^/]+")
Packit 0f19cf
Packit 0f19cf
    # Extract a dictionnary keyword
Packit 0f19cf
    _re_key = re.compile("(/[^ \({/\[<]*)")
Packit 0f19cf
Packit 0f19cf
    # Extract the substituted descriptors
Packit 0f19cf
    _re_descobj = re.compile("{descriptor\((\d+)\)}")
Packit 0f19cf
Packit 0f19cf
    # Find the PDF object references
Packit 0f19cf
    _re_objref = re.compile("(\d+ \d+ R)")
Packit 0f19cf
Packit 0f19cf
    def __init__(self, string=""):
Packit 0f19cf
        self._ident = self._get_ident()
Packit 0f19cf
        self.string = string
Packit 0f19cf
        self.params = {}
Packit 0f19cf
        self._log = logging.getLogger("pdfscan.descriptor")
Packit 0f19cf
Packit 0f19cf
        self.re_dict = self._re_dict
Packit 0f19cf
        self.re_key = self._re_key
Packit 0f19cf
        self.re_descobj = self._re_descobj
Packit 0f19cf
        self.re_objref = self._re_objref
Packit 0f19cf
Packit 0f19cf
    def _get_ident(self):
Packit 0f19cf
        _id = PDFDescriptor._id
Packit 0f19cf
        PDFDescriptor._id += 1
Packit 0f19cf
        return _id
Packit 0f19cf
Packit 0f19cf
    def ident(self):
Packit 0f19cf
        return self._ident
Packit 0f19cf
Packit 0f19cf
    def debug(self, text):
Packit 0f19cf
        self._log.debug("Descriptor [%d]: %s" % (self._ident, text))
Packit 0f19cf
    def error(self, text):
Packit 0f19cf
        self._log.error("Descriptor [%d]: %s" % (self._ident, text))
Packit 0f19cf
    def info(self, text):
Packit 0f19cf
        self._log.info("Descriptor [%d]: %s" % (self._ident, text))
Packit 0f19cf
    def warning(self, text):
Packit 0f19cf
        self._log.warning("Descriptor [%d]: %s" % (self._ident, text))
Packit 0f19cf
Packit 0f19cf
    def __repr__(self):
Packit 0f19cf
        return "desc[%d]" % self._ident
Packit 0f19cf
Packit 0f19cf
    def normalize_fields(self, string):
Packit 0f19cf
        string = string.replace(">>", "")
Packit 0f19cf
        string = string.replace("<<", "")
Packit 0f19cf
        string = string.replace("\n", " ")
Packit 0f19cf
        fields = self.re_dict.findall(string)
Packit 0f19cf
        fields = [ f.strip() for f in fields if (f and f.strip()) ]
Packit 0f19cf
        return fields
Packit 0f19cf
Packit 0f19cf
    def compute(self, descriptors=None):
Packit 0f19cf
        lines = self.normalize_fields(self.string)
Packit 0f19cf
        for line in lines:
Packit 0f19cf
            m = self.re_key.match(line)
Packit 0f19cf
            if not(m):
Packit 0f19cf
                continue
Packit 0f19cf
            param = m.group(1)
Packit 0f19cf
            value = line.replace(param, "").strip()
Packit 0f19cf
            m = self.re_descobj.match(value)
Packit 0f19cf
            if m and descriptors:
Packit 0f19cf
                value = descriptors[int(m.group(1))]
Packit 0f19cf
            self.params[param] = value
Packit 0f19cf
Packit 0f19cf
        self.debug(self.params)
Packit 0f19cf
Packit 0f19cf
    def get(self, param, default=""):
Packit 0f19cf
        return self.params.get(param, default)
Packit 0f19cf
    
Packit 0f19cf
    def values(self):
Packit 0f19cf
        return self.params.values()
Packit 0f19cf
Packit 0f19cf
    def keys(self):
Packit 0f19cf
        return self.params.keys()
Packit 0f19cf
Packit 0f19cf
    def infos(self):
Packit 0f19cf
        return self.params
Packit 0f19cf
Packit 0f19cf
    def is_name_tree_node(self):
Packit 0f19cf
        if self.get("/Limits") or self.get("/Names") or self.get("/Kid"):
Packit 0f19cf
            return True
Packit 0f19cf
        else:
Packit 0f19cf
            return False
Packit 0f19cf
Packit 0f19cf
    def link_to(self, pdfobjects):
Packit 0f19cf
        unresolved = 0
Packit 0f19cf
        for param, value in self.params.items():
Packit 0f19cf
            # Point to something else than a string? Skip it
Packit 0f19cf
            if not(isinstance(value, str)):
Packit 0f19cf
                continue
Packit 0f19cf
Packit 0f19cf
            objects = []
Packit 0f19cf
            objrefs = self.re_objref.findall(value)
Packit 0f19cf
            value2 = value
Packit 0f19cf
            #print value, objrefs
Packit 0f19cf
            for objref in objrefs:
Packit 0f19cf
                o = pdfobjects.get(objref.replace(" R", ""), None)
Packit 0f19cf
                # If the object is missing, keep the reference for another trial
Packit 0f19cf
                if not(o):
Packit 0f19cf
                    self.warning("Object '%s' not resolved" % objref)
Packit 0f19cf
                    unresolved += 1
Packit 0f19cf
                    o = objref
Packit 0f19cf
                objects.append(o)
Packit 0f19cf
                value2 = value2.replace(objref, "", 1)
Packit 0f19cf
Packit 0f19cf
            if not(objects):
Packit 0f19cf
                continue
Packit 0f19cf
Packit 0f19cf
            if pdfstring_is_list(value):
Packit 0f19cf
                if (value2[1:-1].strip()):
Packit 0f19cf
                    #print value2, objects
Packit 0f19cf
                    self.warning("Problem: cannot substitute objects: '%s'" \
Packit 0f19cf
                                 % value)
Packit 0f19cf
                else:
Packit 0f19cf
                    self.params[param] = objects
Packit 0f19cf
                    self.debug("Substitute %s: %s" % (param, objects))
Packit 0f19cf
            else:
Packit 0f19cf
                if value2.strip() or len(objects) > 1:
Packit 0f19cf
                    self.warning("Problem: cannot substitute object" % value)
Packit 0f19cf
                else:
Packit 0f19cf
                    self.params[param] = objects[0]
Packit 0f19cf
                    self.debug("Substitute %s: %s" % (param, objects[0]))
Packit 0f19cf
Packit 0f19cf
        return unresolved
Packit 0f19cf
 
Packit 0f19cf
Packit 0f19cf
class StreamManager(PDFBaseObject):
Packit 0f19cf
    CACHE_REFRESH = 1
Packit 0f19cf
    CACHE_REMANENT = 2
Packit 0f19cf
    CACHE_TMPDIR = 4
Packit 0f19cf
    CACHE_DELONCLOSE = 8
Packit 0f19cf
Packit 0f19cf
    _log = logging.getLogger("pdfscan.pdffile")
Packit 0f19cf
Packit 0f19cf
    def __init__(self, cache_method="file", cache_dirname="", flags=0):
Packit 0f19cf
        self.cache_method = cache_method
Packit 0f19cf
        self.cache_format = "pdfstream.%(number)s.%(revision)s"
Packit 0f19cf
        self.cache_dirname = cache_dirname
Packit 0f19cf
        self.cache_files = []
Packit 0f19cf
        self.flags = flags
Packit 0f19cf
        # Don't want to remove something in a user directory
Packit 0f19cf
        if cache_dirname: self.flags = self.flags | self.CACHE_REMANENT
Packit 0f19cf
Packit 0f19cf
    def cleanup(self):
Packit 0f19cf
        if (self.cache_method != "file"):
Packit 0f19cf
            return
Packit 0f19cf
Packit 0f19cf
        if (self.flags & self.CACHE_REMANENT):
Packit 0f19cf
            if (self.flags & self.CACHE_TMPDIR):
Packit 0f19cf
                self.warning("'%s' not removed" % (self.cache_dirname))
Packit 0f19cf
            return
Packit 0f19cf
Packit 0f19cf
        if (self.flags & self.CACHE_TMPDIR):
Packit 0f19cf
            self.debug("Remove cache directory '%s'" % (self.cache_dirname))
Packit 0f19cf
            shutil.rmtree(self.cache_dirname)
Packit 0f19cf
        else:
Packit 0f19cf
            for fname in self.cache_files:
Packit 0f19cf
                print "shutil.remove(", fname
Packit 0f19cf
Packit 0f19cf
    def cache(self, **kwargs):
Packit 0f19cf
        if self.cache_method == "file":
Packit 0f19cf
            return self.cache_file(kwargs)
Packit 0f19cf
        else:
Packit 0f19cf
            return self.cache_memory(kwargs)
Packit 0f19cf
    
Packit 0f19cf
    def cache_file(self, kwargs):
Packit 0f19cf
        if not(self.cache_dirname):
Packit 0f19cf
            self.cache_dirname = tempfile.mkdtemp()
Packit 0f19cf
            self.flags = self.flags | self.CACHE_TMPDIR | self.CACHE_DELONCLOSE
Packit 0f19cf
Packit 0f19cf
        if not(os.path.exists(self.cache_dirname)):
Packit 0f19cf
            os.mkdir(self.cache_dirname)
Packit 0f19cf
Packit 0f19cf
        cache_path = os.path.join(self.cache_dirname,
Packit 0f19cf
                                  self.cache_format % kwargs)
Packit 0f19cf
        stream_cache = StreamCacheFile(cache_path, flags=self.flags)
Packit 0f19cf
        self.cache_files.append(cache_path)
Packit 0f19cf
        return stream_cache
Packit 0f19cf
Packit 0f19cf
    def cache_memory(self, kwargs):
Packit 0f19cf
        stream_cache = StreamCacheMemory(flags=self.flags)
Packit 0f19cf
        return stream_cache
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class StreamCache:
Packit 0f19cf
    def __init__(self, outfile, flags=0):
Packit 0f19cf
        self.flags = flags
Packit 0f19cf
Packit 0f19cf
    def decompress(self, data, compress_type):
Packit 0f19cf
        if not(compress_type):
Packit 0f19cf
            return data
Packit 0f19cf
        if compress_type == "zlib":
Packit 0f19cf
            return zlib.decompress(data)
Packit 0f19cf
Packit 0f19cf
class StreamCacheFile(StreamCache):
Packit 0f19cf
    def __init__(self, outfile, flags=0):
Packit 0f19cf
        self.flags = flags
Packit 0f19cf
        self.outfile = outfile
Packit 0f19cf
        self._file = None
Packit 0f19cf
Packit 0f19cf
    def write(self, data, compress_type=""):
Packit 0f19cf
        if ((self.flags & StreamManager.CACHE_REFRESH)
Packit 0f19cf
            or not(os.path.exists(self.outfile))):
Packit 0f19cf
            data = self.decompress(data, compress_type)
Packit 0f19cf
            f = open(self.outfile, "w")
Packit 0f19cf
            f.write(data)
Packit 0f19cf
            f.close()
Packit 0f19cf
Packit 0f19cf
    def read(self, size=-1):
Packit 0f19cf
        if not(self._file):
Packit 0f19cf
            self._file = open(self.outfile)
Packit 0f19cf
        if size >= 0:
Packit 0f19cf
            data = self._file.read(size)
Packit 0f19cf
        else:
Packit 0f19cf
            data = self._file.read()
Packit 0f19cf
        return data
Packit 0f19cf
Packit 0f19cf
    def close(self):
Packit 0f19cf
        if (self._file):
Packit 0f19cf
            self._file.close()
Packit 0f19cf
        if (not(self.flags & StreamManager.CACHE_REMANENT) and \
Packit 0f19cf
            (self.flags & StreamManager.CACHE_DELONCLOSE)):
Packit 0f19cf
            os.remove(self.outfile)
Packit 0f19cf
Packit 0f19cf
class StreamCacheMemory(StreamCache):
Packit 0f19cf
    def __init__(self, flags=0):
Packit 0f19cf
        self.flags = flags
Packit 0f19cf
        self._buffer = ""
Packit 0f19cf
        self._read_pos = 0
Packit 0f19cf
Packit 0f19cf
    def write(self, data, compress_type=""):
Packit 0f19cf
        self._buffer += self.decompress(data, compress_type)
Packit 0f19cf
Packit 0f19cf
    def read(self, size=-1):
Packit 0f19cf
        remain = len(self._buffer)-self._read_pos
Packit 0f19cf
        if size >= 0:
Packit 0f19cf
            size = min(size, remain)
Packit 0f19cf
        else:
Packit 0f19cf
            size = remain
Packit 0f19cf
        _buf = self._buffer[self._read_pos:self._read_pos+size]
Packit 0f19cf
        self._read_pos += size
Packit 0f19cf
        return _buf
Packit 0f19cf
Packit 0f19cf
    def close(self):
Packit 0f19cf
        if (self.flags & StreamManager.CACHE_DELONCLOSE):
Packit 0f19cf
            del self._buffer
Packit 0f19cf
            self._buffer = None
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
def extract_string_objects(data, re_pattern, replace_fmt,
Packit 0f19cf
                           delims=None, object_cls=None,  object_id=0,
Packit 0f19cf
                           **kwargs):
Packit 0f19cf
Packit 0f19cf
    if isinstance(re_pattern, str):
Packit 0f19cf
        strings_found = re.findall(re_pattern, data, re.M|re.DOTALL)
Packit 0f19cf
    else:
Packit 0f19cf
        strings_found = re_pattern.findall(data)
Packit 0f19cf
Packit 0f19cf
    #print strings_found
Packit 0f19cf
    strings_objects = []
Packit 0f19cf
    for i, to in enumerate(strings_found):
Packit 0f19cf
        repl = replace_fmt % (i+object_id)
Packit 0f19cf
        if delims:
Packit 0f19cf
            to = delims[0] + to + delims[1]
Packit 0f19cf
            repl = delims[0] + repl + delims[1] 
Packit 0f19cf
        data = data.replace(to, repl, 1)
Packit 0f19cf
        if object_cls:
Packit 0f19cf
            strings_objects.append(object_cls(to, **kwargs))
Packit 0f19cf
        else:
Packit 0f19cf
            strings_objects.append(to)
Packit 0f19cf
    return (strings_objects, data)
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFContentStream(PDFStreamHandler):
Packit 0f19cf
    """
Packit 0f19cf
    Data between the 'stream ... endstream' tags in a PDF object used as
Packit 0f19cf
    content (and not as image or object storage).
Packit 0f19cf
    """
Packit 0f19cf
    def __init__(self, pdfobject, fontmgr=None):
Packit 0f19cf
        PDFStreamHandler.__init__(self, pdfobject)
Packit 0f19cf
        self.data = ""
Packit 0f19cf
        self.qnode_root = None
Packit 0f19cf
        self.textobjects = None
Packit 0f19cf
        self.fontmgr = fontmgr or FontManager({})
Packit 0f19cf
        pdfobject.stream_decode()
Packit 0f19cf
        self.extract_textobjects(pdfobject.stream_text())
Packit 0f19cf
        self.make_graph_tree()
Packit 0f19cf
Packit 0f19cf
    def extract_textobjects(self, data):
Packit 0f19cf
        fields = re.split("((?<=\s)BT(?=\s)|(?<=\s)ET(?=\s))", data)
Packit 0f19cf
Packit 0f19cf
        start_text = False
Packit 0f19cf
        textdata = ""
Packit 0f19cf
        textobject = None
Packit 0f19cf
        textobjects = []
Packit 0f19cf
Packit 0f19cf
        for field in fields:
Packit 0f19cf
            if field == "BT":
Packit 0f19cf
                start_text = True
Packit 0f19cf
                textdata = ""
Packit 0f19cf
            elif field == "ET":
Packit 0f19cf
                textobject = PDFTextObject(textdata, fontmgr=self.fontmgr)
Packit 0f19cf
                data = data.replace(textdata,
Packit 0f19cf
                                    " textobj(%d) " % len(textobjects), 1)
Packit 0f19cf
                textobjects.append(textobject)
Packit 0f19cf
                start_text = False
Packit 0f19cf
            elif start_text:
Packit 0f19cf
                textdata += field
Packit 0f19cf
Packit 0f19cf
        self.debug("Found %d textobjects" % len(textobjects))
Packit 0f19cf
        self.textobjects = textobjects
Packit 0f19cf
        self.data = data
Packit 0f19cf
Packit 0f19cf
    def make_graph_tree(self):
Packit 0f19cf
        graph_stacks = re.split("(q\s|\sQ)", self.data)
Packit 0f19cf
Packit 0f19cf
        self.qnode_root = GraphState()
Packit 0f19cf
        qnode = self.qnode_root
Packit 0f19cf
        for field in graph_stacks:
Packit 0f19cf
            if "q" in field:
Packit 0f19cf
                qnode = qnode.push(GraphState())
Packit 0f19cf
            elif "Q" in field:
Packit 0f19cf
                qnode = qnode.pop()
Packit 0f19cf
            elif field.strip():
Packit 0f19cf
                qnode.set_data(field)
Packit 0f19cf
                qnode.fill_textobjects(self.textobjects)
Packit 0f19cf
Packit 0f19cf
    def dump(self):
Packit 0f19cf
        self.qnode_root.dump()
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFMatrix(PDFBaseObject):
Packit 0f19cf
    """
Packit 0f19cf
             | a  b  0 |
Packit 0f19cf
        Tm = | c  d  0 |
Packit 0f19cf
             | e  f  1 |
Packit 0f19cf
Packit 0f19cf
        [x , y , 1] = [x1, y1, 1] x Tm1
Packit 0f19cf
        [x1, y1, 1] = [x2, y2, 1] x Tm2
Packit 0f19cf
        
Packit 0f19cf
     => [x , y , 1] = [x2, y2, 1] x Tm2 x Tm1
Packit 0f19cf
Packit 0f19cf
    """
Packit 0f19cf
    IDENT = [1, 0, 0, 1, 0, 0]
Packit 0f19cf
Packit 0f19cf
    def __init__(self, vector):
Packit 0f19cf
        self.vector = vector
Packit 0f19cf
Packit 0f19cf
    def tx(self):
Packit 0f19cf
        return self.vector[4]
Packit 0f19cf
Packit 0f19cf
    def ty(self):
Packit 0f19cf
        return self.vector[5]
Packit 0f19cf
Packit 0f19cf
    def scale(self):
Packit 0f19cf
        a, b, c, d, e, f = self.vector
Packit 0f19cf
        # Horizontal orientation
Packit 0f19cf
        if (abs(a) == abs(d) and b == 0 and c == 0):
Packit 0f19cf
            return abs(a)
Packit 0f19cf
        # vertical orientation
Packit 0f19cf
        if (abs(b) == abs(c) and a == 0 and d == 0):
Packit 0f19cf
            return abs(b)
Packit 0f19cf
        # Always return the first even if something is weird
Packit 0f19cf
        self.warning("Cannot interpret Tm matrix scale: %s" % self)
Packit 0f19cf
        return a
Packit 0f19cf
    
Packit 0f19cf
    def __str__(self):
Packit 0f19cf
        return str(self.vector)
Packit 0f19cf
Packit 0f19cf
    def __len__(self):
Packit 0f19cf
        return len(self.vector)
Packit 0f19cf
Packit 0f19cf
    def __mul__(self, vector):
Packit 0f19cf
        a, b, c, d, e, f = self.vector
Packit 0f19cf
        if len(vector) == 6:
Packit 0f19cf
            ar, br, cr, dr, er, fr = vector.vector
Packit 0f19cf
            a2 = a * ar + b * cr + 0 * er
Packit 0f19cf
            b2 = a * br + b * dr + 0 * fr
Packit 0f19cf
            c2 = c * ar + d * cr + 0 * er
Packit 0f19cf
            d2 = c * br + d * dr + 0 * fr
Packit 0f19cf
            e2 = e * ar + f * cr + 1 * er        
Packit 0f19cf
            f2 = e * br + f * dr + 1 * fr        
Packit 0f19cf
Packit 0f19cf
            m = PDFMatrix([a2,b2,c2,d2,e2,f2])
Packit 0f19cf
            return m
Packit 0f19cf
        else:
Packit 0f19cf
            x, y = vector[0:2]
Packit 0f19cf
            x2 = a * x + c * y + e
Packit 0f19cf
            y2 = b * x + d * y + f
Packit 0f19cf
            return [x2, y2, 1]
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class GraphState:
Packit 0f19cf
    """
Packit 0f19cf
    Graphic state starts with 'q' and ends with 'Q' in content stream.
Packit 0f19cf
    It can contain other graphic states and/or text objects.
Packit 0f19cf
    """
Packit 0f19cf
    def __init__(self):
Packit 0f19cf
        self._parent = None
Packit 0f19cf
        self._children = []
Packit 0f19cf
        self._level = 0
Packit 0f19cf
        self._data = ""
Packit 0f19cf
        self.textobjects = []
Packit 0f19cf
        self.matrix = PDFMatrix(PDFMatrix.IDENT)
Packit 0f19cf
Packit 0f19cf
    def level(self):
Packit 0f19cf
        return self._level
Packit 0f19cf
Packit 0f19cf
    def set_parent(self, qnode):
Packit 0f19cf
        self._parent = qnode
Packit 0f19cf
        self._level = qnode.level()+1
Packit 0f19cf
Packit 0f19cf
    def push(self, qnode):
Packit 0f19cf
        self._children.append(qnode)
Packit 0f19cf
        qnode.set_parent(self)
Packit 0f19cf
        return qnode
Packit 0f19cf
Packit 0f19cf
    def pop(self):
Packit 0f19cf
        qnode = self._parent
Packit 0f19cf
        return qnode
Packit 0f19cf
Packit 0f19cf
    def set_data(self, data, textobjects=None):
Packit 0f19cf
        self._data = data
Packit 0f19cf
        if textobjects:
Packit 0f19cf
            self.fill_textobjects(textobjects)
Packit 0f19cf
        self.extract_matrix()
Packit 0f19cf
Packit 0f19cf
    def fill_textobjects(self, textobjects):
Packit 0f19cf
        #print self._data #***
Packit 0f19cf
        tos = re.findall(" (textobj\(\d+\))", self._data)
Packit 0f19cf
        for to in tos:
Packit 0f19cf
            m = re.match("textobj\((\d+)\)", to)
Packit 0f19cf
            if m:
Packit 0f19cf
                textobject = textobjects[int(m.group(1))]
Packit 0f19cf
                textobject.set_graphstate(self)
Packit 0f19cf
                self.textobjects.append(textobject)
Packit 0f19cf
Packit 0f19cf
        self._data = re.sub(" textobj\(\d+\)", "",
Packit 0f19cf
                       self._data, flags=re.MULTILINE).strip()
Packit 0f19cf
Packit 0f19cf
    def extract_matrix(self):
Packit 0f19cf
        m = re.search("("+6*"[^\s]+\s+"+"cm"+")", self._data)
Packit 0f19cf
        if m:
Packit 0f19cf
            vector = [ float(v) for v in m.group(1).split()[0:6] ]
Packit 0f19cf
            self.matrix = PDFMatrix(vector)
Packit 0f19cf
Packit 0f19cf
    def dump(self):
Packit 0f19cf
        s = self._level * "  " + "q '" + self._data + "'"
Packit 0f19cf
        print s
Packit 0f19cf
        for q in self._children:
Packit 0f19cf
            q.dump()
Packit 0f19cf
        s = self._level * "  " + "Q"
Packit 0f19cf
        print s
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFTextObject:
Packit 0f19cf
    """
Packit 0f19cf
    Data between the 'BT' and 'ET' tokens found in content streams.
Packit 0f19cf
    """
Packit 0f19cf
    _font_op_pattern = "/[^\s]+\s+[^\s]+\s+Tf"
Packit 0f19cf
Packit 0f19cf
    # Detect a 'Tf', 'Tm', 'Tj', 'TJ', Td, TD operator sequence in a text object
Packit 0f19cf
    # To use only when strings are extracted and replaced by their reference
Packit 0f19cf
    _re_seq = re.compile("(" + _font_op_pattern + "|"+\
Packit 0f19cf
                         6*"[^\s]+\s+"+"Tm"+"|"+\
Packit 0f19cf
                         "\(textcontent\{\d+\}\)\s*Tj|"+\
Packit 0f19cf
                         "\[[^\]]*\]\s*TJ|"+\
Packit 0f19cf
                         "[^\s]+\s+[^\s]+\s+T[dD])", re.MULTILINE)
Packit 0f19cf
Packit 0f19cf
    # Find a font setup operator, like '/F10 9.47 Tf'
Packit 0f19cf
    _re_font = re.compile("("+_font_op_pattern+")", re.MULTILINE)
Packit 0f19cf
Packit 0f19cf
    # Find a sequence '(...\(...\)...) Tj'
Packit 0f19cf
    _re_text_show1 = re.compile("(\((?:" + "[^()]" + "|" +\
Packit 0f19cf
                                      r"(?<=\\)\(" + "|" +\
Packit 0f19cf
                                      r"(?<=\\)\)" + ")*\)\s*Tj)", re.M)
Packit 0f19cf
                                
Packit 0f19cf
    # Find a sequence '[...\[...\]...] TJ'
Packit 0f19cf
    _re_text_show2 = re.compile("\[((?:" + "[^\[\]]" + "|" +\
Packit 0f19cf
                                        r"(?<=\\)\[" + "|" +\
Packit 0f19cf
                                        r"(?<=\\)\]" + ")*)\]\s*TJ", re.M)
Packit 0f19cf
Packit 0f19cf
    def __init__(self, data, fontmgr=None):
Packit 0f19cf
        self.data = data
Packit 0f19cf
        self.matrix = PDFMatrix(PDFMatrix.IDENT)
Packit 0f19cf
        self.fontmgr = fontmgr or FontManager({})
Packit 0f19cf
        self.qnode = None
Packit 0f19cf
        self.strings = []
Packit 0f19cf
        self.textsegments = []
Packit 0f19cf
        self.textlines = []
Packit 0f19cf
        self.extract_strings()
Packit 0f19cf
        self.extract_matrix()
Packit 0f19cf
        self.parse_data()
Packit 0f19cf
Packit 0f19cf
    def set_graphstate(self, gs):
Packit 0f19cf
        self.qnode = gs
Packit 0f19cf
Packit 0f19cf
    def set_fontmanager(self, fontmgr):
Packit 0f19cf
        self.fontmgr = fontmgr
Packit 0f19cf
Packit 0f19cf
    def matrix_absolute(self):
Packit 0f19cf
        # The textobject matrix change is the last one, so on the full left
Packit 0f19cf
        m = self.matrix
Packit 0f19cf
Packit 0f19cf
        # We climb the graph stack from the deepest (newer) to the upper
Packit 0f19cf
        # (oldest) node so:
Packit 0f19cf
        # Absolute Matrix = Newest (m) x ... x Oldest (qnode.matrix)
Packit 0f19cf
        qnode = self.qnode
Packit 0f19cf
        while qnode:
Packit 0f19cf
            m = m * qnode.matrix 
Packit 0f19cf
            qnode = qnode.pop()
Packit 0f19cf
        return m
Packit 0f19cf
Packit 0f19cf
    def extract_matrix(self):
Packit 0f19cf
        m = re.search("("+6*"[^\s]+\s+"+"Tm"+")", self.data)
Packit 0f19cf
        if m:
Packit 0f19cf
            vector = [ float(v) for v in m.group(1).split()[0:6] ]
Packit 0f19cf
            self.matrix = PDFMatrix(vector)
Packit 0f19cf
    
Packit 0f19cf
    def extract_strings(self):
Packit 0f19cf
        #print self.data
Packit 0f19cf
        objects, data = extract_string_objects(self.data, self._re_text_show1,
Packit 0f19cf
                                               "(textcontent{%d}) Tj")
Packit 0f19cf
        self.strings = objects                                       
Packit 0f19cf
        objects, data = extract_string_objects(data, self._re_text_show2,
Packit 0f19cf
                                               "textcontent{%d}",
Packit 0f19cf
                                               delims=["[","]"],
Packit 0f19cf
                                               object_id=len(self.strings))
Packit 0f19cf
        #print data
Packit 0f19cf
        self.strings += objects
Packit 0f19cf
        self.data = data
Packit 0f19cf
Packit 0f19cf
    def _newline(self):
Packit 0f19cf
        linerow = []
Packit 0f19cf
        self.textlines.append(linerow)
Packit 0f19cf
        return linerow
Packit 0f19cf
Packit 0f19cf
    def get_font(self, font, size, scale):
Packit 0f19cf
        return self.fontmgr.get_font(font, float(size)*scale)
Packit 0f19cf
Packit 0f19cf
    def parse_data(self):
Packit 0f19cf
        linerow = self._newline()
Packit 0f19cf
        textline = PDFTextSegment("", PDFMatrix(PDFMatrix.IDENT))
Packit 0f19cf
        linerow.append(textline)
Packit 0f19cf
Packit 0f19cf
        # Find the operator sequences
Packit 0f19cf
        operators = self._re_seq.findall(self.data)
Packit 0f19cf
Packit 0f19cf
        font, size = "", 1
Packit 0f19cf
        last_key = ""
Packit 0f19cf
Packit 0f19cf
        for tx in operators:
Packit 0f19cf
            fields = tx.split()
Packit 0f19cf
            key = fields[-1]
Packit 0f19cf
Packit 0f19cf
            # Found a font setup, memorize the fontname and fontsize base
Packit 0f19cf
            if key == "Tf":
Packit 0f19cf
                font = fields[0]
Packit 0f19cf
                size = fields[1]
Packit 0f19cf
            # Found the matrix setup, memorize it
Packit 0f19cf
            elif key == "Tm":
Packit 0f19cf
                vector = [ float(c) for c in fields[0:6]]
Packit 0f19cf
                self.matrix = PDFMatrix(vector)
Packit 0f19cf
            # Found a text positionning
Packit 0f19cf
            elif key in ("Td", "TD"):
Packit 0f19cf
                tx, ty = [ float(c) for c in fields[0:2]]
Packit 0f19cf
                matrix = PDFMatrix([1, 0, 0, 1, tx, ty])
Packit 0f19cf
                textline = PDFTextSegment("", matrix)
Packit 0f19cf
                self.textsegments.append(textline)
Packit 0f19cf
                if matrix.ty() != 0:
Packit 0f19cf
                    linerow = self._newline()
Packit 0f19cf
                linerow.append(textline)
Packit 0f19cf
            # When text is shown, the current font/size setup applies and is
Packit 0f19cf
            # then recorded
Packit 0f19cf
            elif "Tj" in key or "TJ" in key:
Packit 0f19cf
                m = re.search("textcontent\{(\d+)\}", tx)
Packit 0f19cf
                text_string = self.strings[int(m.group(1))]
Packit 0f19cf
                scale = self.matrix.scale()
Packit 0f19cf
                #print font, size, scale #*****
Packit 0f19cf
                pdffont = self.get_font(font, size, scale)
Packit 0f19cf
                text_shown = PDFTextShow(text_string, pdffont)
Packit 0f19cf
                textline.add_text_show(text_shown)
Packit 0f19cf
            last_key = key
Packit 0f19cf
Packit 0f19cf
class PDFTextSegment:
Packit 0f19cf
    """
Packit 0f19cf
    A text segment is a portion of text related to a text position operator 'Td'
Packit 0f19cf
    or 'TD'. It contains all the texts shown related to this position, signaled
Packit 0f19cf
    with the 'Tj' and 'TJ' tokens
Packit 0f19cf
    """
Packit 0f19cf
    def __init__(self, data, matrix):
Packit 0f19cf
        self.matrix = matrix
Packit 0f19cf
        self.data = data
Packit 0f19cf
        self.strings = None
Packit 0f19cf
        self.text_shown = []
Packit 0f19cf
Packit 0f19cf
    def __str__(self):
Packit 0f19cf
        s = ""
Packit 0f19cf
        for o in self.text_shown:
Packit 0f19cf
            s += str(o)
Packit 0f19cf
        return s
Packit 0f19cf
Packit 0f19cf
    def text(self):
Packit 0f19cf
        s = " ".join([o.text() for o in self.text_shown])
Packit 0f19cf
        return s
Packit 0f19cf
Packit 0f19cf
    def set_strings(self, strings):
Packit 0f19cf
        self.strings = strings
Packit 0f19cf
Packit 0f19cf
    def add_text_show(self, text_shown):
Packit 0f19cf
        self.text_shown.append(text_shown)
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFTextShow:
Packit 0f19cf
    """
Packit 0f19cf
    Data between the '( )' of the 'Tj' operator or '[ ]' of the 'TJ' operator
Packit 0f19cf
    that is intended to be shown.
Packit 0f19cf
    """
Packit 0f19cf
    _re_textascii = re.compile(r"\(((?:[^)]|(?<=\\)\))*)\)", re.M)
Packit 0f19cf
    _re_textunicode = re.compile(r"<([^>]+)>", re.M)
Packit 0f19cf
    _codec_handler_installed = {}
Packit 0f19cf
Packit 0f19cf
    def __init__(self, data, font):
Packit 0f19cf
        self.data = data
Packit 0f19cf
        self.font = font
Packit 0f19cf
        self.encode = codecs.getencoder("latin1")
Packit 0f19cf
        if not(self._codec_handler_installed):
Packit 0f19cf
            codecs.register_error("substitute", PDFTextShow._encode_subs)
Packit 0f19cf
            self._codec_handler_installed["substitute"] = PDFTextShow._encode_subs
Packit 0f19cf
Packit 0f19cf
    def __str__(self):
Packit 0f19cf
        return self.data.replace("\n", " ")
Packit 0f19cf
Packit 0f19cf
    def text(self):
Packit 0f19cf
        textdata = self._re_textascii.findall(self.data)
Packit 0f19cf
        textdata = "".join(textdata).replace("\(", "(").replace("\)", ")")
Packit 0f19cf
        if textdata:
Packit 0f19cf
            return textdata
Packit 0f19cf
        if (self.font.tounicode):
Packit 0f19cf
            textdata = self._re_textunicode.findall(self.data)
Packit 0f19cf
            s = u" ".join(self.font.tounicode.decode(textdata))
Packit 0f19cf
            return self.encode(s, "substitute")[0]
Packit 0f19cf
        else:
Packit 0f19cf
            return ""
Packit 0f19cf
Packit 0f19cf
    def get_font(self):
Packit 0f19cf
        return self.font
Packit 0f19cf
Packit 0f19cf
    @classmethod
Packit 0f19cf
    def _encode_subs(cls, exc):
Packit 0f19cf
        if not isinstance(exc, UnicodeEncodeError):
Packit 0f19cf
            return u""
Packit 0f19cf
        l = []
Packit 0f19cf
        for c in exc.object[exc.start:exc.end]:
Packit 0f19cf
            l.append(u"&#x%x;" % ord(c))
Packit 0f19cf
        return (u"".join(l), exc.end)
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFFont:
Packit 0f19cf
    def __init__(self, fontobject, fontsize, tounicode=None):
Packit 0f19cf
        self.fontobject = fontobject
Packit 0f19cf
        self.fontsize = fontsize
Packit 0f19cf
        self.tounicode = tounicode
Packit 0f19cf
Packit 0f19cf
    def key(self):
Packit 0f19cf
        key = "%s/%6.2f" % (self.name(), self.size())
Packit 0f19cf
        return key
Packit 0f19cf
Packit 0f19cf
    def __cmp__(self, other):
Packit 0f19cf
        a = (cmp(self.name(), other.name()) or
Packit 0f19cf
             cmp(self.size(), other.size()))
Packit 0f19cf
        return a
Packit 0f19cf
Packit 0f19cf
    def name(self):
Packit 0f19cf
        return self.fontobject.descriptor.get("/BaseFont")
Packit 0f19cf
Packit 0f19cf
    def size(self):
Packit 0f19cf
        return self.fontsize
Packit 0f19cf
Packit 0f19cf
class FontManager:
Packit 0f19cf
    def __init__(self, fontdict, global_fontmgr=None):
Packit 0f19cf
        self.fontdict = fontdict
Packit 0f19cf
        self.fontused = {}
Packit 0f19cf
        self.tounicode = {}
Packit 0f19cf
        self.global_fontmgr = global_fontmgr
Packit 0f19cf
        self.resolver = PDFResolver.get_resolver()
Packit 0f19cf
Packit 0f19cf
    def get_pdffont(self, fontobj, fontsize):
Packit 0f19cf
        key = fontobj.descriptor.get("/BaseFont")+"/"+"%6.2f" % fontsize
Packit 0f19cf
        if self.fontused.has_key(key):
Packit 0f19cf
            return self.fontused.get(key)
Packit 0f19cf
        elif self.global_fontmgr:
Packit 0f19cf
            pdffont = self.global_fontmgr.get_pdffont(fontobj, fontsize)
Packit 0f19cf
            self.fontused[key] = pdffont
Packit 0f19cf
        else:
Packit 0f19cf
            pdffont = self._make_pdffont(fontobj, fontsize)
Packit 0f19cf
            self.fontused[key] = pdffont
Packit 0f19cf
        return pdffont
Packit 0f19cf
Packit 0f19cf
    def _make_pdffont(self, fontobj, fontsize):
Packit 0f19cf
        fontobj.link_to(self.resolver)
Packit 0f19cf
        pdfobject = fontobj.descriptor.get("/ToUnicode")
Packit 0f19cf
        if pdfobject:
Packit 0f19cf
            pdfobject.link_to(self.resolver)
Packit 0f19cf
            tuc = self._get_tounicode(pdfobject)
Packit 0f19cf
        else:
Packit 0f19cf
            tuc = None
Packit 0f19cf
        pdffont = PDFFont(fontobj, fontsize, tuc)
Packit 0f19cf
        return pdffont
Packit 0f19cf
Packit 0f19cf
    def _get_tounicode(self, pdfobject):
Packit 0f19cf
        key = pdfobject.ident()
Packit 0f19cf
        if self.tounicode.has_key(key):
Packit 0f19cf
            tuc = self.tounicode.get(key)
Packit 0f19cf
        else:
Packit 0f19cf
            tuc = ToUnicode(pdfobject)
Packit 0f19cf
            self.tounicode[key] = tuc
Packit 0f19cf
        return tuc
Packit 0f19cf
Packit 0f19cf
    def get_font(self, fontref, size):
Packit 0f19cf
        fontobj = self.fontdict.get(fontref)
Packit 0f19cf
        if not(fontobj):
Packit 0f19cf
            return None
Packit 0f19cf
        return self.get_pdffont(fontobj, size)
Packit 0f19cf
Packit 0f19cf
    def get_used(self):
Packit 0f19cf
        return self.fontused.values()
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class ToUnicode(PDFStreamHandler):
Packit 0f19cf
    """
Packit 0f19cf
    Handle the /ToUnicode CMap object found in a font, in order to be able to
Packit 0f19cf
    translate the text content to readable text
Packit 0f19cf
    """
Packit 0f19cf
    _re_token = re.compile("(" + \
Packit 0f19cf
             "(?:\d+\s+(?:begincodespacerange|beginbfchar|beginbfrange))" + "|"\
Packit 0f19cf
             "(?:endcodespacerange|endbfchar|endbfrange)" + \
Packit 0f19cf
             ")", re.M)
Packit 0f19cf
Packit 0f19cf
    def __init__(self, pdfobject):
Packit 0f19cf
        PDFStreamHandler.__init__(self, pdfobject)
Packit 0f19cf
        self.charmaps = []
Packit 0f19cf
        pdfobject.stream_decode()
Packit 0f19cf
        self.data = pdfobject.stream_text()
Packit 0f19cf
        self.parse_cmap(self.data)
Packit 0f19cf
        self.debug("Create a ToUnicode object for '%s'" % pdfobject.ident())
Packit 0f19cf
Packit 0f19cf
    def parse_cmap(self, data):
Packit 0f19cf
        flds = self._re_token.split(data)
Packit 0f19cf
Packit 0f19cf
        bfchar = None
Packit 0f19cf
        bfrange = None
Packit 0f19cf
        for fld in flds:
Packit 0f19cf
            if "begincodespacerange" in fld:
Packit 0f19cf
                # TODO
Packit 0f19cf
                pass
Packit 0f19cf
            elif "beginbfchar" in fld:
Packit 0f19cf
                n = int(fld.split()[0])
Packit 0f19cf
                bfchar = BfRange(n)
Packit 0f19cf
            elif "beginbfrange" in fld:
Packit 0f19cf
                n = int(fld.split()[0])
Packit 0f19cf
                bfrange = BfRange(n)
Packit 0f19cf
            elif "endcodespacerange" in fld:
Packit 0f19cf
                pass
Packit 0f19cf
            elif "endbfchar" in fld:
Packit 0f19cf
                self.add_bfrange(bfchar)
Packit 0f19cf
                bfchar = None
Packit 0f19cf
            elif "endbfrange" in fld:
Packit 0f19cf
                self.add_bfrange(bfrange)
Packit 0f19cf
                bfrange = None
Packit 0f19cf
            elif bfchar:
Packit 0f19cf
                fld = re.sub("<\s+", "<", fld)
Packit 0f19cf
                fld = re.sub("\s+>", ">", fld)
Packit 0f19cf
                data = fld.split()
Packit 0f19cf
                for i in range(0, len(data), 2):
Packit 0f19cf
                    bfchar.add_mapstr(data[i], data[i], data[i+1])
Packit 0f19cf
            elif bfrange:
Packit 0f19cf
                fld = re.sub("<\s+", "<", fld)
Packit 0f19cf
                fld = re.sub("\s+>", ">", fld)
Packit 0f19cf
                data = fld.split()
Packit 0f19cf
                for i in range(0, len(data), 3):
Packit 0f19cf
                    bfrange.add_mapstr(data[i], data[i+1], data[i+2])
Packit 0f19cf
 
Packit 0f19cf
    def add_bfrange(self, bfrange):
Packit 0f19cf
        self.charmaps.extend(bfrange.charmaps)
Packit 0f19cf
        self.charmaps.sort()
Packit 0f19cf
Packit 0f19cf
    def get_uccode(self, bfchar):
Packit 0f19cf
        mustbe_in_next = False
Packit 0f19cf
        for m in self.charmaps:
Packit 0f19cf
            if bfchar >= m.bffirst:
Packit 0f19cf
                if bfchar <= m.bflast:
Packit 0f19cf
                    return m.uccode + (bfchar - m.bffirst)
Packit 0f19cf
                else:
Packit 0f19cf
                    mustbe_in_next = True
Packit 0f19cf
            elif mustbe_in_next:
Packit 0f19cf
                return 0
Packit 0f19cf
        return 0
Packit 0f19cf
Packit 0f19cf
    def decode_string(self, data):
Packit 0f19cf
        ul = []
Packit 0f19cf
        for i in range(0, len(data), 4):
Packit 0f19cf
            s = data[i:i+4]
Packit 0f19cf
            #print s
Packit 0f19cf
            ul.append(unichr(self.get_uccode(int(s,16))))
Packit 0f19cf
        return u"".join(ul)
Packit 0f19cf
Packit 0f19cf
    def decode(self, data):
Packit 0f19cf
        if isinstance(data, list):
Packit 0f19cf
            return [self.decode_string(s) for s in data]
Packit 0f19cf
        else:
Packit 0f19cf
            return self.decode_string(data)
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class CharMap:
Packit 0f19cf
    def __init__(self, bffirst, bflast, uccode):
Packit 0f19cf
        self.bffirst = bffirst
Packit 0f19cf
        self.bflast = bflast
Packit 0f19cf
        self.uccode = uccode
Packit 0f19cf
Packit 0f19cf
    def __cmp__(self, other):
Packit 0f19cf
        return cmp(self.bffirst, other.bffirst)
Packit 0f19cf
Packit 0f19cf
class BfRange:
Packit 0f19cf
    def __init__(self, entry_count):
Packit 0f19cf
        self.entry_count = entry_count
Packit 0f19cf
        self.charmaps = []
Packit 0f19cf
Packit 0f19cf
    def add_mapstr(self, bffirst_str, bflast_str, ucfirst_str):
Packit 0f19cf
        # Take strings like <045D>
Packit 0f19cf
        bffirst = int(bffirst_str[1:-1], 16)
Packit 0f19cf
        bflast = int(bflast_str[1:-1], 16)
Packit 0f19cf
        ucfirst = int(ucfirst_str[1:-1], 16)
Packit 0f19cf
        self.charmaps.append(CharMap(bffirst, bflast, ucfirst))
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
#
Packit 0f19cf
# Starting from here is the command stuff
Packit 0f19cf
#
Packit 0f19cf
#
Packit 0f19cf
import textwrap
Packit 0f19cf
Packit 0f19cf
class BasicCmd:
Packit 0f19cf
    def __init__(self):
Packit 0f19cf
        pass
Packit 0f19cf
    def setup_parser(self, parser):
Packit 0f19cf
        return True
Packit 0f19cf
    def help(self, cmd):
Packit 0f19cf
        if self.__doc__:
Packit 0f19cf
            return self.__doc__
Packit 0f19cf
        else:
Packit 0f19cf
            return None
Packit 0f19cf
Packit 0f19cf
class PageLayoutCmd(BasicCmd):
Packit 0f19cf
    """
Packit 0f19cf
    Show the position and fonts used for each text line contained by a page
Packit 0f19cf
    """
Packit 0f19cf
    def __init__(self, scanner):
Packit 0f19cf
        self.scanner = scanner
Packit 0f19cf
        layout_fmt = "%5s %5s | %5s %5s | %8s | "
Packit 0f19cf
        self.padding = layout_fmt % (" "," "," "," "," ")
Packit 0f19cf
        self.headline = layout_fmt % (5*"_",5*"_",5*"_",5*"_",8*"_")
Packit 0f19cf
        self.header = layout_fmt % ("dX","dY","X","Y","FONTS")
Packit 0f19cf
        self.width = 90
Packit 0f19cf
        self.show_matrix = False
Packit 0f19cf
        self.raw_text = False
Packit 0f19cf
        self.pt_factor = 1
Packit 0f19cf
Packit 0f19cf
    def setup_parser(self, parser):
Packit 0f19cf
        parser.add_argument("-w", "--width",
Packit 0f19cf
               help="Width of the printed layout information")
Packit 0f19cf
        parser.add_argument("-m", "--show-matrix", action="store_true",
Packit 0f19cf
               help="Print absolute Transformation Matrix for each textobject")
Packit 0f19cf
        parser.add_argument("-r", "--raw-text", action="store_true",
Packit 0f19cf
               help="Print the raw text contained by textobjects")
Packit 0f19cf
Packit 0f19cf
    def run(self, parser, args):
Packit 0f19cf
        if args.width:
Packit 0f19cf
            self.width = int(args.width)
Packit 0f19cf
        if args.show_matrix:
Packit 0f19cf
            self.show_matrix = True
Packit 0f19cf
        if args.raw_text:
Packit 0f19cf
            self.raw_text = True
Packit 0f19cf
        
Packit 0f19cf
        for pg in self.scanner.page_groups:
Packit 0f19cf
            self.print_page_layout(pg)
Packit 0f19cf
Packit 0f19cf
    def print_page_layout(self, pdf_pages):
Packit 0f19cf
        for page in pdf_pages:
Packit 0f19cf
            fonts_used = page.find_fonts()
Packit 0f19cf
            fonts_used.sort()
Packit 0f19cf
            print "\nPage %d fonts used:" % page.pagenum
Packit 0f19cf
            for i, font in enumerate(fonts_used):
Packit 0f19cf
                print "[%d] %-40s %6.2f pt" % (i, font.name(),
Packit 0f19cf
                                               self.pt_factor*font.size())
Packit 0f19cf
Packit 0f19cf
            print "\nPage %d layout:" % page.pagenum
Packit 0f19cf
            content_stream = page.streams[0]
Packit 0f19cf
            xp, yp = 0., 0.
Packit 0f19cf
            print self.header
Packit 0f19cf
            print self.headline
Packit 0f19cf
            for textobject in content_stream.textobjects:
Packit 0f19cf
                xp, yp = self._print_textobject_layout(textobject, xp, yp,
Packit 0f19cf
                                                       fonts_used)
Packit 0f19cf
Packit 0f19cf
    def _print_textobject_layout(self, textobject, xp, yp, fonts_used):
Packit 0f19cf
        wraplen = self.width - len(self.padding)
Packit 0f19cf
Packit 0f19cf
        m2 = textobject.matrix_absolute()
Packit 0f19cf
Packit 0f19cf
        for line in textobject.textlines:
Packit 0f19cf
            # Track the fonts used per line
Packit 0f19cf
            font_line = []
Packit 0f19cf
            for seg in line:
Packit 0f19cf
                for text_shown in seg.text_shown:
Packit 0f19cf
                    font = text_shown.get_font()
Packit 0f19cf
                    if not(font):
Packit 0f19cf
                        continue
Packit 0f19cf
                    idx = fonts_used.index(font)
Packit 0f19cf
                    if not(idx in font_line):
Packit 0f19cf
                        font_line.append(idx)
Packit 0f19cf
Packit 0f19cf
            m2 = line[0].matrix * m2
Packit 0f19cf
            if self.show_matrix: print "%s" % m2
Packit 0f19cf
Packit 0f19cf
            x, y = m2.tx(), m2.ty()
Packit 0f19cf
            x, y = float(x/72), float(y/72)
Packit 0f19cf
            dx, dy = x - xp, y - yp
Packit 0f19cf
            info = "%5.2f %5.2f | %5.2f %5.2f | %8s | " % \
Packit 0f19cf
                  (dx, dy, x, y, font_line)
Packit 0f19cf
            if self.raw_text:
Packit 0f19cf
                text = "".join([str(s) for s in line])
Packit 0f19cf
            else:
Packit 0f19cf
                text = "".join([s.text() for s in line])
Packit 0f19cf
            textw = textwrap.wrap(text, wraplen)
Packit 0f19cf
Packit 0f19cf
            if textw:
Packit 0f19cf
                print "%s%s" % (info, textw[0])
Packit 0f19cf
                for txt in textw[1:]:
Packit 0f19cf
                    print "%s%s" % (self.padding, txt)
Packit 0f19cf
Packit 0f19cf
            xp, yp = x, y
Packit 0f19cf
            for l in line[1:]:
Packit 0f19cf
                m2 = l.matrix * m2
Packit 0f19cf
Packit 0f19cf
        return (xp, yp)
Packit 0f19cf
Packit 0f19cf
class PageObjectCmd(BasicCmd):
Packit 0f19cf
    """
Packit 0f19cf
    List the PDF objects used per page
Packit 0f19cf
    """
Packit 0f19cf
    def __init__(self, scanner):
Packit 0f19cf
        self.scanner = scanner
Packit 0f19cf
Packit 0f19cf
    def run(self, parser, args):
Packit 0f19cf
        page_first = 1
Packit 0f19cf
        for i, page in enumerate(self.scanner.pdf.page_objects):
Packit 0f19cf
            page_num = i+page_first
Packit 0f19cf
            contents = page.descriptor.get("/Contents")
Packit 0f19cf
            resources = page.descriptor.get("/Resources")
Packit 0f19cf
            print "Page %d %s: contents: %s, resources: %s" % \
Packit 0f19cf
                                 (page_num, page, contents, resources)
Packit 0f19cf
        print
Packit 0f19cf
Packit 0f19cf
class PdfObjectCmd(BasicCmd):
Packit 0f19cf
    """
Packit 0f19cf
    Scan data on the PDF objects of the PDF File
Packit 0f19cf
    """
Packit 0f19cf
    def __init__(self, scanner):
Packit 0f19cf
        self.scanner = scanner
Packit 0f19cf
Packit 0f19cf
    def setup_parser(self, parser):
Packit 0f19cf
        group = parser.add_mutually_exclusive_group()
Packit 0f19cf
        group.add_argument("-list", "--list-loaded", action="store_true",
Packit 0f19cf
               help="List the object loaded by the scanner")
Packit 0f19cf
        group.add_argument("-dict", "--dictionnary",
Packit 0f19cf
               metavar="'<number> <generation>'",
Packit 0f19cf
               help="Show the dictionnary of the object specified by its "\
Packit 0f19cf
                    "reference '<number> <generation>'")
Packit 0f19cf
        group.add_argument("-dump", "--dump-stream", nargs=2,
Packit 0f19cf
               metavar=("'<number> <generation>'","OUTFILE"),
Packit 0f19cf
               help="Write the stream content of the object specified by its "\
Packit 0f19cf
                    "reference '<number> <generation>'")
Packit 0f19cf
Packit 0f19cf
    def run(self, parser, args):
Packit 0f19cf
        if args.list_loaded:
Packit 0f19cf
            self.list_pdfobjects()
Packit 0f19cf
        elif args.dictionnary:
Packit 0f19cf
            ident = self._sanitize_objref(args.dictionnary)
Packit 0f19cf
            if not(ident): return
Packit 0f19cf
            self.show_dictionnary(ident)
Packit 0f19cf
        elif args.dump_stream:
Packit 0f19cf
            ident = self._sanitize_objref(args.dump_stream[0])
Packit 0f19cf
            if not(ident): return
Packit 0f19cf
            self.dump_stream(ident, args.dump_stream[1])
Packit 0f19cf
Packit 0f19cf
    def _sanitize_objref(self, ident):
Packit 0f19cf
        flds = ident.split()
Packit 0f19cf
        if len(flds) != 2:
Packit 0f19cf
            print "Invalid object reference: must be in the form "\
Packit 0f19cf
                  "'number generation'"
Packit 0f19cf
            return ""
Packit 0f19cf
        else:
Packit 0f19cf
            return "%s %s" % (flds[0], flds[1])
Packit 0f19cf
Packit 0f19cf
    def show_dictionnary(self, ident):
Packit 0f19cf
        pdfobject = self.scanner.pdf.get_object(ident)
Packit 0f19cf
        if not(pdfobject):
Packit 0f19cf
            print "PDF Object '%s' not found" % ident
Packit 0f19cf
            return
Packit 0f19cf
        if pdfobject.stream:
Packit 0f19cf
            print "PDF Object '%s' has a stream. Its dictionnary:" % ident
Packit 0f19cf
        else:
Packit 0f19cf
            print "PDF Object '%s' dictionnary:" % ident
Packit 0f19cf
        self._print_dictionnary(pdfobject.descriptor)
Packit 0f19cf
Packit 0f19cf
    def _print_dictionnary(self, descriptor, level=1):
Packit 0f19cf
        indent = "  "*level
Packit 0f19cf
        print "%s<<" % indent
Packit 0f19cf
        for p, v in descriptor.infos().items():
Packit 0f19cf
            if isinstance(v, PDFDescriptor):
Packit 0f19cf
                print "%s%s:" % (indent, p)
Packit 0f19cf
                self._print_dictionnary(v, level=level+1)
Packit 0f19cf
            else:
Packit 0f19cf
                print "%s%s: %s" % (indent, p, v)
Packit 0f19cf
        print "%s>>" % indent
Packit 0f19cf
Packit 0f19cf
    def list_pdfobjects(self):
Packit 0f19cf
        pdfobjects = self.scanner.pdf.pdfobjects
Packit 0f19cf
        print "Found %s PDFObjects" % pdfobjects.count()
Packit 0f19cf
        print "Found the following PDFObject types:"
Packit 0f19cf
        types = pdfobjects.types()
Packit 0f19cf
        types.sort()
Packit 0f19cf
        total = 0
Packit 0f19cf
        for typ in types:
Packit 0f19cf
            n_type = len(pdfobjects.get_objects_by_type(typ))
Packit 0f19cf
            print " %20s: %5d objects" % (typ, n_type)
Packit 0f19cf
            total = total + n_type
Packit 0f19cf
        print " %20s: %5d objects" % ("TOTAL", total)
Packit 0f19cf
Packit 0f19cf
    def dump_stream(self, ident, outfile):
Packit 0f19cf
        pdfobject = self.scanner.pdf.get_object(ident)
Packit 0f19cf
        if not(pdfobject):
Packit 0f19cf
            print "PDF Object '%s' not found" % ident
Packit 0f19cf
            return
Packit 0f19cf
        if not(pdfobject.stream):
Packit 0f19cf
            print "PDF Object '%s' has no stream. Give up." % ident
Packit 0f19cf
            return
Packit 0f19cf
        pdfobject.stream_decode()
Packit 0f19cf
        f = open(outfile, "wb")
Packit 0f19cf
        f.write(pdfobject.stream_text())
Packit 0f19cf
        f.close()
Packit 0f19cf
        print "PDF Object '%s' stream written to file %s" % (ident, outfile)
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PageFontCmd(BasicCmd):
Packit 0f19cf
    def __init__(self, scanner):
Packit 0f19cf
        self.scanner = scanner
Packit 0f19cf
        self.header_fmt = "%4s %-40s %s"
Packit 0f19cf
        self.pt_factor = 1
Packit 0f19cf
        self.font_unit = "pt"
Packit 0f19cf
Packit 0f19cf
    def help(self, cmd):
Packit 0f19cf
        if cmd == "font_summary":
Packit 0f19cf
            _help = "List the fonts used and their size in the specified pages"
Packit 0f19cf
        else:
Packit 0f19cf
            _help = "List the fonts used and their size for each page"
Packit 0f19cf
        return _help
Packit 0f19cf
Packit 0f19cf
    def setup_parser(self, parser):
Packit 0f19cf
        parser.add_argument("-pt", "--point-type",
Packit 0f19cf
              help="Point type to use: 'dtp' (default), 'tex'")
Packit 0f19cf
Packit 0f19cf
    def run(self, parser, args):
Packit 0f19cf
        if args.point_type == "tex":
Packit 0f19cf
            self.pt_factor = 72.27/72
Packit 0f19cf
            self.font_unit = "pt tex"
Packit 0f19cf
Packit 0f19cf
        if args.name == "font_summary":
Packit 0f19cf
            self.print_font_summary()
Packit 0f19cf
        else:
Packit 0f19cf
            self.print_font_page()
Packit 0f19cf
Packit 0f19cf
    def print_font_page(self):
Packit 0f19cf
        for pg in self.scanner.page_groups:
Packit 0f19cf
            self.print_fonts_in_pages(pg)
Packit 0f19cf
Packit 0f19cf
    def print_fonts_in_pages(self, pdf_pages, show=True):
Packit 0f19cf
        if show:
Packit 0f19cf
            print self.header_fmt % ("PAGE", "FONT", "SIZE")
Packit 0f19cf
            print self.header_fmt % (4*"-", 40*"-", 10*"-")
Packit 0f19cf
Packit 0f19cf
        for page in pdf_pages:
Packit 0f19cf
            fonts_used = page.find_fonts()
Packit 0f19cf
            fonts_used.sort()
Packit 0f19cf
            for font in fonts_used:
Packit 0f19cf
                if show:
Packit 0f19cf
                    print "%4d %-40s %6.2f %s" % (page.pagenum, font.name(),
Packit 0f19cf
                              self.pt_factor * font.size(), self.font_unit)
Packit 0f19cf
            if show: print self.header_fmt % (4*"-", 40*"-", 10*"-")
Packit 0f19cf
Packit 0f19cf
    def print_font_summary(self):
Packit 0f19cf
        pages = []
Packit 0f19cf
        for pg in self.scanner.page_groups:
Packit 0f19cf
            if not(pg):
Packit 0f19cf
                continue
Packit 0f19cf
            s = "%d" % (pg[0].pagenum)
Packit 0f19cf
            if len(pg) > 1:
Packit 0f19cf
                s += "-%d" % (pg[-1].pagenum)
Packit 0f19cf
            pages.append(s)
Packit 0f19cf
Packit 0f19cf
        print "\nFonts used in pages %s:" % (",".join(pages))
Packit 0f19cf
        fonts_used = self.scanner.pdf.fontmgr.get_used()
Packit 0f19cf
        fonts_used.sort()
Packit 0f19cf
        for font in fonts_used:
Packit 0f19cf
            print "%-40s %6.2f %s" % \
Packit 0f19cf
                  (font.name(), self.pt_factor*font.size(), self.font_unit)
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
class PDFScannerCommand:
Packit 0f19cf
    def __init__(self):
Packit 0f19cf
        self._commands = [
Packit 0f19cf
             ("page_object", PageObjectCmd),
Packit 0f19cf
             ("page_font", PageFontCmd),
Packit 0f19cf
             ("page_layout", PageLayoutCmd),
Packit 0f19cf
             ("font_summary", PageFontCmd),
Packit 0f19cf
             ("pdfobject", PdfObjectCmd)
Packit 0f19cf
            ]
Packit 0f19cf
        self.commands_to_run = []
Packit 0f19cf
        self.pdf = None
Packit 0f19cf
        self.page_ranges = []
Packit 0f19cf
        self.page_groups = []
Packit 0f19cf
        self.fonts_used = {}
Packit 0f19cf
Packit 0f19cf
    def commands(self):
Packit 0f19cf
        return [c[0] for c in self._commands]
Packit 0f19cf
Packit 0f19cf
    def setup_options(self, parser):
Packit 0f19cf
        parser.add_argument("-p", "--pages", action="append",
Packit 0f19cf
              help="Page range in the form '<first>[-[<last>]]'")
Packit 0f19cf
        parser.add_argument("-v", "--verbose", action="append",
Packit 0f19cf
              help="Verbose mode in the form '[group:]level' with level "\
Packit 0f19cf
                   "in 'debug', 'info', 'warning', 'error' and "\
Packit 0f19cf
                   "group in 'pdffile', 'pdfobject', 'descriptor'")
Packit 0f19cf
        parser.add_argument("-c", "--cache-stream-dir",
Packit 0f19cf
              help="Directory where to store the decompressed stream")
Packit 0f19cf
        parser.add_argument("-m", "--no-cache-stream", action="store_true",
Packit 0f19cf
              help="No stream cache on disk used: leave streams in memory")
Packit 0f19cf
        parser.add_argument("-d", "--cache-remanent", action="store_true",
Packit 0f19cf
              help="Equivalent to -fremanent")
Packit 0f19cf
        parser.add_argument("-f", "--cache-flags",
Packit 0f19cf
              help="Comma separated list of stream cache setup options: "\
Packit 0f19cf
                   "'remanent' and/or 'refresh'")
Packit 0f19cf
Packit 0f19cf
    def setup_parser(self, parser):
Packit 0f19cf
        self.setup_options(parser)
Packit 0f19cf
Packit 0f19cf
        if not(self._commands):
Packit 0f19cf
            return
Packit 0f19cf
        partial = True
Packit 0f19cf
        subparsers = parser.add_subparsers() #title=title)
Packit 0f19cf
        clsused = []
Packit 0f19cf
        cmdobjs = []
Packit 0f19cf
        for cmd, cls in self._commands:
Packit 0f19cf
            # Don't duplicate objects used for several commands
Packit 0f19cf
            if cls in clsused:
Packit 0f19cf
                cmdobj = cmdobjs[clsused.index(cls)]
Packit 0f19cf
            else:
Packit 0f19cf
                cmdobj = cls(self)
Packit 0f19cf
                cmdobjs.append(cmdobj)
Packit 0f19cf
                clsused.append(cls)
Packit 0f19cf
            kwargs = {}
Packit 0f19cf
            if cmdobj.help(cmd):
Packit 0f19cf
                kwargs["help"] = cmdobj.help(cmd)
Packit 0f19cf
            p = subparsers.add_parser(cmd, **kwargs)
Packit 0f19cf
            partial = cmdobj.setup_parser(p) or partial
Packit 0f19cf
            p.set_defaults(run=cmdobj.run, name=cmd)
Packit 0f19cf
        return partial
Packit 0f19cf
Packit 0f19cf
    def prepare(self, parser, options, argslist, pdffile):
Packit 0f19cf
        self.options = options
Packit 0f19cf
        # Sort the commands in the right order
Packit 0f19cf
        cmds = [ args.name for args in argslist ]
Packit 0f19cf
        self.commands_to_run = []
Packit 0f19cf
        for cmd in self.commands():
Packit 0f19cf
            if cmd in cmds:
Packit 0f19cf
                i = cmds.index(cmd)
Packit 0f19cf
                self.commands_to_run.append(argslist[i])
Packit 0f19cf
        
Packit 0f19cf
        log_groups = self._option_group_loglevels()
Packit 0f19cf
        self.logger_setup(log_groups)
Packit 0f19cf
Packit 0f19cf
        self.page_ranges = self._option_page_ranges()
Packit 0f19cf
Packit 0f19cf
        stream_manager = self._option_cache_setup()
Packit 0f19cf
        self.pdf = PDFFile(stream_manager=stream_manager)
Packit 0f19cf
Packit 0f19cf
    def cleanup(self):
Packit 0f19cf
        if self.pdf:
Packit 0f19cf
            self.pdf.cleanup()
Packit 0f19cf
Packit 0f19cf
    def run(self, parser, options, argslist, pdffile):
Packit 0f19cf
        self.prepare(parser, options, argslist, pdffile)
Packit 0f19cf
        self.pdf.load(pdffile)
Packit 0f19cf
        self.pdf.load_pages()
Packit 0f19cf
        self._build_pages()
Packit 0f19cf
        for args in self.commands_to_run:
Packit 0f19cf
            args.run(parser, args)
Packit 0f19cf
Packit 0f19cf
    def _build_pages(self):
Packit 0f19cf
        page_count = len(self.pdf.page_objects)
Packit 0f19cf
        for page_range in self.page_ranges:
Packit 0f19cf
            page_first, page_last = self._page_range(page_range, page_count)
Packit 0f19cf
            page_objects = self.pdf.page_objects[page_first-1:page_last]
Packit 0f19cf
Packit 0f19cf
            pdf_pages = self._build_pages_from_objects(page_objects, page_first)
Packit 0f19cf
            self.page_groups.append(pdf_pages)
Packit 0f19cf
Packit 0f19cf
    def _build_pages_from_objects(self, page_objects, page_first):
Packit 0f19cf
        pdf_pages = []
Packit 0f19cf
        for i, pg in enumerate(page_objects):
Packit 0f19cf
            pagenum = i+page_first
Packit 0f19cf
            page = PDFPage(self.pdf, pg, pagenum)
Packit 0f19cf
            pdf_pages.append(page)
Packit 0f19cf
        return pdf_pages
Packit 0f19cf
Packit 0f19cf
    def _page_range(self, page_range, max_range):
Packit 0f19cf
        if not(page_range): page_range = [1, max_range]
Packit 0f19cf
        if page_range[0] == 0: page_range[0] = 1
Packit 0f19cf
        if page_range[1] == 0 or page_range[1] > max_range:
Packit 0f19cf
            page_range[1] = max_range
Packit 0f19cf
        return page_range
Packit 0f19cf
Packit 0f19cf
    def _option_page_ranges(self):
Packit 0f19cf
        page_ranges = []
Packit 0f19cf
        if not(self.options.pages):
Packit 0f19cf
            page_ranges.append([0, 0])
Packit 0f19cf
            return page_ranges
Packit 0f19cf
Packit 0f19cf
        for page_range in self.options.pages:
Packit 0f19cf
            p1, p2 = (page_range + "-x").split("-")[0:2]
Packit 0f19cf
            if not(p2):
Packit 0f19cf
                p2 = 0
Packit 0f19cf
            elif (p2 == "x"):
Packit 0f19cf
                p2 = p1
Packit 0f19cf
            page_ranges.append([int(p1), int(p2)])
Packit 0f19cf
Packit 0f19cf
        return page_ranges
Packit 0f19cf
Packit 0f19cf
    def _option_group_loglevels(self):
Packit 0f19cf
        verbose = self.options.verbose
Packit 0f19cf
        log_groups = {"pdffile":   "info",
Packit 0f19cf
                      "pdfobject": "info",
Packit 0f19cf
                      "descriptor": "error",
Packit 0f19cf
                      "base": "info"}
Packit 0f19cf
Packit 0f19cf
        log_levels = ("debug", "info", "warning", "error")
Packit 0f19cf
Packit 0f19cf
        if not(verbose):
Packit 0f19cf
            return log_groups
Packit 0f19cf
Packit 0f19cf
        groups = log_groups.keys()
Packit 0f19cf
        for verbose_opt in verbose:
Packit 0f19cf
            group, level = ("all:" + verbose_opt).split(":")[-2:]
Packit 0f19cf
            if not(level in log_levels):
Packit 0f19cf
                print "Invalid verbose level: '%s'" % level
Packit 0f19cf
                continue
Packit 0f19cf
            if group == "all":
Packit 0f19cf
                for group in groups:
Packit 0f19cf
                    log_groups[group] = level
Packit 0f19cf
            elif group in groups:
Packit 0f19cf
                log_groups[group] = level
Packit 0f19cf
            else:
Packit 0f19cf
                print "Invalid verbose group: '%s'" % group
Packit 0f19cf
                continue
Packit 0f19cf
        return log_groups
Packit 0f19cf
Packit 0f19cf
    def _option_cache_setup(self):
Packit 0f19cf
        cache_in_memory = self.options.no_cache_stream
Packit 0f19cf
        cache_dirname = self.options.cache_stream_dir
Packit 0f19cf
        cache_flags = self.options.cache_flags
Packit 0f19cf
Packit 0f19cf
        if self.options.cache_remanent:
Packit 0f19cf
            if cache_flags:
Packit 0f19cf
                cache_flags += ",remanent"
Packit 0f19cf
            else:
Packit 0f19cf
                cache_flags = "remanent"
Packit 0f19cf
Packit 0f19cf
        flags = 0
Packit 0f19cf
        if cache_flags:
Packit 0f19cf
            cache_flags = cache_flags.split(",")
Packit 0f19cf
            for cflag in cache_flags:
Packit 0f19cf
                if cflag == "remanent":
Packit 0f19cf
                    flags = flags | StreamManager.CACHE_REMANENT
Packit 0f19cf
                elif cflag == "refresh":
Packit 0f19cf
                    flags = flags | StreamManager.CACHE_REFRESH
Packit 0f19cf
Packit 0f19cf
        if cache_in_memory:
Packit 0f19cf
            mgr = StreamManager(cache_method="memory")
Packit 0f19cf
        elif cache_dirname:
Packit 0f19cf
            cache_dirname = os.path.realpath(cache_dirname)
Packit 0f19cf
            if not(os.path.exists(cache_dirname)):
Packit 0f19cf
                print "Invalid cache dir: '%s'. Temporary dir used instead" % \
Packit 0f19cf
                      cache_dirname
Packit 0f19cf
                return None
Packit 0f19cf
            mgr = StreamManager(cache_method="file",
Packit 0f19cf
                                cache_dirname=cache_dirname,
Packit 0f19cf
                                flags=flags)
Packit 0f19cf
        else:
Packit 0f19cf
            mgr = StreamManager(flags=flags)
Packit 0f19cf
Packit 0f19cf
        return mgr
Packit 0f19cf
Packit 0f19cf
    def logger_setup(self, log_groups):
Packit 0f19cf
        loglevels = { "error":   logging.ERROR,
Packit 0f19cf
                      "warning": logging.WARNING,
Packit 0f19cf
                      "info":    logging.INFO,
Packit 0f19cf
                      "debug":   logging.DEBUG }
Packit 0f19cf
Packit 0f19cf
        console = logging.StreamHandler()
Packit 0f19cf
        fmt = logging.Formatter("%(message)s")
Packit 0f19cf
        console.setFormatter(fmt)
Packit 0f19cf
Packit 0f19cf
        for group, level in log_groups.items():
Packit 0f19cf
            log = logging.getLogger("pdfscan.%s" % group)
Packit 0f19cf
            log.setLevel(loglevels.get(level, logging.INFO)-1)
Packit 0f19cf
            log.addHandler(console)
Packit 0f19cf
Packit 0f19cf
Packit 0f19cf
def main():
Packit 0f19cf
    from argparse import ArgumentParser
Packit 0f19cf
    parser = ArgumentParser(description='Scan information from a PDF file')
Packit 0f19cf
    parser.add_argument("-D", "--dump-stack", action="store_true",
Packit 0f19cf
          help="Dump error stack (debug purpose)")
Packit 0f19cf
Packit 0f19cf
    scanner = PDFScannerCommand()
Packit 0f19cf
    scanner.setup_parser(parser)
Packit 0f19cf
Packit 0f19cf
    options, remain_args =  parser.parse_known_args()
Packit 0f19cf
 
Packit 0f19cf
    argslist = []
Packit 0f19cf
    remain_args = sys.argv[1:]
Packit 0f19cf
    while len(remain_args) > 1:
Packit 0f19cf
        args, remain_args =  parser.parse_known_args(remain_args)
Packit 0f19cf
        args.remain_args = remain_args
Packit 0f19cf
        argslist.append(args)
Packit 0f19cf
Packit 0f19cf
    if not(remain_args) or remain_args[0] in scanner.commands():
Packit 0f19cf
        print "Missing the PDF File"
Packit 0f19cf
        parser.parse_args(["-h"])
Packit 0f19cf
Packit 0f19cf
    error = ErrorHandler()
Packit 0f19cf
    if options.dump_stack: error.dump_stack()
Packit 0f19cf
Packit 0f19cf
    try:
Packit 0f19cf
        pdffile = remain_args[0]
Packit 0f19cf
        scanner.run(parser, options, argslist, pdffile)
Packit 0f19cf
    except Exception, e:
Packit 0f19cf
        error.failure_track("Error: '%s'" % (e))
Packit 0f19cf
Packit 0f19cf
    scanner.cleanup()
Packit 0f19cf
    sys.exit(error.rc)
Packit 0f19cf
Packit 0f19cf
if __name__ == "__main__":
Packit 0f19cf
    main()