|
Packit |
0f19cf |
#! /usr/bin/env python
|
|
Packit |
0f19cf |
#
|
|
Packit |
0f19cf |
# This tool is provided by dblatex (http://dblatex.sourceforge.net) and has
|
|
Packit |
0f19cf |
# the same copyright.
|
|
Packit |
0f19cf |
#
|
|
Packit |
0f19cf |
# It was initially developped to find out the fonts used and their size because
|
|
Packit |
0f19cf |
# as strange as it may seem, no obvious tool gives the font sizes used (pdffonts
|
|
Packit |
0f19cf |
# just lists the font objects of the PDF). The script can be improved to give
|
|
Packit |
0f19cf |
# more informations in a next release.
|
|
Packit |
0f19cf |
#
|
|
Packit |
0f19cf |
# To understand the PDF format, read:
|
|
Packit |
0f19cf |
# * The reference:
|
|
Packit |
0f19cf |
# http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/
|
|
Packit |
0f19cf |
# pdf_reference_1-7.pdf
|
|
Packit |
0f19cf |
#
|
|
Packit |
0f19cf |
# * A usefull introduction:
|
|
Packit |
0f19cf |
# http://www.adobe.com/content/dam/Adobe/en/technology/pdfs/
|
|
Packit |
0f19cf |
# PDF_Day_A_Look_Inside.pdf
|
|
Packit |
0f19cf |
#
|
|
Packit |
0f19cf |
#
|
|
Packit |
0f19cf |
import os
|
|
Packit |
0f19cf |
import sys
|
|
Packit |
0f19cf |
import traceback
|
|
Packit |
0f19cf |
import zlib
|
|
Packit |
0f19cf |
import re
|
|
Packit |
0f19cf |
import logging
|
|
Packit |
0f19cf |
import tempfile
|
|
Packit |
0f19cf |
import shutil
|
|
Packit |
0f19cf |
import struct
|
|
Packit |
0f19cf |
import codecs
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class ErrorHandler:
|
|
Packit |
0f19cf |
def __init__(self):
|
|
Packit |
0f19cf |
self._dump_stack = False
|
|
Packit |
0f19cf |
self.rc = 0
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def dump_stack(self, dump=True):
|
|
Packit |
0f19cf |
self._dump_stack = dump
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def failure_track(self, msg, rc=1):
|
|
Packit |
0f19cf |
self.rc = rc
|
|
Packit |
0f19cf |
print >>sys.stderr, (msg)
|
|
Packit |
0f19cf |
if self._dump_stack:
|
|
Packit |
0f19cf |
traceback.print_exc()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def failed_exit(self, rc=1):
|
|
Packit |
0f19cf |
self.failure_track(msg, rc)
|
|
Packit |
0f19cf |
sys.exit(self.rc)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def pdfstring_is_list(data):
|
|
Packit |
0f19cf |
return (data and data[0] == "[" and data[-1] == "]")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFResolver:
|
|
Packit |
0f19cf |
_resolver = None
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
@classmethod
|
|
Packit |
0f19cf |
def set_resolver(cls, resolver):
|
|
Packit |
0f19cf |
cls._resolver = resolver
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
@classmethod
|
|
Packit |
0f19cf |
def get_resolver(cls):
|
|
Packit |
0f19cf |
return cls._resolver
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFBaseObject:
|
|
Packit |
0f19cf |
_log = logging.getLogger("pdfscan.base")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __init__(self):
|
|
Packit |
0f19cf |
pass
|
|
Packit |
0f19cf |
def debug(self, text):
|
|
Packit |
0f19cf |
self._log.debug(text)
|
|
Packit |
0f19cf |
def warning(self, text):
|
|
Packit |
0f19cf |
self._log.warning(text)
|
|
Packit |
0f19cf |
def error(self, text):
|
|
Packit |
0f19cf |
self._log.error(text)
|
|
Packit |
0f19cf |
def info(self, text):
|
|
Packit |
0f19cf |
self._log.info(text)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFFile(PDFBaseObject):
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
Main object that parses the PDF file and extract the objects needed for
|
|
Packit |
0f19cf |
scanning.
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
_log = logging.getLogger("pdfscan.pdffile")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __init__(self, stream_manager=None):
|
|
Packit |
0f19cf |
self._file = None
|
|
Packit |
0f19cf |
self.filesize = 0
|
|
Packit |
0f19cf |
self.startxref_pos = 0
|
|
Packit |
0f19cf |
self.trailer = None
|
|
Packit |
0f19cf |
self.xref_first = None
|
|
Packit |
0f19cf |
self.xref_table = {}
|
|
Packit |
0f19cf |
self.xref_objstm = {}
|
|
Packit |
0f19cf |
self.objstm_objects = {}
|
|
Packit |
0f19cf |
self.page_objects = []
|
|
Packit |
0f19cf |
self.pdfobjects = PDFObjectGroup()
|
|
Packit |
0f19cf |
self.stream_manager = stream_manager or StreamManager()
|
|
Packit |
0f19cf |
# Create an publish the object resolver
|
|
Packit |
0f19cf |
self.resolver = PDFObjectResolver(self)
|
|
Packit |
0f19cf |
PDFResolver.set_resolver(self.resolver)
|
|
Packit |
0f19cf |
# Create a global font manager
|
|
Packit |
0f19cf |
self.fontmgr = FontManager({})
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Detect the beginning of a PDF Object
|
|
Packit |
0f19cf |
self.re_objstart = re.compile("(\d+) (\d+) obj(.*$)", re.DOTALL)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def cleanup(self):
|
|
Packit |
0f19cf |
self.stream_manager.cleanup()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def load(self, filename):
|
|
Packit |
0f19cf |
self.filesize = os.path.getsize(filename)
|
|
Packit |
0f19cf |
self._file = open(filename, "rb")
|
|
Packit |
0f19cf |
self.read_xref()
|
|
Packit |
0f19cf |
self.build_final_xref()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def find_startxref(self, offset_trailer=160):
|
|
Packit |
0f19cf |
# Look for the first xref from the end
|
|
Packit |
0f19cf |
offset, data = self.filesize, ""
|
|
Packit |
0f19cf |
while not("startxref" in data) or offset == 0:
|
|
Packit |
0f19cf |
offset = max(0, offset - offset_trailer)
|
|
Packit |
0f19cf |
self._file.seek(offset)
|
|
Packit |
0f19cf |
data = self._file.read(offset_trailer) + data
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
m = re.search("\sstartxref\s+(\d+)\s+%%EOF", data, re.M)
|
|
Packit |
0f19cf |
if not(m):
|
|
Packit |
0f19cf |
self.error("Problem in PDF file: startxref not found")
|
|
Packit |
0f19cf |
return 0
|
|
Packit |
0f19cf |
self.startxref_pos = int(m.group(1))
|
|
Packit |
0f19cf |
return self.startxref_pos
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def read_xref(self):
|
|
Packit |
0f19cf |
startxref = self.find_startxref()
|
|
Packit |
0f19cf |
xref = None
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
while startxref:
|
|
Packit |
0f19cf |
self._file.seek(startxref)
|
|
Packit |
0f19cf |
line = self._file.readline()
|
|
Packit |
0f19cf |
m = re.search("xref\s(.*)", line, re.M|re.DOTALL)
|
|
Packit |
0f19cf |
if (m):
|
|
Packit |
0f19cf |
found_xref = PDFXrefSection(self._file)
|
|
Packit |
0f19cf |
found_xref.read_table(m.group(1))
|
|
Packit |
0f19cf |
elif self.re_objstart.search(line):
|
|
Packit |
0f19cf |
self.info("Xref section not found. Try to load XRef object")
|
|
Packit |
0f19cf |
pdfobject, remain_line = self._parse_object(startxref)
|
|
Packit |
0f19cf |
found_xref = PDFXrefObject(pdfobject)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
startxref = int(found_xref.trailer.get("/Prev", 0))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if xref: xref.set_older(found_xref)
|
|
Packit |
0f19cf |
xref = found_xref
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
self.xref_first = xref
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def build_final_xref(self):
|
|
Packit |
0f19cf |
xref = self.xref_first
|
|
Packit |
0f19cf |
while xref:
|
|
Packit |
0f19cf |
self.trailer = xref.trailer
|
|
Packit |
0f19cf |
self.xref_table.update(xref.table)
|
|
Packit |
0f19cf |
self.xref_objstm.update(xref.objstm)
|
|
Packit |
0f19cf |
xref = xref.newer
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def get_objstm(self, objstm_id):
|
|
Packit |
0f19cf |
return self.objstm_objects.get(objstm_id, None)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def create_objstm(self, pdfobject):
|
|
Packit |
0f19cf |
self.debug("Create objstm %s" % pdfobject.ident())
|
|
Packit |
0f19cf |
pdfobject.compute()
|
|
Packit |
0f19cf |
pdfobject.stream_decode()
|
|
Packit |
0f19cf |
self.pdfobjects.add_object(pdfobject)
|
|
Packit |
0f19cf |
objstm = PDFObjectStream(pdfobject)
|
|
Packit |
0f19cf |
self.objstm_objects[objstm.ident()] = objstm
|
|
Packit |
0f19cf |
return objstm
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def xref_resolve_object(self, ident):
|
|
Packit |
0f19cf |
offset = self.xref_table.get(ident, 0)
|
|
Packit |
0f19cf |
if offset != 0:
|
|
Packit |
0f19cf |
#print "Object '%s' found at offset: %d" % (ident, offset)
|
|
Packit |
0f19cf |
pdfobject, remain_line = self._parse_object(offset)
|
|
Packit |
0f19cf |
return pdfobject
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def xref_resolve(self, ident):
|
|
Packit |
0f19cf |
# Try to resolve a standard object
|
|
Packit |
0f19cf |
pdfobject = self.xref_resolve_object(ident)
|
|
Packit |
0f19cf |
if pdfobject:
|
|
Packit |
0f19cf |
return pdfobject
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Find the ObjStm infos that contains that object
|
|
Packit |
0f19cf |
objstm_data = self.xref_objstm.get(ident, 0)
|
|
Packit |
0f19cf |
if objstm_data == 0:
|
|
Packit |
0f19cf |
self.warning("ObjStm id for '%s' not found in xref table" % ident)
|
|
Packit |
0f19cf |
return None
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# If the ObjStm itself is not resolved, resolve it first
|
|
Packit |
0f19cf |
objstm_id = "%d 0" % objstm_data[0]
|
|
Packit |
0f19cf |
object_idx = objstm_data[1]
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
objstm = self.get_objstm(objstm_id)
|
|
Packit |
0f19cf |
if not(objstm):
|
|
Packit |
0f19cf |
pdfobject = self.xref_resolve_object(objstm_id)
|
|
Packit |
0f19cf |
if pdfobject: objstm = self.create_objstm(pdfobject)
|
|
Packit |
0f19cf |
if not(objstm):
|
|
Packit |
0f19cf |
self.error("Object '%s' cannot be resolved: ObjStm '%s' not found" \
|
|
Packit |
0f19cf |
% (ident, objstm_id))
|
|
Packit |
0f19cf |
return None
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Ok, now get the object from the ObjStm
|
|
Packit |
0f19cf |
pdfobject = objstm.get_object(object_idx)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
return pdfobject
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def resolve_object(self, ident):
|
|
Packit |
0f19cf |
pdfobject = self.pdfobjects.get_object(ident)
|
|
Packit |
0f19cf |
if not(pdfobject):
|
|
Packit |
0f19cf |
#print "Try to resolve object '%s'" % ident
|
|
Packit |
0f19cf |
pdfobject = self.xref_resolve(ident)
|
|
Packit |
0f19cf |
if pdfobject:
|
|
Packit |
0f19cf |
self.pdfobjects.add_object(pdfobject)
|
|
Packit |
0f19cf |
return pdfobject
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def get_object(self, ident):
|
|
Packit |
0f19cf |
ident = ident.replace(" R", "").strip()
|
|
Packit |
0f19cf |
pdfobject = self.resolver.get(ident)
|
|
Packit |
0f19cf |
if pdfobject:
|
|
Packit |
0f19cf |
pdfobject.link_to(self.resolver)
|
|
Packit |
0f19cf |
return pdfobject
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _parse_object(self, offset):
|
|
Packit |
0f19cf |
pdfobj = None
|
|
Packit |
0f19cf |
parsed_object = None
|
|
Packit |
0f19cf |
remain_line = ""
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
self._file.seek(offset)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
while not(parsed_object):
|
|
Packit |
0f19cf |
line = self._file.readline()
|
|
Packit |
0f19cf |
if not(line):
|
|
Packit |
0f19cf |
break
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
while line:
|
|
Packit |
0f19cf |
if pdfobj:
|
|
Packit |
0f19cf |
fields = line.split("endobj", 1)
|
|
Packit |
0f19cf |
if len(fields) > 1:
|
|
Packit |
0f19cf |
if fields[0]:
|
|
Packit |
0f19cf |
pdfobj.append_string(fields[0])
|
|
Packit |
0f19cf |
pdfobj.compute()
|
|
Packit |
0f19cf |
remain_line = fields[1]
|
|
Packit |
0f19cf |
parsed_object = pdfobj
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
pdfobj.append_string(line)
|
|
Packit |
0f19cf |
line = ""
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
m = self.re_objstart.search(line)
|
|
Packit |
0f19cf |
if m:
|
|
Packit |
0f19cf |
number, revision = m.group(1), m.group(2)
|
|
Packit |
0f19cf |
pdfobj = PDFObject(number, revision,
|
|
Packit |
0f19cf |
stream_manager=self.stream_manager)
|
|
Packit |
0f19cf |
line = m.group(3)
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
# drop the line
|
|
Packit |
0f19cf |
line = ""
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
return (parsed_object, remain_line)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _expand_pages(self, page_kids):
|
|
Packit |
0f19cf |
# Iterations to make a list of unitary pages (/Page) from a list
|
|
Packit |
0f19cf |
# containing group of pages (/Pages). The iterations stop when all
|
|
Packit |
0f19cf |
# The objects in the list are replaced by unit pages and not
|
|
Packit |
0f19cf |
# intermediate page groups
|
|
Packit |
0f19cf |
page_list = page_kids
|
|
Packit |
0f19cf |
has_kid = len(page_list)
|
|
Packit |
0f19cf |
while has_kid:
|
|
Packit |
0f19cf |
newlist = []
|
|
Packit |
0f19cf |
has_kid = 0
|
|
Packit |
0f19cf |
for kid in page_list:
|
|
Packit |
0f19cf |
#print kid
|
|
Packit |
0f19cf |
kid.link_to(self.resolver)
|
|
Packit |
0f19cf |
if kid.get_type() == "/Pages":
|
|
Packit |
0f19cf |
kids = kid.descriptor.get("/Kids")
|
|
Packit |
0f19cf |
self.debug("Expand page list: %s -> %s" % (kid, kids))
|
|
Packit |
0f19cf |
has_kid += len(kids)
|
|
Packit |
0f19cf |
elif kid.get_type() == "/Page":
|
|
Packit |
0f19cf |
kids = [kid]
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
self.error("%s: %s" % (kid, kid.descriptor.params))
|
|
Packit |
0f19cf |
self.error("%s: What's wrong? '%s'" % (kid, kid.get_type()))
|
|
Packit |
0f19cf |
kids = []
|
|
Packit |
0f19cf |
newlist = newlist + kids
|
|
Packit |
0f19cf |
page_list = newlist
|
|
Packit |
0f19cf |
return page_list
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def load_pages(self):
|
|
Packit |
0f19cf |
root = self.trailer.get("/Root")
|
|
Packit |
0f19cf |
catalog = self.get_object(root)
|
|
Packit |
0f19cf |
pages = catalog.descriptor.get("/Pages")
|
|
Packit |
0f19cf |
page_count = int(pages.descriptor.get("/Count"))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
self.info("Found %d pages" % page_count)
|
|
Packit |
0f19cf |
pages.link_to(self.resolver)
|
|
Packit |
0f19cf |
page_kids = pages.descriptor.get("/Kids")
|
|
Packit |
0f19cf |
self.page_objects = self._expand_pages(page_kids)
|
|
Packit |
0f19cf |
if len(self.page_objects) != page_count:
|
|
Packit |
0f19cf |
self.error("Unconsistent pages found: %d vs %d" % \
|
|
Packit |
0f19cf |
(len(self.page_objects), page_count))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFObjectResolver:
|
|
Packit |
0f19cf |
def __init__(self, pdffile):
|
|
Packit |
0f19cf |
self.pdffile = pdffile
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def get(self, ident, default=None):
|
|
Packit |
0f19cf |
pdfobject = self.pdffile.resolve_object(ident)
|
|
Packit |
0f19cf |
if not(pdfobject): pdfobject = default
|
|
Packit |
0f19cf |
return pdfobject
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFObjectGroup(PDFBaseObject):
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
Group of the PDF Objects contained in a file. This wrapper is a dictionnary
|
|
Packit |
0f19cf |
of the objects, and consolidates the links between the objects.
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
_log = logging.getLogger("pdfscan.pdffile")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __init__(self):
|
|
Packit |
0f19cf |
self.pdfobjects = {}
|
|
Packit |
0f19cf |
self.objtypes = {}
|
|
Packit |
0f19cf |
self.unresolved = []
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def count(self):
|
|
Packit |
0f19cf |
return len(self.pdfobjects.values())
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def types(self):
|
|
Packit |
0f19cf |
return self.objtypes.keys()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def add_object(self, pdfobject):
|
|
Packit |
0f19cf |
self.pdfobjects[pdfobject.ident()] = pdfobject
|
|
Packit |
0f19cf |
objtype = pdfobject.get_type()
|
|
Packit |
0f19cf |
if not(objtype):
|
|
Packit |
0f19cf |
objtype = "misc"
|
|
Packit |
0f19cf |
lst = self.objtypes.get(objtype, [])
|
|
Packit |
0f19cf |
lst.append(pdfobject)
|
|
Packit |
0f19cf |
self.objtypes[objtype] = lst
|
|
Packit |
0f19cf |
self.unresolved.append(pdfobject)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def get_objects_by_type(self, objtype):
|
|
Packit |
0f19cf |
return self.objtypes.get(objtype, [])
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def get_object(self, ident):
|
|
Packit |
0f19cf |
return self.pdfobjects.get(ident, None)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def link_objects(self):
|
|
Packit |
0f19cf |
self.debug("%d objects to resolve" % (len(self.unresolved)))
|
|
Packit |
0f19cf |
unresolved = []
|
|
Packit |
0f19cf |
for pdfobj in self.unresolved:
|
|
Packit |
0f19cf |
if pdfobj.link_to(self.pdfobjects):
|
|
Packit |
0f19cf |
unresolved.append(pdfobj)
|
|
Packit |
0f19cf |
self.unresolved = unresolved
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def stream_decode(self):
|
|
Packit |
0f19cf |
for pdfobj in self.pdfobjects.values():
|
|
Packit |
0f19cf |
pdfobj.stream_decode()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFPage:
|
|
Packit |
0f19cf |
def __init__(self, pdf, page, pagenum=0):
|
|
Packit |
0f19cf |
self.pagenum = pagenum
|
|
Packit |
0f19cf |
self.pdf = pdf
|
|
Packit |
0f19cf |
contents = page.descriptor.get("/Contents")
|
|
Packit |
0f19cf |
resources = page.descriptor.get("/Resources")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if (isinstance(resources, PDFDescriptor)):
|
|
Packit |
0f19cf |
rsc_descriptor = resources
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
rsc_descriptor = resources.descriptor
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
rsc_descriptor.link_to(pdf.resolver)
|
|
Packit |
0f19cf |
font = rsc_descriptor.get("/Font")
|
|
Packit |
0f19cf |
if font:
|
|
Packit |
0f19cf |
font.link_to(pdf.resolver)
|
|
Packit |
0f19cf |
if (isinstance(font, PDFDescriptor)):
|
|
Packit |
0f19cf |
fontdict = font.infos()
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
fontdict = font.descriptor.infos()
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
fontdict = {}
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if not(isinstance(contents, list)):
|
|
Packit |
0f19cf |
contents = [contents]
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
self.page = page
|
|
Packit |
0f19cf |
self.contents = contents
|
|
Packit |
0f19cf |
self.fontdict = fontdict
|
|
Packit |
0f19cf |
self.fontmgr = FontManager(fontdict, pdf.fontmgr)
|
|
Packit |
0f19cf |
self.streams = []
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
self.link_to(pdf.resolver)
|
|
Packit |
0f19cf |
self.load_streams()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def link_to(self, resolver):
|
|
Packit |
0f19cf |
for content in self.contents:
|
|
Packit |
0f19cf |
content.link_to(resolver)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def load_streams(self):
|
|
Packit |
0f19cf |
for content in self.contents:
|
|
Packit |
0f19cf |
stream = PDFContentStream(content, self.fontmgr)
|
|
Packit |
0f19cf |
self.streams.append(stream)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def find_fonts(self):
|
|
Packit |
0f19cf |
return self.fontmgr.get_used()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFXrefSection(PDFBaseObject):
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
Section starting by 'xref' and followed by the 'trailer'. The xref data
|
|
Packit |
0f19cf |
contain information about how to access to objects in the file and is
|
|
Packit |
0f19cf |
therefore a crucial part of the object resolution.
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
_log = logging.getLogger("pdfscan.xref")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
_re_desc = re.compile("(<<(?:(?]|(?)>(?!>))*>>)",
|
|
Packit |
0f19cf |
re.MULTILINE)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __init__(self, fd):
|
|
Packit |
0f19cf |
self.trailer = None
|
|
Packit |
0f19cf |
self.table = {}
|
|
Packit |
0f19cf |
self.objstm = {}
|
|
Packit |
0f19cf |
self._file = fd
|
|
Packit |
0f19cf |
self.older = None
|
|
Packit |
0f19cf |
self.newer = None
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def set_older(self, older):
|
|
Packit |
0f19cf |
self.older = older
|
|
Packit |
0f19cf |
older.newer = self
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _xref_fill_entry(self, fields, obj_id):
|
|
Packit |
0f19cf |
offset, revision, what = fields
|
|
Packit |
0f19cf |
if what == "n":
|
|
Packit |
0f19cf |
ident = "%d %d" % (obj_id, int(revision))
|
|
Packit |
0f19cf |
self.table[ident] = int(offset)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def read_table(self, linestart=""):
|
|
Packit |
0f19cf |
line = linestart.strip() or self._file.readline()
|
|
Packit |
0f19cf |
subsection = line.split()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
while subsection[0] != "trailer":
|
|
Packit |
0f19cf |
start_ref = int(subsection[0])
|
|
Packit |
0f19cf |
object_count = int(subsection[1])
|
|
Packit |
0f19cf |
if len(subsection) == 5:
|
|
Packit |
0f19cf |
self._xref_fill_entry(subsection[2:], start_ref)
|
|
Packit |
0f19cf |
start_ref += 1
|
|
Packit |
0f19cf |
object_count -= 1
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
for i in range(object_count):
|
|
Packit |
0f19cf |
line = self._file.readline()
|
|
Packit |
0f19cf |
self._xref_fill_entry(line.split(), start_ref+i)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
line = self._file.readline()
|
|
Packit |
0f19cf |
subsection = line.split()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
#print len(self.table.values())
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if subsection[0] == "trailer":
|
|
Packit |
0f19cf |
data = " ".join(subsection)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Ensure we have a complete dictionnary
|
|
Packit |
0f19cf |
while not(">>" in data):
|
|
Packit |
0f19cf |
data += self._file.readline()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
m = self._re_desc.search(data)
|
|
Packit |
0f19cf |
if not(m):
|
|
Packit |
0f19cf |
self.error("Problem in PDF file: cannot find valid trailer")
|
|
Packit |
0f19cf |
return
|
|
Packit |
0f19cf |
self.trailer = PDFDescriptor(string=m.group(1))
|
|
Packit |
0f19cf |
self.trailer.compute()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFStreamHandler:
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
Core abstract class in charge to handle the stream of <pdfobject>
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
def __init__(self, pdfobject):
|
|
Packit |
0f19cf |
self.stream_object = pdfobject
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def ident(self):
|
|
Packit |
0f19cf |
return self.stream_object.ident()
|
|
Packit |
0f19cf |
def debug(self, text):
|
|
Packit |
0f19cf |
self.stream_object.debug(text)
|
|
Packit |
0f19cf |
def warning(self, text):
|
|
Packit |
0f19cf |
self.stream_object.warning(text)
|
|
Packit |
0f19cf |
def error(self, text):
|
|
Packit |
0f19cf |
self.stream_object.error(text)
|
|
Packit |
0f19cf |
def info(self, text):
|
|
Packit |
0f19cf |
self.stream_object.info(text)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFXrefObject(PDFStreamHandler):
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
A specific object that contains XRef entries in binary format. It is an
|
|
Packit |
0f19cf |
alternative to the xref section.
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
def __init__(self, pdfobject):
|
|
Packit |
0f19cf |
PDFStreamHandler.__init__(self, pdfobject)
|
|
Packit |
0f19cf |
self.trailer = pdfobject.descriptor
|
|
Packit |
0f19cf |
self.table = {}
|
|
Packit |
0f19cf |
self.objstm = {}
|
|
Packit |
0f19cf |
self.older = None
|
|
Packit |
0f19cf |
self.newer = None
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if pdfobject.descriptor.get("/Type") != "/XRef":
|
|
Packit |
0f19cf |
self.error("Not an XRef object. Give up")
|
|
Packit |
0f19cf |
return
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
_format = pdfobject.descriptor.get("/W")
|
|
Packit |
0f19cf |
_format = _format.replace("[", "").replace("]", "")
|
|
Packit |
0f19cf |
self._format = [ int(f) for f in _format.split() ]
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# An /XRef object must contains a stream
|
|
Packit |
0f19cf |
pdfobject.stream_decode()
|
|
Packit |
0f19cf |
self.data = pdfobject.stream_text()
|
|
Packit |
0f19cf |
self.read_table()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def set_older(self, older):
|
|
Packit |
0f19cf |
self.older = older
|
|
Packit |
0f19cf |
older.newer = self
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _xref_fill_entry(self, fields, obj_id):
|
|
Packit |
0f19cf |
offset, revision, what = fields
|
|
Packit |
0f19cf |
if what == "n":
|
|
Packit |
0f19cf |
ident = "%d %d" % (obj_id, int(revision))
|
|
Packit |
0f19cf |
self.table[ident] = int(offset)
|
|
Packit |
0f19cf |
self.debug("Record xref entry: '%s' @ %s" % (ident, offset))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _xref_fill_objstm(self, fields, obj_id):
|
|
Packit |
0f19cf |
objstm_id, obj_index = fields
|
|
Packit |
0f19cf |
ident = "%d %d" % (obj_id, 0)
|
|
Packit |
0f19cf |
self.objstm[ident] = (objstm_id, obj_index)
|
|
Packit |
0f19cf |
self.debug("Record xref entry in objstm: '%s' @ %s" % \
|
|
Packit |
0f19cf |
(ident, fields))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _int_of(self, string):
|
|
Packit |
0f19cf |
# Convert to int from bytes string that can be of any size
|
|
Packit |
0f19cf |
m = len(string)
|
|
Packit |
0f19cf |
d = 0
|
|
Packit |
0f19cf |
for i, c in enumerate(string):
|
|
Packit |
0f19cf |
d += (1 << (8*(m - i-1))) * struct.unpack("B", c)[0]
|
|
Packit |
0f19cf |
return d
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def read_table(self, linestart=""):
|
|
Packit |
0f19cf |
data = self.data
|
|
Packit |
0f19cf |
fields = 3 * [0]
|
|
Packit |
0f19cf |
entry_size = sum(self._format)
|
|
Packit |
0f19cf |
# TODO: use /Index
|
|
Packit |
0f19cf |
obj_id = 0
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
while data:
|
|
Packit |
0f19cf |
first = 0
|
|
Packit |
0f19cf |
last = 0
|
|
Packit |
0f19cf |
for i in range(3):
|
|
Packit |
0f19cf |
last += self._format[i]
|
|
Packit |
0f19cf |
fields[i] = self._int_of(data[first:last])
|
|
Packit |
0f19cf |
first = last
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
data = data[entry_size:]
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if fields[0] == 1:
|
|
Packit |
0f19cf |
self._xref_fill_entry(fields[1:3] + ["n"], obj_id)
|
|
Packit |
0f19cf |
elif fields[0] == 2:
|
|
Packit |
0f19cf |
self._xref_fill_objstm(fields[1:3], obj_id)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
obj_id += 1
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFObjectStream(PDFStreamHandler):
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
A PDF Object Stream contains in its stream some compressed PDF objects.
|
|
Packit |
0f19cf |
This class works on a PDF object stream to build the containded PDF objects.
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
def __init__(self, pdfobject):
|
|
Packit |
0f19cf |
PDFStreamHandler.__init__(self, pdfobject)
|
|
Packit |
0f19cf |
self._pdfobjects = []
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def pdfobjects(self):
|
|
Packit |
0f19cf |
if not(self._pdfobjects):
|
|
Packit |
0f19cf |
self.compute()
|
|
Packit |
0f19cf |
return self._pdfobjects
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _getinfo(self, what):
|
|
Packit |
0f19cf |
return self.stream_object.descriptor.get(what)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def get_object(self, idx):
|
|
Packit |
0f19cf |
if not(self._pdfobjects):
|
|
Packit |
0f19cf |
self.compute()
|
|
Packit |
0f19cf |
if idx < 0 or idx >= len(self._pdfobjects):
|
|
Packit |
0f19cf |
return None
|
|
Packit |
0f19cf |
return self._pdfobjects[idx]
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def parse_object_list(self, data):
|
|
Packit |
0f19cf |
values = data.split()
|
|
Packit |
0f19cf |
objlist = []
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
for i in range(0, len(values), 2):
|
|
Packit |
0f19cf |
# The pair is ('object number', byte_offset)
|
|
Packit |
0f19cf |
objlist.append((values[i], int(values[i+1])))
|
|
Packit |
0f19cf |
self.objlist = objlist
|
|
Packit |
0f19cf |
return objlist
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def compute(self):
|
|
Packit |
0f19cf |
_type = self._getinfo("/Type")
|
|
Packit |
0f19cf |
if _type != "/ObjStm":
|
|
Packit |
0f19cf |
self.error("Cannot read object stream: Invalid type '%s'" % _type)
|
|
Packit |
0f19cf |
return
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
nb_objects = int(self._getinfo("/N"))
|
|
Packit |
0f19cf |
objlist_b = int(self._getinfo("/First"))
|
|
Packit |
0f19cf |
stream = self.stream_object.stream_cache
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
objlist = self.parse_object_list(stream.read(objlist_b))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if len(objlist) != nb_objects:
|
|
Packit |
0f19cf |
self.warning("Error in parsing the Stream Object: found %d"\
|
|
Packit |
0f19cf |
"objects instead of %d" % (len(objlist), nb_object))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# List Terminator
|
|
Packit |
0f19cf |
objlist.append(("",-1))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
bytes_read = 0
|
|
Packit |
0f19cf |
for i in range(len(objlist)-1):
|
|
Packit |
0f19cf |
# In ObjectStream, a PDF object revision is always '0'
|
|
Packit |
0f19cf |
number, revision = objlist[i][0], "0"
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# The size of the object data is given by the position of the next
|
|
Packit |
0f19cf |
objsize = objlist[i+1][1] - bytes_read
|
|
Packit |
0f19cf |
if objsize >= 0:
|
|
Packit |
0f19cf |
data = stream.read(objsize)
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
data = stream.read()
|
|
Packit |
0f19cf |
bytes_read += len(data)
|
|
Packit |
0f19cf |
self.debug("Object[%d] in stream: '%s' has %d bytes" % \
|
|
Packit |
0f19cf |
(i, number, objsize))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Build the PDF Object from stream data
|
|
Packit |
0f19cf |
pdfobj = PDFObject(number, revision)
|
|
Packit |
0f19cf |
pdfobj.append_string(data)
|
|
Packit |
0f19cf |
pdfobj.compute()
|
|
Packit |
0f19cf |
self._pdfobjects.append(pdfobj)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
stream.close()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFObject:
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
A PDF Object contains the data between the 'obj ... 'endobj' tags.
|
|
Packit |
0f19cf |
It has a unique identifier given by the (number,revision) pair.
|
|
Packit |
0f19cf |
The data contained by a PDF object can be dictionnaries (descriptors),
|
|
Packit |
0f19cf |
stream contents and other stuff.
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
# Extract a dictionnary '<<...>>' leaf (does not contain another dict)
|
|
Packit |
0f19cf |
_re_desc = re.compile("(<<(?:(?]|(?)>(?!>))*>>)",
|
|
Packit |
0f19cf |
re.MULTILINE)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __init__(self, number, revision, stream_manager=None):
|
|
Packit |
0f19cf |
self.string = ""
|
|
Packit |
0f19cf |
self.number = number
|
|
Packit |
0f19cf |
self.revision = revision
|
|
Packit |
0f19cf |
self.descriptors = []
|
|
Packit |
0f19cf |
self.descriptor = None
|
|
Packit |
0f19cf |
self.data = ""
|
|
Packit |
0f19cf |
self.stream = None
|
|
Packit |
0f19cf |
self.outfile = ""
|
|
Packit |
0f19cf |
self.stream_manager = stream_manager or StreamManager()
|
|
Packit |
0f19cf |
self._log = logging.getLogger("pdfscan.pdfobject")
|
|
Packit |
0f19cf |
self.debug("New Object")
|
|
Packit |
0f19cf |
self.re_desc = self._re_desc
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def debug(self, text):
|
|
Packit |
0f19cf |
self._log.debug(self.logstr(text))
|
|
Packit |
0f19cf |
def warning(self, text):
|
|
Packit |
0f19cf |
self._log.warning(self.logstr(text))
|
|
Packit |
0f19cf |
def error(self, text):
|
|
Packit |
0f19cf |
self._log.error(self.logstr(text))
|
|
Packit |
0f19cf |
def info(self, text):
|
|
Packit |
0f19cf |
self._log.info(self.logstr(text))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def ident(self):
|
|
Packit |
0f19cf |
return "%s %s" % (self.number, self.revision)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __repr__(self):
|
|
Packit |
0f19cf |
return "(%s R)" % self.ident()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __int__(self):
|
|
Packit |
0f19cf |
return int(self.data)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def logstr(self, text):
|
|
Packit |
0f19cf |
return "Object [%s %s]: %s" % (self.number,self.revision,text)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def append_string(self, string):
|
|
Packit |
0f19cf |
self.string = self.string + string
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def compute(self):
|
|
Packit |
0f19cf |
string = self.string
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
s = re.split("stream\s", string, re.MULTILINE)
|
|
Packit |
0f19cf |
if len(s) > 1:
|
|
Packit |
0f19cf |
self.debug("Contains stream")
|
|
Packit |
0f19cf |
self.stream = s[1].strip()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
string = s[0]
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Iterate to build all the nested dictionnaries/descriptors,
|
|
Packit |
0f19cf |
# from the deepest to the main one
|
|
Packit |
0f19cf |
self.descriptors = []
|
|
Packit |
0f19cf |
while True:
|
|
Packit |
0f19cf |
descs = self.re_desc.findall(string)
|
|
Packit |
0f19cf |
if not(descs):
|
|
Packit |
0f19cf |
break
|
|
Packit |
0f19cf |
for desc_str in descs:
|
|
Packit |
0f19cf |
desc = PDFDescriptor(string=desc_str)
|
|
Packit |
0f19cf |
string = string.replace(desc_str,
|
|
Packit |
0f19cf |
"{descriptor(%d)}" % len(self.descriptors))
|
|
Packit |
0f19cf |
self.descriptors.append(desc)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
self.debug("Found %d descriptors" % len(self.descriptors))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
for descobj in self.descriptors:
|
|
Packit |
0f19cf |
descobj.compute(descriptors=self.descriptors)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if self.descriptors:
|
|
Packit |
0f19cf |
self.descriptor = self.descriptors[-1]
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
self.descriptor = PDFDescriptor()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
self.data = re.sub("{descriptor\(\d+\)}", "",
|
|
Packit |
0f19cf |
string, flags=re.MULTILINE).strip()
|
|
Packit |
0f19cf |
self.debug("Data: '%s'" % self.data)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def stream_decode(self):
|
|
Packit |
0f19cf |
if not(self.stream):
|
|
Packit |
0f19cf |
return
|
|
Packit |
0f19cf |
self.debug("Try to decode stream...")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Consolidate stream buffer from the /Length information
|
|
Packit |
0f19cf |
stream_size = int(self.descriptor.get("/Length"))
|
|
Packit |
0f19cf |
self.stream = self.stream[0:stream_size]
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Put the stream in a cache
|
|
Packit |
0f19cf |
self.stream_cache = self.stream_manager.cache(number=self.number,
|
|
Packit |
0f19cf |
revision=self.revision)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
method = self.descriptor.get("/Filter")
|
|
Packit |
0f19cf |
if method == "/FlateDecode":
|
|
Packit |
0f19cf |
method = "zlib"
|
|
Packit |
0f19cf |
elif method == "/DCTDecode":
|
|
Packit |
0f19cf |
# This is JPEG. Just dump it
|
|
Packit |
0f19cf |
self.warning("this is a JPEG stream")
|
|
Packit |
0f19cf |
method = ""
|
|
Packit |
0f19cf |
elif method != "":
|
|
Packit |
0f19cf |
self.error("don't know how to decode stream with filter '%s'" \
|
|
Packit |
0f19cf |
% method)
|
|
Packit |
0f19cf |
return
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
self.stream_cache.write(self.stream, compress_type=method)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def stream_text(self):
|
|
Packit |
0f19cf |
if not(self.stream):
|
|
Packit |
0f19cf |
return ""
|
|
Packit |
0f19cf |
data = self.stream_cache.read()
|
|
Packit |
0f19cf |
self.stream_cache.close()
|
|
Packit |
0f19cf |
return data
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def get_type(self):
|
|
Packit |
0f19cf |
_type = self.descriptor.get("/Type")
|
|
Packit |
0f19cf |
if _type:
|
|
Packit |
0f19cf |
return _type
|
|
Packit |
0f19cf |
if self.stream:
|
|
Packit |
0f19cf |
return "stream"
|
|
Packit |
0f19cf |
if pdfstring_is_list(self.data):
|
|
Packit |
0f19cf |
return "list"
|
|
Packit |
0f19cf |
if self.descriptor.is_name_tree_node():
|
|
Packit |
0f19cf |
return "name tree"
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def link_to(self, pdfobjects):
|
|
Packit |
0f19cf |
self.debug("Link objects")
|
|
Packit |
0f19cf |
for desc in self.descriptors:
|
|
Packit |
0f19cf |
desc.link_to(pdfobjects)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if pdfstring_is_list(self.data):
|
|
Packit |
0f19cf |
pass
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFDescriptor:
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
Contains the data between the << ... >> brackets in PDF objects. It is
|
|
Packit |
0f19cf |
a dictionnary that can contain other descriptors/dictionnaries.
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
# Unique identifier for these objects
|
|
Packit |
0f19cf |
_id = 0
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Detect the dictionnary fields covering these cases:
|
|
Packit |
0f19cf |
# <<
|
|
Packit |
0f19cf |
# /Type /Page : the value is another keyword
|
|
Packit |
0f19cf |
# /Contents 5 0 R : the value is a string up next keyword
|
|
Packit |
0f19cf |
# /Resources 4 0 R
|
|
Packit |
0f19cf |
# /MediaBox [0 0 595.276 841.89] : the value is an array
|
|
Packit |
0f19cf |
# /Parent 12 0 R
|
|
Packit |
0f19cf |
# >>
|
|
Packit |
0f19cf |
_re_dict = re.compile("/\w+\s*/[^/\s]+|/\w+\s*\[[^\]]*\]|/\w+\s*[^/]+")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Extract a dictionnary keyword
|
|
Packit |
0f19cf |
_re_key = re.compile("(/[^ \({/\[<]*)")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Extract the substituted descriptors
|
|
Packit |
0f19cf |
_re_descobj = re.compile("{descriptor\((\d+)\)}")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Find the PDF object references
|
|
Packit |
0f19cf |
_re_objref = re.compile("(\d+ \d+ R)")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __init__(self, string=""):
|
|
Packit |
0f19cf |
self._ident = self._get_ident()
|
|
Packit |
0f19cf |
self.string = string
|
|
Packit |
0f19cf |
self.params = {}
|
|
Packit |
0f19cf |
self._log = logging.getLogger("pdfscan.descriptor")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
self.re_dict = self._re_dict
|
|
Packit |
0f19cf |
self.re_key = self._re_key
|
|
Packit |
0f19cf |
self.re_descobj = self._re_descobj
|
|
Packit |
0f19cf |
self.re_objref = self._re_objref
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _get_ident(self):
|
|
Packit |
0f19cf |
_id = PDFDescriptor._id
|
|
Packit |
0f19cf |
PDFDescriptor._id += 1
|
|
Packit |
0f19cf |
return _id
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def ident(self):
|
|
Packit |
0f19cf |
return self._ident
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def debug(self, text):
|
|
Packit |
0f19cf |
self._log.debug("Descriptor [%d]: %s" % (self._ident, text))
|
|
Packit |
0f19cf |
def error(self, text):
|
|
Packit |
0f19cf |
self._log.error("Descriptor [%d]: %s" % (self._ident, text))
|
|
Packit |
0f19cf |
def info(self, text):
|
|
Packit |
0f19cf |
self._log.info("Descriptor [%d]: %s" % (self._ident, text))
|
|
Packit |
0f19cf |
def warning(self, text):
|
|
Packit |
0f19cf |
self._log.warning("Descriptor [%d]: %s" % (self._ident, text))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __repr__(self):
|
|
Packit |
0f19cf |
return "desc[%d]" % self._ident
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def normalize_fields(self, string):
|
|
Packit |
0f19cf |
string = string.replace(">>", "")
|
|
Packit |
0f19cf |
string = string.replace("<<", "")
|
|
Packit |
0f19cf |
string = string.replace("\n", " ")
|
|
Packit |
0f19cf |
fields = self.re_dict.findall(string)
|
|
Packit |
0f19cf |
fields = [ f.strip() for f in fields if (f and f.strip()) ]
|
|
Packit |
0f19cf |
return fields
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def compute(self, descriptors=None):
|
|
Packit |
0f19cf |
lines = self.normalize_fields(self.string)
|
|
Packit |
0f19cf |
for line in lines:
|
|
Packit |
0f19cf |
m = self.re_key.match(line)
|
|
Packit |
0f19cf |
if not(m):
|
|
Packit |
0f19cf |
continue
|
|
Packit |
0f19cf |
param = m.group(1)
|
|
Packit |
0f19cf |
value = line.replace(param, "").strip()
|
|
Packit |
0f19cf |
m = self.re_descobj.match(value)
|
|
Packit |
0f19cf |
if m and descriptors:
|
|
Packit |
0f19cf |
value = descriptors[int(m.group(1))]
|
|
Packit |
0f19cf |
self.params[param] = value
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
self.debug(self.params)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def get(self, param, default=""):
|
|
Packit |
0f19cf |
return self.params.get(param, default)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def values(self):
|
|
Packit |
0f19cf |
return self.params.values()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def keys(self):
|
|
Packit |
0f19cf |
return self.params.keys()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def infos(self):
|
|
Packit |
0f19cf |
return self.params
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def is_name_tree_node(self):
|
|
Packit |
0f19cf |
if self.get("/Limits") or self.get("/Names") or self.get("/Kid"):
|
|
Packit |
0f19cf |
return True
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
return False
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def link_to(self, pdfobjects):
|
|
Packit |
0f19cf |
unresolved = 0
|
|
Packit |
0f19cf |
for param, value in self.params.items():
|
|
Packit |
0f19cf |
# Point to something else than a string? Skip it
|
|
Packit |
0f19cf |
if not(isinstance(value, str)):
|
|
Packit |
0f19cf |
continue
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
objects = []
|
|
Packit |
0f19cf |
objrefs = self.re_objref.findall(value)
|
|
Packit |
0f19cf |
value2 = value
|
|
Packit |
0f19cf |
#print value, objrefs
|
|
Packit |
0f19cf |
for objref in objrefs:
|
|
Packit |
0f19cf |
o = pdfobjects.get(objref.replace(" R", ""), None)
|
|
Packit |
0f19cf |
# If the object is missing, keep the reference for another trial
|
|
Packit |
0f19cf |
if not(o):
|
|
Packit |
0f19cf |
self.warning("Object '%s' not resolved" % objref)
|
|
Packit |
0f19cf |
unresolved += 1
|
|
Packit |
0f19cf |
o = objref
|
|
Packit |
0f19cf |
objects.append(o)
|
|
Packit |
0f19cf |
value2 = value2.replace(objref, "", 1)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if not(objects):
|
|
Packit |
0f19cf |
continue
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if pdfstring_is_list(value):
|
|
Packit |
0f19cf |
if (value2[1:-1].strip()):
|
|
Packit |
0f19cf |
#print value2, objects
|
|
Packit |
0f19cf |
self.warning("Problem: cannot substitute objects: '%s'" \
|
|
Packit |
0f19cf |
% value)
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
self.params[param] = objects
|
|
Packit |
0f19cf |
self.debug("Substitute %s: %s" % (param, objects))
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
if value2.strip() or len(objects) > 1:
|
|
Packit |
0f19cf |
self.warning("Problem: cannot substitute object" % value)
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
self.params[param] = objects[0]
|
|
Packit |
0f19cf |
self.debug("Substitute %s: %s" % (param, objects[0]))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
return unresolved
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class StreamManager(PDFBaseObject):
|
|
Packit |
0f19cf |
CACHE_REFRESH = 1
|
|
Packit |
0f19cf |
CACHE_REMANENT = 2
|
|
Packit |
0f19cf |
CACHE_TMPDIR = 4
|
|
Packit |
0f19cf |
CACHE_DELONCLOSE = 8
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
_log = logging.getLogger("pdfscan.pdffile")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __init__(self, cache_method="file", cache_dirname="", flags=0):
|
|
Packit |
0f19cf |
self.cache_method = cache_method
|
|
Packit |
0f19cf |
self.cache_format = "pdfstream.%(number)s.%(revision)s"
|
|
Packit |
0f19cf |
self.cache_dirname = cache_dirname
|
|
Packit |
0f19cf |
self.cache_files = []
|
|
Packit |
0f19cf |
self.flags = flags
|
|
Packit |
0f19cf |
# Don't want to remove something in a user directory
|
|
Packit |
0f19cf |
if cache_dirname: self.flags = self.flags | self.CACHE_REMANENT
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def cleanup(self):
|
|
Packit |
0f19cf |
if (self.cache_method != "file"):
|
|
Packit |
0f19cf |
return
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if (self.flags & self.CACHE_REMANENT):
|
|
Packit |
0f19cf |
if (self.flags & self.CACHE_TMPDIR):
|
|
Packit |
0f19cf |
self.warning("'%s' not removed" % (self.cache_dirname))
|
|
Packit |
0f19cf |
return
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if (self.flags & self.CACHE_TMPDIR):
|
|
Packit |
0f19cf |
self.debug("Remove cache directory '%s'" % (self.cache_dirname))
|
|
Packit |
0f19cf |
shutil.rmtree(self.cache_dirname)
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
for fname in self.cache_files:
|
|
Packit |
0f19cf |
print "shutil.remove(", fname
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def cache(self, **kwargs):
|
|
Packit |
0f19cf |
if self.cache_method == "file":
|
|
Packit |
0f19cf |
return self.cache_file(kwargs)
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
return self.cache_memory(kwargs)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def cache_file(self, kwargs):
|
|
Packit |
0f19cf |
if not(self.cache_dirname):
|
|
Packit |
0f19cf |
self.cache_dirname = tempfile.mkdtemp()
|
|
Packit |
0f19cf |
self.flags = self.flags | self.CACHE_TMPDIR | self.CACHE_DELONCLOSE
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if not(os.path.exists(self.cache_dirname)):
|
|
Packit |
0f19cf |
os.mkdir(self.cache_dirname)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
cache_path = os.path.join(self.cache_dirname,
|
|
Packit |
0f19cf |
self.cache_format % kwargs)
|
|
Packit |
0f19cf |
stream_cache = StreamCacheFile(cache_path, flags=self.flags)
|
|
Packit |
0f19cf |
self.cache_files.append(cache_path)
|
|
Packit |
0f19cf |
return stream_cache
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def cache_memory(self, kwargs):
|
|
Packit |
0f19cf |
stream_cache = StreamCacheMemory(flags=self.flags)
|
|
Packit |
0f19cf |
return stream_cache
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class StreamCache:
|
|
Packit |
0f19cf |
def __init__(self, outfile, flags=0):
|
|
Packit |
0f19cf |
self.flags = flags
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def decompress(self, data, compress_type):
|
|
Packit |
0f19cf |
if not(compress_type):
|
|
Packit |
0f19cf |
return data
|
|
Packit |
0f19cf |
if compress_type == "zlib":
|
|
Packit |
0f19cf |
return zlib.decompress(data)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class StreamCacheFile(StreamCache):
|
|
Packit |
0f19cf |
def __init__(self, outfile, flags=0):
|
|
Packit |
0f19cf |
self.flags = flags
|
|
Packit |
0f19cf |
self.outfile = outfile
|
|
Packit |
0f19cf |
self._file = None
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def write(self, data, compress_type=""):
|
|
Packit |
0f19cf |
if ((self.flags & StreamManager.CACHE_REFRESH)
|
|
Packit |
0f19cf |
or not(os.path.exists(self.outfile))):
|
|
Packit |
0f19cf |
data = self.decompress(data, compress_type)
|
|
Packit |
0f19cf |
f = open(self.outfile, "w")
|
|
Packit |
0f19cf |
f.write(data)
|
|
Packit |
0f19cf |
f.close()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def read(self, size=-1):
|
|
Packit |
0f19cf |
if not(self._file):
|
|
Packit |
0f19cf |
self._file = open(self.outfile)
|
|
Packit |
0f19cf |
if size >= 0:
|
|
Packit |
0f19cf |
data = self._file.read(size)
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
data = self._file.read()
|
|
Packit |
0f19cf |
return data
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def close(self):
|
|
Packit |
0f19cf |
if (self._file):
|
|
Packit |
0f19cf |
self._file.close()
|
|
Packit |
0f19cf |
if (not(self.flags & StreamManager.CACHE_REMANENT) and \
|
|
Packit |
0f19cf |
(self.flags & StreamManager.CACHE_DELONCLOSE)):
|
|
Packit |
0f19cf |
os.remove(self.outfile)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class StreamCacheMemory(StreamCache):
|
|
Packit |
0f19cf |
def __init__(self, flags=0):
|
|
Packit |
0f19cf |
self.flags = flags
|
|
Packit |
0f19cf |
self._buffer = ""
|
|
Packit |
0f19cf |
self._read_pos = 0
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def write(self, data, compress_type=""):
|
|
Packit |
0f19cf |
self._buffer += self.decompress(data, compress_type)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def read(self, size=-1):
|
|
Packit |
0f19cf |
remain = len(self._buffer)-self._read_pos
|
|
Packit |
0f19cf |
if size >= 0:
|
|
Packit |
0f19cf |
size = min(size, remain)
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
size = remain
|
|
Packit |
0f19cf |
_buf = self._buffer[self._read_pos:self._read_pos+size]
|
|
Packit |
0f19cf |
self._read_pos += size
|
|
Packit |
0f19cf |
return _buf
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def close(self):
|
|
Packit |
0f19cf |
if (self.flags & StreamManager.CACHE_DELONCLOSE):
|
|
Packit |
0f19cf |
del self._buffer
|
|
Packit |
0f19cf |
self._buffer = None
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def extract_string_objects(data, re_pattern, replace_fmt,
|
|
Packit |
0f19cf |
delims=None, object_cls=None, object_id=0,
|
|
Packit |
0f19cf |
**kwargs):
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if isinstance(re_pattern, str):
|
|
Packit |
0f19cf |
strings_found = re.findall(re_pattern, data, re.M|re.DOTALL)
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
strings_found = re_pattern.findall(data)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
#print strings_found
|
|
Packit |
0f19cf |
strings_objects = []
|
|
Packit |
0f19cf |
for i, to in enumerate(strings_found):
|
|
Packit |
0f19cf |
repl = replace_fmt % (i+object_id)
|
|
Packit |
0f19cf |
if delims:
|
|
Packit |
0f19cf |
to = delims[0] + to + delims[1]
|
|
Packit |
0f19cf |
repl = delims[0] + repl + delims[1]
|
|
Packit |
0f19cf |
data = data.replace(to, repl, 1)
|
|
Packit |
0f19cf |
if object_cls:
|
|
Packit |
0f19cf |
strings_objects.append(object_cls(to, **kwargs))
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
strings_objects.append(to)
|
|
Packit |
0f19cf |
return (strings_objects, data)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFContentStream(PDFStreamHandler):
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
Data between the 'stream ... endstream' tags in a PDF object used as
|
|
Packit |
0f19cf |
content (and not as image or object storage).
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
def __init__(self, pdfobject, fontmgr=None):
|
|
Packit |
0f19cf |
PDFStreamHandler.__init__(self, pdfobject)
|
|
Packit |
0f19cf |
self.data = ""
|
|
Packit |
0f19cf |
self.qnode_root = None
|
|
Packit |
0f19cf |
self.textobjects = None
|
|
Packit |
0f19cf |
self.fontmgr = fontmgr or FontManager({})
|
|
Packit |
0f19cf |
pdfobject.stream_decode()
|
|
Packit |
0f19cf |
self.extract_textobjects(pdfobject.stream_text())
|
|
Packit |
0f19cf |
self.make_graph_tree()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def extract_textobjects(self, data):
|
|
Packit |
0f19cf |
fields = re.split("((?<=\s)BT(?=\s)|(?<=\s)ET(?=\s))", data)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
start_text = False
|
|
Packit |
0f19cf |
textdata = ""
|
|
Packit |
0f19cf |
textobject = None
|
|
Packit |
0f19cf |
textobjects = []
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
for field in fields:
|
|
Packit |
0f19cf |
if field == "BT":
|
|
Packit |
0f19cf |
start_text = True
|
|
Packit |
0f19cf |
textdata = ""
|
|
Packit |
0f19cf |
elif field == "ET":
|
|
Packit |
0f19cf |
textobject = PDFTextObject(textdata, fontmgr=self.fontmgr)
|
|
Packit |
0f19cf |
data = data.replace(textdata,
|
|
Packit |
0f19cf |
" textobj(%d) " % len(textobjects), 1)
|
|
Packit |
0f19cf |
textobjects.append(textobject)
|
|
Packit |
0f19cf |
start_text = False
|
|
Packit |
0f19cf |
elif start_text:
|
|
Packit |
0f19cf |
textdata += field
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
self.debug("Found %d textobjects" % len(textobjects))
|
|
Packit |
0f19cf |
self.textobjects = textobjects
|
|
Packit |
0f19cf |
self.data = data
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def make_graph_tree(self):
|
|
Packit |
0f19cf |
graph_stacks = re.split("(q\s|\sQ)", self.data)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
self.qnode_root = GraphState()
|
|
Packit |
0f19cf |
qnode = self.qnode_root
|
|
Packit |
0f19cf |
for field in graph_stacks:
|
|
Packit |
0f19cf |
if "q" in field:
|
|
Packit |
0f19cf |
qnode = qnode.push(GraphState())
|
|
Packit |
0f19cf |
elif "Q" in field:
|
|
Packit |
0f19cf |
qnode = qnode.pop()
|
|
Packit |
0f19cf |
elif field.strip():
|
|
Packit |
0f19cf |
qnode.set_data(field)
|
|
Packit |
0f19cf |
qnode.fill_textobjects(self.textobjects)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def dump(self):
|
|
Packit |
0f19cf |
self.qnode_root.dump()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFMatrix(PDFBaseObject):
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
| a b 0 |
|
|
Packit |
0f19cf |
Tm = | c d 0 |
|
|
Packit |
0f19cf |
| e f 1 |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
[x , y , 1] = [x1, y1, 1] x Tm1
|
|
Packit |
0f19cf |
[x1, y1, 1] = [x2, y2, 1] x Tm2
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
=> [x , y , 1] = [x2, y2, 1] x Tm2 x Tm1
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
IDENT = [1, 0, 0, 1, 0, 0]
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __init__(self, vector):
|
|
Packit |
0f19cf |
self.vector = vector
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def tx(self):
|
|
Packit |
0f19cf |
return self.vector[4]
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def ty(self):
|
|
Packit |
0f19cf |
return self.vector[5]
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def scale(self):
|
|
Packit |
0f19cf |
a, b, c, d, e, f = self.vector
|
|
Packit |
0f19cf |
# Horizontal orientation
|
|
Packit |
0f19cf |
if (abs(a) == abs(d) and b == 0 and c == 0):
|
|
Packit |
0f19cf |
return abs(a)
|
|
Packit |
0f19cf |
# vertical orientation
|
|
Packit |
0f19cf |
if (abs(b) == abs(c) and a == 0 and d == 0):
|
|
Packit |
0f19cf |
return abs(b)
|
|
Packit |
0f19cf |
# Always return the first even if something is weird
|
|
Packit |
0f19cf |
self.warning("Cannot interpret Tm matrix scale: %s" % self)
|
|
Packit |
0f19cf |
return a
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __str__(self):
|
|
Packit |
0f19cf |
return str(self.vector)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __len__(self):
|
|
Packit |
0f19cf |
return len(self.vector)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __mul__(self, vector):
|
|
Packit |
0f19cf |
a, b, c, d, e, f = self.vector
|
|
Packit |
0f19cf |
if len(vector) == 6:
|
|
Packit |
0f19cf |
ar, br, cr, dr, er, fr = vector.vector
|
|
Packit |
0f19cf |
a2 = a * ar + b * cr + 0 * er
|
|
Packit |
0f19cf |
b2 = a * br + b * dr + 0 * fr
|
|
Packit |
0f19cf |
c2 = c * ar + d * cr + 0 * er
|
|
Packit |
0f19cf |
d2 = c * br + d * dr + 0 * fr
|
|
Packit |
0f19cf |
e2 = e * ar + f * cr + 1 * er
|
|
Packit |
0f19cf |
f2 = e * br + f * dr + 1 * fr
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
m = PDFMatrix([a2,b2,c2,d2,e2,f2])
|
|
Packit |
0f19cf |
return m
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
x, y = vector[0:2]
|
|
Packit |
0f19cf |
x2 = a * x + c * y + e
|
|
Packit |
0f19cf |
y2 = b * x + d * y + f
|
|
Packit |
0f19cf |
return [x2, y2, 1]
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class GraphState:
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
Graphic state starts with 'q' and ends with 'Q' in content stream.
|
|
Packit |
0f19cf |
It can contain other graphic states and/or text objects.
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
def __init__(self):
|
|
Packit |
0f19cf |
self._parent = None
|
|
Packit |
0f19cf |
self._children = []
|
|
Packit |
0f19cf |
self._level = 0
|
|
Packit |
0f19cf |
self._data = ""
|
|
Packit |
0f19cf |
self.textobjects = []
|
|
Packit |
0f19cf |
self.matrix = PDFMatrix(PDFMatrix.IDENT)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def level(self):
|
|
Packit |
0f19cf |
return self._level
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def set_parent(self, qnode):
|
|
Packit |
0f19cf |
self._parent = qnode
|
|
Packit |
0f19cf |
self._level = qnode.level()+1
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def push(self, qnode):
|
|
Packit |
0f19cf |
self._children.append(qnode)
|
|
Packit |
0f19cf |
qnode.set_parent(self)
|
|
Packit |
0f19cf |
return qnode
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def pop(self):
|
|
Packit |
0f19cf |
qnode = self._parent
|
|
Packit |
0f19cf |
return qnode
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def set_data(self, data, textobjects=None):
|
|
Packit |
0f19cf |
self._data = data
|
|
Packit |
0f19cf |
if textobjects:
|
|
Packit |
0f19cf |
self.fill_textobjects(textobjects)
|
|
Packit |
0f19cf |
self.extract_matrix()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def fill_textobjects(self, textobjects):
|
|
Packit |
0f19cf |
#print self._data #***
|
|
Packit |
0f19cf |
tos = re.findall(" (textobj\(\d+\))", self._data)
|
|
Packit |
0f19cf |
for to in tos:
|
|
Packit |
0f19cf |
m = re.match("textobj\((\d+)\)", to)
|
|
Packit |
0f19cf |
if m:
|
|
Packit |
0f19cf |
textobject = textobjects[int(m.group(1))]
|
|
Packit |
0f19cf |
textobject.set_graphstate(self)
|
|
Packit |
0f19cf |
self.textobjects.append(textobject)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
self._data = re.sub(" textobj\(\d+\)", "",
|
|
Packit |
0f19cf |
self._data, flags=re.MULTILINE).strip()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def extract_matrix(self):
|
|
Packit |
0f19cf |
m = re.search("("+6*"[^\s]+\s+"+"cm"+")", self._data)
|
|
Packit |
0f19cf |
if m:
|
|
Packit |
0f19cf |
vector = [ float(v) for v in m.group(1).split()[0:6] ]
|
|
Packit |
0f19cf |
self.matrix = PDFMatrix(vector)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def dump(self):
|
|
Packit |
0f19cf |
s = self._level * " " + "q '" + self._data + "'"
|
|
Packit |
0f19cf |
print s
|
|
Packit |
0f19cf |
for q in self._children:
|
|
Packit |
0f19cf |
q.dump()
|
|
Packit |
0f19cf |
s = self._level * " " + "Q"
|
|
Packit |
0f19cf |
print s
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFTextObject:
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
Data between the 'BT' and 'ET' tokens found in content streams.
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
_font_op_pattern = "/[^\s]+\s+[^\s]+\s+Tf"
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Detect a 'Tf', 'Tm', 'Tj', 'TJ', Td, TD operator sequence in a text object
|
|
Packit |
0f19cf |
# To use only when strings are extracted and replaced by their reference
|
|
Packit |
0f19cf |
_re_seq = re.compile("(" + _font_op_pattern + "|"+\
|
|
Packit |
0f19cf |
6*"[^\s]+\s+"+"Tm"+"|"+\
|
|
Packit |
0f19cf |
"\(textcontent\{\d+\}\)\s*Tj|"+\
|
|
Packit |
0f19cf |
"\[[^\]]*\]\s*TJ|"+\
|
|
Packit |
0f19cf |
"[^\s]+\s+[^\s]+\s+T[dD])", re.MULTILINE)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Find a font setup operator, like '/F10 9.47 Tf'
|
|
Packit |
0f19cf |
_re_font = re.compile("("+_font_op_pattern+")", re.MULTILINE)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Find a sequence '(...\(...\)...) Tj'
|
|
Packit |
0f19cf |
_re_text_show1 = re.compile("(\((?:" + "[^()]" + "|" +\
|
|
Packit |
0f19cf |
r"(?<=\\)\(" + "|" +\
|
|
Packit |
0f19cf |
r"(?<=\\)\)" + ")*\)\s*Tj)", re.M)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Find a sequence '[...\[...\]...] TJ'
|
|
Packit |
0f19cf |
_re_text_show2 = re.compile("\[((?:" + "[^\[\]]" + "|" +\
|
|
Packit |
0f19cf |
r"(?<=\\)\[" + "|" +\
|
|
Packit |
0f19cf |
r"(?<=\\)\]" + ")*)\]\s*TJ", re.M)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __init__(self, data, fontmgr=None):
|
|
Packit |
0f19cf |
self.data = data
|
|
Packit |
0f19cf |
self.matrix = PDFMatrix(PDFMatrix.IDENT)
|
|
Packit |
0f19cf |
self.fontmgr = fontmgr or FontManager({})
|
|
Packit |
0f19cf |
self.qnode = None
|
|
Packit |
0f19cf |
self.strings = []
|
|
Packit |
0f19cf |
self.textsegments = []
|
|
Packit |
0f19cf |
self.textlines = []
|
|
Packit |
0f19cf |
self.extract_strings()
|
|
Packit |
0f19cf |
self.extract_matrix()
|
|
Packit |
0f19cf |
self.parse_data()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def set_graphstate(self, gs):
|
|
Packit |
0f19cf |
self.qnode = gs
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def set_fontmanager(self, fontmgr):
|
|
Packit |
0f19cf |
self.fontmgr = fontmgr
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def matrix_absolute(self):
|
|
Packit |
0f19cf |
# The textobject matrix change is the last one, so on the full left
|
|
Packit |
0f19cf |
m = self.matrix
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# We climb the graph stack from the deepest (newer) to the upper
|
|
Packit |
0f19cf |
# (oldest) node so:
|
|
Packit |
0f19cf |
# Absolute Matrix = Newest (m) x ... x Oldest (qnode.matrix)
|
|
Packit |
0f19cf |
qnode = self.qnode
|
|
Packit |
0f19cf |
while qnode:
|
|
Packit |
0f19cf |
m = m * qnode.matrix
|
|
Packit |
0f19cf |
qnode = qnode.pop()
|
|
Packit |
0f19cf |
return m
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def extract_matrix(self):
|
|
Packit |
0f19cf |
m = re.search("("+6*"[^\s]+\s+"+"Tm"+")", self.data)
|
|
Packit |
0f19cf |
if m:
|
|
Packit |
0f19cf |
vector = [ float(v) for v in m.group(1).split()[0:6] ]
|
|
Packit |
0f19cf |
self.matrix = PDFMatrix(vector)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def extract_strings(self):
|
|
Packit |
0f19cf |
#print self.data
|
|
Packit |
0f19cf |
objects, data = extract_string_objects(self.data, self._re_text_show1,
|
|
Packit |
0f19cf |
"(textcontent{%d}) Tj")
|
|
Packit |
0f19cf |
self.strings = objects
|
|
Packit |
0f19cf |
objects, data = extract_string_objects(data, self._re_text_show2,
|
|
Packit |
0f19cf |
"textcontent{%d}",
|
|
Packit |
0f19cf |
delims=["[","]"],
|
|
Packit |
0f19cf |
object_id=len(self.strings))
|
|
Packit |
0f19cf |
#print data
|
|
Packit |
0f19cf |
self.strings += objects
|
|
Packit |
0f19cf |
self.data = data
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _newline(self):
|
|
Packit |
0f19cf |
linerow = []
|
|
Packit |
0f19cf |
self.textlines.append(linerow)
|
|
Packit |
0f19cf |
return linerow
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def get_font(self, font, size, scale):
|
|
Packit |
0f19cf |
return self.fontmgr.get_font(font, float(size)*scale)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def parse_data(self):
|
|
Packit |
0f19cf |
linerow = self._newline()
|
|
Packit |
0f19cf |
textline = PDFTextSegment("", PDFMatrix(PDFMatrix.IDENT))
|
|
Packit |
0f19cf |
linerow.append(textline)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Find the operator sequences
|
|
Packit |
0f19cf |
operators = self._re_seq.findall(self.data)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
font, size = "", 1
|
|
Packit |
0f19cf |
last_key = ""
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
for tx in operators:
|
|
Packit |
0f19cf |
fields = tx.split()
|
|
Packit |
0f19cf |
key = fields[-1]
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
# Found a font setup, memorize the fontname and fontsize base
|
|
Packit |
0f19cf |
if key == "Tf":
|
|
Packit |
0f19cf |
font = fields[0]
|
|
Packit |
0f19cf |
size = fields[1]
|
|
Packit |
0f19cf |
# Found the matrix setup, memorize it
|
|
Packit |
0f19cf |
elif key == "Tm":
|
|
Packit |
0f19cf |
vector = [ float(c) for c in fields[0:6]]
|
|
Packit |
0f19cf |
self.matrix = PDFMatrix(vector)
|
|
Packit |
0f19cf |
# Found a text positionning
|
|
Packit |
0f19cf |
elif key in ("Td", "TD"):
|
|
Packit |
0f19cf |
tx, ty = [ float(c) for c in fields[0:2]]
|
|
Packit |
0f19cf |
matrix = PDFMatrix([1, 0, 0, 1, tx, ty])
|
|
Packit |
0f19cf |
textline = PDFTextSegment("", matrix)
|
|
Packit |
0f19cf |
self.textsegments.append(textline)
|
|
Packit |
0f19cf |
if matrix.ty() != 0:
|
|
Packit |
0f19cf |
linerow = self._newline()
|
|
Packit |
0f19cf |
linerow.append(textline)
|
|
Packit |
0f19cf |
# When text is shown, the current font/size setup applies and is
|
|
Packit |
0f19cf |
# then recorded
|
|
Packit |
0f19cf |
elif "Tj" in key or "TJ" in key:
|
|
Packit |
0f19cf |
m = re.search("textcontent\{(\d+)\}", tx)
|
|
Packit |
0f19cf |
text_string = self.strings[int(m.group(1))]
|
|
Packit |
0f19cf |
scale = self.matrix.scale()
|
|
Packit |
0f19cf |
#print font, size, scale #*****
|
|
Packit |
0f19cf |
pdffont = self.get_font(font, size, scale)
|
|
Packit |
0f19cf |
text_shown = PDFTextShow(text_string, pdffont)
|
|
Packit |
0f19cf |
textline.add_text_show(text_shown)
|
|
Packit |
0f19cf |
last_key = key
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFTextSegment:
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
A text segment is a portion of text related to a text position operator 'Td'
|
|
Packit |
0f19cf |
or 'TD'. It contains all the texts shown related to this position, signaled
|
|
Packit |
0f19cf |
with the 'Tj' and 'TJ' tokens
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
def __init__(self, data, matrix):
|
|
Packit |
0f19cf |
self.matrix = matrix
|
|
Packit |
0f19cf |
self.data = data
|
|
Packit |
0f19cf |
self.strings = None
|
|
Packit |
0f19cf |
self.text_shown = []
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __str__(self):
|
|
Packit |
0f19cf |
s = ""
|
|
Packit |
0f19cf |
for o in self.text_shown:
|
|
Packit |
0f19cf |
s += str(o)
|
|
Packit |
0f19cf |
return s
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def text(self):
|
|
Packit |
0f19cf |
s = " ".join([o.text() for o in self.text_shown])
|
|
Packit |
0f19cf |
return s
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def set_strings(self, strings):
|
|
Packit |
0f19cf |
self.strings = strings
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def add_text_show(self, text_shown):
|
|
Packit |
0f19cf |
self.text_shown.append(text_shown)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFTextShow:
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
Data between the '( )' of the 'Tj' operator or '[ ]' of the 'TJ' operator
|
|
Packit |
0f19cf |
that is intended to be shown.
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
_re_textascii = re.compile(r"\(((?:[^)]|(?<=\\)\))*)\)", re.M)
|
|
Packit |
0f19cf |
_re_textunicode = re.compile(r"<([^>]+)>", re.M)
|
|
Packit |
0f19cf |
_codec_handler_installed = {}
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __init__(self, data, font):
|
|
Packit |
0f19cf |
self.data = data
|
|
Packit |
0f19cf |
self.font = font
|
|
Packit |
0f19cf |
self.encode = codecs.getencoder("latin1")
|
|
Packit |
0f19cf |
if not(self._codec_handler_installed):
|
|
Packit |
0f19cf |
codecs.register_error("substitute", PDFTextShow._encode_subs)
|
|
Packit |
0f19cf |
self._codec_handler_installed["substitute"] = PDFTextShow._encode_subs
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __str__(self):
|
|
Packit |
0f19cf |
return self.data.replace("\n", " ")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def text(self):
|
|
Packit |
0f19cf |
textdata = self._re_textascii.findall(self.data)
|
|
Packit |
0f19cf |
textdata = "".join(textdata).replace("\(", "(").replace("\)", ")")
|
|
Packit |
0f19cf |
if textdata:
|
|
Packit |
0f19cf |
return textdata
|
|
Packit |
0f19cf |
if (self.font.tounicode):
|
|
Packit |
0f19cf |
textdata = self._re_textunicode.findall(self.data)
|
|
Packit |
0f19cf |
s = u" ".join(self.font.tounicode.decode(textdata))
|
|
Packit |
0f19cf |
return self.encode(s, "substitute")[0]
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
return ""
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def get_font(self):
|
|
Packit |
0f19cf |
return self.font
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
@classmethod
|
|
Packit |
0f19cf |
def _encode_subs(cls, exc):
|
|
Packit |
0f19cf |
if not isinstance(exc, UnicodeEncodeError):
|
|
Packit |
0f19cf |
return u""
|
|
Packit |
0f19cf |
l = []
|
|
Packit |
0f19cf |
for c in exc.object[exc.start:exc.end]:
|
|
Packit |
0f19cf |
l.append(u"&#x%x;" % ord(c))
|
|
Packit |
0f19cf |
return (u"".join(l), exc.end)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFFont:
|
|
Packit |
0f19cf |
def __init__(self, fontobject, fontsize, tounicode=None):
|
|
Packit |
0f19cf |
self.fontobject = fontobject
|
|
Packit |
0f19cf |
self.fontsize = fontsize
|
|
Packit |
0f19cf |
self.tounicode = tounicode
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def key(self):
|
|
Packit |
0f19cf |
key = "%s/%6.2f" % (self.name(), self.size())
|
|
Packit |
0f19cf |
return key
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __cmp__(self, other):
|
|
Packit |
0f19cf |
a = (cmp(self.name(), other.name()) or
|
|
Packit |
0f19cf |
cmp(self.size(), other.size()))
|
|
Packit |
0f19cf |
return a
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def name(self):
|
|
Packit |
0f19cf |
return self.fontobject.descriptor.get("/BaseFont")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def size(self):
|
|
Packit |
0f19cf |
return self.fontsize
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class FontManager:
|
|
Packit |
0f19cf |
def __init__(self, fontdict, global_fontmgr=None):
|
|
Packit |
0f19cf |
self.fontdict = fontdict
|
|
Packit |
0f19cf |
self.fontused = {}
|
|
Packit |
0f19cf |
self.tounicode = {}
|
|
Packit |
0f19cf |
self.global_fontmgr = global_fontmgr
|
|
Packit |
0f19cf |
self.resolver = PDFResolver.get_resolver()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def get_pdffont(self, fontobj, fontsize):
|
|
Packit |
0f19cf |
key = fontobj.descriptor.get("/BaseFont")+"/"+"%6.2f" % fontsize
|
|
Packit |
0f19cf |
if self.fontused.has_key(key):
|
|
Packit |
0f19cf |
return self.fontused.get(key)
|
|
Packit |
0f19cf |
elif self.global_fontmgr:
|
|
Packit |
0f19cf |
pdffont = self.global_fontmgr.get_pdffont(fontobj, fontsize)
|
|
Packit |
0f19cf |
self.fontused[key] = pdffont
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
pdffont = self._make_pdffont(fontobj, fontsize)
|
|
Packit |
0f19cf |
self.fontused[key] = pdffont
|
|
Packit |
0f19cf |
return pdffont
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _make_pdffont(self, fontobj, fontsize):
|
|
Packit |
0f19cf |
fontobj.link_to(self.resolver)
|
|
Packit |
0f19cf |
pdfobject = fontobj.descriptor.get("/ToUnicode")
|
|
Packit |
0f19cf |
if pdfobject:
|
|
Packit |
0f19cf |
pdfobject.link_to(self.resolver)
|
|
Packit |
0f19cf |
tuc = self._get_tounicode(pdfobject)
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
tuc = None
|
|
Packit |
0f19cf |
pdffont = PDFFont(fontobj, fontsize, tuc)
|
|
Packit |
0f19cf |
return pdffont
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _get_tounicode(self, pdfobject):
|
|
Packit |
0f19cf |
key = pdfobject.ident()
|
|
Packit |
0f19cf |
if self.tounicode.has_key(key):
|
|
Packit |
0f19cf |
tuc = self.tounicode.get(key)
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
tuc = ToUnicode(pdfobject)
|
|
Packit |
0f19cf |
self.tounicode[key] = tuc
|
|
Packit |
0f19cf |
return tuc
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def get_font(self, fontref, size):
|
|
Packit |
0f19cf |
fontobj = self.fontdict.get(fontref)
|
|
Packit |
0f19cf |
if not(fontobj):
|
|
Packit |
0f19cf |
return None
|
|
Packit |
0f19cf |
return self.get_pdffont(fontobj, size)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def get_used(self):
|
|
Packit |
0f19cf |
return self.fontused.values()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class ToUnicode(PDFStreamHandler):
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
Handle the /ToUnicode CMap object found in a font, in order to be able to
|
|
Packit |
0f19cf |
translate the text content to readable text
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
_re_token = re.compile("(" + \
|
|
Packit |
0f19cf |
"(?:\d+\s+(?:begincodespacerange|beginbfchar|beginbfrange))" + "|"\
|
|
Packit |
0f19cf |
"(?:endcodespacerange|endbfchar|endbfrange)" + \
|
|
Packit |
0f19cf |
")", re.M)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __init__(self, pdfobject):
|
|
Packit |
0f19cf |
PDFStreamHandler.__init__(self, pdfobject)
|
|
Packit |
0f19cf |
self.charmaps = []
|
|
Packit |
0f19cf |
pdfobject.stream_decode()
|
|
Packit |
0f19cf |
self.data = pdfobject.stream_text()
|
|
Packit |
0f19cf |
self.parse_cmap(self.data)
|
|
Packit |
0f19cf |
self.debug("Create a ToUnicode object for '%s'" % pdfobject.ident())
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def parse_cmap(self, data):
|
|
Packit |
0f19cf |
flds = self._re_token.split(data)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
bfchar = None
|
|
Packit |
0f19cf |
bfrange = None
|
|
Packit |
0f19cf |
for fld in flds:
|
|
Packit |
0f19cf |
if "begincodespacerange" in fld:
|
|
Packit |
0f19cf |
# TODO
|
|
Packit |
0f19cf |
pass
|
|
Packit |
0f19cf |
elif "beginbfchar" in fld:
|
|
Packit |
0f19cf |
n = int(fld.split()[0])
|
|
Packit |
0f19cf |
bfchar = BfRange(n)
|
|
Packit |
0f19cf |
elif "beginbfrange" in fld:
|
|
Packit |
0f19cf |
n = int(fld.split()[0])
|
|
Packit |
0f19cf |
bfrange = BfRange(n)
|
|
Packit |
0f19cf |
elif "endcodespacerange" in fld:
|
|
Packit |
0f19cf |
pass
|
|
Packit |
0f19cf |
elif "endbfchar" in fld:
|
|
Packit |
0f19cf |
self.add_bfrange(bfchar)
|
|
Packit |
0f19cf |
bfchar = None
|
|
Packit |
0f19cf |
elif "endbfrange" in fld:
|
|
Packit |
0f19cf |
self.add_bfrange(bfrange)
|
|
Packit |
0f19cf |
bfrange = None
|
|
Packit |
0f19cf |
elif bfchar:
|
|
Packit |
0f19cf |
fld = re.sub("<\s+", "<", fld)
|
|
Packit |
0f19cf |
fld = re.sub("\s+>", ">", fld)
|
|
Packit |
0f19cf |
data = fld.split()
|
|
Packit |
0f19cf |
for i in range(0, len(data), 2):
|
|
Packit |
0f19cf |
bfchar.add_mapstr(data[i], data[i], data[i+1])
|
|
Packit |
0f19cf |
elif bfrange:
|
|
Packit |
0f19cf |
fld = re.sub("<\s+", "<", fld)
|
|
Packit |
0f19cf |
fld = re.sub("\s+>", ">", fld)
|
|
Packit |
0f19cf |
data = fld.split()
|
|
Packit |
0f19cf |
for i in range(0, len(data), 3):
|
|
Packit |
0f19cf |
bfrange.add_mapstr(data[i], data[i+1], data[i+2])
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def add_bfrange(self, bfrange):
|
|
Packit |
0f19cf |
self.charmaps.extend(bfrange.charmaps)
|
|
Packit |
0f19cf |
self.charmaps.sort()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def get_uccode(self, bfchar):
|
|
Packit |
0f19cf |
mustbe_in_next = False
|
|
Packit |
0f19cf |
for m in self.charmaps:
|
|
Packit |
0f19cf |
if bfchar >= m.bffirst:
|
|
Packit |
0f19cf |
if bfchar <= m.bflast:
|
|
Packit |
0f19cf |
return m.uccode + (bfchar - m.bffirst)
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
mustbe_in_next = True
|
|
Packit |
0f19cf |
elif mustbe_in_next:
|
|
Packit |
0f19cf |
return 0
|
|
Packit |
0f19cf |
return 0
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def decode_string(self, data):
|
|
Packit |
0f19cf |
ul = []
|
|
Packit |
0f19cf |
for i in range(0, len(data), 4):
|
|
Packit |
0f19cf |
s = data[i:i+4]
|
|
Packit |
0f19cf |
#print s
|
|
Packit |
0f19cf |
ul.append(unichr(self.get_uccode(int(s,16))))
|
|
Packit |
0f19cf |
return u"".join(ul)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def decode(self, data):
|
|
Packit |
0f19cf |
if isinstance(data, list):
|
|
Packit |
0f19cf |
return [self.decode_string(s) for s in data]
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
return self.decode_string(data)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class CharMap:
|
|
Packit |
0f19cf |
def __init__(self, bffirst, bflast, uccode):
|
|
Packit |
0f19cf |
self.bffirst = bffirst
|
|
Packit |
0f19cf |
self.bflast = bflast
|
|
Packit |
0f19cf |
self.uccode = uccode
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def __cmp__(self, other):
|
|
Packit |
0f19cf |
return cmp(self.bffirst, other.bffirst)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class BfRange:
|
|
Packit |
0f19cf |
def __init__(self, entry_count):
|
|
Packit |
0f19cf |
self.entry_count = entry_count
|
|
Packit |
0f19cf |
self.charmaps = []
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def add_mapstr(self, bffirst_str, bflast_str, ucfirst_str):
|
|
Packit |
0f19cf |
# Take strings like <045D>
|
|
Packit |
0f19cf |
bffirst = int(bffirst_str[1:-1], 16)
|
|
Packit |
0f19cf |
bflast = int(bflast_str[1:-1], 16)
|
|
Packit |
0f19cf |
ucfirst = int(ucfirst_str[1:-1], 16)
|
|
Packit |
0f19cf |
self.charmaps.append(CharMap(bffirst, bflast, ucfirst))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
#
|
|
Packit |
0f19cf |
# Starting from here is the command stuff
|
|
Packit |
0f19cf |
#
|
|
Packit |
0f19cf |
#
|
|
Packit |
0f19cf |
import textwrap
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class BasicCmd:
|
|
Packit |
0f19cf |
def __init__(self):
|
|
Packit |
0f19cf |
pass
|
|
Packit |
0f19cf |
def setup_parser(self, parser):
|
|
Packit |
0f19cf |
return True
|
|
Packit |
0f19cf |
def help(self, cmd):
|
|
Packit |
0f19cf |
if self.__doc__:
|
|
Packit |
0f19cf |
return self.__doc__
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
return None
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PageLayoutCmd(BasicCmd):
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
Show the position and fonts used for each text line contained by a page
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
def __init__(self, scanner):
|
|
Packit |
0f19cf |
self.scanner = scanner
|
|
Packit |
0f19cf |
layout_fmt = "%5s %5s | %5s %5s | %8s | "
|
|
Packit |
0f19cf |
self.padding = layout_fmt % (" "," "," "," "," ")
|
|
Packit |
0f19cf |
self.headline = layout_fmt % (5*"_",5*"_",5*"_",5*"_",8*"_")
|
|
Packit |
0f19cf |
self.header = layout_fmt % ("dX","dY","X","Y","FONTS")
|
|
Packit |
0f19cf |
self.width = 90
|
|
Packit |
0f19cf |
self.show_matrix = False
|
|
Packit |
0f19cf |
self.raw_text = False
|
|
Packit |
0f19cf |
self.pt_factor = 1
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def setup_parser(self, parser):
|
|
Packit |
0f19cf |
parser.add_argument("-w", "--width",
|
|
Packit |
0f19cf |
help="Width of the printed layout information")
|
|
Packit |
0f19cf |
parser.add_argument("-m", "--show-matrix", action="store_true",
|
|
Packit |
0f19cf |
help="Print absolute Transformation Matrix for each textobject")
|
|
Packit |
0f19cf |
parser.add_argument("-r", "--raw-text", action="store_true",
|
|
Packit |
0f19cf |
help="Print the raw text contained by textobjects")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def run(self, parser, args):
|
|
Packit |
0f19cf |
if args.width:
|
|
Packit |
0f19cf |
self.width = int(args.width)
|
|
Packit |
0f19cf |
if args.show_matrix:
|
|
Packit |
0f19cf |
self.show_matrix = True
|
|
Packit |
0f19cf |
if args.raw_text:
|
|
Packit |
0f19cf |
self.raw_text = True
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
for pg in self.scanner.page_groups:
|
|
Packit |
0f19cf |
self.print_page_layout(pg)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def print_page_layout(self, pdf_pages):
|
|
Packit |
0f19cf |
for page in pdf_pages:
|
|
Packit |
0f19cf |
fonts_used = page.find_fonts()
|
|
Packit |
0f19cf |
fonts_used.sort()
|
|
Packit |
0f19cf |
print "\nPage %d fonts used:" % page.pagenum
|
|
Packit |
0f19cf |
for i, font in enumerate(fonts_used):
|
|
Packit |
0f19cf |
print "[%d] %-40s %6.2f pt" % (i, font.name(),
|
|
Packit |
0f19cf |
self.pt_factor*font.size())
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
print "\nPage %d layout:" % page.pagenum
|
|
Packit |
0f19cf |
content_stream = page.streams[0]
|
|
Packit |
0f19cf |
xp, yp = 0., 0.
|
|
Packit |
0f19cf |
print self.header
|
|
Packit |
0f19cf |
print self.headline
|
|
Packit |
0f19cf |
for textobject in content_stream.textobjects:
|
|
Packit |
0f19cf |
xp, yp = self._print_textobject_layout(textobject, xp, yp,
|
|
Packit |
0f19cf |
fonts_used)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _print_textobject_layout(self, textobject, xp, yp, fonts_used):
|
|
Packit |
0f19cf |
wraplen = self.width - len(self.padding)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
m2 = textobject.matrix_absolute()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
for line in textobject.textlines:
|
|
Packit |
0f19cf |
# Track the fonts used per line
|
|
Packit |
0f19cf |
font_line = []
|
|
Packit |
0f19cf |
for seg in line:
|
|
Packit |
0f19cf |
for text_shown in seg.text_shown:
|
|
Packit |
0f19cf |
font = text_shown.get_font()
|
|
Packit |
0f19cf |
if not(font):
|
|
Packit |
0f19cf |
continue
|
|
Packit |
0f19cf |
idx = fonts_used.index(font)
|
|
Packit |
0f19cf |
if not(idx in font_line):
|
|
Packit |
0f19cf |
font_line.append(idx)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
m2 = line[0].matrix * m2
|
|
Packit |
0f19cf |
if self.show_matrix: print "%s" % m2
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
x, y = m2.tx(), m2.ty()
|
|
Packit |
0f19cf |
x, y = float(x/72), float(y/72)
|
|
Packit |
0f19cf |
dx, dy = x - xp, y - yp
|
|
Packit |
0f19cf |
info = "%5.2f %5.2f | %5.2f %5.2f | %8s | " % \
|
|
Packit |
0f19cf |
(dx, dy, x, y, font_line)
|
|
Packit |
0f19cf |
if self.raw_text:
|
|
Packit |
0f19cf |
text = "".join([str(s) for s in line])
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
text = "".join([s.text() for s in line])
|
|
Packit |
0f19cf |
textw = textwrap.wrap(text, wraplen)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if textw:
|
|
Packit |
0f19cf |
print "%s%s" % (info, textw[0])
|
|
Packit |
0f19cf |
for txt in textw[1:]:
|
|
Packit |
0f19cf |
print "%s%s" % (self.padding, txt)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
xp, yp = x, y
|
|
Packit |
0f19cf |
for l in line[1:]:
|
|
Packit |
0f19cf |
m2 = l.matrix * m2
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
return (xp, yp)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PageObjectCmd(BasicCmd):
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
List the PDF objects used per page
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
def __init__(self, scanner):
|
|
Packit |
0f19cf |
self.scanner = scanner
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def run(self, parser, args):
|
|
Packit |
0f19cf |
page_first = 1
|
|
Packit |
0f19cf |
for i, page in enumerate(self.scanner.pdf.page_objects):
|
|
Packit |
0f19cf |
page_num = i+page_first
|
|
Packit |
0f19cf |
contents = page.descriptor.get("/Contents")
|
|
Packit |
0f19cf |
resources = page.descriptor.get("/Resources")
|
|
Packit |
0f19cf |
print "Page %d %s: contents: %s, resources: %s" % \
|
|
Packit |
0f19cf |
(page_num, page, contents, resources)
|
|
Packit |
0f19cf |
print
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PdfObjectCmd(BasicCmd):
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
Scan data on the PDF objects of the PDF File
|
|
Packit |
0f19cf |
"""
|
|
Packit |
0f19cf |
def __init__(self, scanner):
|
|
Packit |
0f19cf |
self.scanner = scanner
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def setup_parser(self, parser):
|
|
Packit |
0f19cf |
group = parser.add_mutually_exclusive_group()
|
|
Packit |
0f19cf |
group.add_argument("-list", "--list-loaded", action="store_true",
|
|
Packit |
0f19cf |
help="List the object loaded by the scanner")
|
|
Packit |
0f19cf |
group.add_argument("-dict", "--dictionnary",
|
|
Packit |
0f19cf |
metavar="'<number> <generation>'",
|
|
Packit |
0f19cf |
help="Show the dictionnary of the object specified by its "\
|
|
Packit |
0f19cf |
"reference '<number> <generation>'")
|
|
Packit |
0f19cf |
group.add_argument("-dump", "--dump-stream", nargs=2,
|
|
Packit |
0f19cf |
metavar=("'<number> <generation>'","OUTFILE"),
|
|
Packit |
0f19cf |
help="Write the stream content of the object specified by its "\
|
|
Packit |
0f19cf |
"reference '<number> <generation>'")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def run(self, parser, args):
|
|
Packit |
0f19cf |
if args.list_loaded:
|
|
Packit |
0f19cf |
self.list_pdfobjects()
|
|
Packit |
0f19cf |
elif args.dictionnary:
|
|
Packit |
0f19cf |
ident = self._sanitize_objref(args.dictionnary)
|
|
Packit |
0f19cf |
if not(ident): return
|
|
Packit |
0f19cf |
self.show_dictionnary(ident)
|
|
Packit |
0f19cf |
elif args.dump_stream:
|
|
Packit |
0f19cf |
ident = self._sanitize_objref(args.dump_stream[0])
|
|
Packit |
0f19cf |
if not(ident): return
|
|
Packit |
0f19cf |
self.dump_stream(ident, args.dump_stream[1])
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _sanitize_objref(self, ident):
|
|
Packit |
0f19cf |
flds = ident.split()
|
|
Packit |
0f19cf |
if len(flds) != 2:
|
|
Packit |
0f19cf |
print "Invalid object reference: must be in the form "\
|
|
Packit |
0f19cf |
"'number generation'"
|
|
Packit |
0f19cf |
return ""
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
return "%s %s" % (flds[0], flds[1])
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def show_dictionnary(self, ident):
|
|
Packit |
0f19cf |
pdfobject = self.scanner.pdf.get_object(ident)
|
|
Packit |
0f19cf |
if not(pdfobject):
|
|
Packit |
0f19cf |
print "PDF Object '%s' not found" % ident
|
|
Packit |
0f19cf |
return
|
|
Packit |
0f19cf |
if pdfobject.stream:
|
|
Packit |
0f19cf |
print "PDF Object '%s' has a stream. Its dictionnary:" % ident
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
print "PDF Object '%s' dictionnary:" % ident
|
|
Packit |
0f19cf |
self._print_dictionnary(pdfobject.descriptor)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _print_dictionnary(self, descriptor, level=1):
|
|
Packit |
0f19cf |
indent = " "*level
|
|
Packit |
0f19cf |
print "%s<<" % indent
|
|
Packit |
0f19cf |
for p, v in descriptor.infos().items():
|
|
Packit |
0f19cf |
if isinstance(v, PDFDescriptor):
|
|
Packit |
0f19cf |
print "%s%s:" % (indent, p)
|
|
Packit |
0f19cf |
self._print_dictionnary(v, level=level+1)
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
print "%s%s: %s" % (indent, p, v)
|
|
Packit |
0f19cf |
print "%s>>" % indent
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def list_pdfobjects(self):
|
|
Packit |
0f19cf |
pdfobjects = self.scanner.pdf.pdfobjects
|
|
Packit |
0f19cf |
print "Found %s PDFObjects" % pdfobjects.count()
|
|
Packit |
0f19cf |
print "Found the following PDFObject types:"
|
|
Packit |
0f19cf |
types = pdfobjects.types()
|
|
Packit |
0f19cf |
types.sort()
|
|
Packit |
0f19cf |
total = 0
|
|
Packit |
0f19cf |
for typ in types:
|
|
Packit |
0f19cf |
n_type = len(pdfobjects.get_objects_by_type(typ))
|
|
Packit |
0f19cf |
print " %20s: %5d objects" % (typ, n_type)
|
|
Packit |
0f19cf |
total = total + n_type
|
|
Packit |
0f19cf |
print " %20s: %5d objects" % ("TOTAL", total)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def dump_stream(self, ident, outfile):
|
|
Packit |
0f19cf |
pdfobject = self.scanner.pdf.get_object(ident)
|
|
Packit |
0f19cf |
if not(pdfobject):
|
|
Packit |
0f19cf |
print "PDF Object '%s' not found" % ident
|
|
Packit |
0f19cf |
return
|
|
Packit |
0f19cf |
if not(pdfobject.stream):
|
|
Packit |
0f19cf |
print "PDF Object '%s' has no stream. Give up." % ident
|
|
Packit |
0f19cf |
return
|
|
Packit |
0f19cf |
pdfobject.stream_decode()
|
|
Packit |
0f19cf |
f = open(outfile, "wb")
|
|
Packit |
0f19cf |
f.write(pdfobject.stream_text())
|
|
Packit |
0f19cf |
f.close()
|
|
Packit |
0f19cf |
print "PDF Object '%s' stream written to file %s" % (ident, outfile)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PageFontCmd(BasicCmd):
|
|
Packit |
0f19cf |
def __init__(self, scanner):
|
|
Packit |
0f19cf |
self.scanner = scanner
|
|
Packit |
0f19cf |
self.header_fmt = "%4s %-40s %s"
|
|
Packit |
0f19cf |
self.pt_factor = 1
|
|
Packit |
0f19cf |
self.font_unit = "pt"
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def help(self, cmd):
|
|
Packit |
0f19cf |
if cmd == "font_summary":
|
|
Packit |
0f19cf |
_help = "List the fonts used and their size in the specified pages"
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
_help = "List the fonts used and their size for each page"
|
|
Packit |
0f19cf |
return _help
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def setup_parser(self, parser):
|
|
Packit |
0f19cf |
parser.add_argument("-pt", "--point-type",
|
|
Packit |
0f19cf |
help="Point type to use: 'dtp' (default), 'tex'")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def run(self, parser, args):
|
|
Packit |
0f19cf |
if args.point_type == "tex":
|
|
Packit |
0f19cf |
self.pt_factor = 72.27/72
|
|
Packit |
0f19cf |
self.font_unit = "pt tex"
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if args.name == "font_summary":
|
|
Packit |
0f19cf |
self.print_font_summary()
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
self.print_font_page()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def print_font_page(self):
|
|
Packit |
0f19cf |
for pg in self.scanner.page_groups:
|
|
Packit |
0f19cf |
self.print_fonts_in_pages(pg)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def print_fonts_in_pages(self, pdf_pages, show=True):
|
|
Packit |
0f19cf |
if show:
|
|
Packit |
0f19cf |
print self.header_fmt % ("PAGE", "FONT", "SIZE")
|
|
Packit |
0f19cf |
print self.header_fmt % (4*"-", 40*"-", 10*"-")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
for page in pdf_pages:
|
|
Packit |
0f19cf |
fonts_used = page.find_fonts()
|
|
Packit |
0f19cf |
fonts_used.sort()
|
|
Packit |
0f19cf |
for font in fonts_used:
|
|
Packit |
0f19cf |
if show:
|
|
Packit |
0f19cf |
print "%4d %-40s %6.2f %s" % (page.pagenum, font.name(),
|
|
Packit |
0f19cf |
self.pt_factor * font.size(), self.font_unit)
|
|
Packit |
0f19cf |
if show: print self.header_fmt % (4*"-", 40*"-", 10*"-")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def print_font_summary(self):
|
|
Packit |
0f19cf |
pages = []
|
|
Packit |
0f19cf |
for pg in self.scanner.page_groups:
|
|
Packit |
0f19cf |
if not(pg):
|
|
Packit |
0f19cf |
continue
|
|
Packit |
0f19cf |
s = "%d" % (pg[0].pagenum)
|
|
Packit |
0f19cf |
if len(pg) > 1:
|
|
Packit |
0f19cf |
s += "-%d" % (pg[-1].pagenum)
|
|
Packit |
0f19cf |
pages.append(s)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
print "\nFonts used in pages %s:" % (",".join(pages))
|
|
Packit |
0f19cf |
fonts_used = self.scanner.pdf.fontmgr.get_used()
|
|
Packit |
0f19cf |
fonts_used.sort()
|
|
Packit |
0f19cf |
for font in fonts_used:
|
|
Packit |
0f19cf |
print "%-40s %6.2f %s" % \
|
|
Packit |
0f19cf |
(font.name(), self.pt_factor*font.size(), self.font_unit)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
class PDFScannerCommand:
|
|
Packit |
0f19cf |
def __init__(self):
|
|
Packit |
0f19cf |
self._commands = [
|
|
Packit |
0f19cf |
("page_object", PageObjectCmd),
|
|
Packit |
0f19cf |
("page_font", PageFontCmd),
|
|
Packit |
0f19cf |
("page_layout", PageLayoutCmd),
|
|
Packit |
0f19cf |
("font_summary", PageFontCmd),
|
|
Packit |
0f19cf |
("pdfobject", PdfObjectCmd)
|
|
Packit |
0f19cf |
]
|
|
Packit |
0f19cf |
self.commands_to_run = []
|
|
Packit |
0f19cf |
self.pdf = None
|
|
Packit |
0f19cf |
self.page_ranges = []
|
|
Packit |
0f19cf |
self.page_groups = []
|
|
Packit |
0f19cf |
self.fonts_used = {}
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def commands(self):
|
|
Packit |
0f19cf |
return [c[0] for c in self._commands]
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def setup_options(self, parser):
|
|
Packit |
0f19cf |
parser.add_argument("-p", "--pages", action="append",
|
|
Packit |
0f19cf |
help="Page range in the form '<first>[-[<last>]]'")
|
|
Packit |
0f19cf |
parser.add_argument("-v", "--verbose", action="append",
|
|
Packit |
0f19cf |
help="Verbose mode in the form '[group:]level' with level "\
|
|
Packit |
0f19cf |
"in 'debug', 'info', 'warning', 'error' and "\
|
|
Packit |
0f19cf |
"group in 'pdffile', 'pdfobject', 'descriptor'")
|
|
Packit |
0f19cf |
parser.add_argument("-c", "--cache-stream-dir",
|
|
Packit |
0f19cf |
help="Directory where to store the decompressed stream")
|
|
Packit |
0f19cf |
parser.add_argument("-m", "--no-cache-stream", action="store_true",
|
|
Packit |
0f19cf |
help="No stream cache on disk used: leave streams in memory")
|
|
Packit |
0f19cf |
parser.add_argument("-d", "--cache-remanent", action="store_true",
|
|
Packit |
0f19cf |
help="Equivalent to -fremanent")
|
|
Packit |
0f19cf |
parser.add_argument("-f", "--cache-flags",
|
|
Packit |
0f19cf |
help="Comma separated list of stream cache setup options: "\
|
|
Packit |
0f19cf |
"'remanent' and/or 'refresh'")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def setup_parser(self, parser):
|
|
Packit |
0f19cf |
self.setup_options(parser)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if not(self._commands):
|
|
Packit |
0f19cf |
return
|
|
Packit |
0f19cf |
partial = True
|
|
Packit |
0f19cf |
subparsers = parser.add_subparsers() #title=title)
|
|
Packit |
0f19cf |
clsused = []
|
|
Packit |
0f19cf |
cmdobjs = []
|
|
Packit |
0f19cf |
for cmd, cls in self._commands:
|
|
Packit |
0f19cf |
# Don't duplicate objects used for several commands
|
|
Packit |
0f19cf |
if cls in clsused:
|
|
Packit |
0f19cf |
cmdobj = cmdobjs[clsused.index(cls)]
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
cmdobj = cls(self)
|
|
Packit |
0f19cf |
cmdobjs.append(cmdobj)
|
|
Packit |
0f19cf |
clsused.append(cls)
|
|
Packit |
0f19cf |
kwargs = {}
|
|
Packit |
0f19cf |
if cmdobj.help(cmd):
|
|
Packit |
0f19cf |
kwargs["help"] = cmdobj.help(cmd)
|
|
Packit |
0f19cf |
p = subparsers.add_parser(cmd, **kwargs)
|
|
Packit |
0f19cf |
partial = cmdobj.setup_parser(p) or partial
|
|
Packit |
0f19cf |
p.set_defaults(run=cmdobj.run, name=cmd)
|
|
Packit |
0f19cf |
return partial
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def prepare(self, parser, options, argslist, pdffile):
|
|
Packit |
0f19cf |
self.options = options
|
|
Packit |
0f19cf |
# Sort the commands in the right order
|
|
Packit |
0f19cf |
cmds = [ args.name for args in argslist ]
|
|
Packit |
0f19cf |
self.commands_to_run = []
|
|
Packit |
0f19cf |
for cmd in self.commands():
|
|
Packit |
0f19cf |
if cmd in cmds:
|
|
Packit |
0f19cf |
i = cmds.index(cmd)
|
|
Packit |
0f19cf |
self.commands_to_run.append(argslist[i])
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
log_groups = self._option_group_loglevels()
|
|
Packit |
0f19cf |
self.logger_setup(log_groups)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
self.page_ranges = self._option_page_ranges()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
stream_manager = self._option_cache_setup()
|
|
Packit |
0f19cf |
self.pdf = PDFFile(stream_manager=stream_manager)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def cleanup(self):
|
|
Packit |
0f19cf |
if self.pdf:
|
|
Packit |
0f19cf |
self.pdf.cleanup()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def run(self, parser, options, argslist, pdffile):
|
|
Packit |
0f19cf |
self.prepare(parser, options, argslist, pdffile)
|
|
Packit |
0f19cf |
self.pdf.load(pdffile)
|
|
Packit |
0f19cf |
self.pdf.load_pages()
|
|
Packit |
0f19cf |
self._build_pages()
|
|
Packit |
0f19cf |
for args in self.commands_to_run:
|
|
Packit |
0f19cf |
args.run(parser, args)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _build_pages(self):
|
|
Packit |
0f19cf |
page_count = len(self.pdf.page_objects)
|
|
Packit |
0f19cf |
for page_range in self.page_ranges:
|
|
Packit |
0f19cf |
page_first, page_last = self._page_range(page_range, page_count)
|
|
Packit |
0f19cf |
page_objects = self.pdf.page_objects[page_first-1:page_last]
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
pdf_pages = self._build_pages_from_objects(page_objects, page_first)
|
|
Packit |
0f19cf |
self.page_groups.append(pdf_pages)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _build_pages_from_objects(self, page_objects, page_first):
|
|
Packit |
0f19cf |
pdf_pages = []
|
|
Packit |
0f19cf |
for i, pg in enumerate(page_objects):
|
|
Packit |
0f19cf |
pagenum = i+page_first
|
|
Packit |
0f19cf |
page = PDFPage(self.pdf, pg, pagenum)
|
|
Packit |
0f19cf |
pdf_pages.append(page)
|
|
Packit |
0f19cf |
return pdf_pages
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _page_range(self, page_range, max_range):
|
|
Packit |
0f19cf |
if not(page_range): page_range = [1, max_range]
|
|
Packit |
0f19cf |
if page_range[0] == 0: page_range[0] = 1
|
|
Packit |
0f19cf |
if page_range[1] == 0 or page_range[1] > max_range:
|
|
Packit |
0f19cf |
page_range[1] = max_range
|
|
Packit |
0f19cf |
return page_range
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _option_page_ranges(self):
|
|
Packit |
0f19cf |
page_ranges = []
|
|
Packit |
0f19cf |
if not(self.options.pages):
|
|
Packit |
0f19cf |
page_ranges.append([0, 0])
|
|
Packit |
0f19cf |
return page_ranges
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
for page_range in self.options.pages:
|
|
Packit |
0f19cf |
p1, p2 = (page_range + "-x").split("-")[0:2]
|
|
Packit |
0f19cf |
if not(p2):
|
|
Packit |
0f19cf |
p2 = 0
|
|
Packit |
0f19cf |
elif (p2 == "x"):
|
|
Packit |
0f19cf |
p2 = p1
|
|
Packit |
0f19cf |
page_ranges.append([int(p1), int(p2)])
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
return page_ranges
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _option_group_loglevels(self):
|
|
Packit |
0f19cf |
verbose = self.options.verbose
|
|
Packit |
0f19cf |
log_groups = {"pdffile": "info",
|
|
Packit |
0f19cf |
"pdfobject": "info",
|
|
Packit |
0f19cf |
"descriptor": "error",
|
|
Packit |
0f19cf |
"base": "info"}
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
log_levels = ("debug", "info", "warning", "error")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if not(verbose):
|
|
Packit |
0f19cf |
return log_groups
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
groups = log_groups.keys()
|
|
Packit |
0f19cf |
for verbose_opt in verbose:
|
|
Packit |
0f19cf |
group, level = ("all:" + verbose_opt).split(":")[-2:]
|
|
Packit |
0f19cf |
if not(level in log_levels):
|
|
Packit |
0f19cf |
print "Invalid verbose level: '%s'" % level
|
|
Packit |
0f19cf |
continue
|
|
Packit |
0f19cf |
if group == "all":
|
|
Packit |
0f19cf |
for group in groups:
|
|
Packit |
0f19cf |
log_groups[group] = level
|
|
Packit |
0f19cf |
elif group in groups:
|
|
Packit |
0f19cf |
log_groups[group] = level
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
print "Invalid verbose group: '%s'" % group
|
|
Packit |
0f19cf |
continue
|
|
Packit |
0f19cf |
return log_groups
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def _option_cache_setup(self):
|
|
Packit |
0f19cf |
cache_in_memory = self.options.no_cache_stream
|
|
Packit |
0f19cf |
cache_dirname = self.options.cache_stream_dir
|
|
Packit |
0f19cf |
cache_flags = self.options.cache_flags
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if self.options.cache_remanent:
|
|
Packit |
0f19cf |
if cache_flags:
|
|
Packit |
0f19cf |
cache_flags += ",remanent"
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
cache_flags = "remanent"
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
flags = 0
|
|
Packit |
0f19cf |
if cache_flags:
|
|
Packit |
0f19cf |
cache_flags = cache_flags.split(",")
|
|
Packit |
0f19cf |
for cflag in cache_flags:
|
|
Packit |
0f19cf |
if cflag == "remanent":
|
|
Packit |
0f19cf |
flags = flags | StreamManager.CACHE_REMANENT
|
|
Packit |
0f19cf |
elif cflag == "refresh":
|
|
Packit |
0f19cf |
flags = flags | StreamManager.CACHE_REFRESH
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if cache_in_memory:
|
|
Packit |
0f19cf |
mgr = StreamManager(cache_method="memory")
|
|
Packit |
0f19cf |
elif cache_dirname:
|
|
Packit |
0f19cf |
cache_dirname = os.path.realpath(cache_dirname)
|
|
Packit |
0f19cf |
if not(os.path.exists(cache_dirname)):
|
|
Packit |
0f19cf |
print "Invalid cache dir: '%s'. Temporary dir used instead" % \
|
|
Packit |
0f19cf |
cache_dirname
|
|
Packit |
0f19cf |
return None
|
|
Packit |
0f19cf |
mgr = StreamManager(cache_method="file",
|
|
Packit |
0f19cf |
cache_dirname=cache_dirname,
|
|
Packit |
0f19cf |
flags=flags)
|
|
Packit |
0f19cf |
else:
|
|
Packit |
0f19cf |
mgr = StreamManager(flags=flags)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
return mgr
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def logger_setup(self, log_groups):
|
|
Packit |
0f19cf |
loglevels = { "error": logging.ERROR,
|
|
Packit |
0f19cf |
"warning": logging.WARNING,
|
|
Packit |
0f19cf |
"info": logging.INFO,
|
|
Packit |
0f19cf |
"debug": logging.DEBUG }
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
console = logging.StreamHandler()
|
|
Packit |
0f19cf |
fmt = logging.Formatter("%(message)s")
|
|
Packit |
0f19cf |
console.setFormatter(fmt)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
for group, level in log_groups.items():
|
|
Packit |
0f19cf |
log = logging.getLogger("pdfscan.%s" % group)
|
|
Packit |
0f19cf |
log.setLevel(loglevels.get(level, logging.INFO)-1)
|
|
Packit |
0f19cf |
log.addHandler(console)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
def main():
|
|
Packit |
0f19cf |
from argparse import ArgumentParser
|
|
Packit |
0f19cf |
parser = ArgumentParser(description='Scan information from a PDF file')
|
|
Packit |
0f19cf |
parser.add_argument("-D", "--dump-stack", action="store_true",
|
|
Packit |
0f19cf |
help="Dump error stack (debug purpose)")
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
scanner = PDFScannerCommand()
|
|
Packit |
0f19cf |
scanner.setup_parser(parser)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
options, remain_args = parser.parse_known_args()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
argslist = []
|
|
Packit |
0f19cf |
remain_args = sys.argv[1:]
|
|
Packit |
0f19cf |
while len(remain_args) > 1:
|
|
Packit |
0f19cf |
args, remain_args = parser.parse_known_args(remain_args)
|
|
Packit |
0f19cf |
args.remain_args = remain_args
|
|
Packit |
0f19cf |
argslist.append(args)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if not(remain_args) or remain_args[0] in scanner.commands():
|
|
Packit |
0f19cf |
print "Missing the PDF File"
|
|
Packit |
0f19cf |
parser.parse_args(["-h"])
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
error = ErrorHandler()
|
|
Packit |
0f19cf |
if options.dump_stack: error.dump_stack()
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
try:
|
|
Packit |
0f19cf |
pdffile = remain_args[0]
|
|
Packit |
0f19cf |
scanner.run(parser, options, argslist, pdffile)
|
|
Packit |
0f19cf |
except Exception, e:
|
|
Packit |
0f19cf |
error.failure_track("Error: '%s'" % (e))
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
scanner.cleanup()
|
|
Packit |
0f19cf |
sys.exit(error.rc)
|
|
Packit |
0f19cf |
|
|
Packit |
0f19cf |
if __name__ == "__main__":
|
|
Packit |
0f19cf |
main()
|