From 899fa414dcb07565d5ae2cc01349ddfe5d06a2b2 Mon Sep 17 00:00:00 2001
From: Michael Weiser <michael.weiser@gmx.de>
Date: Fri, 6 Mar 2020 15:18:31 +0100
Subject: [PATCH 1/6] Fix object stream parsing

Commit 8cc27b6a broke object stream parsing by resetting the content
cursor PDFParser.charCounter to zero on every invocation. This broke object
stream parsing. Reproducer:

$ echo -e "create pdf\ncreate object_stream\nall\nsave /tmp/foo.pdf" | \
	peepdf -i

Without fix:

$ peepdf -j /tmp/foo.pdf
Error: An error has occurred while parsing an indirect object!!

With this change: JSON output as expected (same for other outputs).

$ peepdf -j /tmp/foo.pdf
{
    "peepdf_analysis": {
[...]
           "version": "0.3"
        }
    }
}

Signed-off-by: Michael Weiser <michael.weiser@gmx.de>
---
 peepdf/PDFCore.py | 1 -
 1 file changed, 1 deletion(-)
diff --git a/peepdf/PDFCore.py b/peepdf/PDFCore.py
index a61d776..44e56f8 100644
--- a/peepdf/PDFCore.py
+++ b/peepdf/PDFCore.py
@@ -8153,7 +8153,6 @@ def readUntilSymbol(self, string, symbol):
 
         newString = string[self.charCounter:]
 
-        self.charCounter = 0
         index = newString.find(symbol)
         if index == -1:
             errorMessage = 'Symbol "'+symbol+'" not found'

From d7f0be5052bd27e881326d0878da6584c580bf55 Mon Sep 17 00:00:00 2001
From: Michael Weiser <michael.weiser@gmx.de>
Date: Fri, 6 Mar 2020 18:25:01 +0100
Subject: [PATCH 2/6] Delay reading of objects until references are resolved

For unclear reasons, PDFObjectStream.update() delays decoding of the
modified raw stream until all references can be resolved. It does
however then go on to always try to extract objects from the still empty
decoded stream. This produces an error from peepdf cli:

$ peepdf image.php
Error: An error has occurred while parsing an indirect object!!

The error from PDFObjectStream.update() is "Missing offsets in object
stream" because self.decodedStream is still empty at that point, making
offsetsSection and eventually the numbers list empty, causing the abort.

This is triggered by /Length being a reference and setting updateNeeded
to True. Sample: https://www.infotek.co.jp/pdflib/demo/sample/image.php.
Relevant PDF structure:

32 0 obj
<</Length 43 0 R/Filter/FlateDecode/Type/ObjStm/N 7/First 47>>
stream
[...]
endstream
endobj
43 0 obj
461
endobj

(Length in dict of object 32 R-eferences object 43 which contains 461
what presumably is the length of the stream - which does not seem to be
used or checked for consistency by peepdf atm, btw.)

This resolves the first half of jesparza/peepdf#70 in that force mode is
no longer necessary to parse such files at all.

Signed-off-by: Michael Weiser <michael.weiser@gmx.de>
---
 peepdf/PDFCore.py | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/peepdf/PDFCore.py b/peepdf/PDFCore.py
index 44e56f8..4fc312a 100644
--- a/peepdf/PDFCore.py
+++ b/peepdf/PDFCore.py
@@ -3000,29 +3000,31 @@ def update(self, modifiedCompressedObjects=False, onlyElements=False, decrypt=Fa
                                     if self.isEncodedStream:
                                         self.decode()
                                 self.size = len(self.rawStream)
-                        offsetsSection = self.decodedStream[:self.firstObjectOffset]
-                        objectsSection = self.decodedStream[self.firstObjectOffset:]
-                        numbers = re.findall('\d{1,10}', offsetsSection)
-                        if numbers != [] and len(numbers) % 2 == 0:
-                            for i in range(0, len(numbers), 2):
-                                id = int(numbers[i])
-                                offset = int(numbers[i+1])
-                                ret = PDFParser().readObject(objectsSection[offset:])
-                                if ret[0] == -1:
-                                    if isForceMode:
-                                        object = None
-                                        self.addError(ret[1])
+
+                        if not self.updateNeeded:
+                            offsetsSection = self.decodedStream[:self.firstObjectOffset]
+                            objectsSection = self.decodedStream[self.firstObjectOffset:]
+                            numbers = re.findall('\d{1,10}', offsetsSection)
+                            if numbers != [] and len(numbers) % 2 == 0:
+                                for i in range(0, len(numbers), 2):
+                                    id = int(numbers[i])
+                                    offset = int(numbers[i+1])
+                                    ret = PDFParser().readObject(objectsSection[offset:])
+                                    if ret[0] == -1:
+                                        if isForceMode:
+                                            object = None
+                                            self.addError(ret[1])
+                                        else:
+                                            return ret
                                     else:
-                                        return ret
-                                else:
-                                    object = ret[1]
-                                self.compressedObjectsDict[id] = [offset, object]
-                                self.indexes.append(id)
-                        else:
-                            if isForceMode:
-                                self.addError('Missing offsets in object stream')
+                                        object = ret[1]
+                                    self.compressedObjectsDict[id] = [offset, object]
+                                    self.indexes.append(id)
                             else:
-                                return (-1, 'Missing offsets in object stream')
+                                if isForceMode:
+                                    self.addError('Missing offsets in object stream')
+                                else:
+                                    return (-1, 'Missing offsets in object stream')
                     elif modifiedCompressedObjects:
                         tmpStreamObjects = ''
                         tmpStreamObjectsInfo = ''

From 431e872eaed18587e038f102f46c7b546221a601 Mon Sep 17 00:00:00 2001
From: Michael Weiser <michael.weiser@gmx.de>
Date: Fri, 6 Mar 2020 19:39:10 +0100
Subject: [PATCH 3/6] Avoid TypeError on reference resolution

With the previous change deferring reading of objects from the decoded
stream until references can be resolved, it now runs into
jesparza/peepdf#70. This change provides a different approach in fixing
it to #6 by syncing it with the other locations where the identical code
is in use:

1. Force the numbers extracted by re.findall to int() as before,
   avoiding the TypeError exception:

Traceback (most recent call last):
  File "peepdf-venv2/lib64/python2.7/site-packages/peepdf/main.py", line 409, in main
    ret, pdf = pdfParser.parse(fileName, options.isForceMode, options.isLooseMode, options.isManualAnalysis)
  File "peepdf-venv2/lib64/python2.7/site-packages/peepdf/PDFCore.py", line 7117, in parse
    ret = body.updateObjects()
  File "peepdf-venv2/lib64/python2.7/site-packages/peepdf/PDFCore.py", line 4291, in updateObjects
    object.resolveReferences()
  File "peepdf-venv2/lib64/python2.7/site-packages/peepdf/PDFCore.py", line 3256, in resolveReferences
    ret = PDFParser.readObject(objectsSection[offset:])
TypeError: slice indices must be integers or None or have an __index__ method

2. Instantiate a new PDFParser object by adding the missing braces,
   avoiding another TypeError because readObject is no class method:

Traceback (most recent call last):
  File "peepdf-venv2/lib64/python2.7/site-packages/peepdf/main.py", line 409, in main
    ret, pdf = pdfParser.parse(fileName, options.isForceMode, options.isLooseMode, options.isManualAnalysis)
  File "peepdf-venv2/lib64/python2.7/site-packages/peepdf/PDFCore.py", line 7118, in parse
    ret = body.updateObjects()
  File "peepdf-venv2/lib64/python2.7/site-packages/peepdf/PDFCore.py", line 4292, in updateObjects
    object.resolveReferences()
  File "peepdf-venv2/lib64/python2.7/site-packages/peepdf/PDFCore.py", line 3256, in resolveReferences
    ret = PDFParser.readObject(objectsSection[offset:])
TypeError: unbound method readObject() must be called with PDFParser instance as first argument (got str instance instead)

3. Explicitly force the id to be an int() as well and append it do the
   list of indices as at the other callsites of this code. This solves
   no issue I have run into but seems sensible to avoid other potential
   TypeErrors and keep internal bookkeeping of the object consistent.

This should conclusively resolve jesparza/peepdf#70 and supersedes #6.

Signed-off-by: Michael Weiser <michael.weiser@gmx.de>
---
 peepdf/PDFCore.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/peepdf/PDFCore.py b/peepdf/PDFCore.py
index 4fc312a..6905bb8 100644
--- a/peepdf/PDFCore.py
+++ b/peepdf/PDFCore.py
@@ -3251,8 +3251,9 @@ def resolveReferences(self):
                 numbers = re.findall('\d{1,10}', offsetsSection)
                 if numbers != [] and len(numbers) % 2 == 0:
                     for i in range(0, len(numbers), 2):
-                        offset = numbers[i+1]
-                        ret = PDFParser.readObject(objectsSection[offset:])
+                        id = int(numbers[i])
+                        offset = int(numbers[i+1])
+                        ret = PDFParser().readObject(objectsSection[offset:])
                         if ret[0] == -1:
                             if isForceMode:
                                 object = None
@@ -3261,7 +3262,8 @@ def resolveReferences(self):
                                 return ret
                         else:
                             object = ret[1]
-                        self.compressedObjectsDict[numbers[i]] = [offset, object]
+                        self.compressedObjectsDict[id] = [offset, object]
+                        self.indexes.append(id)
                 else:
                     errorMessage = 'Missing offsets in object stream'
                     if isForceMode:

From d1015554cd4953f3c82ca012d234f2279e025345 Mon Sep 17 00:00:00 2001
From: Anselm Kruis <anselm.kruis@atos.net>
Date: Fri, 19 Jun 2020 13:32:11 +0200
Subject: [PATCH 4/6] Fix PDFParser.readSymbol(), if while-space characters
 precede the symbol

In PDF files the Cross-Reference Table or a Cross-Reference Stream
contain byte-offsets for the start of objects within the file or the
uncompressed stream. Such an offset does not always point the first byte
of the initial token (see ISO 32000-2008 section 7.2.2) of the referenced
object. The object may be preceded by white-space characters and comments.

Without this commit PDFParser.readSymbol() fails to read a symbol, if the
first character to be processed is a white-space character. This commit
changes PDFParser.readSymbol() to skip leading white-space characters.
(PDFParser.readSymbol() already skips any number of leading comments followed
by white-space characters.) This enables passing of PDF-files with sloppy
cross reference offsets.
---
 peepdf/PDFCore.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/peepdf/PDFCore.py b/peepdf/PDFCore.py
index 6905bb8..8a6d197 100644
--- a/peepdf/PDFCore.py
+++ b/peepdf/PDFCore.py
@@ -8003,6 +8003,7 @@ def readSymbol(self, string, symbol, deleteSpaces=True):
             errorMessage = 'EOF while looking for symbol "'+symbol+'"'
             pdfFile.addError(errorMessage)
             return (-1, errorMessage)
+        self.readSpaces(string)
         while string[self.charCounter] == '%':
             ret = self.readUntilEndOfLine(string)
             if ret[0] == -1:

From f50847fcf1551db26c55a45ffb5fc6593fb5e46f Mon Sep 17 00:00:00 2001
From: Anselm Kruis <anselm.kruis@atos.net>
Date: Fri, 19 Jun 2020 18:00:44 +0200
Subject: [PATCH 5/6] Add the missing method PDFArray.getJSCode()

An object of class PDFArray can contain JS-code, if one or more array-elements
contain JS-code. The getter method was simply missing.
---
 peepdf/PDFCore.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/peepdf/PDFCore.py b/peepdf/PDFCore.py
index 8a6d197..81ba762 100644
--- a/peepdf/PDFCore.py
+++ b/peepdf/PDFCore.py
@@ -1217,6 +1217,14 @@ def setElements(self, newElements):
         ret = self.update()
         return ret
 
+    def getJSCode(self):
+        '''
+            Gets the Javascript code of the object
+
+            @return: An array of Javascript code sections
+        '''
+        return self.JSCode
+
 
 class PDFDictionary(PDFObject):
     def __init__(self, rawContent='', elements={}, rawNames={}):

From 90720a4b28bb3e83b062c97b6f938e33fb8c5e41 Mon Sep 17 00:00:00 2001
From: Michael Weiser <michael.weiser@gmx.de>
Date: Wed, 1 Jul 2020 11:36:46 +0200
Subject: [PATCH 6/6] Handle sloppy cross references more and less generically

A previous commit adjusted readSymbol() to skip leading whitespace in
order to avoid errors with sloppy cross references. This did not fix
handling of literals such as numbers and booleans in readObject()
because they're not accessed using readSymbol(). Also, adjusting the
very low-level readSymbol() function might generate fallout.

So instead, this change moves the skipping of leading whitespace into
readObject() so that it affects all types of referenced objects equally
but not all symbol lookups altogether.

Signed-off-by: Michael Weiser <michael.weiser@gmx.de>
---
 peepdf/PDFCore.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/peepdf/PDFCore.py b/peepdf/PDFCore.py
index 81ba762..28d24eb 100644
--- a/peepdf/PDFCore.py
+++ b/peepdf/PDFCore.py
@@ -7857,6 +7857,11 @@ def readObject(self, content, objectType=None, forceMode=False, looseMode=False)
         pdfObject = None
         oldCounter = self.charCounter
         self.charCounter = 0
+        # skip leading whitespace in case of sloppy reference offsets
+        self.readSpaces(content)
+        if self.charCounter > 0:
+            content = content[self.charCounter:]
+            self.charCounter = 0
         if objectType is not None:
             objectsTypeArray = [self.delimiters[i][2] for i in range(len(self.delimiters))]
             index = objectsTypeArray.index(objectType)
@@ -8011,7 +8016,6 @@ def readSymbol(self, string, symbol, deleteSpaces=True):
             errorMessage = 'EOF while looking for symbol "'+symbol+'"'
             pdfFile.addError(errorMessage)
             return (-1, errorMessage)
-        self.readSpaces(string)
         while string[self.charCounter] == '%':
             ret = self.readUntilEndOfLine(string)
             if ret[0] == -1: