Skip to content

Commit 768d1d6

Browse files
yasheenak00ni
andauthored
Fixes #478 (/Index problem) (#479)
* Add files via upload Fixing problem of incomplete analysis of the /Index entry. * Delete RawDataParser.php Wrong subdirectory. * Add files via upload Fix problem of uncomplete analysis of /Index entry. * Update RawDataParser.php optical changes * Update RawDataParser.php optical changes * Update RawDataParser.php optical changes * Add files via upload After adding a description to the file, the valid /Index entry now contains two entries (consisting of 2 values: first object number, number of objects): /Index[2 1 21 2] * Update RawDataParserTest.php Adding test for issue 479 * Update RawDataParserTest.php Forgot a { * Update RawDataParser.php Code style update * Update RawDataParserTest.php Added more description and more checks. * Update PageTest.php Issue #331 is fixed by issue #479: test updated * Update RawDataParserTest.php optical fix * Update PageTest.php optical changes * Update RawDataParser.php change to remove the native_function_invocation message * Update tests/Integration/PageTest.php Co-authored-by: Konrad Abicht <[email protected]> * Update RawDataParser.php Added comments... * Update RawDataParser.php Changes for CS fixer * Update PageTest.php Comment update * Update tests/Integration/PageTest.php Co-authored-by: Konrad Abicht <[email protected]> Co-authored-by: Konrad Abicht <[email protected]>
1 parent a216ccd commit 768d1d6

File tree

4 files changed

+65
-10
lines changed

4 files changed

+65
-10
lines changed

samples/bugs/Issue479.pdf

10.5 KB
Binary file not shown.

src/Smalot/PdfParser/RawData/RawDataParser.php

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -269,8 +269,11 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref
269269
) {
270270
$valid_crs = true;
271271
} elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
272-
// first object number in the subsection
273-
$index_first = (int) ($sarr[($k + 1)][1][0][1]);
272+
// initialize list for: first object number in the subsection / number of objects
273+
$index_blocks = [];
274+
for ($m = 0; $m < \count($sarr[($k + 1)][1]); $m += 2) {
275+
$index_blocks[] = [$sarr[($k + 1)][1][$m][1], $sarr[($k + 1)][1][$m + 1][1]];
276+
}
274277
} elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
275278
// get previous xref offset
276279
$prevxref = (int) ($sarr[($k + 1)][1]);
@@ -432,8 +435,9 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref
432435
}
433436

434437
// fill xref
435-
if (isset($index_first)) {
436-
$obj_num = $index_first;
438+
if (isset($index_blocks)) {
439+
// load the first object number of the first /Index entry
440+
$obj_num = $index_blocks[0][0];
437441
} else {
438442
$obj_num = 0;
439443
}
@@ -463,6 +467,21 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref
463467
break;
464468
}
465469
++$obj_num;
470+
if (isset($index_blocks)) {
471+
// reduce the number of remaining objects
472+
--$index_blocks[0][1];
473+
if (0 == $index_blocks[0][1]) {
474+
// remove the actual used /Index entry
475+
array_shift($index_blocks);
476+
if (0 < \count($index_blocks)) {
477+
// load the first object number of the following /Index entry
478+
$obj_num = $index_blocks[0][0];
479+
} else {
480+
// if there are no more entries, remove $index_blocks to avoid actions on an empty array
481+
unset($index_blocks);
482+
}
483+
}
484+
}
466485
}
467486
} // end decoding data
468487
if (isset($prevxref)) {

tests/Integration/PageTest.php

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -486,12 +486,17 @@ public function testGetPages()
486486
$document = $this->getParserInstance()->parseFile($filename);
487487
$pages = $document->getPages();
488488

489-
// This should actually be 3 pages, but as long as the cause for issue #331
490-
// has not been found and the issue is not fixed, we'll settle for 2 here.
491-
// We still test for the count, so in case the bug should be fixed
492-
// unknowingly, we don't forget to resolve the issue as well and make sure
493-
// this assertion is present.
494-
$this->assertCount(2, $pages);
489+
/*
490+
* The problem of issue #331 is fixed by the pull request of the issue #479.
491+
* The original Issue331.pdf was modified so for the updated version (actual
492+
* version) a new xref was added and now the valid /Index has the following value:
493+
* [1 1 3 1 7 1 175 1 178 1 219 2]
494+
* This means, that there a 6 pairs containing the values for 'first object id'
495+
* and 'number of objects'. Till now only the first entry was used and so the
496+
* objects of all following entries gots a wrong id.
497+
* By the fix of issue #479 now the expected number of pages is counted.
498+
*/
499+
$this->assertCount(3, $pages);
495500

496501
foreach ($pages as $page) {
497502
$this->assertTrue($page instanceof Page);

tests/Integration/RawData/RawDataParserTest.php

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,4 +119,35 @@ public function testDecodeObjectHeaderIssue405()
119119

120120
$this->assertStringContainsString('Bug fix: PR #405', $pages[0]->getText());
121121
}
122+
123+
/**
124+
* Tests buggy behavior of decodeXrefStream.
125+
*
126+
* When PDF has more than one entry in the /Index area (for example by changing
127+
* the document description), only the first entry is used.
128+
* If the fix is not used the array returned by getDetails() contains only the entry
129+
* with the key 'Pages'. All other entries like 'Author', 'Creator', 'Title',
130+
* 'Subject' (which come from the 'Info' object) are not listed, because the
131+
* 'Info' object gets a wrong object id during parsing the data into the xref structure.
132+
* So the object id listed at the /Info entry is not valid and the data of the info object
133+
* cannot be loaded during executing Document::buildDetails().
134+
*
135+
* @see https://github.com/smalot/pdfparser/pull/479
136+
*/
137+
public function testDecodeXrefStreamIssue479()
138+
{
139+
$filename = $this->rootDir.'/samples/bugs/Issue479.pdf';
140+
141+
$parser = $this->getParserInstance();
142+
$document = $parser->parseFile($filename);
143+
$details = $document->getDetails();
144+
145+
$this->assertArrayHasKey('Author', $details);
146+
$this->assertArrayHasKey('CreationDate', $details);
147+
$this->assertArrayHasKey('Creator', $details);
148+
$this->assertArrayHasKey('ModDate', $details);
149+
$this->assertArrayHasKey('Producer', $details);
150+
$this->assertArrayHasKey('Subject', $details);
151+
$this->assertArrayHasKey('Title', $details);
152+
}
122153
}

0 commit comments

Comments
 (0)