Skip to content

Commit 98d31ba

Browse files
authored
Stop Image XObjects being included as empty strings in PDFObject::getTextArray(). (#775)
* Stop Image XObjects being included as empty strings in PDFObject::getTextArray(). * Fix code style issues. * Add test coverage to PDFObject::getTextArray() to cover the Do command case.
1 parent 8452a04 commit 98d31ba

File tree

2 files changed

+60
-10
lines changed

2 files changed

+60
-10
lines changed

src/Smalot/PdfParser/PDFObject.php

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -775,16 +775,27 @@ public function getTextArray(?Page $page = null): array
775775
break;
776776

777777
case 'Do':
778-
if (null !== $page) {
779-
$args = preg_split('/\s/s', $command[self::COMMAND]);
780-
$id = trim(array_pop($args), '/ ');
781-
$xobject = $page->getXObject($id);
782-
783-
// @todo $xobject could be a ElementXRef object, which would then throw an error
784-
if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack, true)) {
785-
// Not a circular reference.
786-
$text[] = $xobject->getText($page);
787-
}
778+
if (is_null($page)) {
779+
break;
780+
}
781+
782+
$args = preg_split('/\s/s', $command[self::COMMAND]);
783+
$id = trim(array_pop($args), '/ ');
784+
$xobject = $page->getXObject($id);
785+
786+
// Check we got a PDFObject back.
787+
if (!$xobject instanceof self) {
788+
break;
789+
}
790+
791+
// If the PDFObject is an image, do nothing, as images aren't text.
792+
if ($xobject instanceof Image) {
793+
break;
794+
}
795+
796+
// Check this is not a circular reference.
797+
if (!\in_array($xobject->getUniqueId(), self::$recursionStack, true)) {
798+
$text[] = $xobject->getText($page);
788799
}
789800
break;
790801

tests/PHPUnit/Unit/PDFObjectTest.php

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,12 @@
66

77
use PHPUnitTests\TestCase;
88
use Smalot\PdfParser\Document;
9+
use Smalot\PdfParser\Element;
10+
use Smalot\PdfParser\Element\ElementArray;
11+
use Smalot\PdfParser\Header;
912
use Smalot\PdfParser\Page;
1013
use Smalot\PdfParser\PDFObject;
14+
use Smalot\PdfParser\XObject\Image;
1115

1216
class PDFObjectTest extends TestCase
1317
{
@@ -22,4 +26,39 @@ public function testGetTextOnPageWithoutContent(): void
2226

2327
static::assertSame(' ', (new PDFObject($document, null, null))->getText(new Page($document)));
2428
}
29+
30+
public function testTextArrayObjects(): void
31+
{
32+
$document = new Document();
33+
$document->init();
34+
35+
$image = new Image($document);
36+
$xObject = new PDFObject($document);
37+
38+
$header1 = new Header([
39+
'Resources' => new Header([
40+
'XObject' => new Header([
41+
'Im0' => $image,
42+
])
43+
]),
44+
'Contents' => new ElementArray([new Element('/Imo Do', $document)], $document),
45+
]);
46+
$page1 = new Page($document, $header1);
47+
48+
$header2 = new Header([
49+
'Resources' => new Header([
50+
'XObject' => new Header([
51+
'Ps0' => $xObject,
52+
])
53+
]),
54+
'Contents' => new ElementArray([new Element('/Ps0 Do', $document)], $document),
55+
]);
56+
$page2 = new Page($document, $header2);
57+
58+
// Page 1 contains an image, which should not appear in the text array.
59+
self::assertSame([], $page1->getTextArray());
60+
61+
// Page 2 contains a non-image object, which should appear in the text array.
62+
self::assertSame([' '], $page2->getTextArray());
63+
}
2564
}

0 commit comments

Comments
 (0)