Skip to content

Commit 14adf31

Browse files
authored
Strengthen check for UTF-8 conformity (#704)
In some cases a binary string may pass as valid UTF-8 to the `mb_check_encoding(..., 'UTF-8')` function. Use a comprehensive regexp from the W3 group instead to be **certain** we aren't trying to parse binary content in `formatContent()`. Reference: https://www.w3.org/International/questions/qa-forms-utf-8.en
1 parent e13c495 commit 14adf31

File tree

2 files changed

+32
-7
lines changed

2 files changed

+32
-7
lines changed

src/Smalot/PdfParser/PDFObject.php

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -214,13 +214,31 @@ private function formatContent(?string $content): string
214214
return '';
215215
}
216216

217-
// Outside of (String) content in PDF document streams, all
218-
// text should conform to UTF-8. Test for binary content by
219-
// deleting everything after the first open-parenthesis ( which
220-
// indicates the beginning of a string. Then test what remains
221-
// for valid UTF-8. If it's not UTF-8, return an empty string
222-
// as this $content is most likely binary.
223-
if (false === mb_check_encoding(preg_replace('/\(.*$/s', '', $content), 'UTF-8')) {
217+
// Outside of (String) and inline image content in PDF document
218+
// streams, all text should conform to UTF-8. Test for binary
219+
// content by deleting everything after the first open-
220+
// parenthesis ( which indicates the beginning of a string, or
221+
// the first ID command which indicates the beginning of binary
222+
// inline image content. Then test what remains for valid
223+
// UTF-8. If it's not UTF-8, return an empty string as this
224+
// $content is most likely binary. Unfortunately, using
225+
// mb_check_encoding(..., 'UTF-8') is not strict enough, so the
226+
// following regexp, adapted from the W3, is used. See:
227+
// https://www.w3.org/International/questions/qa-forms-utf-8.en
228+
// We use preg_replace() instead of preg_match() to avoid "JIT
229+
// stack limit exhausted" errors on larger files.
230+
$utf8Filter = preg_replace('/(
231+
[\x09\x0A\x0D\x20-\x7E] | # ASCII
232+
[\xC2-\xDF][\x80-\xBF] | # non-overlong 2-byte
233+
\xE0[\xA0-\xBF][\x80-\xBF] | # excluding overlongs
234+
[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} | # straight 3-byte
235+
\xED[\x80-\x9F][\x80-\xBF] | # excluding surrogates
236+
\xF0[\x90-\xBF][\x80-\xBF]{2} | # planes 1-3
237+
[\xF1-\xF3][\x80-\xBF]{3} | # planes 4-15
238+
\xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
239+
)/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content));
240+
241+
if ('' !== $utf8Filter) {
224242
return '';
225243
}
226244

tests/PHPUnit/Integration/PDFObjectTest.php

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,13 @@ public function testFormatContent(): void
284284

285285
// Binary check is done before a regexp that causes an error
286286
$this->assertStringContainsString('Marko Nestorović PR', $pages[0]->getText());
287+
288+
// mb_check_encoding(..., 'UTF-8') returns true here,
289+
// necessitating a test for UTF-8 that's more strict
290+
$content = hex2bin('0101010101010101');
291+
$cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content);
292+
293+
$this->assertEquals('', $cleaned);
287294
}
288295

289296
public function testGetSectionsText(): void

0 commit comments

Comments
 (0)