Strengthen check for UTF-8 conformity (#704)

GreyWyvern · web-flow · commit 14adf318f862 · 2024-04-29T08:36:50.000+02:00
In some cases a binary string may pass as valid UTF-8 to the `mb_check_encoding(..., 'UTF-8')` function. Use a comprehensive regexp from the W3 group instead to be **certain** we aren't trying to parse binary content in `formatContent()`. Reference: https://www.w3.org/International/questions/qa-forms-utf-8.en
diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php
@@ -214,13 +214,31 @@ private function formatContent(?string $content): string
             return '';
         }
 
-        // Outside of (String) content in PDF document streams, all
-        // text should conform to UTF-8. Test for binary content by
-        // deleting everything after the first open-parenthesis ( which
-        // indicates the beginning of a string. Then test what remains
-        // for valid UTF-8. If it's not UTF-8, return an empty string
-        // as this $content is most likely binary.
-        if (false === mb_check_encoding(preg_replace('/\(.*$/s', '', $content), 'UTF-8')) {
+        // Outside of (String) and inline image content in PDF document
+        // streams, all text should conform to UTF-8. Test for binary
+        // content by deleting everything after the first open-
+        // parenthesis ( which indicates the beginning of a string, or
+        // the first ID command which indicates the beginning of binary
+        // inline image content. Then test what remains for valid
+        // UTF-8. If it's not UTF-8, return an empty string as this
+        // $content is most likely binary. Unfortunately, using
+        // mb_check_encoding(..., 'UTF-8') is not strict enough, so the
+        // following regexp, adapted from the W3, is used. See:
+        // https://www.w3.org/International/questions/qa-forms-utf-8.en
+        // We use preg_replace() instead of preg_match() to avoid "JIT
+        // stack limit exhausted" errors on larger files.
+        $utf8Filter = preg_replace('/(
+            [\x09\x0A\x0D\x20-\x7E] |            # ASCII
+            [\xC2-\xDF][\x80-\xBF] |             # non-overlong 2-byte
+            \xE0[\xA0-\xBF][\x80-\xBF] |         # excluding overlongs
+            [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} |  # straight 3-byte
+            \xED[\x80-\x9F][\x80-\xBF] |         # excluding surrogates
+            \xF0[\x90-\xBF][\x80-\xBF]{2} |      # planes 1-3
+            [\xF1-\xF3][\x80-\xBF]{3} |          # planes 4-15
+            \xF4[\x80-\x8F][\x80-\xBF]{2}        # plane 16
+        )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content));
+
+        if ('' !== $utf8Filter) {
             return '';
         }
 
diff --git a/tests/PHPUnit/Integration/PDFObjectTest.php b/tests/PHPUnit/Integration/PDFObjectTest.php
@@ -284,6 +284,13 @@ public function testFormatContent(): void
 
         // Binary check is done before a regexp that causes an error
         $this->assertStringContainsString('Marko Nestorović PR', $pages[0]->getText());
+
+        // mb_check_encoding(..., 'UTF-8') returns true here,
+        // necessitating a test for UTF-8 that's more strict
+        $content = hex2bin('0101010101010101');
+        $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content);
+
+        $this->assertEquals('', $cleaned);
     }
 
     public function testGetSectionsText(): void