From e8de7cde1a99a72e9d9bb54de54ae7343826315b Mon Sep 17 00:00:00 2001 From: Christoph Singer Date: Fri, 13 Oct 2023 18:13:45 +0200 Subject: [PATCH] Protect closing tags in html strings within scripts --- Tests/HtmlPageTest.php | 23 +++++++++++++++++++++++ src/Helpers.php | 16 +++++++++++++--- src/HtmlPage.php | 21 +-------------------- 3 files changed, 37 insertions(+), 23 deletions(-) diff --git a/Tests/HtmlPageTest.php b/Tests/HtmlPageTest.php index 1dbd50a..e74f6db 100644 --- a/Tests/HtmlPageTest.php +++ b/Tests/HtmlPageTest.php @@ -344,4 +344,27 @@ public function testSaveOnFileName() $hp->save(vfsStream::url('root/save.html')); $this->assertFileExists(vfsStream::url('root/save.html')); } + + public function testEmbeddedScriptWithHtml() + { + // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements + // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string + $html = << + + + test + + +
+ +
+ + +END; + $hp = new HtmlPage($html); + $this->assertEquals($html . "\n", $hp->save()); + } } diff --git a/src/Helpers.php b/src/Helpers.php index f481168..af2c3c7 100644 --- a/src/Helpers.php +++ b/src/Helpers.php @@ -74,8 +74,15 @@ public static function cssArrayToString($array) */ public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8') { - $unsafeLibXml = \LIBXML_VERSION < 20900; + $html = '' . $html . ''; + $d = self::loadHtml($html, $charset); + return $d->getElementsByTagName('body')->item(0); + } + + public static function loadHtml(string $html, $charset = 'UTF-8'): \DOMDocument + { + $unsafeLibXml = \LIBXML_VERSION < 20900; $current = libxml_use_internal_errors(true); if($unsafeLibXml) { $disableEntities = libxml_disable_entity_loader(true); @@ -89,11 +96,14 @@ public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8') ) { $html = mb_convert_encoding($html, 'HTML-ENTITIES', $charset); } - @$d->loadHTML($html); + // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements + // Option LIBXML_SCHEMA_CREATE seems to prevent this + // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string + @$d->loadHTML($html, \LIBXML_SCHEMA_CREATE); libxml_use_internal_errors($current); if($unsafeLibXml) { libxml_disable_entity_loader($disableEntities); } - return $d->getElementsByTagName('body')->item(0); + return $d; } } diff --git a/src/HtmlPage.php b/src/HtmlPage.php index 624c162..a1b6951 100644 --- a/src/HtmlPage.php +++ b/src/HtmlPage.php @@ -41,31 +41,12 @@ class HtmlPage public function __construct($content = '', $url = '', $charset = 'UTF-8') { - $unsafeLibXml = \LIBXML_VERSION < 20900; $this->charset = $charset; $this->url = $url; if ($content == '') { $content = ''; } - $current = libxml_use_internal_errors(true); - if($unsafeLibXml) { - $disableEntities = libxml_disable_entity_loader(true); - } - - $this->dom = new \DOMDocument('1.0', $charset); - $this->dom->validateOnParse = true; - - - if (function_exists('mb_convert_encoding') && in_array(strtolower($charset), array_map('strtolower', mb_list_encodings()))) { - $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset); - } - - @$this->dom->loadHTML($content); - - libxml_use_internal_errors($current); - if($unsafeLibXml) { - libxml_disable_entity_loader($disableEntities); - } + $this->dom = Helpers::loadHtml($content, $charset); $this->crawler = new HtmlPageCrawler($this->dom); }