Skip to content

Commit

Permalink
Protect closing tags in html strings within scripts
Browse files Browse the repository at this point in the history
(cherry picked from commit e8de7cd)
  • Loading branch information
Christoph Singer authored and wasinger committed Oct 16, 2023
1 parent b407e89 commit 5147809
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 23 deletions.
23 changes: 23 additions & 0 deletions Tests/HtmlPageTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -344,4 +344,27 @@ public function testSaveOnFileName()
$hp->save(vfsStream::url('root/save.html'));
$this->assertFileExists(vfsStream::url('root/save.html'));
}

public function testEmbeddedScriptWithHtml()
{
// PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
// see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
$html = <<<END
<!DOCTYPE html>
<html lang="de">
<head>
<title>test</title>
</head>
<body>
<div>
<script>
var html = '<b>Status</b><div>' + it_status_text + '</div>';
</script>
</div>
</body>
</html>
END;
$hp = new HtmlPage($html);
$this->assertEquals($html . "\n", $hp->save());
}
}
16 changes: 13 additions & 3 deletions src/Helpers.php
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,15 @@ public static function cssArrayToString($array)
*/
public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8')
{
$unsafeLibXml = \LIBXML_VERSION < 20900;

$html = '<html><body>' . $html . '</body></html>';
$d = self::loadHtml($html, $charset);
return $d->getElementsByTagName('body')->item(0);
}

public static function loadHtml(string $html, $charset = 'UTF-8'): \DOMDocument
{
$unsafeLibXml = \LIBXML_VERSION < 20900;
$current = libxml_use_internal_errors(true);
if($unsafeLibXml) {
$disableEntities = libxml_disable_entity_loader(true);
Expand All @@ -89,11 +96,14 @@ public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8')
) {
$html = mb_convert_encoding($html, 'HTML-ENTITIES', $charset);
}
@$d->loadHTML($html);
// PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
// Option LIBXML_SCHEMA_CREATE seems to prevent this
// see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
@$d->loadHTML($html, \LIBXML_SCHEMA_CREATE);
libxml_use_internal_errors($current);
if($unsafeLibXml) {
libxml_disable_entity_loader($disableEntities);
}
return $d->getElementsByTagName('body')->item(0);
return $d;
}
}
21 changes: 1 addition & 20 deletions src/HtmlPage.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,31 +41,12 @@ class HtmlPage

public function __construct($content = '', $url = '', $charset = 'UTF-8')
{
$unsafeLibXml = \LIBXML_VERSION < 20900;
$this->charset = $charset;
$this->url = $url;
if ($content == '') {
$content = '<!DOCTYPE html><html><head><title></title></head><body></body></html>';
}
$current = libxml_use_internal_errors(true);
if($unsafeLibXml) {
$disableEntities = libxml_disable_entity_loader(true);
}

$this->dom = new \DOMDocument('1.0', $charset);
$this->dom->validateOnParse = true;


if (function_exists('mb_convert_encoding') && in_array(strtolower($charset), array_map('strtolower', mb_list_encodings()))) {
$content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
}

@$this->dom->loadHTML($content);

libxml_use_internal_errors($current);
if($unsafeLibXml) {
libxml_disable_entity_loader($disableEntities);
}
$this->dom = Helpers::loadHtml($content, $charset);
$this->crawler = new HtmlPageCrawler($this->dom);
}

Expand Down

0 comments on commit 5147809

Please sign in to comment.