Skip to content

Commit

Permalink
Adopt helper functions for loading HTML from Symfony
Browse files Browse the repository at this point in the history
  • Loading branch information
Christoph Singer committed Oct 17, 2023
1 parent e8de7cd commit 90d587f
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 22 deletions.
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"php":"^8.0",
"ext-dom":"*",
"ext-libxml":"*",
"ext-mbstring":"*",
"symfony/polyfill-mbstring": "~1.0",
"symfony/dom-crawler":"^6",
"symfony/css-selector":"^6"
},
Expand Down
69 changes: 48 additions & 21 deletions src/Helpers.php
Original file line number Diff line number Diff line change
Expand Up @@ -82,28 +82,55 @@ public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8')

public static function loadHtml(string $html, $charset = 'UTF-8'): \DOMDocument
{
$unsafeLibXml = \LIBXML_VERSION < 20900;
$current = libxml_use_internal_errors(true);
if($unsafeLibXml) {
$disableEntities = libxml_disable_entity_loader(true);
}
$d = new \DOMDocument('1.0', $charset);
$d->validateOnParse = true;
if (function_exists('mb_convert_encoding') && in_array(
strtolower($charset),
array_map('strtolower', mb_list_encodings())
)
) {
$html = mb_convert_encoding($html, 'HTML-ENTITIES', $charset);
return self::parseXhtml($html, $charset);
}
/**
* Function originally taken from Symfony\Component\DomCrawler\Crawler
* (c) Fabien Potencier <[email protected]>
* License: MIT
*/
private static function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
{
$htmlContent = self::convertToHtmlEntities($htmlContent, $charset);

$internalErrors = libxml_use_internal_errors(true);

$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;

if ('' !== trim($htmlContent)) {
// PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
// Option LIBXML_SCHEMA_CREATE seems to prevent this
// see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
@$dom->loadHTML($htmlContent, \LIBXML_SCHEMA_CREATE);
}
// PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
// Option LIBXML_SCHEMA_CREATE seems to prevent this
// see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
@$d->loadHTML($html, \LIBXML_SCHEMA_CREATE);
libxml_use_internal_errors($current);
if($unsafeLibXml) {
libxml_disable_entity_loader($disableEntities);

libxml_use_internal_errors($internalErrors);

return $dom;
}

/**
* Converts charset to HTML-entities to ensure valid parsing.
* Function taken from Symfony\Component\DomCrawler\Crawler
* (c) Fabien Potencier <[email protected]>
* License: MIT
*/
private static function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
{
set_error_handler(function () { throw new \Exception(); });

try {
return mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], $charset);
} catch (\Exception|\ValueError) {
try {
$htmlContent = iconv($charset, 'UTF-8', $htmlContent);
$htmlContent = mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8');
} catch (\Exception|\ValueError) {
}
return $htmlContent;
} finally {
restore_error_handler();
}
return $d;
}
}

0 comments on commit 90d587f

Please sign in to comment.