From 90d587f063901efefe6a286c0e771fd050f09199 Mon Sep 17 00:00:00 2001 From: Christoph Singer Date: Tue, 17 Oct 2023 17:02:26 +0200 Subject: [PATCH] Adopt helper functions for loading HTML from Symfony --- composer.json | 2 +- src/Helpers.php | 69 ++++++++++++++++++++++++++++++++++--------------- 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/composer.json b/composer.json index 1c5ac5b..3462de2 100644 --- a/composer.json +++ b/composer.json @@ -16,7 +16,7 @@ "php":"^8.0", "ext-dom":"*", "ext-libxml":"*", - "ext-mbstring":"*", + "symfony/polyfill-mbstring": "~1.0", "symfony/dom-crawler":"^6", "symfony/css-selector":"^6" }, diff --git a/src/Helpers.php b/src/Helpers.php index af2c3c7..55dc309 100644 --- a/src/Helpers.php +++ b/src/Helpers.php @@ -82,28 +82,55 @@ public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8') public static function loadHtml(string $html, $charset = 'UTF-8'): \DOMDocument { - $unsafeLibXml = \LIBXML_VERSION < 20900; - $current = libxml_use_internal_errors(true); - if($unsafeLibXml) { - $disableEntities = libxml_disable_entity_loader(true); - } - $d = new \DOMDocument('1.0', $charset); - $d->validateOnParse = true; - if (function_exists('mb_convert_encoding') && in_array( - strtolower($charset), - array_map('strtolower', mb_list_encodings()) - ) - ) { - $html = mb_convert_encoding($html, 'HTML-ENTITIES', $charset); + return self::parseXhtml($html, $charset); + } + /** + * Function originally taken from Symfony\Component\DomCrawler\Crawler + * (c) Fabien Potencier + * License: MIT + */ + private static function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument + { + $htmlContent = self::convertToHtmlEntities($htmlContent, $charset); + + $internalErrors = libxml_use_internal_errors(true); + + $dom = new \DOMDocument('1.0', $charset); + $dom->validateOnParse = true; + + if ('' !== trim($htmlContent)) { + // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements + // Option LIBXML_SCHEMA_CREATE seems to prevent this + // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string + @$dom->loadHTML($htmlContent, \LIBXML_SCHEMA_CREATE); } - // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements - // Option LIBXML_SCHEMA_CREATE seems to prevent this - // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string - @$d->loadHTML($html, \LIBXML_SCHEMA_CREATE); - libxml_use_internal_errors($current); - if($unsafeLibXml) { - libxml_disable_entity_loader($disableEntities); + + libxml_use_internal_errors($internalErrors); + + return $dom; + } + + /** + * Converts charset to HTML-entities to ensure valid parsing. + * Function taken from Symfony\Component\DomCrawler\Crawler + * (c) Fabien Potencier + * License: MIT + */ + private static function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string + { + set_error_handler(function () { throw new \Exception(); }); + + try { + return mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], $charset); + } catch (\Exception|\ValueError) { + try { + $htmlContent = iconv($charset, 'UTF-8', $htmlContent); + $htmlContent = mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8'); + } catch (\Exception|\ValueError) { + } + return $htmlContent; + } finally { + restore_error_handler(); } - return $d; } }