diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index dbe1e80..b5b1a49 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -14,7 +14,7 @@ jobs: strategy: matrix: - php: [7.4, 8.0, 8.1, 8.2] + php: [7.4, 8.0, 8.1, 8.2, 8.3] dependency-version: [prefer-lowest, prefer-stable] steps: diff --git a/src/Helpers.php b/src/Helpers.php index af2c3c7..a0b06b8 100644 --- a/src/Helpers.php +++ b/src/Helpers.php @@ -82,28 +82,61 @@ public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8') public static function loadHtml(string $html, $charset = 'UTF-8'): \DOMDocument { - $unsafeLibXml = \LIBXML_VERSION < 20900; - $current = libxml_use_internal_errors(true); - if($unsafeLibXml) { + return self::parseXhtml($html, $charset); + } + /** + * Function originally taken from Symfony\Component\DomCrawler\Crawler + * (c) Fabien Potencier + * License: MIT + */ + private static function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument + { + $htmlContent = self::convertToHtmlEntities($htmlContent, $charset); + + $internalErrors = libxml_use_internal_errors(true); + if (\LIBXML_VERSION < 20900) { $disableEntities = libxml_disable_entity_loader(true); } - $d = new \DOMDocument('1.0', $charset); - $d->validateOnParse = true; - if (function_exists('mb_convert_encoding') && in_array( - strtolower($charset), - array_map('strtolower', mb_list_encodings()) - ) - ) { - $html = mb_convert_encoding($html, 'HTML-ENTITIES', $charset); + + $dom = new \DOMDocument('1.0', $charset); + $dom->validateOnParse = true; + + if ('' !== trim($htmlContent)) { + // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements + // Option LIBXML_SCHEMA_CREATE seems to prevent this + // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string + @$dom->loadHTML($htmlContent, \LIBXML_SCHEMA_CREATE); } - // PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements - // Option LIBXML_SCHEMA_CREATE seems to prevent this - // see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string - @$d->loadHTML($html, \LIBXML_SCHEMA_CREATE); - libxml_use_internal_errors($current); - if($unsafeLibXml) { + + libxml_use_internal_errors($internalErrors); + if (\LIBXML_VERSION < 20900) { libxml_disable_entity_loader($disableEntities); } - return $d; + + return $dom; + } + + /** + * Converts charset to HTML-entities to ensure valid parsing. + * Function taken from Symfony\Component\DomCrawler\Crawler + * (c) Fabien Potencier + * License: MIT + */ + private static function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string + { + set_error_handler(function () { throw new \Exception(); }); + + try { + return mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], $charset); + } catch (\Exception|\ValueError $e) { + try { + $htmlContent = iconv($charset, 'UTF-8', $htmlContent); + $htmlContent = mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8'); + } catch (\Exception|\ValueError $e) { + } + return $htmlContent; + } finally { + restore_error_handler(); + } } }