From 8da101889fdbd72f220b1acaeb64b15f76a4a301 Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 03:11:38 +0800 Subject: [PATCH 01/18] Update HtmlPageCrawler.php Replace `_root` by constant `self::FRAGMENT_ROOT_TAGNAME` --- src/HtmlPageCrawler.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/HtmlPageCrawler.php b/src/HtmlPageCrawler.php index e91cefc..9654551 100644 --- a/src/HtmlPageCrawler.php +++ b/src/HtmlPageCrawler.php @@ -887,7 +887,7 @@ public function saveHTML() return $this->getDOMDocument()->saveHTML(); } else { $doc = new \DOMDocument('1.0', 'UTF-8'); - $root = $doc->appendChild($doc->createElement('_root')); + $root = $doc->appendChild($doc->createElement(self::FRAGMENT_ROOT_TAGNAME)); foreach ($this as $node) { $root->appendChild($doc->importNode($node, true)); } From 332ffcc3e14878536bd8353a054583726477baa9 Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 03:27:00 +0800 Subject: [PATCH 02/18] Update HtmlPageCrawler.php Try to fix utf-8 encoding problem. --- src/HtmlPageCrawler.php | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/HtmlPageCrawler.php b/src/HtmlPageCrawler.php index 9654551..f08e36b 100644 --- a/src/HtmlPageCrawler.php +++ b/src/HtmlPageCrawler.php @@ -383,10 +383,11 @@ public function getInnerHtml() $node = $this->getNode(0); if ($node instanceof \DOMNode) { $doc = new \DOMDocument('1.0', 'UTF-8'); + $doc->loadHTML(''); $doc->appendChild($doc->importNode($node, true)); $html = trim($doc->saveHTML()); $tag = $node->nodeName; - return preg_replace('@^<' . $tag . '[^>]*>|$@', '', $html); + return preg_replace('@^.*<' . $tag . '[^>]*>|$@s', '', $html); } else { return ''; } @@ -883,17 +884,19 @@ public function wrapInner($content) */ public function saveHTML() { - if ($this->isHtmlDocument()) { - return $this->getDOMDocument()->saveHTML(); - } else { + /* don't see any reason we should handle the complete HTML document seperately. */ + // if ($this->isHtmlDocument()) { + // return $this->getDOMDocument()->saveHTML(); + // } else { $doc = new \DOMDocument('1.0', 'UTF-8'); + $doc->loadHTML(''); $root = $doc->appendChild($doc->createElement(self::FRAGMENT_ROOT_TAGNAME)); foreach ($this as $node) { $root->appendChild($doc->importNode($node, true)); } $html = trim($doc->saveHTML()); - return preg_replace('@^<'.self::FRAGMENT_ROOT_TAGNAME.'[^>]*>|$@', '', $html); - } + return preg_replace('@^.*<'.self::FRAGMENT_ROOT_TAGNAME.'[^>]*>|$@s', '', $html); + // } } public function __toString() From 478d5766831c4ee56f8974cd02d825d55a9887ee Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 03:32:07 +0800 Subject: [PATCH 03/18] Change tab indents to space indents. --- src/HtmlPageCrawler.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/HtmlPageCrawler.php b/src/HtmlPageCrawler.php index f08e36b..52b44ec 100644 --- a/src/HtmlPageCrawler.php +++ b/src/HtmlPageCrawler.php @@ -383,7 +383,7 @@ public function getInnerHtml() $node = $this->getNode(0); if ($node instanceof \DOMNode) { $doc = new \DOMDocument('1.0', 'UTF-8'); - $doc->loadHTML(''); + $doc->loadHTML(''); $doc->appendChild($doc->importNode($node, true)); $html = trim($doc->saveHTML()); $tag = $node->nodeName; @@ -884,12 +884,12 @@ public function wrapInner($content) */ public function saveHTML() { - /* don't see any reason we should handle the complete HTML document seperately. */ - // if ($this->isHtmlDocument()) { + /* don't see any reason we should handle the complete HTML document seperately. */ + // if ($this->isHtmlDocument()) { // return $this->getDOMDocument()->saveHTML(); // } else { $doc = new \DOMDocument('1.0', 'UTF-8'); - $doc->loadHTML(''); + $doc->loadHTML(''); $root = $doc->appendChild($doc->createElement(self::FRAGMENT_ROOT_TAGNAME)); foreach ($this as $node) { $root->appendChild($doc->importNode($node, true)); From 02d7a6fa78dd6253e538426a0132dd4913a4a994 Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 03:40:52 +0800 Subject: [PATCH 04/18] Fixes UTF-8 encoding problem. --- src/HtmlPage.php | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/HtmlPage.php b/src/HtmlPage.php index ac63bdf..092b211 100644 --- a/src/HtmlPage.php +++ b/src/HtmlPage.php @@ -50,6 +50,7 @@ public function __construct($content = '', $url = '', $charset = 'UTF-8') $disableEntities = libxml_disable_entity_loader(true); $this->dom = new \DOMDocument('1.0', $charset); + $this->dom->loadHTML(''); $this->dom->validateOnParse = true; @@ -265,11 +266,15 @@ public function __toString() */ public function save($filename = '') { + $html = $this->__toString(); + if (function_exists('mb_convert_encoding') && in_array(strtolower($this->charset), array_map('strtolower', mb_list_encodings()))) { + $html = mb_convert_encoding($html, $this->charset, 'HTML-ENTITIES'); + } if ($filename != '') { - file_put_contents($filename, $this->__toString()); + file_put_contents($filename, $html); return; } else { - return $this->__toString(); + return $html; } } From 7bf5088ed99e72790ddbd65682d38ac601d3ab47 Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 03:44:00 +0800 Subject: [PATCH 05/18] `mb_convert_encoding` to charset in `__toString()` --- src/HtmlPage.php | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/HtmlPage.php b/src/HtmlPage.php index 092b211..63a52e4 100644 --- a/src/HtmlPage.php +++ b/src/HtmlPage.php @@ -255,7 +255,11 @@ public function getBody() public function __toString() { - return $this->dom->saveHTML(); + $html = $this->dom->saveHTML(); + if (function_exists('mb_convert_encoding') && in_array(strtolower($this->charset), array_map('strtolower', mb_list_encodings()))) { + $html = mb_convert_encoding($html, $this->charset, 'HTML-ENTITIES'); + } + return $html; } /** @@ -266,15 +270,11 @@ public function __toString() */ public function save($filename = '') { - $html = $this->__toString(); - if (function_exists('mb_convert_encoding') && in_array(strtolower($this->charset), array_map('strtolower', mb_list_encodings()))) { - $html = mb_convert_encoding($html, $this->charset, 'HTML-ENTITIES'); - } if ($filename != '') { - file_put_contents($filename, $html); + file_put_contents($filename, $this->__toString()); return; } else { - return $html; + return $this->__toString(); } } From 5c33cbe2f5676f74d22c1f4ff84396b6b329cb51 Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 04:03:43 +0800 Subject: [PATCH 06/18] Use parent method instead. --- src/HtmlPageCrawler.php | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/src/HtmlPageCrawler.php b/src/HtmlPageCrawler.php index 52b44ec..48c41df 100644 --- a/src/HtmlPageCrawler.php +++ b/src/HtmlPageCrawler.php @@ -380,17 +380,7 @@ public function html($html = null) */ public function getInnerHtml() { - $node = $this->getNode(0); - if ($node instanceof \DOMNode) { - $doc = new \DOMDocument('1.0', 'UTF-8'); - $doc->loadHTML(''); - $doc->appendChild($doc->importNode($node, true)); - $html = trim($doc->saveHTML()); - $tag = $node->nodeName; - return preg_replace('@^.*<' . $tag . '[^>]*>|$@s', '', $html); - } else { - return ''; - } + return parent::html(); } /** @@ -884,19 +874,11 @@ public function wrapInner($content) */ public function saveHTML() { - /* don't see any reason we should handle the complete HTML document seperately. */ - // if ($this->isHtmlDocument()) { - // return $this->getDOMDocument()->saveHTML(); - // } else { - $doc = new \DOMDocument('1.0', 'UTF-8'); - $doc->loadHTML(''); - $root = $doc->appendChild($doc->createElement(self::FRAGMENT_ROOT_TAGNAME)); - foreach ($this as $node) { - $root->appendChild($doc->importNode($node, true)); - } - $html = trim($doc->saveHTML()); - return preg_replace('@^.*<'.self::FRAGMENT_ROOT_TAGNAME.'[^>]*>|$@s', '', $html); - // } + $html = ''; + foreach ($this as $node) { + $html .= $node->ownerDocument->saveHTML($node); + } + return $html; } public function __toString() From 94a2c3c29ee686e32265d73a68b3244676833129 Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 05:23:51 +0800 Subject: [PATCH 07/18] Correct some tests --- Tests/HtmlPageCrawlerTest.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Tests/HtmlPageCrawlerTest.php b/Tests/HtmlPageCrawlerTest.php index 9c2880d..29f6f0f 100644 --- a/Tests/HtmlPageCrawlerTest.php +++ b/Tests/HtmlPageCrawlerTest.php @@ -43,7 +43,7 @@ public function testManipulationFunctions() $this->assertEquals('

Ein neuer Inhalt

', $content->getInnerHtml()); $content->prepend('

Neue Überschrift'); - $this->assertEquals('

Neue Überschrift

Ein neuer Inhalt

', $content->getInnerHtml()); + $this->assertEquals('

Neue Überschrift

Ein neuer Inhalt

', $content->getInnerHtml()); $h1 = $content->filter('h1'); $this->assertEquals('Neue Überschrift', $h1->text()); @@ -440,7 +440,7 @@ public function testUTF8Characters() $c = HtmlPageCrawler::create($text); $expected =<<< END -

Die Burse wurde unmittelbar (1478 bis 1482) nach der Universitätsgründung als Studentenwohnhaus und -lehranstalt errichtet. Hier lehrte der Humanist und Reformator Philipp Melanchthon bis zu seiner Berufung nach Wittenberg 1518, an ihn erinnert eine Gedenktafel. 1803 bis 1805 wurde das Gebäude im Stil des Klassizismus zum ersten Tübinger Klinikum umgebaut. Einer der ersten Patienten war Friedrich Hölderlin, der nach einer 231 Tage dauernden Behandlung am 3. Mai 1807 als unheilbar entlassen wurde.

Einst Badeanstalt vor der Stadtmauer. Wer durch das kleine Stadttor geht, hat – rückwärts gewandt – einen guten Blick auf die Stadtbefestigung mit "Pechnasen" und Spuren des alten Wehrgangs.

+

Die Burse wurde unmittelbar (1478 bis 1482) nach der Universitätsgründung als Studentenwohnhaus und -lehranstalt errichtet. Hier lehrte der Humanist und Reformator Philipp Melanchthon bis zu seiner Berufung nach Wittenberg 1518, an ihn erinnert eine Gedenktafel. 1803 bis 1805 wurde das Gebäude im Stil des Klassizismus zum ersten Tübinger Klinikum umgebaut. Einer der ersten Patienten war Friedrich Hölderlin, der nach einer 231 Tage dauernden Behandlung am 3. Mai 1807 als unheilbar entlassen wurde.

Einst Badeanstalt vor der Stadtmauer. Wer durch das kleine Stadttor geht, hat – rückwärts gewandt – einen guten Blick auf die Stadtbefestigung mit "Pechnasen" und Spuren des alten Wehrgangs.

END; $this->assertEquals($expected, $c->filter('p')->saveHTML()); From 7d90051b1683d83ca89763f0de44e162283962ac Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 05:25:12 +0800 Subject: [PATCH 08/18] Correct some tests. --- Tests/HtmlPageTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tests/HtmlPageTest.php b/Tests/HtmlPageTest.php index cf68a3f..3454e5a 100644 --- a/Tests/HtmlPageTest.php +++ b/Tests/HtmlPageTest.php @@ -34,7 +34,7 @@ public function testHtmlPage() $content = '

Überschrift

bla bla
fett

'; $hp->setHtmlById('content', $content); // echo $hp; - $this->assertEquals(mb_convert_encoding($content, 'HTML-ENTITIES', 'utf8'), $hp->getElementById('content')->getInnerHtml()); + $this->assertEquals($content, $hp->getElementById('content')->getInnerHtml()); $url = 'http://www.tuebingen.de/'; $hp->setBaseHref($url); From 50ab050d5767551ebcad9653ef0bec7d31a053b7 Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 05:37:19 +0800 Subject: [PATCH 09/18] Output DocType if exists --- src/HtmlPageCrawler.php | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/HtmlPageCrawler.php b/src/HtmlPageCrawler.php index 48c41df..d46c3a3 100644 --- a/src/HtmlPageCrawler.php +++ b/src/HtmlPageCrawler.php @@ -875,8 +875,11 @@ public function wrapInner($content) public function saveHTML() { $html = ''; + /* Output DocType if exists */ + $documentHtml = $this->getDOMDocument()->saveHTML(); + $html .= preg_match("//is", $documentHtml, $match) ? $match[0]."\n" : ''; foreach ($this as $node) { - $html .= $node->ownerDocument->saveHTML($node); + $html .= $this->getDOMDocument()->saveHTML($node); } return $html; } From d086de62d440d72508cf9eb7f86db405d208d84b Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 05:47:18 +0800 Subject: [PATCH 10/18] trim every node in saveHTML --- src/HtmlPageCrawler.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/HtmlPageCrawler.php b/src/HtmlPageCrawler.php index d46c3a3..b13f99a 100644 --- a/src/HtmlPageCrawler.php +++ b/src/HtmlPageCrawler.php @@ -879,7 +879,7 @@ public function saveHTML() $documentHtml = $this->getDOMDocument()->saveHTML(); $html .= preg_match("//is", $documentHtml, $match) ? $match[0]."\n" : ''; foreach ($this as $node) { - $html .= $this->getDOMDocument()->saveHTML($node); + $html .= trim($this->getDOMDocument()->saveHTML($node)); } return $html; } From 84005f3cd473378afb091fa01791794fdc4dd1be Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 05:50:33 +0800 Subject: [PATCH 11/18] Adding php 7.1 --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 3bd07e4..b75da2c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ php: - 5.5 - 5.6 - 7.0 + - 7.1 - hhvm env: From a7dfad235c53fe380ddf51863c5127f5ae4d41fe Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 06:10:43 +0800 Subject: [PATCH 12/18] Fixes DocType Don't output DocType if only `saveHTML` of elements. --- src/HtmlPageCrawler.php | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/HtmlPageCrawler.php b/src/HtmlPageCrawler.php index b13f99a..de6a110 100644 --- a/src/HtmlPageCrawler.php +++ b/src/HtmlPageCrawler.php @@ -875,9 +875,11 @@ public function wrapInner($content) public function saveHTML() { $html = ''; - /* Output DocType if exists */ - $documentHtml = $this->getDOMDocument()->saveHTML(); - $html .= preg_match("//is", $documentHtml, $match) ? $match[0]."\n" : ''; + if ( $this->isHtmlDocument() ) { + /* Output DocType if exists */ + $documentHtml = $this->getDOMDocument()->saveHTML(); + $html .= preg_match("//is", $documentHtml, $match) ? $match[0]."\n" : ''; + } foreach ($this as $node) { $html .= trim($this->getDOMDocument()->saveHTML($node)); } From c43221520c20a5341cfa8e27a9e86ea9459e31bb Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 12:20:25 +0800 Subject: [PATCH 13/18] Changes Commet --- src/HtmlPageCrawler.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/HtmlPageCrawler.php b/src/HtmlPageCrawler.php index de6a110..23e8744 100644 --- a/src/HtmlPageCrawler.php +++ b/src/HtmlPageCrawler.php @@ -866,8 +866,8 @@ public function wrapInner($content) /** * Get the HTML code fragment of all elements and their contents. * - * If the first node contains a complete HTML document return only - * the full code of this document. + * If the first node contains a complete HTML document return the + * DocType if exists * * @return string HTML code (fragment) * @api @@ -918,7 +918,7 @@ public function isHtmlDocument() */ public function getDOMDocument() { - $node = $this->getNode(0); + $node = $this->getNode(0); $r = null; if ($node instanceof \DOMElement && $node->ownerDocument instanceof \DOMDocument From c2975e503c03aa695b3f811515255cb251b07290 Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 12:46:58 +0800 Subject: [PATCH 14/18] Bug Fixes --- src/HtmlPageCrawler.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/HtmlPageCrawler.php b/src/HtmlPageCrawler.php index 23e8744..2f1474b 100644 --- a/src/HtmlPageCrawler.php +++ b/src/HtmlPageCrawler.php @@ -1,4 +1,5 @@ /is", $documentHtml, $match) ? $match[0]."\n" : ''; } foreach ($this as $node) { - $html .= trim($this->getDOMDocument()->saveHTML($node)); + $html .= trim($node->ownerDocument->saveHTML($node)); } return $html; } From acfa3de6c4112d13317ec30ea2bf6b5f6b489248 Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 14:50:18 +0800 Subject: [PATCH 15/18] Trim getInnerHTML for each element --- src/HtmlPageCrawler.php | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/HtmlPageCrawler.php b/src/HtmlPageCrawler.php index 2f1474b..9e7e8df 100644 --- a/src/HtmlPageCrawler.php +++ b/src/HtmlPageCrawler.php @@ -1,5 +1,4 @@ getNode(0)->childNodes as $node) { + $html .= trim($node->ownerDocument->saveHTML($node)); + } + echo $html; } /** From 1ea192047b78ff6d7e091030d6d1214b37c9a6a7 Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 14:57:18 +0800 Subject: [PATCH 16/18] Fix Typo mistake --- src/HtmlPageCrawler.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/HtmlPageCrawler.php b/src/HtmlPageCrawler.php index 9e7e8df..beda7ac 100644 --- a/src/HtmlPageCrawler.php +++ b/src/HtmlPageCrawler.php @@ -384,7 +384,7 @@ public function getInnerHtml() foreach ($this->getNode(0)->childNodes as $node) { $html .= trim($node->ownerDocument->saveHTML($node)); } - echo $html; + return $html; } /** From abbcc399d133b6b5b53b74cbd1d3238378283473 Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 15:19:29 +0800 Subject: [PATCH 17/18] using rtrim instead of trim --- src/HtmlPageCrawler.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/HtmlPageCrawler.php b/src/HtmlPageCrawler.php index beda7ac..acf68de 100644 --- a/src/HtmlPageCrawler.php +++ b/src/HtmlPageCrawler.php @@ -382,7 +382,7 @@ public function getInnerHtml() { $html = ''; foreach ($this->getNode(0)->childNodes as $node) { - $html .= trim($node->ownerDocument->saveHTML($node)); + $html .= rtrim($node->ownerDocument->saveHTML($node), "\n"); } return $html; } @@ -885,7 +885,7 @@ public function saveHTML() $html .= preg_match("//is", $documentHtml, $match) ? $match[0]."\n" : ''; } foreach ($this as $node) { - $html .= trim($node->ownerDocument->saveHTML($node)); + $html .= rtrim($node->ownerDocument->saveHTML($node), "\n"); } return $html; } From 2b20848474ac2f57e76d200a0ea6e1de04d26313 Mon Sep 17 00:00:00 2001 From: Kyle Tse Date: Tue, 28 Feb 2017 16:57:58 +0800 Subject: [PATCH 18/18] Cancel trimming on elements I think we should keep the newline and whitespace. --- src/HtmlPageCrawler.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/HtmlPageCrawler.php b/src/HtmlPageCrawler.php index acf68de..13040a4 100644 --- a/src/HtmlPageCrawler.php +++ b/src/HtmlPageCrawler.php @@ -382,7 +382,7 @@ public function getInnerHtml() { $html = ''; foreach ($this->getNode(0)->childNodes as $node) { - $html .= rtrim($node->ownerDocument->saveHTML($node), "\n"); + $html .= $node->ownerDocument->saveHTML($node); } return $html; } @@ -885,7 +885,7 @@ public function saveHTML() $html .= preg_match("//is", $documentHtml, $match) ? $match[0]."\n" : ''; } foreach ($this as $node) { - $html .= rtrim($node->ownerDocument->saveHTML($node), "\n"); + $html .= $node->ownerDocument->saveHTML($node); } return $html; }