Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added getNodes() #20

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ php:
- 5.5
- 5.6
- 7.0
- 7.1
- hhvm

env:
Expand Down
4 changes: 2 additions & 2 deletions Tests/HtmlPageCrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public function testManipulationFunctions()
$this->assertEquals('<p>Ein neuer <b>Inhalt</b></p>', $content->getInnerHtml());

$content->prepend('<h1>Neue Überschrift');
$this->assertEquals('<h1>Neue &Uuml;berschrift</h1><p>Ein neuer <b>Inhalt</b></p>', $content->getInnerHtml());
$this->assertEquals('<h1>Neue Überschrift</h1><p>Ein neuer <b>Inhalt</b></p>', $content->getInnerHtml());

$h1 = $content->filter('h1');
$this->assertEquals('Neue Überschrift', $h1->text());
Expand Down Expand Up @@ -440,7 +440,7 @@ public function testUTF8Characters()
$c = HtmlPageCrawler::create($text);

$expected =<<< END
<p style="margin: 0cm 0cm 0pt;"><span>Die Burse&nbsp;wurde unmittelbar (1478 bis 1482) nach der Universit&auml;tsgr&uuml;ndung als Studentenwohnhaus und -lehranstalt errichtet. Hier lehrte der Humanist und Reformator Philipp Melanchthon bis zu seiner Berufung nach Wittenberg 1518, an ihn erinnert eine Gedenktafel. 1803 bis 1805 wurde das Geb&auml;ude im Stil des Klassizismus zum ersten T&uuml;binger Klinikum umgebaut. Einer der ersten Patienten war Friedrich H&ouml;lderlin, der nach einer 231 Tage dauernden Behandlung am 3. Mai 1807 als unheilbar entlassen wurde.</span></p><p style="margin: 0cm 0cm 0pt;"><span>Einst Badeanstalt vor der Stadtmauer. Wer durch das kleine Stadttor geht, hat &ndash; r&uuml;ckw&auml;rts gewandt &ndash; einen guten Blick auf die Stadtbefestigung mit "Pechnasen" und Spuren des alten Wehrgangs.</span></p>
<p style="margin: 0cm 0cm 0pt;"><span>Die Burse wurde unmittelbar (1478 bis 1482) nach der Universitätsgründung als Studentenwohnhaus und -lehranstalt errichtet. Hier lehrte der Humanist und Reformator Philipp Melanchthon bis zu seiner Berufung nach Wittenberg 1518, an ihn erinnert eine Gedenktafel. 1803 bis 1805 wurde das Gebäude im Stil des Klassizismus zum ersten Tübinger Klinikum umgebaut. Einer der ersten Patienten war Friedrich Hölderlin, der nach einer 231 Tage dauernden Behandlung am 3. Mai 1807 als unheilbar entlassen wurde.</span></p><p style="margin: 0cm 0cm 0pt;"><span>Einst Badeanstalt vor der Stadtmauer. Wer durch das kleine Stadttor geht, hat – rückwärts gewandt einen guten Blick auf die Stadtbefestigung mit "Pechnasen" und Spuren des alten Wehrgangs.</span></p>
END;

$this->assertEquals($expected, $c->filter('p')->saveHTML());
Expand Down
2 changes: 1 addition & 1 deletion Tests/HtmlPageTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public function testHtmlPage()
$content = '<h1>Überschrift</h1><p>bla bla <br><b>fett</b></p>';
$hp->setHtmlById('content', $content);
// echo $hp;
$this->assertEquals(mb_convert_encoding($content, 'HTML-ENTITIES', 'utf8'), $hp->getElementById('content')->getInnerHtml());
$this->assertEquals($content, $hp->getElementById('content')->getInnerHtml());

$url = 'http://www.tuebingen.de/';
$hp->setBaseHref($url);
Expand Down
7 changes: 6 additions & 1 deletion src/HtmlPage.php
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public function __construct($content = '', $url = '', $charset = 'UTF-8')
$disableEntities = libxml_disable_entity_loader(true);

$this->dom = new \DOMDocument('1.0', $charset);
$this->dom->loadHTML('<meta http-equiv="Content-Type" content="text/html;charset='.$charset.'">');
$this->dom->validateOnParse = true;


Expand Down Expand Up @@ -254,7 +255,11 @@ public function getBody()

public function __toString()
{
return $this->dom->saveHTML();
$html = $this->dom->saveHTML();
if (function_exists('mb_convert_encoding') && in_array(strtolower($this->charset), array_map('strtolower', mb_list_encodings()))) {
$html = mb_convert_encoding($html, $this->charset, 'HTML-ENTITIES');
}
return $html;
}

/**
Expand Down
54 changes: 31 additions & 23 deletions src/HtmlPageCrawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -380,16 +380,11 @@ public function html($html = null)
*/
public function getInnerHtml()
{
$node = $this->getNode(0);
if ($node instanceof \DOMNode) {
$doc = new \DOMDocument('1.0', 'UTF-8');
$doc->appendChild($doc->importNode($node, true));
$html = trim($doc->saveHTML());
$tag = $node->nodeName;
return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html);
} else {
return '';
$html = '';
foreach ($this->getNode(0)->childNodes as $node) {
$html .= $node->ownerDocument->saveHTML($node);
}
return $html;
}

/**
Expand Down Expand Up @@ -875,25 +870,24 @@ public function wrapInner($content)
/**
* Get the HTML code fragment of all elements and their contents.
*
* If the first node contains a complete HTML document return only
* the full code of this document.
* If the first node contains a complete HTML document return the
* DocType if exists
*
* @return string HTML code (fragment)
* @api
*/
public function saveHTML()
{
if ($this->isHtmlDocument()) {
return $this->getDOMDocument()->saveHTML();
} else {
$doc = new \DOMDocument('1.0', 'UTF-8');
$root = $doc->appendChild($doc->createElement('_root'));
foreach ($this as $node) {
$root->appendChild($doc->importNode($node, true));
}
$html = trim($doc->saveHTML());
return preg_replace('@^<'.self::FRAGMENT_ROOT_TAGNAME.'[^>]*>|</'.self::FRAGMENT_ROOT_TAGNAME.'>$@', '', $html);
$html = '';
if ( $this->isHtmlDocument() ) {
/* Output DocType if exists */
$documentHtml = $this->getDOMDocument()->saveHTML();
$html .= preg_match("/<!DOCTYPE.*?>/is", $documentHtml, $match) ? $match[0]."\n" : '';
}
foreach ($this as $node) {
$html .= $node->ownerDocument->saveHTML($node);
}
return $html;
}

public function __toString()
Expand Down Expand Up @@ -928,7 +922,7 @@ public function isHtmlDocument()
*/
public function getDOMDocument()
{
$node = $this->getNode(0);
$node = $this->getNode(0);
$r = null;
if ($node instanceof \DOMElement
&& $node->ownerDocument instanceof \DOMDocument
Expand Down Expand Up @@ -1026,6 +1020,20 @@ public function getNode($position)
return parent::getNode($position);
}

/**
*
* get all nodes in Crawler in Array format, so that we can use
* foreach to loop through the elements
*
* @return Array HtmlPageCrawler
*/
public function getNodes()
{
return $this->each(function($node) {
return $node;
});
}

/**
* Returns the node name of the first node of the list.
*
Expand Down Expand Up @@ -1097,7 +1105,7 @@ public function isDisconnected()
$parent = $this->getNode(0)->parentNode;
return ($parent == null || $parent->tagName == self::FRAGMENT_ROOT_TAGNAME);
}

public function __get($name)
{
switch ($name) {
Expand Down