summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-12-11 12:21:45 +0000
committerAndres Rey <[email protected]>2016-12-11 12:21:45 +0000
commit7acb69c7f6391fa2121466cd4d013727dac64e36 (patch)
tree6cb7d0671807eeb584db10256f97feadeffd037e /src
parent3f80edf8a2ff66be411815e12ceeb1d4ea584d46 (diff)
Added option to filter empty DOMText nodes while getting children.
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php15
-rw-r--r--src/Readability.php19
2 files changed, 24 insertions, 10 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 022f616..27e376d 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -318,7 +318,7 @@ class HTMLParser
* algorithm with DIVs with are, in practice, paragraphs.
*/
if ($this->hasSinglePNode($node)) {
- $pNode = $node->getChildren()[0];
+ $pNode = $node->getChildren(true)[0];
$node->replaceChild($pNode);
$node = $pNode;
} elseif (!$this->hasSingleChildBlockElement($node)) {
@@ -877,17 +877,12 @@ class HTMLParser
private function hasSinglePNode(Readability $node)
{
// There should be exactly 1 element child which is a P:
- if ($node->hasChildren()) {
- $children = $node->getChildren();
-
- if (count($children) === 1) {
- if ($children[0]->tagNameEqualsTo('p')) {
- return true;
- }
- }
+ // And there should be no text nodes with real content (param true on ->getChildren)
+ if (count($children = $node->getChildren(true)) !== 1 || !$children[0]->tagNameEqualsTo('p')) {
+ return false;
}
- return false;
+ return true;
}
private function hasSingleChildBlockElement(Readability $node)
diff --git a/src/Readability.php b/src/Readability.php
index e3122dd..9255bc6 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -476,4 +476,23 @@ class Readability extends Element implements ReadabilityInterface
}
return false;
}
+
+ /**
+ * @param bool $filterEmptyDOMText Filter empty DOMText nodes?
+ * @return array
+ */
+ public function getChildren($filterEmptyDOMText = false)
+ {
+ $ret = array();
+ /** @var \DOMNode $node */
+ foreach ($this->node->childNodes as $node) {
+ if ($filterEmptyDOMText && $node->nodeName === '#text' && !trim($node->nodeValue)) {
+ continue;
+ }
+
+ $ret[] = new static($node);
+ }
+
+ return $ret;
+ }
}