summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/HTMLParser.php15
-rw-r--r--src/Readability.php19
2 files changed, 24 insertions, 10 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 022f616..27e376d 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -318,7 +318,7 @@ class HTMLParser
* algorithm with DIVs with are, in practice, paragraphs.
*/
if ($this->hasSinglePNode($node)) {
- $pNode = $node->getChildren()[0];
+ $pNode = $node->getChildren(true)[0];
$node->replaceChild($pNode);
$node = $pNode;
} elseif (!$this->hasSingleChildBlockElement($node)) {
@@ -877,17 +877,12 @@ class HTMLParser
private function hasSinglePNode(Readability $node)
{
// There should be exactly 1 element child which is a P:
- if ($node->hasChildren()) {
- $children = $node->getChildren();
-
- if (count($children) === 1) {
- if ($children[0]->tagNameEqualsTo('p')) {
- return true;
- }
- }
+ // And there should be no text nodes with real content (param true on ->getChildren)
+ if (count($children = $node->getChildren(true)) !== 1 || !$children[0]->tagNameEqualsTo('p')) {
+ return false;
}
- return false;
+ return true;
}
private function hasSingleChildBlockElement(Readability $node)
diff --git a/src/Readability.php b/src/Readability.php
index e3122dd..9255bc6 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -476,4 +476,23 @@ class Readability extends Element implements ReadabilityInterface
}
return false;
}
+
+ /**
+ * @param bool $filterEmptyDOMText Filter empty DOMText nodes?
+ * @return array
+ */
+ public function getChildren($filterEmptyDOMText = false)
+ {
+ $ret = array();
+ /** @var \DOMNode $node */
+ foreach ($this->node->childNodes as $node) {
+ if ($filterEmptyDOMText && $node->nodeName === '#text' && !trim($node->nodeValue)) {
+ continue;
+ }
+
+ $ret[] = new static($node);
+ }
+
+ return $ret;
+ }
}