summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-09 18:37:40 +0000
committerAndres Rey <[email protected]>2017-11-09 18:37:40 +0000
commit6437e87b77eec9870ac042298731c8cf34fc8b74 (patch)
treebc5ecdd36b4b193fbcd82cd3a4a88532c5a82d20 /src
parent0eb5fe0c29ac48f1beee2bdc0c9984e9d7ea85f9 (diff)
Better detection of empty paragraphs
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php11
-rw-r--r--src/Readability.php12
2 files changed, 21 insertions, 2 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 8f00eda..1345bc1 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -1454,11 +1454,18 @@ class HTMLParser
private function hasSinglePNode(Readability $node)
{
// There should be exactly 1 element child which is a P:
- // And there should be no text nodes with real content (param true on ->getChildren)
- if (count($children = $node->getChildren(true)) !== 1 || !$children[0]->tagNameEqualsTo('p')) {
+ if (count($children = $node->getChildren()) !== 1 || !$children[0]->tagNameEqualsTo('p')) {
return false;
}
+ // And there should be no text nodes with real content (param true on ->getChildren)
+ foreach ($children as $child) {
+ /** @var $child Readability */
+ if ($child->nodeTypeEqualsTo(XML_TEXT_NODE) && !preg_match('/\S$/', $child->getTextContent())) {
+ return false;
+ }
+ }
+
return true;
}
diff --git a/src/Readability.php b/src/Readability.php
index b0a5830..922e60f 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -78,6 +78,18 @@ class Readability extends Element implements ReadabilityInterface
}
/**
+ * Checks for the node type.
+ *
+ * @param string $value Type of node to compare to
+ *
+ * @return bool
+ */
+ public function nodeTypeEqualsTo($value)
+ {
+ return $this->node->nodeType === $value;
+ }
+
+ /**
* Get the ancestors of the current node.
*
* @param int $maxLevel Max amount of ancestors to get.