From 6437e87b77eec9870ac042298731c8cf34fc8b74 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Thu, 9 Nov 2017 18:37:40 +0000 Subject: Better detection of empty paragraphs --- src/HTMLParser.php | 11 +++++++++-- src/Readability.php | 12 ++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 8f00eda..1345bc1 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -1454,11 +1454,18 @@ class HTMLParser private function hasSinglePNode(Readability $node) { // There should be exactly 1 element child which is a P: - // And there should be no text nodes with real content (param true on ->getChildren) - if (count($children = $node->getChildren(true)) !== 1 || !$children[0]->tagNameEqualsTo('p')) { + if (count($children = $node->getChildren()) !== 1 || !$children[0]->tagNameEqualsTo('p')) { return false; } + // And there should be no text nodes with real content (param true on ->getChildren) + foreach ($children as $child) { + /** @var $child Readability */ + if ($child->nodeTypeEqualsTo(XML_TEXT_NODE) && !preg_match('/\S$/', $child->getTextContent())) { + return false; + } + } + return true; } diff --git a/src/Readability.php b/src/Readability.php index b0a5830..922e60f 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -77,6 +77,18 @@ class Readability extends Element implements ReadabilityInterface return false; } + /** + * Checks for the node type. + * + * @param string $value Type of node to compare to + * + * @return bool + */ + public function nodeTypeEqualsTo($value) + { + return $this->node->nodeType === $value; + } + /** * Get the ancestors of the current node. * -- cgit v1.2.3