summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-12 12:48:49 +0000
committerAndres Rey <[email protected]>2017-11-12 12:48:49 +0000
commitd39bdd7ef4dd5976b1a2b6d8dc2917a8e830b80e (patch)
tree3339addc29d2d756381387e8287f55bafb820964 /src
parentd51c2bcf367bd4b80a91d2eaaff30f485033ba90 (diff)
Remove empty or just whitespace P elements during rating
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php3
-rw-r--r--src/Readability.php18
2 files changed, 14 insertions, 7 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index f35a950..5d86065 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -673,7 +673,8 @@ class HTMLParser
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
if (($node->tagNameEqualsTo('div') || $node->tagNameEqualsTo('section') || $node->tagNameEqualsTo('header') ||
$node->tagNameEqualsTo('h1') || $node->tagNameEqualsTo('h2') || $node->tagNameEqualsTo('h3') ||
- $node->tagNameEqualsTo('h4') || $node->tagNameEqualsTo('h5') || $node->tagNameEqualsTo('h6')) &&
+ $node->tagNameEqualsTo('h4') || $node->tagNameEqualsTo('h5') || $node->tagNameEqualsTo('h6') ||
+ $node->tagNameEqualsTo('p')) &&
$node->isElementWithoutContent()) {
$node = $node->removeAndGetNext($node);
continue;
diff --git a/src/Readability.php b/src/Readability.php
index 8d856d8..c8ce01a 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -529,15 +529,21 @@ class Readability extends Element implements ReadabilityInterface
{
return ($this->node instanceof \DOMElement &&
// /\x{00A0}|\s+/u TODO to be replaced with regexps array
- mb_strlen(preg_replace('/\x{00A0}|\s+/u','',$this->node->textContent)) === 0 &&
+ mb_strlen(preg_replace('/\x{00A0}|\s+/u', '', $this->node->textContent)) === 0 &&
($this->node->childNodes->length === 0 ||
- $this->node->childNodes->length === $this->node->getElementsByTagName('br')->length + $this->node->getElementsByTagName('hr')->length ||
+ $this->node->childNodes->length === $this->node->getElementsByTagName('br')->length + $this->node->getElementsByTagName('hr')->length
/*
- * Special DOMDocument case: When there's an empty tag with a space inside, like "<h3> </h3>", the
- * previous if will fail because DOMElement will say that it has one node inside (A DOMText) and this
- * in JS doesn't happens. So here we check if we have exactly one node, and that node is a DOMText one.
+ * Special DOMDocument case: We also need to count how many DOMText we have inside the node.
+ * If there's an empty tag with an space inside and a BR (for example "<p> <br/></p>) counting only BRs and
+ * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument,
+ * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we
+ * are dealing with (And at this point we know they are empty or are just whitespace, because of the
+ * mb_strlen in this chain of checks).
*/
- ($this->node->childNodes->length === 1 && $this->node->childNodes->item(0)->nodeType === XML_TEXT_NODE)
+ + count(array_filter(iterator_to_array($this->node->childNodes), function ($child) {
+ return $child instanceof \DOMText;
+ }))
+
));
}
}