summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-12 01:32:03 +0000
committerAndres Rey <[email protected]>2017-11-12 01:32:03 +0000
commitfcf8ba9de0a532433178686bde2f78afc6e063c2 (patch)
tree4c550f7a85efe12d5b02316b3408c2b366777f8e
parent00ab1e503a7d804a19365ffe3ee19cd9ca9f8643 (diff)
Add new regexp to check for whitespace include unicode version of &nbsp;
-rw-r--r--src/HTMLParser.php4
-rw-r--r--src/Readability.php3
2 files changed, 5 insertions, 2 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 5589e83..f35a950 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -37,6 +37,8 @@ class HTMLParser
'prevLink' => '/(prev|earl|old|new|<|«)/i',
'whitespace' => '/^\s*$/',
'hasContent' => '/\S$/',
+ // \x{00A0} is the unicode version of &nbsp;
+ 'onlyWhitespace' => '/\x{00A0}|\s+/u'
];
private $defaultTagsToScore = [
@@ -1261,7 +1263,7 @@ class HTMLParser
$iframeCount = $paragraph->getElementsByTagName('iframe')->length;
$totalCount = $imgCount + $embedCount + $objectCount + $iframeCount;
- if ($totalCount === 0 && !trim($paragraph->textContent)) {
+ if ($totalCount === 0 && !preg_replace($this->regexps['onlyWhitespace'], '', $paragraph->textContent)) {
// TODO must be done via readability
$paragraph->parentNode->removeChild($paragraph);
}
diff --git a/src/Readability.php b/src/Readability.php
index b5bc723..8d856d8 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -528,7 +528,8 @@ class Readability extends Element implements ReadabilityInterface
public function isElementWithoutContent()
{
return ($this->node instanceof \DOMElement &&
- mb_strlen(trim($this->node->textContent)) === 0 &&
+ // /\x{00A0}|\s+/u TODO to be replaced with regexps array
+ mb_strlen(preg_replace('/\x{00A0}|\s+/u','',$this->node->textContent)) === 0 &&
($this->node->childNodes->length === 0 ||
$this->node->childNodes->length === $this->node->getElementsByTagName('br')->length + $this->node->getElementsByTagName('hr')->length ||
/*