summaryrefslogtreecommitdiff
path: root/src/HTMLParser.php
diff options
context:
space:
mode:
Diffstat (limited to 'src/HTMLParser.php')
-rw-r--r--src/HTMLParser.php4
1 files changed, 3 insertions, 1 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 5589e83..f35a950 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -37,6 +37,8 @@ class HTMLParser
'prevLink' => '/(prev|earl|old|new|<|«)/i',
'whitespace' => '/^\s*$/',
'hasContent' => '/\S$/',
+ // \x{00A0} is the unicode version of &nbsp;
+ 'onlyWhitespace' => '/\x{00A0}|\s+/u'
];
private $defaultTagsToScore = [
@@ -1261,7 +1263,7 @@ class HTMLParser
$iframeCount = $paragraph->getElementsByTagName('iframe')->length;
$totalCount = $imgCount + $embedCount + $objectCount + $iframeCount;
- if ($totalCount === 0 && !trim($paragraph->textContent)) {
+ if ($totalCount === 0 && !preg_replace($this->regexps['onlyWhitespace'], '', $paragraph->textContent)) {
// TODO must be done via readability
$paragraph->parentNode->removeChild($paragraph);
}