diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/HTMLParser.php | 4 | ||||
-rw-r--r-- | src/Readability.php | 3 |
2 files changed, 5 insertions, 2 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 5589e83..f35a950 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -37,6 +37,8 @@ class HTMLParser 'prevLink' => '/(prev|earl|old|new|<|«)/i', 'whitespace' => '/^\s*$/', 'hasContent' => '/\S$/', + // \x{00A0} is the unicode version of + 'onlyWhitespace' => '/\x{00A0}|\s+/u' ]; private $defaultTagsToScore = [ @@ -1261,7 +1263,7 @@ class HTMLParser $iframeCount = $paragraph->getElementsByTagName('iframe')->length; $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount; - if ($totalCount === 0 && !trim($paragraph->textContent)) { + if ($totalCount === 0 && !preg_replace($this->regexps['onlyWhitespace'], '', $paragraph->textContent)) { // TODO must be done via readability $paragraph->parentNode->removeChild($paragraph); } diff --git a/src/Readability.php b/src/Readability.php index b5bc723..8d856d8 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -528,7 +528,8 @@ class Readability extends Element implements ReadabilityInterface public function isElementWithoutContent() { return ($this->node instanceof \DOMElement && - mb_strlen(trim($this->node->textContent)) === 0 && + // /\x{00A0}|\s+/u TODO to be replaced with regexps array + mb_strlen(preg_replace('/\x{00A0}|\s+/u','',$this->node->textContent)) === 0 && ($this->node->childNodes->length === 0 || $this->node->childNodes->length === $this->node->getElementsByTagName('br')->length + $this->node->getElementsByTagName('hr')->length || /* |