diff options
author | Andres Rey <[email protected]> | 2017-11-12 01:32:03 +0000 |
---|---|---|
committer | Andres Rey <[email protected]> | 2017-11-12 01:32:03 +0000 |
commit | fcf8ba9de0a532433178686bde2f78afc6e063c2 (patch) | |
tree | 4c550f7a85efe12d5b02316b3408c2b366777f8e | |
parent | 00ab1e503a7d804a19365ffe3ee19cd9ca9f8643 (diff) |
Add new regexp to check for whitespace include unicode version of
-rw-r--r-- | src/HTMLParser.php | 4 | ||||
-rw-r--r-- | src/Readability.php | 3 |
2 files changed, 5 insertions, 2 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 5589e83..f35a950 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -37,6 +37,8 @@ class HTMLParser 'prevLink' => '/(prev|earl|old|new|<|«)/i', 'whitespace' => '/^\s*$/', 'hasContent' => '/\S$/', + // \x{00A0} is the unicode version of + 'onlyWhitespace' => '/\x{00A0}|\s+/u' ]; private $defaultTagsToScore = [ @@ -1261,7 +1263,7 @@ class HTMLParser $iframeCount = $paragraph->getElementsByTagName('iframe')->length; $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount; - if ($totalCount === 0 && !trim($paragraph->textContent)) { + if ($totalCount === 0 && !preg_replace($this->regexps['onlyWhitespace'], '', $paragraph->textContent)) { // TODO must be done via readability $paragraph->parentNode->removeChild($paragraph); } diff --git a/src/Readability.php b/src/Readability.php index b5bc723..8d856d8 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -528,7 +528,8 @@ class Readability extends Element implements ReadabilityInterface public function isElementWithoutContent() { return ($this->node instanceof \DOMElement && - mb_strlen(trim($this->node->textContent)) === 0 && + // /\x{00A0}|\s+/u TODO to be replaced with regexps array + mb_strlen(preg_replace('/\x{00A0}|\s+/u','',$this->node->textContent)) === 0 && ($this->node->childNodes->length === 0 || $this->node->childNodes->length === $this->node->getElementsByTagName('br')->length + $this->node->getElementsByTagName('hr')->length || /* |