diff options
author | FiveFilters.org <[email protected]> | 2021-08-20 16:41:40 +0200 |
---|---|---|
committer | FiveFilters.org <[email protected]> | 2021-08-20 16:41:40 +0200 |
commit | 0a53c54ee9aa708b6f2978633e4df8d94f972cb4 (patch) | |
tree | 46e156de1b527e5315e4fdf7e3e20fbc0b15ae81 /src | |
parent | 9bc37532b5e5b241961aa7cb5edce24e9178b8b5 (diff) |
remove nodes with role=complementary
https://github.com/mozilla/readability/commit/d5621f85e775229332bf0f6f2b1d3d789c638f2d
Diffstat (limited to 'src')
-rw-r--r-- | src/Readability.php | 8 |
1 files changed, 7 insertions, 1 deletions
diff --git a/src/Readability.php b/src/Readability.php index 73a8a54..bdbb56d 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -720,6 +720,12 @@ class Readability } } + if ($node->getAttribute('role') === 'complementary') { + $this->logger->debug(sprintf('Removing complementary content - %s', $matchString)); + $node = NodeUtility::removeAndGetNext($node); + continue; + } + // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). if (($node->nodeName === 'div' || $node->nodeName === 'section' || $node->nodeName === 'header' || $node->nodeName === 'h1' || $node->nodeName === 'h2' || $node->nodeName === 'h3' || @@ -891,11 +897,11 @@ class Readability $noscripts = iterator_to_array($dom->getElementsByTagName('noscript')); array_walk($noscripts, function($noscript) use($dom) { // Parse content of noscript and make sure it only contains image + // [PHP port] Could copy innerHTML support over for the commented lines below, but is it needed? // var tmp = doc.createElement("div"); // tmp.innerHTML = noscript.innerHTML; $tmp = $noscript->cloneNode(true); $dom->importNode($tmp); - //NodeUtility::setNodeTag($tmp, 'div'); if (!$this->isSingleImage($tmp)) { return; } |