From 0a53c54ee9aa708b6f2978633e4df8d94f972cb4 Mon Sep 17 00:00:00 2001 From: "FiveFilters.org" Date: Fri, 20 Aug 2021 16:41:40 +0200 Subject: remove nodes with role=complementary https://github.com/mozilla/readability/commit/d5621f85e775229332bf0f6f2b1d3d789c638f2d --- src/Readability.php | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'src/Readability.php') diff --git a/src/Readability.php b/src/Readability.php index 73a8a54..bdbb56d 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -720,6 +720,12 @@ class Readability } } + if ($node->getAttribute('role') === 'complementary') { + $this->logger->debug(sprintf('Removing complementary content - %s', $matchString)); + $node = NodeUtility::removeAndGetNext($node); + continue; + } + // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). if (($node->nodeName === 'div' || $node->nodeName === 'section' || $node->nodeName === 'header' || $node->nodeName === 'h1' || $node->nodeName === 'h2' || $node->nodeName === 'h3' || @@ -891,11 +897,11 @@ class Readability $noscripts = iterator_to_array($dom->getElementsByTagName('noscript')); array_walk($noscripts, function($noscript) use($dom) { // Parse content of noscript and make sure it only contains image + // [PHP port] Could copy innerHTML support over for the commented lines below, but is it needed? // var tmp = doc.createElement("div"); // tmp.innerHTML = noscript.innerHTML; $tmp = $noscript->cloneNode(true); $dom->importNode($tmp); - //NodeUtility::setNodeTag($tmp, 'div'); if (!$this->isSingleImage($tmp)) { return; } -- cgit v1.2.3