From 4fe7aa2a39c12f9dd4bddc8699f76e9bc3eb4b4f Mon Sep 17 00:00:00 2001 From: "FiveFilters.org" Date: Wed, 25 Aug 2021 03:06:54 +0200 Subject: exclude additional elements based on their role https://github.com/mozilla/readability/commit/d5eea06a0095b3138dbd1f6233f656d690200509 --- src/Readability.php | 9 +++++++-- test/test-pages/nytimes-1/expected.html | 13 +------------ test/test-pages/nytimes-2/expected.html | 13 +------------ 3 files changed, 9 insertions(+), 26 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 2027db3..1089769 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -124,6 +124,11 @@ class Readability 'pre', ]; + /** + * @var array + */ + private $unlikelyRoles = ['menu', 'menubar', 'complementary', 'navigation', 'alert', 'alertdialog', 'dialog']; + /** * @var array */ @@ -896,8 +901,8 @@ class Readability } } - if ($node->getAttribute('role') === 'complementary') { - $this->logger->debug(sprintf('Removing complementary content - %s', $matchString)); + if (in_array($node->getAttribute('role'), $this->unlikelyRoles)) { + $this->logger->debug(sprintf('Removing content with role %s - %s', $node->getAttribute('role'), $matchString)); $node = NodeUtility::removeAndGetNext($node); continue; } diff --git a/test/test-pages/nytimes-1/expected.html b/test/test-pages/nytimes-1/expected.html index a18a21e..4151659 100644 --- a/test/test-pages/nytimes-1/expected.html +++ b/test/test-pages/nytimes-1/expected.html @@ -68,18 +68,7 @@ -
- - - - - -
+ diff --git a/test/test-pages/nytimes-2/expected.html b/test/test-pages/nytimes-2/expected.html index e0e48ff..b8ca26b 100644 --- a/test/test-pages/nytimes-2/expected.html +++ b/test/test-pages/nytimes-2/expected.html @@ -71,18 +71,7 @@ -
- - - - - -
+ -- cgit v1.2.3