From 9b0c58b8a610d109e88b22ba5377576f2fe4a575 Mon Sep 17 00:00:00 2001 From: "FiveFilters.org" Date: Fri, 20 Aug 2021 15:54:08 +0200 Subject: Add method to unwrap img inside noscript https://github.com/mozilla/readability/commit/d784bf7e20e25ec1b3a6102a20c83d35fe3ef87d (but code based on current version of Readability.js) --- src/Nodes/DOM/DOMElement.php | 35 + src/Readability.php | 102 ++ test/test-pages/citylab-1/expected-images.json | 1 + test/test-pages/citylab-1/expected.html | 2 +- test/test-pages/lazy-image-1/expected-images.json | 12 + .../test-pages/lazy-image-1/expected-metadata.json | 8 + test/test-pages/lazy-image-1/expected.html | 379 ++++++ test/test-pages/lazy-image-1/source.html | 1277 ++++++++++++++++++++ test/test-pages/mozilla-1/expected-images.json | 2 +- test/test-pages/mozilla-1/expected.html | 2 +- test/test-pages/seattletimes-1/expected.html | 2 +- test/test-pages/yahoo-1/expected.html | 2 +- 12 files changed, 1819 insertions(+), 5 deletions(-) create mode 100644 test/test-pages/lazy-image-1/expected-images.json create mode 100644 test/test-pages/lazy-image-1/expected-metadata.json create mode 100644 test/test-pages/lazy-image-1/expected.html create mode 100644 test/test-pages/lazy-image-1/source.html diff --git a/src/Nodes/DOM/DOMElement.php b/src/Nodes/DOM/DOMElement.php index 900ad56..7486bad 100644 --- a/src/Nodes/DOM/DOMElement.php +++ b/src/Nodes/DOM/DOMElement.php @@ -7,4 +7,39 @@ use fivefilters\Readability\Nodes\NodeTrait; class DOMElement extends \DOMElement { use NodeTrait; + + /** + * Returns the child elements of this element. + * + * To get all child nodes, including non-element nodes like text and comment nodes, use childNodes. + * + * @return DOMNodeList + */ + public function children() + { + $newList = new DOMNodeList(); + foreach ($this->childNodes as $node) { + if ($node->nodeType === XML_ELEMENT_NODE) { + $newList->add($node); + } + } + return $newList; + } + + /** + * Returns the Element immediately prior to the specified one in its parent's children list, or null if the specified element is the first one in the list. + * + * @see https://wiki.php.net/rfc/dom_living_standard_api + * @return DOMElement|null + */ + public function previousElementSibling() + { + $previous = $this->previousSibling; + do { + if ($previous->nodeType === XML_ELEMENT_NODE) { + return $previous; + } + } while ($previous = $previous->previousSibling); + return null; + } } diff --git a/src/Readability.php b/src/Readability.php index d85b987..73a8a54 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -305,6 +305,9 @@ class Readability } $dom->encoding = 'UTF-8'; + // Unwrap image from noscript + $this->unwrapNoscriptImages($dom); + $this->removeScripts($dom); $this->prepDocument($dom); @@ -834,6 +837,105 @@ class Readability return false; } + + /** + * Check if node is image, or if node contains exactly only one image + * whether as a direct child or as its descendants. + * + * @param DOMElement $node + */ + private function isSingleImage(DOMElement $node) { + if ($node->tagName === 'img') { + return true; + } + + if ($node->children()->length !== 1 || trim($node->textContent) !== '') { + return false; + } + + return $this->isSingleImage($node->children()->item(0)); + } + + /** + * Find all