diff options
author | FiveFilters.org <[email protected]> | 2021-08-20 15:54:08 +0200 |
---|---|---|
committer | FiveFilters.org <[email protected]> | 2021-08-20 15:54:08 +0200 |
commit | 9b0c58b8a610d109e88b22ba5377576f2fe4a575 (patch) | |
tree | e66ef2ec6ad82f0088cc856bb0123efdc00b422a /src/Readability.php | |
parent | 37196ac5c2ee73ef113fccb2daa39d1fb8bfb52b (diff) |
Add method to unwrap img inside noscript
https://github.com/mozilla/readability/commit/d784bf7e20e25ec1b3a6102a20c83d35fe3ef87d (but code based on current version of Readability.js)
Diffstat (limited to 'src/Readability.php')
-rw-r--r-- | src/Readability.php | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/src/Readability.php b/src/Readability.php index d85b987..73a8a54 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -305,6 +305,9 @@ class Readability } $dom->encoding = 'UTF-8'; + // Unwrap image from noscript + $this->unwrapNoscriptImages($dom); + $this->removeScripts($dom); $this->prepDocument($dom); @@ -834,6 +837,105 @@ class Readability return false; } + + /** + * Check if node is image, or if node contains exactly only one image + * whether as a direct child or as its descendants. + * + * @param DOMElement $node + */ + private function isSingleImage(DOMElement $node) { + if ($node->tagName === 'img') { + return true; + } + + if ($node->children()->length !== 1 || trim($node->textContent) !== '') { + return false; + } + + return $this->isSingleImage($node->children()->item(0)); + } + + /** + * Find all <noscript> that are located after <img> nodes, and which contain only one + * <img> element. Replace the first image with the image from inside the <noscript> tag, + * and remove the <noscript> tag. This improves the quality of the images we use on + * some sites (e.g. Medium). + * + * @param DOMDocument $dom + */ + private function unwrapNoscriptImages(DOMDocument $dom) { + // Find img without source or attributes that might contains image, and remove it. + // This is done to prevent a placeholder img is replaced by img from noscript in next step. + $imgs = iterator_to_array($dom->getElementsByTagName('img')); + array_walk($imgs, function($img) { + for ($i = 0; $i < $img->attributes->length; $i++) { + $attr = $img->attributes->item($i); + switch ($attr->name) { + case 'src': + case 'srcset': + case 'data-src': + case 'data-srcset': + return; + } + + if (preg_match('/\.(jpg|jpeg|png|webp)/i', $attr->value)) { + return; + } + } + + $img->parentNode->removeChild($img); + }); + + // Next find noscript and try to extract its image + $noscripts = iterator_to_array($dom->getElementsByTagName('noscript')); + array_walk($noscripts, function($noscript) use($dom) { + // Parse content of noscript and make sure it only contains image + // var tmp = doc.createElement("div"); + // tmp.innerHTML = noscript.innerHTML; + $tmp = $noscript->cloneNode(true); + $dom->importNode($tmp); + //NodeUtility::setNodeTag($tmp, 'div'); + if (!$this->isSingleImage($tmp)) { + return; + } + + // If noscript has previous sibling and it only contains image, + // replace it with noscript content. However we also keep old + // attributes that might contains image. + $prevElement = $noscript->previousElementSibling(); + if ($prevElement && $this->isSingleImage($prevElement)) { + $prevImg = $prevElement; + if ($prevImg->tagName !== 'img') { + $prevImg = $prevElement->getElementsByTagName('img')->item(0); + } + + $newImg = $tmp->getElementsByTagName('img')->item(0); + for ($i = 0; $i < $prevImg->attributes->length; $i++) { + $attr = $prevImg->attributes->item($i); + if ($attr->value === '') { + continue; + } + + if ($attr->name === 'src' || $attr->name === 'srcset' || preg_match('/\.(jpg|jpeg|png|webp)/i', $attr->value)) { + if ($newImg->getAttribute($attr->name) === $attr->value) { + continue; + } + + $attrName = $attr->name; + if ($newImg->hasAttribute($attrName)) { + $attrName = 'data-old-' . $attrName; + } + + $newImg->setAttribute($attrName, $attr->value); + } + } + + $noscript->parentNode->replaceChild($tmp->getFirstElementChild(), $prevElement); + } + }); + } + /** * Removes all the scripts of the html. * |