diff options
Diffstat (limited to 'plugins/af_readability/vendor/fivefilters/readability.php/src/Readability.php')
-rw-r--r-- | plugins/af_readability/vendor/fivefilters/readability.php/src/Readability.php | 832 |
1 files changed, 719 insertions, 113 deletions
diff --git a/plugins/af_readability/vendor/fivefilters/readability.php/src/Readability.php b/plugins/af_readability/vendor/fivefilters/readability.php/src/Readability.php index 6bcbf78d7..6407a9292 100644 --- a/plugins/af_readability/vendor/fivefilters/readability.php/src/Readability.php +++ b/plugins/af_readability/vendor/fivefilters/readability.php/src/Readability.php @@ -1,13 +1,16 @@ <?php -namespace andreskrey\Readability; +namespace fivefilters\Readability; -use andreskrey\Readability\Nodes\DOM\DOMDocument; -use andreskrey\Readability\Nodes\DOM\DOMElement; -use andreskrey\Readability\Nodes\DOM\DOMNode; -use andreskrey\Readability\Nodes\DOM\DOMText; -use andreskrey\Readability\Nodes\NodeUtility; +use fivefilters\Readability\Nodes\DOM\DOMDocument; +use fivefilters\Readability\Nodes\DOM\DOMElement; +use fivefilters\Readability\Nodes\DOM\DOMNode; +use fivefilters\Readability\Nodes\DOM\DOMText; +use fivefilters\Readability\Nodes\NodeUtility; use Psr\Log\LoggerInterface; +use \Masterminds\HTML5; +use League\Uri\Http; +use League\Uri\UriResolver; /** * Class Readability. @@ -71,6 +74,14 @@ class Readability protected $direction = null; /** + * Base URI + * HTML5PHP doesn't appear to store it in the baseURI property like PHP's DOMDocument does when parsing with libxml + * + * @var string|null + */ + protected $baseURI = null; + + /** * Configuration object. * * @var Configuration @@ -85,6 +96,13 @@ class Readability private $logger; /** + * JSON-LD + * + * @var array + */ + private $jsonld = []; + + /** * Collection of attempted text extractions. * * @var array @@ -109,6 +127,11 @@ class Readability /** * @var array */ + private $unlikelyRoles = ['menu', 'menubar', 'complementary', 'navigation', 'alert', 'alertdialog', 'dialog']; + + /** + * @var array + */ private $alterToDIVExceptions = [ 'div', 'article', @@ -117,6 +140,17 @@ class Readability ]; /** + * @var array + */ + private $htmlEscapeMap = [ + 'lt' => '<', + 'gt' => '>', + 'amp' => '&', + 'quot' => '"', + 'apos' => '\'', + ]; + + /** * Readability constructor. * * @param Configuration $configuration @@ -154,6 +188,7 @@ class Readability $this->getMainImage(); while (true) { + $this->logger->debug('Starting parse loop'); $root = $root->firstChild; $elementsToScore = $this->getNodes($root); @@ -195,7 +230,7 @@ class Readability // No luck after removing flags, just return the longest text we found during the different loops usort($this->attempts, function ($a, $b) { - return $a['textLength'] < $b['textLength']; + return $b['textLength'] - $a['textLength']; }); // But first check if we actually have something @@ -215,6 +250,11 @@ class Readability } } + if (!$result) { + $this->logger->info('*** Parse failed :('); + return false; + } + $result = $this->postProcessContent($result); // If we haven't found an excerpt in the article's metadata, use the article's @@ -254,28 +294,54 @@ class Readability // To avoid throwing a gazillion of errors on malformed HTMLs libxml_use_internal_errors(true); - $dom = new DOMDocument('1.0', 'utf-8'); + //$html = preg_replace('/(<br[^>]*>[ \n\r\t]*){2,}/i', '</p><p>', $html); + + if ($this->configuration->getParser() === 'html5') { + $this->logger->debug('[Loading] Using HTML5 parser...'); + $html5 = new HTML5(['disable_html_ns' => true, 'target_document' => new DOMDocument('1.0', 'utf-8')]); + $dom = $html5->loadHTML($html); + //TODO: Improve this so it looks inside <html><head><base>, not just any <base> + $base = $dom->getElementsByTagName('base'); + if ($base->length > 0) { + $base = $base->item(0); + $base = $base->getAttribute('href'); + if ($base != '') { + $this->baseURI = $base; + } + } + } else { + $this->logger->debug('[Loading] Using libxml parser...'); + $dom = new DOMDocument('1.0', 'utf-8'); + if ($this->configuration->getNormalizeEntities()) { + $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.'); + // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content + $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); + } + } if (!$this->configuration->getSubstituteEntities()) { // Keep the original HTML entities $dom->substituteEntities = false; } - if ($this->configuration->getNormalizeEntities()) { - $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.'); - // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content - $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); - } - if ($this->configuration->getSummonCthulhu()) { $this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘'); $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html); } // Prepend the XML tag to avoid having issues with special characters. Should be harmless. - $dom->loadHTML('<?xml encoding="UTF-8">' . $html); + if ($this->configuration->getParser() !== 'html5') { + $dom->loadHTML('<?xml encoding="UTF-8">' . $html); + $this->baseURI = $dom->baseURI; + } $dom->encoding = 'UTF-8'; + // Unwrap image from noscript + $this->unwrapNoscriptImages($dom); + + // Extract JSON-LD metadata before removing scripts + $this->jsonld = $this->configuration->getDisableJSONLD() ? [] : $this->getJSONLD($dom); + $this->removeScripts($dom); $this->prepDocument($dom); @@ -286,6 +352,99 @@ class Readability } /** + * Try to extract metadata from JSON-LD object. + * For now, only Schema.org objects of type Article or its subtypes are supported. + * + * @param DOMDocument $dom + * @return Object with any metadata that could be extracted (possibly none) + */ + private function getJSONLD(DOMDocument $dom) + { + $scripts = $this->_getAllNodesWithTag($dom, ['script']); + + $jsonLdElement = $this->findNode($scripts, function ($el) { + return $el->getAttribute('type') === 'application/ld+json'; + }); + + if ($jsonLdElement) { + try { + // Strip CDATA markers if present + $content = preg_replace('/^\s*<!\[CDATA\[|\]\]>\s*$/', '', $jsonLdElement->textContent); + $parsed = json_decode($content, true); + $metadata = []; + if ( + !isset($parsed['@context']) || + !is_string($parsed['@context']) || + !preg_match('/^https?\:\/\/schema\.org$/', $parsed['@context']) + ) { + return $metadata; + } + + if (!isset($parsed['@type']) && isset($parsed['@graph']) && is_array($parsed['@graph'])) { + $_found = null; + foreach ($parsed['@graph'] as $it) { + if (isset($it['@type']) && is_string($it['@type']) && preg_match(NodeUtility::$regexps['jsonLdArticleTypes'], $it['@type'])) { + $_found = $it; + } + } + $parsed = $_found; + } + + if ( + !$parsed || + !isset($parsed['@type']) || + !is_string($parsed['@type']) || + !preg_match(NodeUtility::$regexps['jsonLdArticleTypes'], $parsed['@type']) + ) { + return $metadata; + } + if (isset($parsed['name']) && is_string($parsed['name'])) { + $metadata['title'] = trim($parsed['name']); + } elseif (isset($parsed['headline']) && is_string($parsed['headline'])) { + $metadata['title'] = trim($parsed['headline']); + } + if (isset($parsed['author'])) { + if (isset($parsed['author']['name']) && is_string($parsed['author']['name'])) { + $metadata['byline'] = trim($parsed['author']['name']); + } elseif ( + is_array($parsed['author']) && + isset($parsed['author'][0]) && + is_array($parsed['author'][0]) && + isset($parsed['author'][0]['name']) && + is_string($parsed['author'][0]['name']) + ) { + $metadata['byline'] = array_filter($parsed['author'], function ($author) { + return is_array($author) && isset($author['name']) && is_string($author['name']); + }); + $metadata['byline'] = array_map(function ($author) { + return trim($author['name']); + }, $metadata['byline']); + $metadata['byline'] = implode(', ', $metadata['byline']); + } + } + if (isset($parsed['description']) && is_string($parsed['description'])) { + $metadata['excerpt'] = trim($parsed['description']); + } + if ( + isset($parsed['publisher']) && + is_array($parsed['publisher']) && + isset($parsed['publisher']['name']) && + is_string($parsed['publisher']['name']) + ) { + $metadata['siteName'] = trim($parsed['publisher']['name']); + } + return $metadata; + } catch (\Exception $err) { + // The try-catch blocks are from the JS version. Not sure if there's anything + // here in the PHP version that would trigger an error or exception, so perhaps we can + // remove the try-catch blocks here (or at least translate errors to exceptions for this bit) + $this->logger->debug('[JSON-LD] Error parsing: ' . $err->getMessage()); + } + } + return []; + } + + /** * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties. */ private function getMetadata() @@ -304,19 +463,15 @@ class Readability /* @var DOMNode $meta */ $elementName = $meta->getAttribute('name'); $elementProperty = $meta->getAttribute('property'); - $content = $meta->getAttribute('content'); + $content = $meta->getAttribute('content'); $matches = null; $name = null; if ($elementProperty) { if (preg_match($propertyPattern, $elementProperty, $matches)) { - for ($i = count($matches) - 1; $i >= 0; $i--) { - // Convert to lowercase, and remove any whitespace - // so we can match below. - $name = preg_replace('/\s/', '', mb_strtolower($matches[$i])); - // multiple authors - $values[$name] = trim($content); - } + $name = preg_replace('/\s/', '', mb_strtolower($matches[0])); + // multiple authors + $values[$name] = trim($content); } } @@ -349,7 +504,11 @@ class Readability 'twitter:title' ], array_keys($values))); - $this->setTitle(isset($values[$key]) ? trim($values[$key]) : null); + if (isset($this->jsonld['title'])) { + $this->setTitle($this->jsonld['title']); + } else { + $this->setTitle(isset($values[$key]) ? trim($values[$key]) : null); + } if (!$this->getTitle()) { $this->setTitle($this->getArticleTitle()); @@ -362,7 +521,11 @@ class Readability 'author' ], array_keys($values))); - $this->setAuthor(isset($values[$key]) ? $values[$key] : null); + if (isset($this->jsonld['byline'])) { + $this->setAuthor($this->jsonld['byline']); + } else { + $this->setAuthor(isset($values[$key]) ? $values[$key] : null); + } // get description $key = current(array_intersect([ @@ -375,7 +538,11 @@ class Readability 'twitter:description' ], array_keys($values))); - $this->setExcerpt(isset($values[$key]) ? $values[$key] : null); + if (isset($this->jsonld['excerpt'])) { + $this->setExcerpt($this->jsonld['excerpt']); + } else { + $this->setExcerpt(isset($values[$key]) ? $values[$key] : null); + } // get main image $key = current(array_intersect([ @@ -390,7 +557,18 @@ class Readability 'og:site_name' ], array_keys($values))); - $this->setSiteName(isset($values[$key]) ? $values[$key] : null); + if (isset($this->jsonld['siteName'])) { + $this->setSiteName($this->jsonld['siteName']); + } else { + $this->setSiteName(isset($values[$key]) ? $values[$key] : null); + } + + // in many sites the meta value is escaped with HTML entities, + // so here we need to unescape it + $this->setTitle($this->unescapeHtmlEntities($this->getTitle())); + $this->setAuthor($this->unescapeHtmlEntities($this->getAuthor())); + $this->setExcerpt($this->unescapeHtmlEntities($this->getExcerpt())); + $this->setSiteName($this->unescapeHtmlEntities($this->getSiteName())); } /** @@ -458,6 +636,37 @@ class Readability } /** + * Remove unnecessary nested elements + * + * @param DOMDocument $article + * + * @return void + */ + private function simplifyNestedElements(DOMDocument $article) + { + $node = $article; + + while ($node) { + if ($node->parentNode && in_array($node->nodeName, ['div', 'section']) && !($node->hasAttribute('id') && strpos($node->getAttribute('id'), 'readability') === 0)) { + if ($node->isElementWithoutContent()) { + $node = NodeUtility::removeAndGetNext($node); + continue; + } elseif ($node->hasSingleTagInsideElement('div') || $node->hasSingleTagInsideElement('section')) { + $child = $node->children()->item(0); + for ($i = 0; $i < $node->attributes->length; $i++) { + $child->setAttribute($node->attributes->item($i)->name, $node->attributes->item($i)->value); + } + $node->parentNode->replaceChild($child, $node); + $node = $child; + continue; + } + } + + $node = NodeUtility::getNextNode($node); + } + } + + /** * Returns the title of the html. Prioritizes the title from the metadata against the title tag. * * @return string|null @@ -491,7 +700,7 @@ class Readability * I can assure you it works properly if you let the code run. */ if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) { - $titleHadHierarchicalSeparators = (bool)preg_match('/ [\\\\\/>»] /', $curTitle); + $titleHadHierarchicalSeparators = (bool) preg_match('/ [\\\\\/>»] /', $curTitle); $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle); $this->logger->info(sprintf('[Metadata] Found hierarchical separators in title, new title is: \'%s\'', $curTitle)); @@ -540,7 +749,7 @@ class Readability } } - $curTitle = trim($curTitle); + $curTitle = preg_replace(NodeUtility::$regexps['normalize'], ' ', trim($curTitle)); /* * If we now have 4 words or fewer as our title, and either no @@ -572,6 +781,8 @@ class Readability { list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL()); + $uri = trim($uri); + // If this is already an absolute URI, return it. if (preg_match('/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/', $uri)) { return $uri; @@ -587,18 +798,23 @@ class Readability return $prePath . $uri; } - // Dotslash relative URI. - if (strpos($uri, './') === 0) { - return $pathBase . substr($uri, 2); - } // Ignore hash URIs: if (substr($uri, 0, 1) === '#') { return $uri; } + // Dotslash relative URI. + //if (strpos($uri, './') === 0) { + // return $pathBase . substr($uri, 2); + //} + + $baseUri = Http::createFromString($pathBase); + $relativeUri = Http::createFromString($uri); + return (string)UriResolver::resolve($relativeUri, $baseUri); + // Standard relative URI; add entire path. pathBase already includes a // trailing "/". - return $pathBase . $uri; + //return $pathBase . $uri; } /** @@ -611,13 +827,13 @@ class Readability public function getPathInfo($url) { // Check for base URLs - if ($this->dom->baseURI !== null) { - if (substr($this->dom->baseURI, 0, 1) === '/') { + if ($this->baseURI !== null) { + if (substr($this->baseURI, 0, 1) === '/') { // URLs starting with '/' override completely the URL defined in the link - $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . $this->dom->baseURI; + $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . $this->baseURI; } else { // Otherwise just prepend the base to the actual path - $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/' . rtrim($this->dom->baseURI, '/') . '/'; + $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/'.rtrim($this->baseURI, '/') . '/'; } } else { $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/'; @@ -644,6 +860,8 @@ class Readability $elementsToScore = []; + $shouldRemoveTitleHeader = true; + /* * First, node prepping. Trash nodes that look cruddy (like ones with the * class name "comment", etc), and turn divs into P tags where they have been @@ -673,11 +891,20 @@ class Readability continue; } + if ($shouldRemoveTitleHeader && $this->headerDuplicatesTitle($node)) { + $this->logger->debug(sprintf('Removing header: %s', $node->getTextContent())); + $shouldRemoveTitleHeader = false; + $node = NodeUtility::removeAndGetNext($node); + continue; + } + // Remove unlikely candidates if ($stripUnlikelyCandidates) { if ( preg_match(NodeUtility::$regexps['unlikelyCandidates'], $matchString) && !preg_match(NodeUtility::$regexps['okMaybeItsACandidate'], $matchString) && + !$node->hasAncestorTag( 'table') && + !$node->hasAncestorTag( 'code') && $node->nodeName !== 'body' && $node->nodeName !== 'a' ) { @@ -687,6 +914,12 @@ class Readability } } + if (in_array($node->getAttribute('role'), $this->unlikelyRoles)) { + $this->logger->debug(sprintf('Removing content with role %s - %s', $node->getAttribute('role'), $matchString)); + $node = NodeUtility::removeAndGetNext($node); + continue; + } + // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). if (($node->nodeName === 'div' || $node->nodeName === 'section' || $node->nodeName === 'header' || $node->nodeName === 'h1' || $node->nodeName === 'h2' || $node->nodeName === 'h3' || @@ -753,6 +986,30 @@ class Readability } /** + * compares second text to first one + * 1 = same text, 0 = completely different text + * works the way that it splits both texts into words and then finds words that are unique in second text + * the result is given by the lower length of unique parts + * + * @param string $textA + * @param string $textB + * + * @return int 1 = same text, 0 = completely different text + */ + private function textSimilarity(string $textA, string $textB) { + $tokensA = array_filter(preg_split(NodeUtility::$regexps['tokenize'], mb_strtolower($textA))); + $tokensB = array_filter(preg_split(NodeUtility::$regexps['tokenize'], mb_strtolower($textB))); + if (!count($tokensA) || !count($tokensB)) { + return 0; + } + $uniqTokensB = array_filter($tokensB, function ($token) use (&$tokensA) { + return !in_array($token, $tokensA); + }); + $distanceB = mb_strlen(implode(' ', $uniqTokensB)) / mb_strlen(implode(' ', $tokensB)); + return 1 - $distanceB; + } + + /** * Checks if the node is a byline. * * @param DOMNode $node @@ -774,10 +1031,11 @@ class Readability } $rel = $node->getAttribute('rel'); + $itemprop = $node->getAttribute("itemprop"); - if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) { - $this->logger->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent())); - $this->setAuthor(trim($node->getTextContent())); + if ($rel === 'author' || ($itemprop && strpos($itemprop, 'author') !== false) || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent(false))) { + $this->logger->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent(false))); + $this->setAuthor(trim($node->getTextContent(false))); return true; } @@ -804,6 +1062,132 @@ class Readability } /** + * Converts some of the common HTML entities in string to their corresponding characters. + * + * @param string $str - a string to unescape. + * @return string without HTML entity. + */ + private function unescapeHtmlEntities($str) { + if (!$str) { + return $str; + } + + $htmlEscapeMap = $this->htmlEscapeMap; + $str = preg_replace_callback('/&(quot|amp|apos|lt|gt);/', function ($tag) use ($htmlEscapeMap) { + return $htmlEscapeMap[$tag[1]]; + }, $str); + $str = preg_replace_callback('/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/i', function ($matches) { + $hex = $matches[1]; + $numStr = $matches[2]; + if ($hex !== '') { + $num = intval($hex, 16); + } else { + $num = intval($numStr, 10); + } + return mb_chr($num); + }, $str); + return $str; + } + + /** + * Check if node is image, or if node contains exactly only one image + * whether as a direct child or as its descendants. + * + * @param DOMElement $node + */ + private function isSingleImage(DOMElement $node) { + if ($node->tagName === 'img') { + return true; + } + + if ($node->children()->length !== 1 || trim($node->textContent) !== '') { + return false; + } + + return $this->isSingleImage($node->children()->item(0)); + } + + /** + * Find all <noscript> that are located after <img> nodes, and which contain only one + * <img> element. Replace the first image with the image from inside the <noscript> tag, + * and remove the <noscript> tag. This improves the quality of the images we use on + * some sites (e.g. Medium). + * + * @param DOMDocument $dom + */ + private function unwrapNoscriptImages(DOMDocument $dom) { + // Find img without source or attributes that might contains image, and remove it. + // This is done to prevent a placeholder img is replaced by img from noscript in next step. + $imgs = iterator_to_array($dom->getElementsByTagName('img')); + array_walk($imgs, function ($img) { + for ($i = 0; $i < $img->attributes->length; $i++) { + $attr = $img->attributes->item($i); + switch ($attr->name) { + case 'src': + case 'srcset': + case 'data-src': + case 'data-srcset': + return; + } + + if (preg_match('/\.(jpg|jpeg|png|webp)/i', $attr->value)) { + return; + } + } + + $img->parentNode->removeChild($img); + }); + + // Next find noscript and try to extract its image + $noscripts = iterator_to_array($dom->getElementsByTagName('noscript')); + array_walk($noscripts, function ($noscript) use($dom) { + // Parse content of noscript and make sure it only contains image + // [PHP port] Could copy innerHTML support over for the commented lines below, but is it needed? + // var tmp = doc.createElement("div"); + // tmp.innerHTML = noscript.innerHTML; + $tmp = $noscript->cloneNode(true); + $dom->importNode($tmp); + if (!$this->isSingleImage($tmp)) { + return; + } + + // If noscript has previous sibling and it only contains image, + // replace it with noscript content. However we also keep old + // attributes that might contains image. + $prevElement = $noscript->previousElementSibling(); + if ($prevElement && $this->isSingleImage($prevElement)) { + $prevImg = $prevElement; + if ($prevImg->tagName !== 'img') { + $prevImg = $prevElement->getElementsByTagName('img')->item(0); + } + + $newImg = $tmp->getElementsByTagName('img')->item(0); + for ($i = 0; $i < $prevImg->attributes->length; $i++) { + $attr = $prevImg->attributes->item($i); + if ($attr->value === '') { + continue; + } + + if ($attr->name === 'src' || $attr->name === 'srcset' || preg_match('/\.(jpg|jpeg|png|webp)/i', $attr->value)) { + if ($newImg->getAttribute($attr->name) === $attr->value) { + continue; + } + + $attrName = $attr->name; + if ($newImg->hasAttribute($attrName)) { + $attrName = 'data-old-' . $attrName; + } + + $newImg->setAttribute($attrName, $attr->value); + } + } + + $noscript->parentNode->replaceChild($tmp->getFirstElementChild(), $prevElement); + } + }); + } + + /** * Removes all the scripts of the html. * * @param DOMDocument $dom @@ -841,7 +1225,7 @@ class Readability * or non-whitespace. This leaves behind the first <br> in the chain * (which will be replaced with a <p> later). */ - while (($next = NodeUtility::nextElement($next)) && ($next->nodeName === 'br')) { + while (($next = NodeUtility::nextNode($next)) && ($next->nodeName === 'br')) { $this->logger->debug('[PrepDocument] Removing chain of BR nodes...'); $replaced = true; @@ -864,7 +1248,7 @@ class Readability while ($next) { // If we've hit another <br><br>, we're done adding children to this <p>. if ($next->nodeName === 'br') { - $nextElem = NodeUtility::nextElement($next->nextSibling); + $nextElem = NodeUtility::nextNode($next->nextSibling); if ($nextElem && $nextElem->nodeName === 'br') { break; } @@ -882,10 +1266,14 @@ class Readability $next = $sibling; } - while ($p->lastChild && $p->lastChild->isWhitespace()) { + while ($p && $p->lastChild && $p->lastChild->isWhitespace()) { $p->removeChild($p->lastChild); } + while ($p && $p->firstChild && $p->firstChild->isWhitespace()) { + $p->removeChild($p->firstChild); + } + if ($p->parentNode->tagName === 'p') { NodeUtility::setNodeTag($p->parentNode, 'div'); } @@ -893,11 +1281,11 @@ class Readability } // Replace font tags with span - $fonts = $dom->getElementsByTagName('font'); - $length = $fonts->length; + $fonts = $this->_getAllNodesWithTag($dom, ['font']); + $length = count($fonts); for ($i = 0; $i < $length; $i++) { $this->logger->debug('[PrepDocument] Converting font tag into a span tag.'); - $font = $fonts->item($length - 1 - $i); + $font = $fonts[$length - 1 - $i]; NodeUtility::setNodeTag($font, 'span'); } } @@ -926,7 +1314,7 @@ class Readability continue; } - $ancestors = $node->getNodeAncestors(); + $ancestors = $node->getNodeAncestors(5); // Exclude nodes with no ancestor if (count($ancestors) === 0) { @@ -1046,7 +1434,7 @@ class Readability $parentOfTopCandidate = $topCandidate->parentNode; // Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher - while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) { + while ($parentOfTopCandidate && $parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) { $listsContainingThisAncestor = 0; for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) { $listsContainingThisAncestor += (int)in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]); @@ -1076,7 +1464,7 @@ class Readability $scoreThreshold = $lastScore / 3; /* @var DOMElement $parentOfTopCandidate */ - while ($parentOfTopCandidate->nodeName !== 'body') { + while ($parentOfTopCandidate && $parentOfTopCandidate->nodeName !== 'body') { $parentScore = $parentOfTopCandidate->contentScore; if ($parentScore < $scoreThreshold) { break; @@ -1095,7 +1483,7 @@ class Readability // If the top candidate is the only child, use parent instead. This will help sibling // joining logic when adjacent content is actually located in parent's sibling node. $parentOfTopCandidate = $topCandidate->parentNode; - while ($parentOfTopCandidate->nodeName !== 'body' && count(NodeUtility::filterTextNodes($parentOfTopCandidate->childNodes)) === 1) { + while ($parentOfTopCandidate && $parentOfTopCandidate->nodeName !== 'body' && count(NodeUtility::filterTextNodes($parentOfTopCandidate->childNodes)) === 1) { $topCandidate = $parentOfTopCandidate; $parentOfTopCandidate = $topCandidate->parentNode; } @@ -1216,20 +1604,26 @@ class Readability // visually linked to other content-ful elements (text, images, etc.). $this->_markDataTables($article); + $this->_fixLazyImages($article); + // Clean out junk from the article content $this->_cleanConditionally($article, 'form'); $this->_cleanConditionally($article, 'fieldset'); $this->_clean($article, 'object'); $this->_clean($article, 'embed'); - $this->_clean($article, 'h1'); $this->_clean($article, 'footer'); $this->_clean($article, 'link'); $this->_clean($article, 'aside'); // Clean out elements have "share" in their id/class combinations from final top candidates, // which means we don't remove the top candidates even they have "share". + + $shareElementThreshold = $this->configuration->getCharThreshold(); + foreach ($article->childNodes as $child) { - $this->_cleanMatchedNodes($child, '/share/i'); + $this->_cleanMatchedNodes($child, function ($node, $matchString) use ($shareElementThreshold) { + return (preg_match(NodeUtility::$regexps['shareElements'], $matchString) && mb_strlen($node->textContent) < $shareElementThreshold); + }); } /* @@ -1237,6 +1631,7 @@ class Readability * they are probably using it as a header and not a subheader, * so remove it since we already extract the title separately. */ + /* $h2 = $article->getElementsByTagName('h2'); if ($h2->length === 1) { $lengthSimilarRate = (mb_strlen($h2->item(0)->textContent) - mb_strlen($this->getTitle())) / max(mb_strlen($this->getTitle()), 1); @@ -1253,6 +1648,7 @@ class Readability } } } + */ $this->_clean($article, 'iframe'); $this->_clean($article, 'input'); @@ -1267,10 +1663,15 @@ class Readability $this->_cleanConditionally($article, 'ul'); $this->_cleanConditionally($article, 'div'); + // replace H1 with H2 as H1 should be only title that is displayed separately + foreach (iterator_to_array($article->getElementsByTagName('h1')) as $h1) { + NodeUtility::setNodeTag($h1, 'h2'); + } + $this->_cleanExtraParagraphs($article); foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) { - $next = $br->nextSibling; + $next = NodeUtility::nextNode($br->nextSibling); if ($next && $next->nodeName === 'p') { $this->logger->debug('[PrepArticle] Removing br node next to a p node.'); $br->parentNode->removeChild($br); @@ -1357,6 +1758,84 @@ class Readability } /** + * convert images and figures that have properties like data-src into images that can be loaded without JS + * + * @param DOMDocument $article + * + * @return void + */ + public function _fixLazyImages(DOMDocument $article) + { + $images = $this->_getAllNodesWithTag($article, ['img', 'picture', 'figure']); + foreach ($images as $elem) { + // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute. + // So, here we check if the data uri is too short, just might as well remove it. + if ($elem->getAttribute('src') && preg_match(NodeUtility::$regexps['b64DataUrl'], $elem->getAttribute('src'), $parts)) { + // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes. + if ($parts[1] === 'image/svg+xml') { + continue; + } + + // Make sure this element has other attributes which contains image. + // If it doesn't, then this src is important and shouldn't be removed. + $srcCouldBeRemoved = false; + for ($i = 0; $i < $elem->attributes->length; $i++) { + $attr = $elem->attributes->item($i); + if ($attr->name === 'src') { + continue; + } + + if (preg_match('/\.(jpg|jpeg|png|webp)/i', $attr->value)) { + $srcCouldBeRemoved = true; + break; + } + } + + // Here we assume if image is less than 100 bytes (or 133B after encoded to base64) + // it will be too small, therefore it might be placeholder image. + if ($srcCouldBeRemoved) { + $b64starts = stripos($elem->getAttribute('src'), 'base64') + 7; + $b64length = strlen($elem->getAttribute('src')) - $b64starts; + if ($b64length < 133) { + $elem->removeAttribute('src'); + } + } + } + + // Don't remove if there's a src or srcset attribute, and there's no sign of 'lazy' loading in the class + // attribute value. + if (($elem->getAttribute('src') || $elem->getAttribute('srcset')) && mb_stripos($elem->getAttribute('class'), 'lazy') === false) { + continue; + } + + for ($j = 0; $j < $elem->attributes->length; $j++) { + $attr = $elem->attributes->item($j); + if ($attr->name === 'src' || $attr->name === 'srcset' || $attr->name === 'alt') { + continue; + } + $copyTo = null; + if (preg_match('/\.(jpg|jpeg|png|webp)\s+\d/', $attr->value)) { + $copyTo = 'srcset'; + } elseif (preg_match('/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/', $attr->value)) { + $copyTo = 'src'; + } + if ($copyTo) { + //if this is an img or picture, set the attribute directly + if ($elem->tagName === 'img' || $elem->tagName === 'picture') { + $elem->setAttribute($copyTo, $attr->value); + } elseif ($elem->tagName === 'figure' && empty($this->_getAllNodesWithTag($elem, ['img', 'picture']))) { + //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure + //see the nytimes-3 testcase for an example + $img = $article->createElement('img'); + $img->setAttribute($copyTo, $attr->value); + $elem->appendChild($img); + } + } + } + } + } + + /** * Remove the style attribute on every e and under. * * @param $node DOMDocument|DOMNode @@ -1390,20 +1869,20 @@ class Readability } /** - * Clean out elements whose id/class combinations match specific string. + * Clean out elements that match the specified conditions * * @param $node DOMElement Node to clean - * @param $regex string Match id/class combination. + * @param $filter callable Function determines whether a node should be removed * * @return void **/ - public function _cleanMatchedNodes($node, $regex) + public function _cleanMatchedNodes($node, callable $filter) { $endOfSearchMarkerNode = NodeUtility::getNextNode($node, true); $next = NodeUtility::getNextNode($node); while ($next && $next !== $endOfSearchMarkerNode) { - if (preg_match($regex, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) { - $this->logger->debug(sprintf('Removing matched node with regex: \'%s\', node class was: \'%s\', id: \'%s\'', $regex, $next->getAttribute('class'), $next->getAttribute('id'))); + if ($filter($next, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) { + $this->logger->debug(sprintf('Removing matched node, node class was: \'%s\', id: \'%s\'', $next->getAttribute('class'), $next->getAttribute('id'))); $next = NodeUtility::removeAndGetNext($next); } else { $next = NodeUtility::getNextNode($next); @@ -1418,11 +1897,11 @@ class Readability */ public function _cleanExtraParagraphs(DOMDocument $article) { - $paragraphs = $article->getElementsByTagName('p'); - $length = $paragraphs->length; + $paragraphs = $this->_getAllNodesWithTag($article, ['p']); + $length = count($paragraphs); for ($i = 0; $i < $length; $i++) { - $paragraph = $paragraphs->item($length - 1 - $i); + $paragraph = $paragraphs[$length - 1 - $i]; $imgCount = $paragraph->getElementsByTagName('img')->length; $embedCount = $paragraph->getElementsByTagName('embed')->length; @@ -1438,6 +1917,19 @@ class Readability } } + private function getTextDensity($e, array $tags) { + $textLength = mb_strlen($e->getTextContent(true)); + if ($textLength === 0) { + return 0; + } + $childrenLength = 0; + $children = $this->_getAllNodesWithTag($e, $tags); + foreach ($children as $child) { + $childrenLength += mb_strlen($child->getTextContent(true)); + } + return $childrenLength / $textLength; + } + /** * @param DOMDocument $article * @param string $tag Tag to clean conditionally @@ -1450,27 +1942,53 @@ class Readability return; } - $isList = in_array($tag, ['ul', 'ol']); - /* * Gather counts for other typical elements embedded within. * Traverse backwards so we can remove nodes at the same time * without effecting the traversal. */ - $DOMNodeList = $article->getElementsByTagName($tag); - $length = $DOMNodeList->length; + $allNodesWithTag = $this->_getAllNodesWithTag($article, [$tag]); + $length = count($allNodesWithTag); for ($i = 0; $i < $length; $i++) { /** @var $node DOMElement */ - $node = $DOMNodeList->item($length - 1 - $i); + $node = $allNodesWithTag[$length - 1 - $i]; + + $isList = in_array($tag, ['ul', 'ol']); + /* + // Doesn't seem to work as expected + // compared to JS version: https://github.com/mozilla/readability/commit/3c833899866ffb1f9130767110197fd6f5c08d4c + if (!$isList) { + $listLength = 0; + $listNodes = $this->_getAllNodesWithTag($node, ['ul', 'ol']); + array_walk($listNodes, function ($list) use(&$listLength) { + $listLength += mb_strlen($list->getTextContent()); + }); + $nodeTextLength = mb_strlen($node->getTextContent()); + if (!$nodeTextLength) { + $isList = true; + } else { + $isList = $listLength / $nodeTextLength > 0.9; + } + } + */ + + // First check if this node IS data table, in which case don't remove it. + if ($tag === 'table' && $node->isReadabilityDataTable()) { + continue; + } - // First check if we're in a data table, in which case don't remove us. + // Next check if we're inside a data table, in which case don't remove it as well. if ($node->hasAncestorTag('table', -1, function ($node) { return $node->isReadabilityDataTable(); })) { continue; } + if ($node->hasAncestorTag('code')) { + continue; + } + $weight = 0; if ($this->configuration->getWeightClasses()) { $weight = $node->getClassWeight(); @@ -1483,7 +2001,7 @@ class Readability continue; } - if (substr_count($node->getTextContent(), ',') < 10) { + if (substr_count($node->getTextContent(false), ',') < 10) { /* * If there are not very many commas, and the number of * non-paragraph elements is more than paragraphs or other @@ -1494,14 +2012,24 @@ class Readability $img = $node->getElementsByTagName('img')->length; $li = $node->getElementsByTagName('li')->length - 100; $input = $node->getElementsByTagName('input')->length; + $headingDensity = $this->getTextDensity($node, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']); $embedCount = 0; - $embeds = $node->getElementsByTagName('embed'); + $embeds = $this->_getAllNodesWithTag($node, ['object', 'embed', 'iframe']); foreach ($embeds as $embedNode) { - if (preg_match(NodeUtility::$regexps['videos'], $embedNode->C14N())) { - $embedCount++; + for ($j = 0; $j < $embedNode->attributes->length; $j++) { + if (preg_match(NodeUtility::$regexps['videos'], $embedNode->attributes->item($j)->nodeValue)) { + continue 3; + } + } + + // For embed with <object> tag, check inner HTML as well. + if ($embedNode->tagName === "object" && preg_match(NodeUtility::$regexps['videos'], $embedNode->C14N())) { + continue 2; } + + $embedCount++; } $linkDensity = $node->getLinkDensity(); @@ -1511,7 +2039,7 @@ class Readability ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag('figure')) || (!$isList && $li > $p) || ($input > floor($p / 3)) || - (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag('figure')) || + (!$isList && $headingDensity < 0.9 && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag('figure')) || (!$isList && $weight < 25 && $linkDensity > 0.2) || ($weight >= 25 && $linkDensity > 0.5) || (($embedCount === 1 && $contentLength < 75) || $embedCount > 1); @@ -1525,6 +2053,17 @@ class Readability } } + public function _getAllNodesWithTag($node, array $tagNames) { + $nodes = []; + foreach ($tagNames as $tag) { + $nodeList = $node->getElementsByTagName($tag); + foreach ($nodeList as $n) { + $nodes[] = $n; + } + } + return $nodes; + } + /** * Clean a node of all elements of type "tag". * (Unless it's a youtube/vimeo video. People love movies.). @@ -1538,10 +2077,10 @@ class Readability { $isEmbed = in_array($tag, ['object', 'embed', 'iframe']); - $DOMNodeList = $article->getElementsByTagName($tag); - $length = $DOMNodeList->length; + $allNodesWithTag = $this->_getAllNodesWithTag($article, [$tag]); + $length = count($allNodesWithTag); for ($i = 0; $i < $length; $i++) { - $item = $DOMNodeList->item($length - 1 - $i); + $item = $allNodesWithTag[$length - 1 - $i]; // Allow youtube and vimeo videos through as people usually want to see those. if ($isEmbed) { @@ -1556,8 +2095,8 @@ class Readability continue; } - // Then check the elements inside this element for the same. - if (preg_match(NodeUtility::$regexps['videos'], $item->C14N())) { + // For embed with <object> tag, check inner HTML as well. + if ($item->tagName === 'object' && preg_match(NodeUtility::$regexps['videos'], $item->C14N())) { continue; } } @@ -1568,7 +2107,7 @@ class Readability } /** - * Clean out spurious headers from an Element. Checks things like classnames and link density. + * Clean out spurious headers from an Element. * * @param DOMDocument $article * @@ -1576,25 +2115,43 @@ class Readability **/ public function _cleanHeaders(DOMDocument $article) { - for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { - $headers = $article->getElementsByTagName('h' . $headerIndex); - /** @var $header DOMElement */ - foreach ($headers as $header) { - $weight = 0; - if ($this->configuration->getWeightClasses()) { - $weight = $header->getClassWeight(); - } + $headingNodes = $this->_getAllNodesWithTag($article, ['h1', 'h2']); + /** @var $header DOMElement */ + foreach ($headingNodes as $header) { + $weight = 0; + if ($this->configuration->getWeightClasses()) { + $weight = $header->getClassWeight(); + } + $shouldRemove = $weight < 0; - if ($weight < 0) { - $this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128))); + if ($shouldRemove) { + $this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128))); - NodeUtility::removeNode($header); - } + NodeUtility::removeNode($header); } } } /** + * Check if this node is an H1 or H2 element whose content is mostly + * the same as the article title. + * + * @param DOMNode the node to check. + * @return boolean indicating whether this is a title-like header. + */ + private function headerDuplicatesTitle($node) { + if ($node->nodeName !== 'h1' && $node->nodeName !== 'h2') { + return false; + } + if (!isset($this->title)) { + return false; + } + $heading = $node->getTextContent(false); + $this->logger->debug(sprintf('Evaluating similarity of header: %s"', $heading)); + return $this->textSimilarity($this->title, $heading) > 0.75; + } + + /** * Removes the class="" attribute from every element in the given * subtree. * @@ -1631,13 +2188,23 @@ class Readability /** @var DOMElement $link */ $href = $link->getAttribute('href'); if ($href) { - // Replace links with javascript: URIs with text content, since + // Remove links with javascript: URIs, since // they won't work after scripts have been removed from the page. if (strpos($href, 'javascript:') === 0) { $this->logger->debug(sprintf('[PostProcess] Removing \'javascript:\' link. Content is: \'%s\'', substr($link->textContent, 0, 128))); - $text = $article->createTextNode($link->textContent); - $link->parentNode->replaceChild($text, $link); + // if the link only contains simple text content, it can be converted to a text node + if ($link->childNodes->length === 1 && $link->childNodes->item(0)->nodeType === XML_TEXT_NODE) { + $text = $article->createTextNode($link->textContent); + $link->parentNode->replaceChild($text, $link); + } else { + // if the link has multiple children, they should all be preserved + $container = $article->createElement('span'); + while ($link->firstChild) { + $container->appendChild($link->firstChild); + } + $link->parentNode->replaceChild($container, $link); + } } else { $this->logger->debug(sprintf('[PostProcess] Converting link to absolute URI: \'%s\'', substr($href, 0, 128))); @@ -1646,35 +2213,67 @@ class Readability } } - foreach ($article->getElementsByTagName('img') as $img) { - /** @var DOMElement $img */ - /* - * Extract all possible sources of img url and select the first one on the list. - */ - $url = [ - $img->getAttribute('src'), - $img->getAttribute('data-src'), - $img->getAttribute('data-original'), - $img->getAttribute('data-orig'), - $img->getAttribute('data-url') - ]; - - $src = array_filter($url); - $src = reset($src); + $medias = $this->_getAllNodesWithTag($article, [ + 'img', 'picture', 'figure', 'video', 'audio', 'source' + ]); + + array_walk($medias, function ($media) { + $src = $media->getAttribute('src'); + $poster = $media->getAttribute('poster'); + $srcset = $media->getAttribute('srcset'); + if ($src) { $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($src, 0, 128))); - $img->setAttribute('src', $this->toAbsoluteURI($src)); + $media->setAttribute('src', $this->toAbsoluteURI($src)); } - } + + if ($poster) { + $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($poster, 0, 128))); + + $media->setAttribute('poster', $this->toAbsoluteURI($poster)); + } + + if ($srcset) { + $newSrcset = preg_replace_callback(NodeUtility::$regexps['srcsetUrl'], function ($matches) { + $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($matches[1], 0, 128))); + + return $this->toAbsoluteURI($matches[1]) . $matches[2] . $matches[3]; + }, $srcset); + + $media->setAttribute('srcset', $newSrcset); + } + }); } - $this->_cleanClasses($article); + $this->simplifyNestedElements($article); + + if (!$this->configuration->getKeepClasses()) { + $this->_cleanClasses($article); + } return $article; } /** + * Iterate over a NodeList, and return the first node that passes + * the supplied test function + * + * @param NodeList nodeList The NodeList. + * @param Function fn The test function. + * @return DOMNode|null + */ + private function findNode(array $nodeList, callable $fn) + { + foreach ($nodeList as $node) { + if ($fn($node)) { + return $node; + } + } + return null; + } + + /** * @return null|string */ public function __toString() @@ -1703,7 +2302,14 @@ class Readability */ public function getContent() { - return ($this->content instanceof DOMDocument) ? $this->content->C14N() : null; + if ($this->content instanceof DOMDocument) { + $html5 = new HTML5(['disable_html_ns' => true]); + // by using childNodes below we make sure HTML5PHP's serialiser + // doesn't output the <!DOCTYPE html> string at the start. + return $html5->saveHTML($this->content->childNodes); + } else { + return null; + } } /** |