update php-readability

author: Andrew Dolgov <[email protected]> 2019-02-13 14:49:27 +0300
committer: Andrew Dolgov <[email protected]> 2019-02-13 14:49:27 +0300
commit: 8b6ef90385874cefcb904e59801b3c0482805849 (patch)
tree: cf40ecb7a13dc4c5a25e3d506185872355dec5a7 /vendor
parent: 874a2d21704c41e625b4b7ad565b2326ce5b95cd (diff)
5 files changed, 356 insertions, 147 deletions
diff --git a/vendor/andreskrey/Readability/Configuration.php b/vendor/andreskrey/Readability/Configuration.php
index 951740ed0..6c17bc757 100644
--- a/vendor/andreskrey/Readability/Configuration.php
+++ b/vendor/andreskrey/Readability/Configuration.php
@@ -21,7 +21,7 @@ class Configuration
     /**
      * @var int
      */
-    protected $wordThreshold = 500;
+    protected $charThreshold = 500;
 
     /**
      * @var bool
@@ -109,9 +109,9 @@ class Configuration
         // If no logger has been set, just return a null logger
         if ($this->logger === null) {
             return new NullLogger();
-        } else {
-            return $this->logger;
         }
+
+        return $this->logger;
     }
 
     /**
@@ -149,19 +149,45 @@ class Configuration
     /**
      * @return int
      */
+    public function getCharThreshold()
+    {
+        return $this->charThreshold;
+    }
+
+    /**
+     * @param int $charThreshold
+     *
+     * @return $this
+     */
+    public function setCharThreshold($charThreshold)
+    {
+        $this->charThreshold = $charThreshold;
+
+        return $this;
+    }
+
+    /**
+     * @deprecated Use getCharThreshold. Will be removed in version 2.0
+     *
+     * @return int
+     */
     public function getWordThreshold()
     {
-        return $this->wordThreshold;
+        @trigger_error('getWordThreshold was replaced with getCharThreshold and will be removed in version 3.0', E_USER_DEPRECATED);
+
+        return $this->charThreshold;
     }
 
     /**
-     * @param int $wordThreshold
+     * @param int $charThreshold
      *
      * @return $this
      */
-    public function setWordThreshold($wordThreshold)
+    public function setWordThreshold($charThreshold)
     {
-        $this->wordThreshold = $wordThreshold;
+        @trigger_error('setWordThreshold was replaced with setCharThreshold and will be removed in version 3.0', E_USER_DEPRECATED);
+
+        $this->charThreshold = $charThreshold;
 
         return $this;
     }
diff --git a/vendor/andreskrey/Readability/Nodes/DOM/DOMNode.php b/vendor/andreskrey/Readability/Nodes/DOM/DOMNode.php
index f1944c44b..7c3c4f3a2 100644
--- a/vendor/andreskrey/Readability/Nodes/DOM/DOMNode.php
+++ b/vendor/andreskrey/Readability/Nodes/DOM/DOMNode.php
@@ -6,6 +6,7 @@ use andreskrey\Readability\Nodes\NodeTrait;
 
 /**
  * @method getAttribute($attribute)
+ * @method hasAttribute($attribute)
  */
 class DOMNode extends \DOMNode
 {
diff --git a/vendor/andreskrey/Readability/Nodes/NodeTrait.php b/vendor/andreskrey/Readability/Nodes/NodeTrait.php
index 13611c9e7..d7060ccbb 100644
--- a/vendor/andreskrey/Readability/Nodes/NodeTrait.php
+++ b/vendor/andreskrey/Readability/Nodes/NodeTrait.php
@@ -6,6 +6,7 @@ use andreskrey\Readability\Nodes\DOM\DOMDocument;
 use andreskrey\Readability\Nodes\DOM\DOMElement;
 use andreskrey\Readability\Nodes\DOM\DOMNode;
 use andreskrey\Readability\Nodes\DOM\DOMText;
+use DOMNodeList;
 
 /**
  * @method \DOMNode removeAttribute($name)
@@ -51,6 +52,21 @@ trait NodeTrait
     ];
 
     /**
+     * The commented out elements qualify as phrasing content but tend to be
+     * removed by readability when put into paragraphs, so we ignore them here.
+     *
+     * @var array
+     */
+    private $phrasing_elems = [
+        // 'CANVAS', 'IFRAME', 'SVG', 'VIDEO',
+        'abbr', 'audio', 'b', 'bdo', 'br', 'button', 'cite', 'code', 'data',
+        'datalist', 'dfn', 'em', 'embed', 'i', 'img', 'input', 'kbd', 'label',
+        'mark', 'math', 'meter', 'noscript', 'object', 'output', 'progress', 'q',
+        'ruby', 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub',
+        'sup', 'textarea', 'time', 'var', 'wbr'
+    ];
+
+    /**
      * initialized getter.
      *
      * @return bool
@@ -65,7 +81,19 @@ trait NodeTrait
      */
     public function isReadabilityDataTable()
     {
-        return $this->readabilityDataTable;
+        /*
+         * This is a workaround that I'd like to remove in the future.
+         * Seems that although we are extending the base DOMElement and adding custom properties (like this one,
+         * 'readabilityDataTable'), these properties get lost when you search for elements with getElementsByTagName.
+         * This means that even if we mark the tables in a previous step, when we want to retrieve that information,
+         * all the custom properties are in their default values. Somehow we need to find a way to make these properties
+         * permanent across the whole DOM.
+         *
+         * @see https://stackoverflow.com/questions/35654709/php-registernodeclass-and-reusing-variable-names
+         */
+        return $this->hasAttribute('readabilityDataTable')
+            && $this->getAttribute('readabilityDataTable') === '1';
+//        return $this->readabilityDataTable;
     }
 
     /**
@@ -73,7 +101,9 @@ trait NodeTrait
      */
     public function setReadabilityDataTable($param)
     {
-        $this->readabilityDataTable = $param;
+        // Can't be "true" because DOMDocument casts it to "1"
+        $this->setAttribute('readabilityDataTable', $param ? '1' : '0');
+//        $this->readabilityDataTable = $param;
     }
 
     /**
@@ -149,6 +179,24 @@ trait NodeTrait
     }
 
     /**
+     * Override for native hasAttribute.
+     *
+     * @see getAttribute
+     *
+     * @param $attributeName
+     *
+     * @return bool
+     */
+    public function hasAttribute($attributeName)
+    {
+        if (!is_null($this->attributes)) {
+            return parent::hasAttribute($attributeName);
+        }
+
+        return false;
+    }
+
+    /**
      * Get the ancestors of the current node.
      *
      * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them
@@ -332,22 +380,26 @@ trait NodeTrait
      * Check if a given node has one of its ancestor tag name matching the
      * provided one.
      *
-     * @param DOMElement $node
      * @param string $tagName
      * @param int $maxDepth
+     * @param callable $filterFn
      *
      * @return bool
      */
-    public function hasAncestorTag($node, $tagName, $maxDepth = 3)
+    public function hasAncestorTag($tagName, $maxDepth = 3, callable $filterFn = null)
     {
         $depth = 0;
+        $node = $this;
+
         while ($node->parentNode) {
             if ($maxDepth > 0 && $depth > $maxDepth) {
                 return false;
             }
-            if ($node->parentNode->nodeName === $tagName) {
+
+            if ($node->parentNode->nodeName === $tagName && (!$filterFn || $filterFn($node->parentNode))) {
                 return true;
             }
+
             $node = $node->parentNode;
             $depth++;
         }
@@ -356,30 +408,29 @@ trait NodeTrait
     }
 
     /**
-     * Checks if the current node has a single child and if that child is a P node.
-     * Useful to convert <div><p> nodes to a single <p> node and avoid confusing the scoring system since div with p
-     * tags are, in practice, paragraphs.
+     * Check if this node has only whitespace and a single element with given tag
+     * or if it contains no element with given tag or more than 1 element.
      *
-     * @param DOMNode $node
+     * @param $tag string Name of tag
      *
      * @return bool
      */
-    public function hasSinglePNode()
+    public function hasSingleTagInsideElement($tag)
     {
-        // There should be exactly 1 element child which is a P:
-        if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== 'p') {
+        // There should be exactly 1 element child with given tag
+        if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== $tag) {
             return false;
         }
 
-        // And there should be no text nodes with real content (param true on ->getChildren)
-        foreach ($children as $child) {
-            /** @var $child DOMNode */
-            if ($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())) {
+        // And there should be no text nodes with real content
+        return array_reduce($children, function ($carry, $child) {
+            if (!$carry === false) {
                 return false;
             }
-        }
 
-        return true;
+            /* @var DOMNode $child */
+            return !($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent()));
+        });
     }
 
     /**
@@ -431,4 +482,79 @@ trait NodeTrait
 
             );
     }
+
+    /**
+     * Determine if a node qualifies as phrasing content.
+     * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content.
+     *
+     * @return bool
+     */
+    public function isPhrasingContent()
+    {
+        return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false ||
+            (!is_null($this->childNodes) &&
+                ($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') &&
+                array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) {
+                    return $node->isPhrasingContent() && $carry;
+                }, true)
+            );
+    }
+
+    public function isProbablyVisible()
+    {
+        /*
+         * In the original JS project they check if the node has the style display=none, which unfortunately
+         * in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none".
+         *
+         * Might be a good idea to check for classes or other attributes like 'aria-hidden'
+         */
+
+        return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden');
+    }
+
+    public function isWhitespace()
+    {
+        return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) ||
+            ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br');
+    }
+
+    /**
+     * This is a hack that overcomes the issue of node shifting when scanning and removing nodes.
+     *
+     * In the JS version of getElementsByTagName, if you remove a node it will not appear during the
+     * foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an
+     * orphan node and will give an exception if you try to do anything with it.
+     *
+     * Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are
+     * removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that
+     * never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries
+     * to access node 6)
+     *
+     * This function solves this by searching for the nodes on every loop and keeping track of the count differences.
+     * Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be
+     * used only when the results of the search are going to be used to remove the nodes.
+     *
+     * @param string $tag
+     *
+     * @return \Generator
+     */
+    public function shiftingAwareGetElementsByTagName($tag)
+    {
+        /** @var $nodes DOMNodeList */
+        $nodes = $this->getElementsByTagName($tag);
+        $count = $nodes->length;
+
+        for ($i = 0; $i < $count; $i = max(++$i, 0)) {
+            yield $nodes->item($i);
+
+            // Search for all the nodes again
+            $nodes = $this->getElementsByTagName($tag);
+
+            // Subtract the amount of nodes removed from the current index
+            $i -= $count - $nodes->length;
+
+            // Subtract the amount of nodes removed from the current count
+            $count -= ($count - $nodes->length);
+        }
+    }
 }
diff --git a/vendor/andreskrey/Readability/Nodes/NodeUtility.php b/vendor/andreskrey/Readability/Nodes/NodeUtility.php
index 752e9f410..7a1f18ee4 100644
--- a/vendor/andreskrey/Readability/Nodes/NodeUtility.php
+++ b/vendor/andreskrey/Readability/Nodes/NodeUtility.php
@@ -17,13 +17,13 @@ class NodeUtility
      * @var array
      */
     public static $regexps = [
-        'unlikelyCandidates' => '/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
+        'unlikelyCandidates' => '/-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
         'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
         'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
         'byline' => '/byline|author|dateline|writtenby|p-author/i',
         'replaceFonts' => '/<(\/?)font[^>]*>/gi',
         'normalize' => '/\s{2,}/',
-        'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i',
+        'videos' => '/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i',
         'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i',
         'prevLink' => '/(prev|earl|old|new|<|«)/i',
         'whitespace' => '/^\s*$/',
@@ -45,8 +45,8 @@ class NodeUtility
     {
         $next = $node;
         while ($next
-            && $next->nodeName !== '#text'
-            && trim($next->textContent)) {
+            && $next->nodeType !== XML_ELEMENT_NODE
+            && $next->isWhitespace()) {
             $next = $next->nextSibling;
         }
 
@@ -57,12 +57,13 @@ class NodeUtility
      * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new
      * element with the new tag name and importing it to the main DOMDocument.
      *
+     * @param DOMNode $node
      * @param string $value
      * @param bool $importAttributes
      *
      * @return DOMNode
      */
-    public static function setNodeTag($node, $value, $importAttributes = false)
+    public static function setNodeTag($node, $value, $importAttributes = true)
     {
         $new = new DOMDocument('1.0', 'utf-8');
         $new->appendChild($new->createElement($value));
diff --git a/vendor/andreskrey/Readability/Readability.php b/vendor/andreskrey/Readability/Readability.php
index 93fc81070..7b7eed6bf 100644
--- a/vendor/andreskrey/Readability/Readability.php
+++ b/vendor/andreskrey/Readability/Readability.php
@@ -127,7 +127,7 @@ class Readability
      *
      * @throws ParseException
      *
-     * @return array|bool
+     * @return bool
      */
     public function parse($html)
     {
@@ -164,14 +164,11 @@ class Readability
 
             $length = mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent));
 
-            $this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getWordThreshold()));
-
-            $parseSuccessful = true;
+            $this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold()));
 
-            if ($result && $length < $this->configuration->getWordThreshold()) {
+            if ($result && $length < $this->configuration->getCharThreshold()) {
                 $this->dom = $this->loadHTML($html);
                 $root = $this->dom->getElementsByTagName('body')->item(0);
-                $parseSuccessful = false;
 
                 if ($this->configuration->getStripUnlikelyCandidates()) {
                     $this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false');
@@ -204,7 +201,6 @@ class Readability
                     $this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.');
 
                     $result = $this->attempts[0]['articleContent'];
-                    $parseSuccessful = true;
                     break;
                 }
             } else {
@@ -212,26 +208,24 @@ class Readability
             }
         }
 
-        if ($parseSuccessful) {
-            $result = $this->postProcessContent($result);
-
-            // If we haven't found an excerpt in the article's metadata, use the article's
-            // first paragraph as the excerpt. This can be used for displaying a preview of
-            // the article's content.
-            if (!$this->getExcerpt()) {
-                $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');
-                $paragraphs = $result->getElementsByTagName('p');
-                if ($paragraphs->length > 0) {
-                    $this->setExcerpt(trim($paragraphs->item(0)->textContent));
-                }
+        $result = $this->postProcessContent($result);
+
+        // If we haven't found an excerpt in the article's metadata, use the article's
+        // first paragraph as the excerpt. This can be used for displaying a preview of
+        // the article's content.
+        if (!$this->getExcerpt()) {
+            $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');
+            $paragraphs = $result->getElementsByTagName('p');
+            if ($paragraphs->length > 0) {
+                $this->setExcerpt(trim($paragraphs->item(0)->textContent));
             }
+        }
 
-            $this->setContent($result);
+        $this->setContent($result);
 
-            $this->logger->info('*** Parse successful :)');
+        $this->logger->info('*** Parse successful :)');
 
-            return true;
-        }
+        return true;
     }
 
     /**
@@ -292,77 +286,98 @@ class Readability
         $this->logger->debug('[Metadata] Retrieving metadata...');
 
         $values = [];
-        // Match "description", or Twitter's "twitter:description" (Cards)
-        // in name attribute.
-        $namePattern = '/^\s*((twitter)\s*:\s*)?(description|title|image)\s*$/i';
+        // property is a space-separated list of values
+        $propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image)\s*/i';
 
-        // Match Facebook's Open Graph title & description properties.
-        $propertyPattern = '/^\s*og\s*:\s*(description|title|image)\s*$/i';
+        // name is a single value
+        $namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image)\s*$/i';
 
+        // Find description tags.
         foreach ($this->dom->getElementsByTagName('meta') as $meta) {
             /* @var DOMNode $meta */
             $elementName = $meta->getAttribute('name');
             $elementProperty = $meta->getAttribute('property');
+            $content = $meta->getAttribute('content');
+            $matches = null;
+            $name = null;
 
-            if (in_array('author', [$elementName, $elementProperty])) {
-                $this->logger->info(sprintf('[Metadata] Found author: \'%s\'', $meta->getAttribute('content')));
-                $this->setAuthor($meta->getAttribute('content'));
-                continue;
+            if ($elementProperty) {
+                if (preg_match($propertyPattern, $elementProperty, $matches)) {
+                    for ($i = count($matches) - 1; $i >= 0; $i--) {
+                        // Convert to lowercase, and remove any whitespace
+                        // so we can match below.
+                        $name = preg_replace('/\s/', '', mb_strtolower($matches[$i]));
+                        // multiple authors
+                        $values[$name] = trim($content);
+                    }
+                }
             }
 
-            $name = null;
-            if (preg_match($namePattern, $elementName)) {
+            if (!$matches && $elementName && preg_match($namePattern, $elementName)) {
                 $name = $elementName;
-            } elseif (preg_match($propertyPattern, $elementProperty)) {
-                $name = $elementProperty;
-            }
-
-            if ($name) {
-                $content = $meta->getAttribute('content');
                 if ($content) {
-                    // Convert to lowercase and remove any whitespace
-                    // so we can match below.
-                    $name = preg_replace('/\s/', '', strtolower($name));
+                    // Convert to lowercase, remove any whitespace, and convert dots
+                    // to colons so we can match below.
+                    $name = preg_replace(['/\s/', '/\./'], ['', ':'], mb_strtolower($name));
                     $values[$name] = trim($content);
                 }
             }
         }
-        if (array_key_exists('description', $values)) {
-            $this->logger->info(sprintf('[Metadata] Found excerpt in \'description\' tag: \'%s\'', $values['description']));
-            $this->setExcerpt($values['description']);
-        } elseif (array_key_exists('og:description', $values)) {
-            // Use facebook open graph description.
-            $this->logger->info(sprintf('[Metadata] Found excerpt in \'og:description\' tag: \'%s\'', $values['og:description']));
-            $this->setExcerpt($values['og:description']);
-        } elseif (array_key_exists('twitter:description', $values)) {
-            // Use twitter cards description.
-            $this->logger->info(sprintf('[Metadata] Found excerpt in \'twitter:description\' tag: \'%s\'', $values['twitter:description']));
-            $this->setExcerpt($values['twitter:description']);
-        }
 
-        $this->setTitle($this->getArticleTitle());
+        // get title
+        /*
+         * This is a very convoluted way of extracting the first matching key of the $values array
+         * against a set of options.
+         *
+         * This could be easily replaced with an ugly set of isset($values['key']) or a bunch of ??s.
+         * Will probably replace it with ??s after dropping support of PHP5.6
+         */
+
+        $key = current(array_intersect([
+            'dc:title',
+            'dcterm:title',
+            'og:title',
+            'weibo:article:title',
+            'weibo:webpage:title',
+            'title',
+            'twitter:title'
+        ], array_keys($values)));
+
+        $this->setTitle(isset($values[$key]) ? trim($values[$key]) : null);
 
         if (!$this->getTitle()) {
-            if (array_key_exists('og:title', $values)) {
-                // Use facebook open graph title.
-                $this->logger->info(sprintf('[Metadata] Found title in \'og:title\' tag: \'%s\'', $values['og:title']));
-                $this->setTitle($values['og:title']);
-            } elseif (array_key_exists('twitter:title', $values)) {
-                // Use twitter cards title.
-                $this->logger->info(sprintf('[Metadata] Found title in \'twitter:title\' tag: \'%s\'', $values['twitter:title']));
-                $this->setTitle($values['twitter:title']);
-            }
+            $this->setTitle($this->getArticleTitle());
         }
 
-        if (array_key_exists('og:image', $values) || array_key_exists('twitter:image', $values)) {
-            if (array_key_exists('og:image', $values)) {
-                $this->logger->info(sprintf('[Metadata] Found main image in \'og:image\' tag: \'%s\'', $values['og:image']));
-                $this->setImage($values['og:image']);
-            } else {
-                $this->logger->info(sprintf('[Metadata] Found main image in \'twitter:image\' tag: \'%s\'', $values['twitter:image']));
-                $this->setImage($values['twitter:image']);
-            }
-        }
+        // get author
+        $key = current(array_intersect([
+            'dc:creator',
+            'dcterm:creator',
+            'author'
+        ], array_keys($values)));
+
+        $this->setAuthor(isset($values[$key]) ? $values[$key] : null);
+
+        // get description
+        $key = current(array_intersect([
+            'dc:description',
+            'dcterm:description',
+            'og:description',
+            'weibo:article:description',
+            'weibo:webpage:description',
+            'description',
+            'twitter:description'
+        ], array_keys($values)));
+
+        $this->setExcerpt(isset($values[$key]) ? $values[$key] : null);
+
+        // get main image
+        $key = current(array_intersect([
+            'og:image',
+            'twitter:image'
+        ], array_keys($values)));
+
+        $this->setImage(isset($values[$key]) ? $values[$key] : null);
     }
 
     /**
@@ -453,7 +468,7 @@ class Readability
             return null;
         }
 
-        $curTitle = $originalTitle;
+        $curTitle = $originalTitle = trim($originalTitle);
         $titleHadHierarchicalSeparators = false;
 
         /*
@@ -623,8 +638,6 @@ class Readability
          */
 
         while ($node) {
-            $matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
-
             // Remove DOMComments nodes as we don't need them and mess up children counting
             if ($node->nodeType === XML_COMMENT_NODE) {
                 $this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
@@ -632,6 +645,14 @@ class Readability
                 continue;
             }
 
+            $matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
+
+            if (!$node->isProbablyVisible()) {
+                $this->logger->debug(sprintf('[Get Nodes] Removing hidden node... Match string was: \'%s\'', $matchString));
+                $node = NodeUtility::removeAndGetNext($node);
+                continue;
+            }
+
             // Check to see if this node is a byline, and remove it if it is.
             if ($this->checkByline($node, $matchString)) {
                 $this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
@@ -671,13 +692,35 @@ class Readability
 
             // Turn all divs that don't have children block level elements into p's
             if ($node->nodeName === 'div') {
+                // Put phrasing content into paragraphs.
+                $p = null;
+                $childNode = $node->firstChild;
+                while ($childNode) {
+                    $nextSibling = $childNode->nextSibling;
+                    if ($childNode->isPhrasingContent()) {
+                        if ($p !== null) {
+                            $p->appendChild($childNode);
+                        } elseif (!$childNode->isWhitespace()) {
+                            $p = $this->dom->createElement('p');
+                            $node->replaceChild($p, $childNode);
+                            $p->appendChild($childNode);
+                        }
+                    } elseif ($p !== null) {
+                        while ($p->lastChild && $p->lastChild->isWhitespace()) {
+                            $p->removeChild($p->lastChild);
+                        }
+                        $p = null;
+                    }
+                    $childNode = $nextSibling;
+                }
+
                 /*
                  * Sites like http://mobile.slate.com encloses each paragraph with a DIV
                  * element. DIVs with only a P element inside and no text content can be
                  * safely converted into plain P elements to avoid confusing the scoring
                  * algorithm with DIVs with are, in practice, paragraphs.
                  */
-                if ($node->hasSinglePNode()) {
+                if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) {
                     $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
                     $pNode = $node->getChildren(true)[0];
                     $node->parentNode->replaceChild($pNode, $node);
@@ -687,16 +730,6 @@ class Readability
                     $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
                     $node = NodeUtility::setNodeTag($node, 'p');
                     $elementsToScore[] = $node;
-                } else {
-                    // EXPERIMENTAL
-                    foreach ($node->getChildren() as $child) {
-                        /** @var $child DOMNode */
-                        if ($child->nodeType === XML_TEXT_NODE && mb_strlen(trim($child->getTextContent())) > 0) {
-                            $this->logger->debug(sprintf('[Get Nodes] Found DIV a text node inside, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
-                            $newNode = $node->createNode($child, 'p');
-                            $child->parentNode->replaceChild($newNode, $child);
-                        }
-                    }
                 }
             }
 
@@ -751,7 +784,7 @@ class Readability
         if (gettype($text) == 'string') {
             $byline = trim($text);
 
-            return (mb_strlen($byline) > 0) && (mb_strlen($text) < 100);
+            return (mb_strlen($byline) > 0) && (mb_strlen($byline) < 100);
         }
 
         return false;
@@ -764,15 +797,10 @@ class Readability
      */
     private function removeScripts(DOMDocument $dom)
     {
-        $toRemove = ['script', 'noscript'];
-
-        foreach ($toRemove as $tag) {
-            while ($script = $dom->getElementsByTagName($tag)) {
-                if ($script->item(0)) {
-                    $script->item(0)->parentNode->removeChild($script->item(0));
-                } else {
-                    break;
-                }
+        foreach (['script', 'noscript'] as $tag) {
+            $nodes = $dom->getElementsByTagName($tag);
+            foreach (iterator_to_array($nodes) as $node) {
+                NodeUtility::removeNode($node);
             }
         }
     }
@@ -786,15 +814,7 @@ class Readability
     {
         $this->logger->info('[PrepDocument] Preparing document for parsing...');
 
-        /*
-         * DOMNodeList must be converted to an array before looping over it.
-         * This is done to avoid node shifting when removing nodes.
-         *
-         * Reverse traversing cannot be done here because we need to find brs that are right next to other brs.
-         * (If we go the other way around we need to search for previous nodes forcing the creation of new functions
-         * that will be used only here)
-         */
-        foreach (iterator_to_array($dom->getElementsByTagName('br')) as $br) {
+        foreach ($dom->shiftingAwareGetElementsByTagName('br') as $br) {
             $next = $br->nextSibling;
 
             /*
@@ -831,12 +851,16 @@ class Readability
                 while ($next) {
                     // If we've hit another <br><br>, we're done adding children to this <p>.
                     if ($next->nodeName === 'br') {
-                        $nextElem = NodeUtility::nextElement($next);
+                        $nextElem = NodeUtility::nextElement($next->nextSibling);
                         if ($nextElem && $nextElem->nodeName === 'br') {
                             break;
                         }
                     }
 
+                    if (!$next->isPhrasingContent()) {
+                        break;
+                    }
+
                     $this->logger->debug('[PrepDocument] Replacing BR with a P node...');
 
                     // Otherwise, make this node a child of the new <p>.
@@ -844,6 +868,14 @@ class Readability
                     $p->appendChild($next);
                     $next = $sibling;
                 }
+
+                while ($p->lastChild && $p->lastChild->isWhitespace()) {
+                    $p->removeChild($p->lastChild);
+                }
+
+                if ($p->parentNode->tagName === 'p') {
+                    NodeUtility::setNodeTag($p->parentNode, 'div');
+                }
             }
         }
 
@@ -853,7 +885,7 @@ class Readability
         for ($i = 0; $i < $length; $i++) {
             $this->logger->debug('[PrepDocument] Converting font tag into a span tag.');
             $font = $fonts->item($length - 1 - $i);
-            NodeUtility::setNodeTag($font, 'span', true);
+            NodeUtility::setNodeTag($font, 'span');
         }
     }
 
@@ -989,7 +1021,9 @@ class Readability
             // and whose scores are quite closed with current `topCandidate` node.
             $alternativeCandidateAncestors = [];
             for ($i = 1; $i < count($topCandidates); $i++) {
-                if ($topCandidates[$i]->contentScore / $topCandidate->contentScore >= 0.75) {
+                // In some cases we may end up with a top candidate with zero content score. To avoid dividing by zero
+                // we have to use max() and replace zero with a low value like 0.1
+                if ($topCandidates[$i]->contentScore / max($topCandidate->contentScore, 0.1) >= 0.75) {
                     array_push($alternativeCandidateAncestors, $topCandidates[$i]->getNodeAncestors(false));
                 }
             }
@@ -997,7 +1031,9 @@ class Readability
             $MINIMUM_TOPCANDIDATES = 3;
             if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) {
                 $parentOfTopCandidate = $topCandidate->parentNode;
-                while ($parentOfTopCandidate->nodeName !== 'body') {
+
+                // Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher
+                while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) {
                     $listsContainingThisAncestor = 0;
                     for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) {
                         $listsContainingThisAncestor += (int)in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]);
@@ -1027,8 +1063,7 @@ class Readability
             $scoreThreshold = $lastScore / 3;
 
             /* @var DOMElement $parentOfTopCandidate */
-            // Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher
-            while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) {
+            while ($parentOfTopCandidate->nodeName !== 'body') {
                 $parentScore = $parentOfTopCandidate->contentScore;
                 if ($parentScore < $scoreThreshold) {
                     break;
@@ -1175,6 +1210,7 @@ class Readability
         $this->_clean($article, 'h1');
         $this->_clean($article, 'footer');
         $this->_clean($article, 'link');
+        $this->_clean($article, 'aside');
 
         // Clean out elements have "share" in their id/class combinations from final top candidates,
         // which means we don't remove the top candidates even they have "share".
@@ -1227,6 +1263,22 @@ class Readability
             }
         }
 
+        // Remove single-cell tables
+        foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) {
+            /** @var DOMNode $table */
+            $tbody = $table->hasSingleTagInsideElement('tbody') ? $table->childNodes[0] : $table;
+            if ($tbody->hasSingleTagInsideElement('tr')) {
+                $row = $tbody->firstChild;
+                if ($row->hasSingleTagInsideElement('td')) {
+                    $cell = $row->firstChild;
+                    $cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function ($carry, $node) {
+                        return $node->isPhrasingContent() && $carry;
+                    }, true)) ? 'p' : 'div');
+                    $table->parentNode->replaceChild($cell, $table);
+                }
+            }
+        }
+
         return $article;
     }
 
@@ -1374,6 +1426,7 @@ class Readability
 
     /**
      * @param DOMDocument $article
+     * @param string $tag Tag to clean conditionally
      *
      * @return void
      */
@@ -1398,7 +1451,9 @@ class Readability
             $node = $DOMNodeList->item($length - 1 - $i);
 
             // First check if we're in a data table, in which case don't remove us.
-            if ($node->hasAncestorTag($node, 'table', -1) && $node->isReadabilityDataTable()) {
+            if ($node->hasAncestorTag('table', -1, function ($node) {
+                return $node->isReadabilityDataTable();
+            })) {
                 continue;
             }
 
@@ -1439,10 +1494,10 @@ class Readability
                 $contentLength = mb_strlen($node->getTextContent(true));
 
                 $haveToRemove =
-                    ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag($node, 'figure')) ||
+                    ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag('figure')) ||
                     (!$isList && $li > $p) ||
                     ($input > floor($p / 3)) ||
-                    (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag($node, 'figure')) ||
+                    (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag('figure')) ||
                     (!$isList && $weight < 25 && $linkDensity > 0.2) ||
                     ($weight >= 25 && $linkDensity > 0.5) ||
                     (($embedCount === 1 && $contentLength < 75) || $embedCount > 1);
@@ -1477,7 +1532,7 @@ class Readability
             // Allow youtube and vimeo videos through as people usually want to see those.
             if ($isEmbed) {
                 $attributeValues = [];
-                foreach ($item->attributes as $name => $value) {
+                foreach ($item->attributes as $value) {
                     $attributeValues[] = $value->nodeValue;
                 }
                 $attributeValues = implode('|', $attributeValues);
author	Andrew Dolgov <[email protected]>	2019-02-13 14:49:27 +0300
committer	Andrew Dolgov <[email protected]>	2019-02-13 14:49:27 +0300
commit	8b6ef90385874cefcb904e59801b3c0482805849 (patch)
tree	cf40ecb7a13dc4c5a25e3d506185872355dec5a7 /vendor
parent	874a2d21704c41e625b4b7ad565b2326ce5b95cd (diff)