2 files changed, 291 insertions, 38 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 6f76a25..27dd8e5 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -17,6 +17,7 @@ class HTMLParser
     private $dom = null;
 
     /**
+     * TODO Make this an object? Instead of a dumb array
      * @var array
      */
     private $metadata = [];
@@ -25,7 +26,7 @@ class HTMLParser
      * @var array
      */
     private $regexps = [
-        'unlikelyCandidates' => '/banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i',
+        'unlikelyCandidates' => '/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
         'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
         'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
         'byline' => '/byline|author|dateline|writtenby|p-author/i',
@@ -184,6 +185,7 @@ class HTMLParser
             'image' => isset($this->metadata['image']) ? $this->metadata['image'] : null,
             'article' => $result,
             'html' => $result->C14N(),
+            'dir' => isset($this->metadata['articleDir']) ? $this->metadata['articleDir'] : null,
         ];
     }
 
@@ -536,16 +538,83 @@ class HTMLParser
      */
     private function getTitle()
     {
+        $originalTitle = null;
+
         if (isset($this->metadata['title'])) {
-            return $this->metadata['title'];
+            $originalTitle = $this->metadata['title'];
+        } else {
+            $titleTag = $this->dom->getElementsByTagName('title');
+            if ($titleTag->length > 0) {
+                $originalTitle = $titleTag->item(0)->nodeValue;
+            }
         }
 
-        $title = $this->dom->getElementsByTagName('title');
-        if ($title->length > 0) {
-            return $title->item(0)->nodeValue;
+        if ($originalTitle === null) {
+            return null;
+        }
+
+        $curTitle = $originalTitle;
+        $titleHadHierarchicalSeparators = false;
+
+        /*
+         * If there's a separator in the title, first remove the final part
+         *
+         * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false
+         * I can assure you it works properly if you let the code run.
+         */
+        if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) {
+            $titleHadHierarchicalSeparators = (bool)preg_match('/ [\\\\\/>»] /', $curTitle);
+            $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle);
+
+            // If the resulting title is too short (3 words or fewer), remove
+            // the first part instead:
+            if (count(preg_split('/\s+/', $curTitle)) < 3) {
+                $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle);
+            }
+        } else if (strpos($curTitle, ': ') !== false) {
+            // Check if we have an heading containing this exact string, so we
+            // could assume it's the full title.
+            $match = false;
+            for ($i = 1; $i <= 2; $i++) {
+                foreach ($this->dom->getElementsByTagName('h' . $i) as $hTag) {
+                    if ($hTag->nodeValue === $curTitle) {
+                        $match = true;
+                    }
+                }
+            }
+
+            // If we don't, let's extract the title out of the original title string.
+            if (!$match) {
+                $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1);
+
+                // If the title is now too short, try the first colon instead:
+                if (count(preg_split('/\s+/', $curTitle)) < 3)
+                    $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1);
+            }
+        } else if (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
+            $hOnes = $this->dom->getElementsByTagName('h1');
+
+            if ($hOnes->length === 1) {
+                $curTitle = $hOnes->item(0)->nodeValue;
+            }
+        }
+
+        $curTitle = trim($curTitle);
+
+        /*
+         * If we now have 4 words or fewer as our title, and either no
+         * 'hierarchical' separators (\, /, > or ») were found in the original
+         * title or we decreased the number of words by more than 1 word, use
+         * the original title.
+         */
+        $curTitleWordCount = count(preg_split('/\s+/', $curTitle));
+
+        if ($curTitleWordCount <= 4 &&
+            (!$titleHadHierarchicalSeparators || $curTitleWordCount !== preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle)) - 1)) {
+            $curTitle = $originalTitle;
         }
 
-        return null;
+        return $curTitle;
     }
 
     /**
@@ -589,6 +658,16 @@ class HTMLParser
                 }
             }
 
+            // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
+            if (($node->tagNameEqualsTo('div') || $node->tagNameEqualsTo('section') || $node->tagNameEqualsTo('header') ||
+                    $node->tagNameEqualsTo('h1') || $node->tagNameEqualsTo('h2') || $node->tagNameEqualsTo('h3') ||
+                    $node->tagNameEqualsTo('h4') || $node->tagNameEqualsTo('h5') || $node->tagNameEqualsTo('h6')) &&
+                $node->isElementWithoutContent()) {
+                $node = $node->removeAndGetNext($node);
+                continue;
+            }
+
+
             if (in_array(strtolower($node->getTagName()), $this->defaultTagsToScore)) {
                 $elementsToScore[] = $node;
             }
@@ -612,12 +691,9 @@ class HTMLParser
                     // EXPERIMENTAL
                     foreach ($node->getChildren() as $child) {
                         /** @var Readability $child */
-                        if ($child->isText()) {
-                            // Check if there's actual content on the node.
-                            if (trim($child->getTextContent())) {
-                                $newNode = $node->createNode($child, 'p');
-                                $child->replaceChild($newNode);
-                            }
+                        if ($child->isText() && mb_strlen(trim($child->getTextContent())) > 0) {
+                            $newNode = $node->createNode($child, 'p');
+                            $child->replaceChild($newNode);
                         }
                     }
                 }
@@ -737,6 +813,7 @@ class HTMLParser
 
         $topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null;
         $neededToCreateTopCandidate = false;
+        $parentOfTopCandidate = null;
 
         /*
          * If we still have no top candidate, just use the body as a last resort.
@@ -745,8 +822,6 @@ class HTMLParser
 
         if ($topCandidate === null || $topCandidate->tagNameEqualsTo('body')) {
             // Move all of the page's children into topCandidate
-            $neededToCreateTopCandidate = true;
-
             $topCandidate = new DOMDocument('1.0', 'utf-8');
             $topCandidate->encoding = 'UTF-8';
             $topCandidate->appendChild($topCandidate->createElement('div', ''));
@@ -765,6 +840,31 @@ class HTMLParser
             //TODO on the original code, $topCandidate is added to the page variable, which holds the whole HTML
             // Should be done this here also? (line 823 in readability.js)
         } elseif ($topCandidate) {
+            // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
+            // and whose scores are quite closed with current `topCandidate` node.
+            $alternativeCandidateAncestors = [];
+            for ($i = 0; $i < count($topCandidates) - 1; $i++) {
+                if ($topCandidates[$i]->getContentScore() / $topCandidate->getContentScore() >= 0.75) {
+                    $alternativeCandidateAncestors[$i] = $topCandidates[$i]->getNodeAncestors(5);
+                }
+            }
+
+            $MINIMUM_TOPCANDIDATES = 3;
+            if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) {
+                $parentOfTopCandidate = $topCandidate->getParent();
+                while (!$parentOfTopCandidate->tagNameEqualsTo('body')) {
+                    $listsContainingThisAncestor = 0;
+                    for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) {
+                        $listsContainingThisAncestor += (int)in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]);
+                    }
+                    if ($listsContainingThisAncestor >= $MINIMUM_TOPCANDIDATES) {
+                        $topCandidate = $parentOfTopCandidate;
+                        break;
+                    }
+                    $parentOfTopCandidate = $parentOfTopCandidate->getParent();
+                }
+            }
+
             /*
              * Because of our bonus system, parents of candidates might have scores
              * themselves. They get half of the node. There won't be nodes with higher
@@ -781,8 +881,8 @@ class HTMLParser
             // The scores shouldn't get too low.
             $scoreThreshold = $lastScore / 3;
 
-            while ($parentOfTopCandidate) {
-                /* @var Readability $parentOfTopCandidate */
+            /* @var Readability $parentOfTopCandidate */
+            while (!$parentOfTopCandidate->tagNameEqualsTo('body')) {
                 $parentScore = $parentOfTopCandidate->getContentScore();
                 if ($parentScore < $scoreThreshold) {
                     break;
@@ -796,6 +896,14 @@ class HTMLParser
                 $lastScore = $parentOfTopCandidate->getContentScore();
                 $parentOfTopCandidate = $parentOfTopCandidate->getParent();
             }
+
+            // If the top candidate is the only child, use parent instead. This will help sibling
+            // joining logic when adjacent content is actually located in parent's sibling node.
+            $parentOfTopCandidate = $topCandidate->getParent();
+            while (!$parentOfTopCandidate->tagNameEqualsTo('body') && count($parentOfTopCandidate->getChildren()) === 1) {
+                $topCandidate = $parentOfTopCandidate;
+                $parentOfTopCandidate = $topCandidate->getParent();
+            }
         }
 
         /*
@@ -808,7 +916,9 @@ class HTMLParser
         $articleContent->createElement('div');
 
         $siblingScoreThreshold = max(10, $topCandidate->getContentScore() * 0.2);
-        $siblings = $topCandidate->getParent()->getChildren();
+        // Keep potential top candidate's parent node to try to get text direction of it later.
+        $parentOfTopCandidate = $topCandidate->getParent();
+        $siblings = $parentOfTopCandidate->getChildren();
 
         $hasContent = false;
 
@@ -866,6 +976,16 @@ class HTMLParser
         $articleContent = $this->prepArticle($articleContent);
 
         if ($hasContent) {
+            // Find out text direction from ancestors of final top candidate.
+            $ancestors = array_merge([$parentOfTopCandidate, $topCandidate], $parentOfTopCandidate->getNodeAncestors());
+            foreach ($ancestors as $ancestor) {
+                $articleDir = $ancestor->getAttribute('dir');
+                if ($articleDir) {
+                    $this->metadata['articleDir'] = $articleDir;
+                    break;
+                }
+            }
+
             return $articleContent;
         } else {
             return false;
@@ -881,8 +1001,14 @@ class HTMLParser
      */
     public function prepArticle(DOMDocument $article)
     {
+        // Check for data tables before we continue, to avoid removing items in
+        // those tables, which will often be isolated even though they're
+        // visually linked to other content-ful elements (text, images, etc.).
+        $this->_markDataTables($article);
+
         // Clean out junk from the article content
         $this->_cleanConditionally($article, 'form');
+        $this->_cleanConditionally($article, 'fieldset');
         $this->_clean($article, 'object');
         $this->_clean($article, 'embed');
         $this->_clean($article, 'h1');
@@ -891,13 +1017,29 @@ class HTMLParser
         // Readability.js cleans styles on prepDocument but we do it here.
         $this->_clean($article, 'style');
 
-        // If there is only one h2, they are probably using it as a header
-        // and not a subheader, so remove it since we already have a header.
-        if ($article->getElementsByTagName('h2')->length === 1) {
-            $this->_clean($article, 'h2');
+        /*
+         * If there is only one h2 and its text content substantially equals article title,
+         * they are probably using it as a header and not a subheader,
+         * so remove it since we already extract the title separately.
+         */
+        $h2 = $article->getElementsByTagName('h2');
+        if ($h2->length === 1) {
+            $lengthSimilarRate = (mb_strlen($h2->item(0)->textContent) - mb_strlen($this->metadata['title'])) / mb_strlen($this->metadata['title']);
+            if (abs($lengthSimilarRate) < 0.5 &&
+                ($lengthSimilarRate > 0 ?
+                    strpos($h2->item(0)->textContent, $this->metadata['title']) !== false :
+                    strpos($this->metadata['title'], $h2->item(0)->textContent) !== false
+                )
+            ) {
+                $this->_clean($article, 'h2');
+            }
         }
 
         $this->_clean($article, 'iframe');
+        $this->_clean($article, 'input');
+        $this->_clean($article, 'textarea');
+        $this->_clean($article, 'select');
+        $this->_clean($article, 'button');
         $this->_cleanHeaders($article);
 
         // Do these last as the previous stuff may have removed junk
@@ -924,6 +1066,98 @@ class HTMLParser
     }
 
     /**
+     * Look for 'data' (as opposed to 'layout') tables, for which we use
+     * similar checks as
+     * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
+     *
+     * TODO To be moved to Readability. WARNING: check if we actually keep the "readabilityDataTable" param and
+     * maybe switch to a readability data-tag?
+     *
+     * @param DOMDocument $article
+     *
+     * @return void
+     */
+    public function _markDataTables(DOMDocument $article)
+    {
+        $tables = $article->getElementsByTagName('table');
+        foreach ($tables as $table) {
+            /** @var \DOMElement $table */
+            $role = $table->getAttribute('role');
+            if ($role === "presentation") {
+                $table->readabilityDataTable = false;
+                continue;
+            }
+            $datatable = $table->getAttribute('datatable');
+            if ($datatable == '0') {
+                $table->readabilityDataTable = false;
+                continue;
+            }
+            $summary = $table->getAttribute('summary');
+            if ($summary) {
+                $table->readabilityDataTable = true;
+                continue;
+            }
+
+            $caption = $table->getElementsByTagName('caption');
+            if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) {
+                $table->readabilityDataTable = true;
+                continue;
+            }
+
+            // If the table has a descendant with any of these tags, consider a data table:
+            foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) {
+                if ($table->getElementsByTagName($dataTableDescendants)->length > 0) {
+                    $table->readabilityDataTable = true;
+                    continue 2;
+                }
+            }
+
+            // Nested tables indicate a layout table:
+            if ($table->getElementsByTagName('table')->length > 0) {
+                $table->readabilityDataTable = false;
+                continue;
+            }
+
+            $sizeInfo = $this->_getRowAndColumnCount($table);
+            if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) {
+                $table->readabilityDataTable = true;
+                continue;
+            }
+            // Now just go by size entirely:
+            $table->readabilityDataTable = $sizeInfo['rows'] * $sizeInfo['columns'] > 10;
+        }
+    }
+
+    /**
+     * Return an array indicating how many rows and columns this table has.
+     * @param \DOMElement $table
+     *
+     * @return array
+     */
+    public function _getRowAndColumnCount(\DOMElement $table)
+    {
+        $rows = $columns = 0;
+        $trs = $table->getElementsByTagName('tr');
+        foreach ($trs as $tr) {
+            /** @var \DOMElement $tr */
+            $rowspan = $tr->getAttribute('rowspan');
+            $rows += ($rowspan || 1);
+
+            // Now look for column-related info
+            $columnsInThisRow = 0;
+            $cells = $tr->getElementsByTagName('td');
+            foreach ($cells as $cell) {
+                /** @var \DOMElement $cell */
+                $colspan = $cell->getAttribute('colspan');
+                $columnsInThisRow += ($colspan || 1);
+            }
+            $columns = max($columns, $columnsInThisRow);
+        }
+
+        return ['rows' => $rows, 'columns' => $columns];
+    }
+
+    /**
      * TODO To be moved to Readability.
      *
      * @param DOMDocument $article
@@ -992,6 +1226,12 @@ class HTMLParser
             $node = $DOMNodeList->item($length - 1 - $i);
 
             $node = new Readability($node);
+
+            // First check if we're in a data table, in which case don't remove us.
+            if ($node->hasAncestorTag($node, 'table', -1) && isset($node->readabilityDataTable)) {
+                continue;
+            }
+
             $weight = $node->getClassWeight();
 
             if ($weight < 0) {
@@ -1025,11 +1265,10 @@ class HTMLParser
                 $contentLength = mb_strlen($node->getTextContent(true));
 
                 $haveToRemove =
-                    // Make an exception for elements with no p's and exactly 1 img.
-                    ($img > $p && $node->hasAncestorTag($node, 'figure')) ||
+                    ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag($node, 'figure')) ||
                     (!$isList && $li > $p) ||
                     ($input > floor($p / 3)) ||
-                    (!$isList && $contentLength < 25 && ($img === 0 || $img > 2)) ||
+                    (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag($node, 'figure')) ||
                     (!$isList && $weight < 25 && $linkDensity > 0.2) ||
                     ($weight >= 25 && $linkDensity > 0.5) ||
                     (($embedCount === 1 && $contentLength < 75) || $embedCount > 1);
@@ -1047,8 +1286,8 @@ class HTMLParser
      *
      * TODO To be moved to Readability
      *
-     * @param Element
-     * @param string tag to clean
+     * @param $article DOMDocument
+     * @param $tag string tag to clean
      *
      * @return void
      **/
@@ -1126,7 +1365,7 @@ class HTMLParser
      * Checks if the node is a byline.
      *
      * @param Readability $node
-     * @param string      $matchString
+     * @param string $matchString
      *
      * @return bool
      */
diff --git a/src/Readability.php b/src/Readability.php
index c55e0ad..b0a5830 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -13,7 +13,7 @@ use League\HTMLToMarkdown\Element;
 class Readability extends Element implements ReadabilityInterface
 {
     /**
-     * @var \DOMNode
+     * @var \DOMNode|\DOMElement
      */
     protected $node;
 
@@ -50,9 +50,6 @@ class Readability extends Element implements ReadabilityInterface
          * An if must be added before calling the getAttribute function, because if we reach the DOMDocument
          * by getting the node parents we'll get a undefined function fatal error
          */
-        $score = 0;
-
-        // Check if the getAttribute method exists, as some elements lack of it (and calling it anyway throws an exception)
         if (method_exists($node, 'getAttribute')) {
             if ($node->hasAttribute('data-readability')) {
                 // Node was initialized previously. Restoring score and setting flag.
@@ -253,7 +250,7 @@ class Readability extends Element implements ReadabilityInterface
     {
         // Check if the setAttribute method exists, as some elements lack of it (and calling it anyway throws an exception)
         if (method_exists($this->node, 'setAttribute')) {
-            $this->contentScore = (float) $score;
+            $this->contentScore = (float)$score;
 
             // Set score in an attribute of the tag to prevent losing it while creating new Readability objects.
             $this->node->setAttribute('data-readability', $this->contentScore);
@@ -286,7 +283,7 @@ class Readability extends Element implements ReadabilityInterface
      * element with the new tag name and importing it to the main DOMDocument.
      *
      * @param string $value
-     * @param bool   $importAttributes
+     * @param bool $importAttributes
      */
     public function setNodeTag($value, $importAttributes = false)
     {
@@ -343,7 +340,7 @@ class Readability extends Element implements ReadabilityInterface
      * for parents.
      *
      * @param Readability $originalNode
-     * @param bool        $ignoreSelfAndKids
+     * @param bool $ignoreSelfAndKids
      *
      * @return Readability
      */
@@ -419,7 +416,7 @@ class Readability extends Element implements ReadabilityInterface
      * Creates a new node based on the text content of the original node.
      *
      * @param Readability $originalNode
-     * @param string      $tagName
+     * @param string $tagName
      *
      * @return Readability
      */
@@ -466,8 +463,8 @@ class Readability extends Element implements ReadabilityInterface
      * provided one.
      *
      * @param Readability $node
-     * @param string      $tagName
-     * @param int         $maxDepth
+     * @param string $tagName
+     * @param int $maxDepth
      *
      * @return bool
      */
@@ -475,7 +472,7 @@ class Readability extends Element implements ReadabilityInterface
     {
         $depth = 0;
         while ($node->getParent()) {
-            if ($depth > $maxDepth) {
+            if ($maxDepth > 0 && $depth > $maxDepth) {
                 return false;
             }
             if ($node->getParent()->tagNameEqualsTo($tagName)) {
@@ -489,6 +486,8 @@ class Readability extends Element implements ReadabilityInterface
     }
 
     /**
+     * Returns the children of the current node
+     *
      * @param bool $filterEmptyDOMText Filter empty DOMText nodes?
      *
      * @return array
@@ -507,4 +506,19 @@ class Readability extends Element implements ReadabilityInterface
 
         return $ret;
     }
+
+
+    /**
+     * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace
+     *
+     * @return bool
+     */
+    public function isElementWithoutContent()
+    {
+        return ($this->node instanceof \DOMElement &&
+            mb_strlen(trim($this->node->textContent)) === 0 &&
+            ($this->node->childNodes->length === 0 ||
+                $this->node->childNodes->length === $this->node->getElementsByTagName('br')->length + $this->node->getElementsByTagName('hr')->length
+            ));
+    }
 }