Adding comments everywhere

author: Andres Rey <[email protected]> 2017-12-10 19:47:07 +0000
committer: Andres Rey <[email protected]> 2017-12-10 19:47:07 +0000
commit: 8b496d68788694b34c6fe898c380bf181981019d (patch)
tree: b1cf4962b6f7e33ddbf132cc677822df8afacef1
parent: c284ab9bae14994fd792900a77f61001da6dade5 (diff)
1 files changed, 110 insertions, 1 deletions
diff --git a/src/Readability.php b/src/Readability.php
index 48aff05..8df0189 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -124,10 +124,13 @@ class Readability
      */
     public function parse($html)
     {
+        $this->logger->info('*** Starting parse process...');
+
         $this->dom = $this->loadHTML($html);
 
         // Checking for minimum HTML to work with.
         if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) {
+            $this->logger->emergency('No body tag present or body tag empty');
             throw new ParseException('Invalid or incomplete HTML.');
         }
 
@@ -139,6 +142,7 @@ class Readability
             $root = $root->firstChild;
 
             $elementsToScore = $this->getNodes($root);
+            $this->logger->debug(sprintf('Elements to score: %s', count($elementsToScore)), $elementsToScore);
 
             $result = $this->rateNodes($elementsToScore);
 
@@ -154,17 +158,24 @@ class Readability
             foreach ($result->getElementsByTagName('p') as $p) {
                 $length += mb_strlen($p->textContent);
             }
+
+            $this->logger->info(sprintf('Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getWordThreshold()));
+
             if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < $this->configuration->getWordThreshold()) {
                 $this->dom = $this->loadHTML($html);
                 $root = $this->dom->getElementsByTagName('body')->item(0);
 
                 if ($this->configuration->getStripUnlikelyCandidates()) {
+                    $this->logger->debug('Threshold not passed, trying again setting StripUnlikelyCandidates as false');
                     $this->configuration->setStripUnlikelyCandidates(false);
                 } elseif ($this->configuration->getWeightClasses()) {
+                    $this->logger->debug('Threshold not passed, trying again setting WeightClasses as false');
                     $this->configuration->setWeightClasses(false);
                 } elseif ($this->configuration->getCleanConditionally()) {
+                    $this->logger->debug('Threshold not passed, trying again setting CleanConditionally as false');
                     $this->configuration->setCleanConditionally(false);
                 } else {
+                    $this->logger->emergency('Could not parse text, giving up.');
                     throw new ParseException('Could not parse text.');
                 }
             } else {
@@ -178,6 +189,7 @@ class Readability
         // first paragraph as the excerpt. This can be used for displaying a preview of
         // the article's content.
         if (!$this->getExcerpt()) {
+            $this->logger->debug('No excerpt text found on metadata, extracting first p node and using it as excerpt.');
             $paragraphs = $result->getElementsByTagName('p');
             if ($paragraphs->length > 0) {
                 $this->setExcerpt(trim($paragraphs->item(0)->textContent));
@@ -186,6 +198,8 @@ class Readability
 
         $this->setContent($result->C14N());
 
+        $this->logger->info('*** Parse successful :)');
+
         return true;
     }
 
@@ -203,6 +217,8 @@ class Readability
      */
     private function loadHTML($html)
     {
+        $this->logger->debug('Loading HTML...');
+
         // To avoid throwing a gazillion of errors on malformed HTMLs
         libxml_use_internal_errors(true);
 
@@ -214,11 +230,13 @@ class Readability
         }
 
         if ($this->configuration->getNormalizeEntities()) {
+            $this->logger->debug('Normalized entities via mb_convert_encoding.');
             // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
             $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
         }
 
         if ($this->configuration->getSummonCthulhu()) {
+            $this->logger->debug('Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘');
             $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
         }
 
@@ -230,6 +248,8 @@ class Readability
 
         $this->prepDocument($dom);
 
+        $this->logger->debug('Loaded HTML successfully.');
+
         return $dom;
     }
 
@@ -238,6 +258,8 @@ class Readability
      */
     private function getMetadata()
     {
+        $this->logger->debug('Retrieving metadata...');
+
         $values = [];
         // Match "description", or Twitter's "twitter:description" (Cards)
         // in name attribute.
@@ -252,6 +274,7 @@ class Readability
             $elementProperty = $meta->getAttribute('property');
 
             if (in_array('author', [$elementName, $elementProperty])) {
+                $this->logger->info(sprintf('[Metadata] Found author: %s', $meta->getAttribute('content')));
                 $this->setAuthor($meta->getAttribute('content'));
                 continue;
             }
@@ -274,12 +297,15 @@ class Readability
             }
         }
         if (array_key_exists('description', $values)) {
+            $this->logger->info(sprintf('[Metadata] Found excerpt in \'description\' tag: %s', $values['description']));
             $this->setExcerpt($values['description']);
         } elseif (array_key_exists('og:description', $values)) {
             // Use facebook open graph description.
+            $this->logger->info(sprintf('[Metadata] Found excerpt in \'og:description\' tag: %s', $values['og:description']));
             $this->setExcerpt($values['og:description']);
         } elseif (array_key_exists('twitter:description', $values)) {
             // Use twitter cards description.
+            $this->logger->info(sprintf('[Metadata] Found excerpt in \'twitter:description\' tag: %s', $values['twitter:description']));
             $this->setExcerpt($values['twitter:description']);
         }
 
@@ -288,15 +314,23 @@ class Readability
         if (!$this->getTitle()) {
             if (array_key_exists('og:title', $values)) {
                 // Use facebook open graph title.
+                $this->logger->info(sprintf('[Metadata] Found title in \'og:title\' tag: %s', $values['og:title']));
                 $this->setTitle($values['og:title']);
             } elseif (array_key_exists('twitter:title', $values)) {
                 // Use twitter cards title.
+                $this->logger->info(sprintf('[Metadata] Found title in \'twitter:title\' tag: %s', $values['twitter:title']));
                 $this->setTitle($values['twitter:title']);
             }
         }
 
         if (array_key_exists('og:image', $values) || array_key_exists('twitter:image', $values)) {
-            $this->setImage(array_key_exists('og:image', $values) ? $values['og:image'] : $values['twitter:image']);
+            if (array_key_exists('og:image', $values)) {
+                $this->logger->info(sprintf('[Metadata] Found main image in \'og:image\' tag: %s', $values['og:image']));
+                $this->setImage($values['og:image']);
+            } else {
+                $this->logger->info(sprintf('[Metadata] Found main image in \'twitter:image\' tag: %s', $values['twitter:image']));
+                $this->setImage($values['twitter:image']);
+            }
         }
     }
 
@@ -376,8 +410,10 @@ class Readability
         if ($this->getTitle()) {
             $originalTitle = $this->getTitle();
         } else {
+            $this->logger->debug('Could not find title in metadata, searching for the title tag...');
             $titleTag = $this->dom->getElementsByTagName('title');
             if ($titleTag->length > 0) {
+                $this->logger->info(sprintf('Using title tag as article title: %s', $titleTag->item(0)->nodeValue));
                 $originalTitle = $titleTag->item(0)->nodeValue;
             }
         }
@@ -399,10 +435,13 @@ class Readability
             $titleHadHierarchicalSeparators = (bool)preg_match('/ [\\\\\/>»] /', $curTitle);
             $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle);
 
+            $this->logger->info(sprintf('Found hierarchical separators in title, new title is: %s', $curTitle));
+
             // If the resulting title is too short (3 words or fewer), remove
             // the first part instead:
             if (count(preg_split('/\s+/', $curTitle)) < 3) {
                 $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle);
+                $this->logger->info(sprintf('Title too short, using the first part of the title instead: %s', $curTitle));
             }
         } elseif (strpos($curTitle, ': ') !== false) {
             // Check if we have an heading containing this exact string, so we
@@ -421,9 +460,12 @@ class Readability
             if (!$match) {
                 $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1);
 
+                $this->logger->info(sprintf('Title has a colon in the middle, new title is: %s', $curTitle));
+
                 // If the title is now too short, try the first colon instead:
                 if (count(preg_split('/\s+/', $curTitle)) < 3) {
                     $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1);
+                    $this->logger->info(sprintf('Title too short, using the first part of the title instead: %s', $curTitle));
                 }
             }
         } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
@@ -431,6 +473,7 @@ class Readability
 
             if ($hOnes->length === 1) {
                 $curTitle = $hOnes->item(0)->nodeValue;
+                $this->logger->info(sprintf('Using title from an H1 node: %s', $curTitle));
             }
         }
 
@@ -448,6 +491,8 @@ class Readability
         if ($curTitleWordCount <= 4 &&
             (!$titleHadHierarchicalSeparators || $curTitleWordCount !== $originalTitleWordCount)) {
             $curTitle = $originalTitle;
+
+            $this->logger->info(sprintf('Using title from an H1 node: %s', $curTitle));
         }
 
         return $curTitle;
@@ -518,6 +563,8 @@ class Readability
      */
     private function getNodes($node)
     {
+        $this->logger->info('Retrieving nodes...');
+
         $stripUnlikelyCandidates = $this->configuration->getStripUnlikelyCandidates();
 
         $elementsToScore = [];
@@ -531,14 +578,18 @@ class Readability
         while ($node) {
             $matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
 
+            $this->logger->debug(sprintf('Match string from class and id is: %s', $matchString));
+
             // Remove DOMComments nodes as we don't need them and mess up children counting
             if ($node->nodeType === XML_COMMENT_NODE) {
+                $this->logger->debug(sprintf('Found comment node, removing... Node content was: %s', substr($node->nodeValue, 0, 128)));
                 $node = NodeUtility::removeAndGetNext($node);
                 continue;
             }
 
             // Check to see if this node is a byline, and remove it if it is.
             if ($this->checkByline($node, $matchString)) {
+                $this->logger->debug(sprintf('Found byline, removing... Node content was: %s', substr($node->nodeValue, 0, 128)));
                 $node = NodeUtility::removeAndGetNext($node);
                 continue;
             }
@@ -551,6 +602,7 @@ class Readability
                     $node->nodeName !== 'body' &&
                     $node->nodeName !== 'a'
                 ) {
+                    $this->logger->debug(sprintf('Removing unlikely candidate. Node content was: %s', substr($node->nodeValue, 0, 128)));
                     $node = NodeUtility::removeAndGetNext($node);
                     continue;
                 }
@@ -562,11 +614,13 @@ class Readability
                     $node->nodeName === 'h4' || $node->nodeName === 'h5' || $node->nodeName === 'h6' ||
                     $node->nodeName === 'p') &&
                 $node->isElementWithoutContent()) {
+                $this->logger->debug(sprintf('Removing empty \'%s\' node.', $node->nodeName));
                 $node = NodeUtility::removeAndGetNext($node);
                 continue;
             }
 
             if (in_array(strtolower($node->nodeName), $this->defaultTagsToScore)) {
+                $this->logger->debug(sprintf('Adding node to score list, node content is: %s', substr($node->nodeValue, 0, 128)));
                 $elementsToScore[] = $node;
             }
 
@@ -579,11 +633,13 @@ class Readability
                  * algorithm with DIVs with are, in practice, paragraphs.
                  */
                 if ($node->hasSinglePNode()) {
+                    $this->logger->debug(sprintf('Found DIV with a single P node, removing DIV. Node content is: %s', substr($node->nodeValue, 0, 128)));
                     $pNode = $node->getChildren(true)[0];
                     $node->parentNode->replaceChild($pNode, $node);
                     $node = $pNode;
                     $elementsToScore[] = $node;
                 } elseif (!$node->hasSingleChildBlockElement()) {
+                    $this->logger->debug(sprintf('Found DIV with a single child block element, converting to a P node. Node content is: %s', substr($node->nodeValue, 0, 128)));
                     $node = NodeUtility::setNodeTag($node, 'p');
                     $elementsToScore[] = $node;
                 } else {
@@ -591,6 +647,7 @@ class Readability
                     foreach ($node->getChildren() as $child) {
                         /** @var $child DOMNode */
                         if ($child->nodeType === XML_TEXT_NODE && mb_strlen(trim($child->getTextContent())) > 0) {
+                            $this->logger->debug(sprintf('Found DIV a text node inside, converting to a P node. Node content is: %s', substr($node->nodeValue, 0, 128)));
                             $newNode = $node->createNode($child, 'p');
                             $child->parentNode->replaceChild($newNode, $child);
                         }
@@ -628,6 +685,7 @@ class Readability
         $rel = $node->getAttribute('rel');
 
         if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) {
+            $this->logger->info(sprintf('Found article author: %s', $node->getTextContent()));
             $this->setAuthor(trim($node->getTextContent()));
 
             return true;
@@ -681,6 +739,8 @@ class Readability
      */
     private function prepDocument(DOMDocument $dom)
     {
+        $this->logger->info('Preparing document for parsing...');
+
         /*
          * DOMNodeList must be converted to an array before looping over it.
          * This is done to avoid node shifting when removing nodes.
@@ -704,6 +764,8 @@ class Readability
              * (which will be replaced with a <p> later).
              */
             while (($next = NodeUtility::nextElement($next)) && ($next->nodeName === 'br')) {
+                $this->logger->debug('Removing chain of BR nodes...');
+
                 $replaced = true;
                 $brSibling = $next->nextSibling;
                 $next->parentNode->removeChild($next);
@@ -730,6 +792,8 @@ class Readability
                         }
                     }
 
+                    $this->logger->debug('Replacing BR with a P node...');
+
                     // Otherwise, make this node a child of the new <p>.
                     $sibling = $next->nextSibling;
                     $p->appendChild($next);
@@ -742,6 +806,7 @@ class Readability
         $fonts = $dom->getElementsByTagName('font');
         $length = $fonts->length;
         for ($i = 0; $i < $length; $i++) {
+            $this->logger->debug('Converting font tag into a span tag.');
             $font = $fonts->item($length - 1 - $i);
             NodeUtility::setNodeTag($font, 'span', true);
         }
@@ -756,6 +821,8 @@ class Readability
      */
     private function rateNodes($nodes)
     {
+        $this->logger->info('Rating nodes...');
+
         $candidates = [];
 
         /** @var DOMElement $node */
@@ -785,8 +852,11 @@ class Readability
             // For every 100 characters in this paragraph, add another point. Up to 3 points.
             $contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3);
 
+            $this->logger->debug(sprintf('Node score %s, content: %s', $contentScore, substr($node->nodeValue, 0, 128)));
+
             /** @var $ancestor DOMElement */
             foreach ($ancestors as $level => $ancestor) {
+                $this->logger->debug('Found ancestor, initializing and adding it as a candidate...');
                 if (!$ancestor->isInitialized()) {
                     $ancestor->initializeNode($this->configuration->getWeightClasses());
                     $candidates[] = $ancestor;
@@ -809,6 +879,8 @@ class Readability
 
                 $currentScore = $ancestor->contentScore;
                 $ancestor->contentScore = $currentScore + ($contentScore / $scoreDivider);
+
+                $this->logger->debug(sprintf('Ancestor score %s, value: %s', $ancestor->contentScore, substr($ancestor->nodeValue, 0, 128)));
             }
         }
 
@@ -850,6 +922,8 @@ class Readability
          */
 
         if ($topCandidate === null || $topCandidate->nodeName === 'body') {
+            $this->logger->info('No top candidate found or top candidate is the body tag. Moving all child nodes to a new DIV node.');
+
             // Move all of the page's children into topCandidate
             $topCandidate = new DOMDocument('1.0', 'utf-8');
             $topCandidate->encoding = 'UTF-8';
@@ -865,6 +939,7 @@ class Readability
             // Candidate must be created using firstChild to grab the DOMElement instead of the DOMDocument.
             $topCandidate = $topCandidate->firstChild;
         } elseif ($topCandidate) {
+            $this->logger->info('Found top candidate');
             // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
             // and whose scores are quite closed with current `topCandidate` node.
             $alternativeCandidateAncestors = [];
@@ -916,6 +991,7 @@ class Readability
                 if ($parentScore > $lastScore) {
                     // Alright! We found a better parent to use.
                     $topCandidate = $parentOfTopCandidate;
+                    $this->logger->info('Found a better top candidate.');
                     break;
                 }
                 $lastScore = $parentOfTopCandidate->contentScore;
@@ -937,6 +1013,8 @@ class Readability
          * that we removed, etc.
          */
 
+        $this->logger->info('Creating final article content document...');
+
         $articleContent = new DOMDocument('1.0', 'utf-8');
         $articleContent->createElement('div');
 
@@ -949,9 +1027,13 @@ class Readability
 
         /** @var DOMElement $sibling */
         foreach ($siblings as $sibling) {
+            $this->logger->info('Adding top candidate siblings...');
+
             $append = false;
 
             if ($sibling === $topCandidate) {
+                $this->logger->debug('Sibling is equal to the top candidate, adding to the final article...');
+
                 $append = true;
             } else {
                 $contentBonus = 0;
@@ -975,6 +1057,8 @@ class Readability
             }
 
             if ($append) {
+                $this->logger->debug(sprintf('Appending sibling to final article, content is: %s', substr($sibling->nodeValue, 0, 128)));
+
                 $hasContent = true;
 
                 if (!in_array(strtolower($sibling->nodeName), $this->alterToDIVExceptions)) {
@@ -1007,6 +1091,7 @@ class Readability
                 $articleDir = $ancestor->getAttribute('dir');
                 if ($articleDir) {
                     $this->setDirection($articleDir);
+                    $this->logger->debug(sprintf('Found article direction: %s', $articleDir));
                     break;
                 }
             }
@@ -1026,6 +1111,8 @@ class Readability
      */
     public function prepArticle(DOMDocument $article)
     {
+        $this->logger->info('Preparing final article...');
+
         $this->_cleanStyles($article);
         $this->_clean($article, 'style');
 
@@ -1064,6 +1151,7 @@ class Readability
                     $titlesMatch = strpos($this->getTitle(), $h2->item(0)->textContent) !== false;
                 }
                 if ($titlesMatch) {
+                    $this->logger->info('Found title repeated in an H2 node, removing...');
                     $this->_clean($article, 'h2');
                 }
             }
@@ -1087,6 +1175,7 @@ class Readability
         foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) {
             $next = $br->nextSibling;
             if ($next && $next->nodeName === 'p') {
+                $this->logger->debug('Removing br node next to a p node.');
                 $br->parentNode->removeChild($br);
             }
         }
@@ -1161,6 +1250,8 @@ class Readability
      **/
     public function _cleanStyles($node)
     {
+        $this->logger->info('Cleaning styles...');
+
         if (property_exists($node, 'tagName') && $node->tagName === 'svg') {
             return;
         }
@@ -1201,6 +1292,7 @@ class Readability
         $next = NodeUtility::getNextNode($node);
         while ($next && $next !== $endOfSearchMarkerNode) {
             if (preg_match($regex, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) {
+                $this->logger->debug(sprintf('Removing matched node with regex: %s, node content was: %s', $regex, substr($next->nodeValue, 0, 128)));
                 $next = NodeUtility::removeAndGetNext($next);
             } else {
                 $next = NodeUtility::getNextNode($next);
@@ -1229,6 +1321,7 @@ class Readability
             $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount;
 
             if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) {
+                $this->logger->debug(sprintf('Removing extra paragraph. Text content was: %s', substr($paragraph->textContent, 0, 128)));
                 $paragraph->parentNode->removeChild($paragraph);
             }
         }
@@ -1270,6 +1363,8 @@ class Readability
             }
 
             if ($weight < 0) {
+                $this->logger->debug(sprintf('Removing tag %s with 0 or less weight', $tag));
+
                 NodeUtility::removeNode($node);
                 continue;
             }
@@ -1308,6 +1403,8 @@ class Readability
                     (($embedCount === 1 && $contentLength < 75) || $embedCount > 1);
 
                 if ($haveToRemove) {
+                    $this->logger->debug(sprintf('Removing tag %s.', $tag));
+
                     NodeUtility::removeNode($node);
                 }
             }
@@ -1350,6 +1447,8 @@ class Readability
                     continue;
                 }
             }
+            $this->logger->debug(sprintf('Removing node %s.', $item->tagName));
+
             NodeUtility::removeNode($item);
         }
     }
@@ -1373,6 +1472,8 @@ class Readability
                 }
 
                 if ($weight < 0) {
+                    $this->logger->debug(sprintf('Removing H node with 0 or less weight. Content was: %s', substr($header->nodeValue, 0, 128)));
+
                     NodeUtility::removeNode($header);
                 }
             }
@@ -1386,6 +1487,8 @@ class Readability
      */
     public function postProcessContent(DOMDocument $article)
     {
+        $this->logger->info('PostProcessing content...');
+
         // Readability cannot open relative uris so we convert them to absolute uris.
         if ($this->configuration->getFixRelativeURLs()) {
             foreach (iterator_to_array($article->getElementsByTagName('a')) as $link) {
@@ -1395,9 +1498,13 @@ class Readability
                     // Replace links with javascript: URIs with text content, since
                     // they won't work after scripts have been removed from the page.
                     if (strpos($href, 'javascript:') === 0) {
+                        $this->logger->debug(sprintf('Removing \'javascript:\' link. Content is: %s', substr($link->textContent, 0, 128)));
+
                         $text = $article->createTextNode($link->textContent);
                         $link->parentNode->replaceChild($text, $link);
                     } else {
+                        $this->logger->debug(sprintf('Converting link to absolute URI: %s', substr($href, 0, 128)));
+
                         $link->setAttribute('href', $this->toAbsoluteURI($href));
                     }
                 }
@@ -1418,6 +1525,8 @@ class Readability
                 $src = array_filter($url);
                 $src = reset($src);
                 if ($src) {
+                    $this->logger->debug(sprintf('Converting image URL to absolute URI: %s', substr($src, 0, 128)));
+
                     $img->setAttribute('src', $this->toAbsoluteURI($src));
                 }
             }
author	Andres Rey <[email protected]>	2017-12-10 19:47:07 +0000
committer	Andres Rey <[email protected]>	2017-12-10 19:47:07 +0000
commit	8b496d68788694b34c6fe898c380bf181981019d (patch)
tree	b1cf4962b6f7e33ddbf132cc677822df8afacef1
parent	c284ab9bae14994fd792900a77f61001da6dade5 (diff)