Merge pull request #18 from andreskrey/development

Prepare for release v0.2.0
author: Andres Rey <[email protected]> 2017-03-10 11:10:46 +0000
committer: GitHub <[email protected]> 2017-03-10 11:10:46 +0000
commit: 10c528c0b98164be5da0bfd097b0190e26e6de5f (patch)
tree: 30adee993e1c52abc398e7f6e2a0b0723c73ecf9 /src
parent: 2a493bcc6cd8c175c531a26b6c9b061e911dcf39 (diff)
parent: a2d10aa920780447c946ac64efa7c095c854bff6 (diff)
2 files changed, 74 insertions, 14 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 0313b2a..bc9aa9f 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -29,18 +29,13 @@ class HTMLParser
     /**
      * @var array
      */
-    private $title = [];
-
-    /**
-     * @var array
-     */
     private $regexps = [
         'unlikelyCandidates' => '/banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i',
         'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
         'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
         'byline' => '/byline|author|dateline|writtenby|p-author/i',
         'replaceFonts' => '/<(\/?)font[^>]*>/gi',
-        'normalize' => '/\s{2,}/g',
+        'normalize' => '/\s{2,}/',
         'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i',
         'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i',
         'prevLink' => '/(prev|earl|old|new|<|«)/i',
@@ -104,6 +99,7 @@ class HTMLParser
             'weightClasses' => true,
             'removeReadabilityTags' => true,
             'fixRelativeURLs' => false,
+            'substituteEntities' => true,
             'originalURL' => 'http://fakehost',
         ];
 
@@ -137,7 +133,9 @@ class HTMLParser
 
         $this->metadata = $this->getMetadata();
 
-        $this->title = $this->getTitle();
+        $this->metadata['image'] = $this->getMainImage();
+
+        $this->metadata['title'] = $this->getTitle();
 
         // Checking for minimum HTML to work with.
         if (!($root = $this->dom->getElementsByTagName('body')->item(0))) {
@@ -162,7 +160,11 @@ class HTMLParser
 
             // TODO Better way to count resulting text. Textcontent usually has alt titles and that stuff
             // that doesn't really count to the quality of the result.
-            if ($result && mb_strlen($result->textContent) < 500) {
+            $length = 0;
+            foreach ($result->getElementsByTagName('p') as $p) {
+                $length += mb_strlen($p->textContent);
+            }
+            if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < 500) {
                 $root = $this->backupdom->getElementsByTagName('body')->item(0);
 
                 if ($this->getConfig()->getOption('stripUnlikelyCandidates')) {
@@ -205,6 +207,11 @@ class HTMLParser
      */
     private function loadHTML($html)
     {
+        if (!$this->getConfig()->getOption('substituteEntities')) {
+            // Keep the original HTML entities
+            $this->dom->substituteEntities = false;
+        }
+
         // Prepend the XML tag to avoid having issues with special characters. Should be harmless.
         $this->dom->loadHTML('<?xml encoding="UTF-8">' . $html);
         $this->dom->encoding = 'UTF-8';
@@ -293,6 +300,15 @@ class HTMLParser
                 }
             }
         }
+
+        // Replace font tags with span
+        $fonts = $this->dom->getElementsByTagName('font');
+        $length = $fonts->length;
+        for ($i = 0; $i < $length; $i++) {
+            $font = $fonts->item($length - 1 - $i);
+            $span = new Readability($font);
+            $span->setNodeTag('span', true);
+        }
     }
 
     public function postProcessContent(DOMDocument $article)
@@ -436,12 +452,40 @@ class HTMLParser
 
         if (array_key_exists('og:image', $values) || array_key_exists('twitter:image', $values)) {
             $metadata['image'] = ($values['og:image']) ? $values['og:image'] : $values['twitter:image'];
+        } else {
+            $metadata['image'] = null;
         }
 
         return $metadata;
     }
 
     /**
+     * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't
+     * find a correct image.
+     *
+     * @return bool|string URL of the top image or false if unsuccessful.
+     */
+    public function getMainImage()
+    {
+        if ($this->metadata['image'] !== null) {
+            return $this->metadata['image'];
+        }
+
+        foreach ($this->dom->getElementsByTagName('link') as $link) {
+            /** @var \DOMElement $link */
+            /*
+             * Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and
+             * finally check for the existence of the href attribute, which should hold the image url.
+             */
+            if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')) {
+                return $link->getAttribute('href');
+            }
+        }
+
+        return false;
+    }
+
+    /**
      * Get the density of links as a percentage of the content
      * This is the amount of text that is inside a link divided by the total text in the node.
      *
@@ -493,6 +537,7 @@ class HTMLParser
      * Gets nodes from the root element.
      *
      * @param $node Readability
+     *
      * @return array
      */
     private function getNodes(Readability $node)
@@ -586,7 +631,7 @@ class HTMLParser
                 continue;
             }
             // Discard nodes with less than 25 characters, without blank space
-            if (mb_strlen($node->getValue(true)) < 25) {
+            if (mb_strlen($node->getTextContent(true)) < 25) {
                 continue;
             }
 
@@ -601,10 +646,10 @@ class HTMLParser
             $contentScore = 1;
 
             // Add points for any commas within this paragraph.
-            $contentScore += count(explode(',', $node->getValue(true)));
+            $contentScore += count(explode(',', $node->getTextContent(true)));
 
             // For every 100 characters in this paragraph, add another point. Up to 3 points.
-            $contentScore += min(floor(mb_strlen($node->getValue(true)) / 100), 3);
+            $contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3);
 
             // Initialize and score ancestors.
             /** @var Readability $ancestor */
@@ -1066,13 +1111,20 @@ class HTMLParser
      * Checks if the node is a byline.
      *
      * @param Readability $node
-     * @param string $matchString
+     * @param string      $matchString
      *
      * @return bool
      */
     private function checkByline($node, $matchString)
     {
-        if ($this->getConfig()->getOption('articleByLine')) {
+        if (!$this->getConfig()->getOption('articleByLine')) {
+            return false;
+        }
+
+        /*
+         * Check if the byline is already set
+         */
+        if (isset($this->metadata['byline'])) {
             return false;
         }
 
diff --git a/src/Readability.php b/src/Readability.php
index 027858f..c55e0ad 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -286,8 +286,9 @@ class Readability extends Element implements ReadabilityInterface
      * element with the new tag name and importing it to the main DOMDocument.
      *
      * @param string $value
+     * @param bool   $importAttributes
      */
-    public function setNodeTag($value)
+    public function setNodeTag($value, $importAttributes = false)
     {
         $new = new \DOMDocument();
         $new->appendChild($new->createElement($value));
@@ -298,6 +299,13 @@ class Readability extends Element implements ReadabilityInterface
             $new->firstChild->appendChild($import);
         }
 
+        if ($importAttributes) {
+            // Import attributes from the original node.
+            foreach ($this->node->attributes as $attribute) {
+                $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue);
+            }
+        }
+
         // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement.
         $import = $this->node->ownerDocument->importNode($new->firstChild, true);
         $this->node->parentNode->replaceChild($import, $this->node);
author	Andres Rey <[email protected]>	2017-03-10 11:10:46 +0000
committer	GitHub <[email protected]>	2017-03-10 11:10:46 +0000
commit	10c528c0b98164be5da0bfd097b0190e26e6de5f (patch)
tree	30adee993e1c52abc398e7f6e2a0b0723c73ecf9 /src
parent	2a493bcc6cd8c175c531a26b6c9b061e911dcf39 (diff)
parent	a2d10aa920780447c946ac64efa7c095c854bff6 (diff)