Import getArticleTitle function (originally "getTitle")

author: Andres Rey <[email protected]> 2017-11-29 20:31:28 +0000
committer: Andres Rey <[email protected]> 2017-11-29 20:31:28 +0000
commit: dd8f887f835016a28a55f5e7f18a9a72d3c10778 (patch)
tree: 72b04cc355376ff909237fb412c6d3dff0617818 /src
parent: d4c22a6625fbcfdfa9d3a4c7cf65561181e07b4c (diff)
1 files changed, 91 insertions, 1 deletions
diff --git a/src/Readability.php b/src/Readability.php
index abeffbb..f3eac77 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -267,7 +267,7 @@ class Readability
             $metadata['excerpt'] = $values['twitter:description'];
         }
 
-        $metadata['title'] = $this->getTitle();
+        $metadata['title'] = $this->getArticleTitle();
 
         if (!$metadata['title']) {
             if (array_key_exists('og:title', $values)) {
@@ -354,6 +354,96 @@ class Readability
         return $imgUrl;
     }
 
+    /**
+     * Returns the title of the html. Prioritizes the title from the metadata against the title tag.
+     *
+     * @return string|null
+     */
+    private function getArticleTitle()
+    {
+        $originalTitle = null;
+
+        if (isset($this->metadata['title'])) {
+            $originalTitle = $this->metadata['title'];
+        } else {
+            $titleTag = $this->dom->getElementsByTagName('title');
+            if ($titleTag->length > 0) {
+                $originalTitle = $titleTag->item(0)->nodeValue;
+            }
+        }
+
+        if ($originalTitle === null) {
+            return null;
+        }
+
+        $curTitle = $originalTitle;
+        $titleHadHierarchicalSeparators = false;
+
+        /*
+         * If there's a separator in the title, first remove the final part
+         *
+         * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false
+         * I can assure you it works properly if you let the code run.
+         */
+        if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) {
+            $titleHadHierarchicalSeparators = (bool)preg_match('/ [\\\\\/>»] /', $curTitle);
+            $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle);
+
+            // If the resulting title is too short (3 words or fewer), remove
+            // the first part instead:
+            if (count(preg_split('/\s+/', $curTitle)) < 3) {
+                $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle);
+            }
+        } elseif (strpos($curTitle, ': ') !== false) {
+            // Check if we have an heading containing this exact string, so we
+            // could assume it's the full title.
+            $match = false;
+            for ($i = 1; $i <= 2; $i++) {
+                foreach ($this->dom->getElementsByTagName('h' . $i) as $hTag) {
+                    // Trim texts to avoid having false negatives when the title is surrounded by spaces or tabs
+                    if (trim($hTag->nodeValue) === trim($curTitle)) {
+                        $match = true;
+                    }
+                }
+            }
+
+            // If we don't, let's extract the title out of the original title string.
+            if (!$match) {
+                $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1);
+
+                // If the title is now too short, try the first colon instead:
+                if (count(preg_split('/\s+/', $curTitle)) < 3) {
+                    $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1);
+                }
+            }
+        } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
+            $hOnes = $this->dom->getElementsByTagName('h1');
+
+            if ($hOnes->length === 1) {
+                $curTitle = $hOnes->item(0)->nodeValue;
+            }
+        }
+
+        $curTitle = trim($curTitle);
+
+        /*
+         * If we now have 4 words or fewer as our title, and either no
+         * 'hierarchical' separators (\, /, > or ») were found in the original
+         * title or we decreased the number of words by more than 1 word, use
+         * the original title.
+         */
+        $curTitleWordCount = count(preg_split('/\s+/', $curTitle));
+        $originalTitleWordCount = count(preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle))) - 1;
+
+        if ($curTitleWordCount <= 4 &&
+            (!$titleHadHierarchicalSeparators || $curTitleWordCount !== $originalTitleWordCount)) {
+            $curTitle = $originalTitle;
+        }
+
+        return $curTitle;
+    }
+
+
     private function toAbsoluteURI($uri)
     {
         list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL());
author	Andres Rey <[email protected]>	2017-11-29 20:31:28 +0000
committer	Andres Rey <[email protected]>	2017-11-29 20:31:28 +0000
commit	dd8f887f835016a28a55f5e7f18a9a72d3c10778 (patch)
tree	72b04cc355376ff909237fb412c6d3dff0617818 /src
parent	d4c22a6625fbcfdfa9d3a4c7cf65561181e07b4c (diff)