From 34a1349b2df4938c086c98a1efd856e34465b573 Mon Sep 17 00:00:00 2001
From: Andres Rey <andreskrey@gmail.com>
Date: Sat, 4 Nov 2017 17:51:36 +0000
Subject: Add hierarchical separators detection on titles

---
 README.md          |  2 +-
 src/HTMLParser.php | 79 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 74 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 8da8a17..8de2711 100644
--- a/README.md
+++ b/README.md
@@ -97,7 +97,7 @@ Readability parses all the text with DOMDocument, scans the text nodes and gives
 
 ## Code porting
 
-Current version follows the latest version of readability.js as of [05 May 2015](https://github.com/mozilla/readability/commit/f0edc77cb58ef52890e3065cf2b0e334d940feb2).
+Current version follows the latest version of readability.js as of [05 May 2017](https://github.com/mozilla/readability/commit/f0edc77cb58ef52890e3065cf2b0e334d940feb2).
  
 ### TO-DOs of the current port:
 
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 1a5f75e..2372f63 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -531,16 +531,83 @@ class HTMLParser
      */
     private function getTitle()
     {
+        $originalTitle = null;
+
         if (isset($this->metadata['title'])) {
-            return $this->metadata['title'];
+            $originalTitle = $this->metadata['title'];
+        } else {
+            $titleTag = $this->dom->getElementsByTagName('title');
+            if ($titleTag->length > 0) {
+                $originalTitle = $titleTag->item(0)->nodeValue;
+            }
         }
 
-        $title = $this->dom->getElementsByTagName('title');
-        if ($title->length > 0) {
-            return $title->item(0)->nodeValue;
+        if ($originalTitle === null) {
+            return null;
+        }
+
+        $curTitle = $originalTitle;
+        $titleHadHierarchicalSeparators = false;
+
+        /*
+         * If there's a separator in the title, first remove the final part
+         *
+         * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false
+         * I can assure you it works properly if you let the code run.
+         */
+        if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) {
+            $titleHadHierarchicalSeparators = (bool)preg_match('/ [\\\\\/>»] /', $curTitle);
+            $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle);
+
+            // If the resulting title is too short (3 words or fewer), remove
+            // the first part instead:
+            if (count(preg_split('/\s+/', $curTitle)) < 3) {
+                $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle);
+            }
+        } else if (strpos($curTitle, ': ') !== false) {
+            // Check if we have an heading containing this exact string, so we
+            // could assume it's the full title.
+            $match = false;
+            for ($i = 1; $i <= 2; $i++) {
+                foreach ($this->dom->getElementsByTagName('h' . $i) as $hTag) {
+                    if ($hTag->nodeValue === $curTitle) {
+                        $match = true;
+                    }
+                }
+            }
+
+            // If we don't, let's extract the title out of the original title string.
+            if (!$match) {
+                $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1);
+
+                // If the title is now too short, try the first colon instead:
+                if (count(preg_split('/\s+/', $curTitle)) < 3)
+                    $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1);
+            }
+        } else if (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
+            $hOnes = $this->dom->getElementsByTagName('h1');
+
+            if ($hOnes->length === 1) {
+                $curTitle = $hOnes->item(0)->nodeValue;
+            }
+        }
+
+        $curTitle = trim($curTitle);
+
+        /*
+         * If we now have 4 words or fewer as our title, and either no
+         * 'hierarchical' separators (\, /, > or ») were found in the original
+         * title or we decreased the number of words by more than 1 word, use
+         * the original title.
+         */
+        $curTitleWordCount = count(preg_split('/\s+/', $curTitle));
+
+        if ($curTitleWordCount <= 4 &&
+            (!$titleHadHierarchicalSeparators || $curTitleWordCount !== preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle)) - 1)) {
+            $curTitle = $originalTitle;
         }
 
-        return null;
+        return $curTitle;
     }
 
     /**
@@ -1121,7 +1188,7 @@ class HTMLParser
      * Checks if the node is a byline.
      *
      * @param Readability $node
-     * @param string      $matchString
+     * @param string $matchString
      *
      * @return bool
      */
-- 
cgit v1.2.3