diff options
author | Andres Rey <[email protected]> | 2017-11-29 20:31:28 +0000 |
---|---|---|
committer | Andres Rey <[email protected]> | 2017-11-29 20:31:28 +0000 |
commit | dd8f887f835016a28a55f5e7f18a9a72d3c10778 (patch) | |
tree | 72b04cc355376ff909237fb412c6d3dff0617818 /src | |
parent | d4c22a6625fbcfdfa9d3a4c7cf65561181e07b4c (diff) |
Import getArticleTitle function (originally "getTitle")
Diffstat (limited to 'src')
-rw-r--r-- | src/Readability.php | 92 |
1 files changed, 91 insertions, 1 deletions
diff --git a/src/Readability.php b/src/Readability.php index abeffbb..f3eac77 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -267,7 +267,7 @@ class Readability $metadata['excerpt'] = $values['twitter:description']; } - $metadata['title'] = $this->getTitle(); + $metadata['title'] = $this->getArticleTitle(); if (!$metadata['title']) { if (array_key_exists('og:title', $values)) { @@ -354,6 +354,96 @@ class Readability return $imgUrl; } + /** + * Returns the title of the html. Prioritizes the title from the metadata against the title tag. + * + * @return string|null + */ + private function getArticleTitle() + { + $originalTitle = null; + + if (isset($this->metadata['title'])) { + $originalTitle = $this->metadata['title']; + } else { + $titleTag = $this->dom->getElementsByTagName('title'); + if ($titleTag->length > 0) { + $originalTitle = $titleTag->item(0)->nodeValue; + } + } + + if ($originalTitle === null) { + return null; + } + + $curTitle = $originalTitle; + $titleHadHierarchicalSeparators = false; + + /* + * If there's a separator in the title, first remove the final part + * + * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false + * I can assure you it works properly if you let the code run. + */ + if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) { + $titleHadHierarchicalSeparators = (bool)preg_match('/ [\\\\\/>»] /', $curTitle); + $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle); + + // If the resulting title is too short (3 words or fewer), remove + // the first part instead: + if (count(preg_split('/\s+/', $curTitle)) < 3) { + $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle); + } + } elseif (strpos($curTitle, ': ') !== false) { + // Check if we have an heading containing this exact string, so we + // could assume it's the full title. + $match = false; + for ($i = 1; $i <= 2; $i++) { + foreach ($this->dom->getElementsByTagName('h' . $i) as $hTag) { + // Trim texts to avoid having false negatives when the title is surrounded by spaces or tabs + if (trim($hTag->nodeValue) === trim($curTitle)) { + $match = true; + } + } + } + + // If we don't, let's extract the title out of the original title string. + if (!$match) { + $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1); + + // If the title is now too short, try the first colon instead: + if (count(preg_split('/\s+/', $curTitle)) < 3) { + $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1); + } + } + } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { + $hOnes = $this->dom->getElementsByTagName('h1'); + + if ($hOnes->length === 1) { + $curTitle = $hOnes->item(0)->nodeValue; + } + } + + $curTitle = trim($curTitle); + + /* + * If we now have 4 words or fewer as our title, and either no + * 'hierarchical' separators (\, /, > or ») were found in the original + * title or we decreased the number of words by more than 1 word, use + * the original title. + */ + $curTitleWordCount = count(preg_split('/\s+/', $curTitle)); + $originalTitleWordCount = count(preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle))) - 1; + + if ($curTitleWordCount <= 4 && + (!$titleHadHierarchicalSeparators || $curTitleWordCount !== $originalTitleWordCount)) { + $curTitle = $originalTitle; + } + + return $curTitle; + } + + private function toAbsoluteURI($uri) { list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL()); |