summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-29 20:31:28 +0000
committerAndres Rey <[email protected]>2017-11-29 20:31:28 +0000
commitdd8f887f835016a28a55f5e7f18a9a72d3c10778 (patch)
tree72b04cc355376ff909237fb412c6d3dff0617818 /src
parentd4c22a6625fbcfdfa9d3a4c7cf65561181e07b4c (diff)
Import getArticleTitle function (originally "getTitle")
Diffstat (limited to 'src')
-rw-r--r--src/Readability.php92
1 files changed, 91 insertions, 1 deletions
diff --git a/src/Readability.php b/src/Readability.php
index abeffbb..f3eac77 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -267,7 +267,7 @@ class Readability
$metadata['excerpt'] = $values['twitter:description'];
}
- $metadata['title'] = $this->getTitle();
+ $metadata['title'] = $this->getArticleTitle();
if (!$metadata['title']) {
if (array_key_exists('og:title', $values)) {
@@ -354,6 +354,96 @@ class Readability
return $imgUrl;
}
+ /**
+ * Returns the title of the html. Prioritizes the title from the metadata against the title tag.
+ *
+ * @return string|null
+ */
+ private function getArticleTitle()
+ {
+ $originalTitle = null;
+
+ if (isset($this->metadata['title'])) {
+ $originalTitle = $this->metadata['title'];
+ } else {
+ $titleTag = $this->dom->getElementsByTagName('title');
+ if ($titleTag->length > 0) {
+ $originalTitle = $titleTag->item(0)->nodeValue;
+ }
+ }
+
+ if ($originalTitle === null) {
+ return null;
+ }
+
+ $curTitle = $originalTitle;
+ $titleHadHierarchicalSeparators = false;
+
+ /*
+ * If there's a separator in the title, first remove the final part
+ *
+ * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false
+ * I can assure you it works properly if you let the code run.
+ */
+ if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) {
+ $titleHadHierarchicalSeparators = (bool)preg_match('/ [\\\\\/>»] /', $curTitle);
+ $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle);
+
+ // If the resulting title is too short (3 words or fewer), remove
+ // the first part instead:
+ if (count(preg_split('/\s+/', $curTitle)) < 3) {
+ $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle);
+ }
+ } elseif (strpos($curTitle, ': ') !== false) {
+ // Check if we have an heading containing this exact string, so we
+ // could assume it's the full title.
+ $match = false;
+ for ($i = 1; $i <= 2; $i++) {
+ foreach ($this->dom->getElementsByTagName('h' . $i) as $hTag) {
+ // Trim texts to avoid having false negatives when the title is surrounded by spaces or tabs
+ if (trim($hTag->nodeValue) === trim($curTitle)) {
+ $match = true;
+ }
+ }
+ }
+
+ // If we don't, let's extract the title out of the original title string.
+ if (!$match) {
+ $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1);
+
+ // If the title is now too short, try the first colon instead:
+ if (count(preg_split('/\s+/', $curTitle)) < 3) {
+ $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1);
+ }
+ }
+ } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
+ $hOnes = $this->dom->getElementsByTagName('h1');
+
+ if ($hOnes->length === 1) {
+ $curTitle = $hOnes->item(0)->nodeValue;
+ }
+ }
+
+ $curTitle = trim($curTitle);
+
+ /*
+ * If we now have 4 words or fewer as our title, and either no
+ * 'hierarchical' separators (\, /, > or ») were found in the original
+ * title or we decreased the number of words by more than 1 word, use
+ * the original title.
+ */
+ $curTitleWordCount = count(preg_split('/\s+/', $curTitle));
+ $originalTitleWordCount = count(preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle))) - 1;
+
+ if ($curTitleWordCount <= 4 &&
+ (!$titleHadHierarchicalSeparators || $curTitleWordCount !== $originalTitleWordCount)) {
+ $curTitle = $originalTitle;
+ }
+
+ return $curTitle;
+ }
+
+
private function toAbsoluteURI($uri)
{
list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL());