From d6615c87857eed1d486c0f90c1a9d909ff262d13 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Sat, 10 Mar 2018 18:43:31 +0000 Subject: Failsafe for weird titles --- src/Readability.php | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/Readability.php b/src/Readability.php index 9a29313..c17911c 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -468,6 +468,10 @@ class Readability if (count(preg_split('/\s+/', $curTitle)) < 3) { $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1); $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle)); + } else if (count(preg_split('/\s+/', substr($curTitle, 0, strpos($curTitle, ':')))) > 5) { + // But if we have too many words before the colon there's something weird + // with the titles and the H tags so let's just use the original title instead + $curTitle = $originalTitle; } } } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { @@ -1485,7 +1489,7 @@ class Readability * * Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes * here so no need to filter those. - * + * * @param DOMDocument|DOMNode $node * * @return void -- cgit v1.2.3