diff options
author | Andres Rey <[email protected]> | 2018-03-10 18:43:31 +0000 |
---|---|---|
committer | Andres Rey <[email protected]> | 2018-03-10 18:43:31 +0000 |
commit | d6615c87857eed1d486c0f90c1a9d909ff262d13 (patch) | |
tree | 25e1232183fe22dfc9683a0486db87a58b8e3071 /src | |
parent | 746dd0bcf5f3b0e685d842252c620c01faff19b9 (diff) |
Failsafe for weird titles
Diffstat (limited to 'src')
-rw-r--r-- | src/Readability.php | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/src/Readability.php b/src/Readability.php index 9a29313..c17911c 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -468,6 +468,10 @@ class Readability if (count(preg_split('/\s+/', $curTitle)) < 3) { $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1); $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle)); + } else if (count(preg_split('/\s+/', substr($curTitle, 0, strpos($curTitle, ':')))) > 5) { + // But if we have too many words before the colon there's something weird + // with the titles and the H tags so let's just use the original title instead + $curTitle = $originalTitle; } } } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { @@ -1485,7 +1489,7 @@ class Readability * * Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes * here so no need to filter those. - * + * * @param DOMDocument|DOMNode $node * * @return void |