diff options
author | FiveFilters.org <[email protected]> | 2021-08-25 17:13:28 +0200 |
---|---|---|
committer | FiveFilters.org <[email protected]> | 2021-08-25 17:13:28 +0200 |
commit | 9f43d27942f91be48ca7180d874af22a29d6febb (patch) | |
tree | 755ea7f82951bd83508b7be72e9158d63ab8348d /src/Readability.php | |
parent | 346bbf878d8d982b9b097fffaed711d669e6c8b2 (diff) |
Separating title-like and low-scoring headers
https://github.com/mozilla/readability/commit/2e620c232ec5b189207da5b7123470e8c8872f54
Diffstat (limited to 'src/Readability.php')
-rw-r--r-- | src/Readability.php | 30 |
1 files changed, 28 insertions, 2 deletions
diff --git a/src/Readability.php b/src/Readability.php index 9b277db..cf2faaf 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -188,6 +188,7 @@ class Readability $this->getMainImage(); while (true) { + $this->logger->debug('Starting parse loop'); $root = $root->firstChild; $elementsToScore = $this->getNodes($root); @@ -853,6 +854,8 @@ class Readability $elementsToScore = []; + $shouldRemoveTitleHeader = true; + /* * First, node prepping. Trash nodes that look cruddy (like ones with the * class name "comment", etc), and turn divs into P tags where they have been @@ -882,6 +885,13 @@ class Readability continue; } + if ($shouldRemoveTitleHeader && $this->headerDuplicatesTitle($node)) { + $this->logger->debug(sprintf('Removing header: %s', $node->getTextContent())); + $shouldRemoveTitleHeader = false; + $node = NodeUtility::removeAndGetNext($node); + continue; + } + // Remove unlikely candidates if ($stripUnlikelyCandidates) { if ( @@ -2106,9 +2116,9 @@ class Readability if ($this->configuration->getWeightClasses()) { $weight = $header->getClassWeight(); } - $heading = $header->getTextContent(false); + $shouldRemove = $weight < 0; - if (($this->textSimilarity($this->title, $heading) > 0.75) || $weight < 0) { + if ($shouldRemove) { $this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128))); NodeUtility::removeNode($header); @@ -2117,6 +2127,22 @@ class Readability } /** + * Check if this node is an H1 or H2 element whose content is mostly + * the same as the article title. + * + * @param DOMNode the node to check. + * @return boolean indicating whether this is a title-like header. + */ + private function headerDuplicatesTitle($node) { + if ($node->nodeName !== 'h1' && $node->nodeName !== 'h2') { + return false; + } + $heading = $node->getTextContent(false); + $this->logger->debug(sprintf('Evaluating similarity of header: %s"', $heading)); + return $this->textSimilarity($this->title, $heading) > 0.75; + } + + /** * Removes the class="" attribute from every element in the given * subtree. * |