From 9f43d27942f91be48ca7180d874af22a29d6febb Mon Sep 17 00:00:00 2001 From: "FiveFilters.org" Date: Wed, 25 Aug 2021 17:13:28 +0200 Subject: Separating title-like and low-scoring headers https://github.com/mozilla/readability/commit/2e620c232ec5b189207da5b7123470e8c8872f54 --- src/Readability.php | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'src/Readability.php') diff --git a/src/Readability.php b/src/Readability.php index 9b277db..cf2faaf 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -188,6 +188,7 @@ class Readability $this->getMainImage(); while (true) { + $this->logger->debug('Starting parse loop'); $root = $root->firstChild; $elementsToScore = $this->getNodes($root); @@ -853,6 +854,8 @@ class Readability $elementsToScore = []; + $shouldRemoveTitleHeader = true; + /* * First, node prepping. Trash nodes that look cruddy (like ones with the * class name "comment", etc), and turn divs into P tags where they have been @@ -882,6 +885,13 @@ class Readability continue; } + if ($shouldRemoveTitleHeader && $this->headerDuplicatesTitle($node)) { + $this->logger->debug(sprintf('Removing header: %s', $node->getTextContent())); + $shouldRemoveTitleHeader = false; + $node = NodeUtility::removeAndGetNext($node); + continue; + } + // Remove unlikely candidates if ($stripUnlikelyCandidates) { if ( @@ -2106,9 +2116,9 @@ class Readability if ($this->configuration->getWeightClasses()) { $weight = $header->getClassWeight(); } - $heading = $header->getTextContent(false); + $shouldRemove = $weight < 0; - if (($this->textSimilarity($this->title, $heading) > 0.75) || $weight < 0) { + if ($shouldRemove) { $this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128))); NodeUtility::removeNode($header); @@ -2116,6 +2126,22 @@ class Readability } } + /** + * Check if this node is an H1 or H2 element whose content is mostly + * the same as the article title. + * + * @param DOMNode the node to check. + * @return boolean indicating whether this is a title-like header. + */ + private function headerDuplicatesTitle($node) { + if ($node->nodeName !== 'h1' && $node->nodeName !== 'h2') { + return false; + } + $heading = $node->getTextContent(false); + $this->logger->debug(sprintf('Evaluating similarity of header: %s"', $heading)); + return $this->textSimilarity($this->title, $heading) > 0.75; + } + /** * Removes the class="" attribute from every element in the given * subtree. -- cgit v1.2.3