From 9f43d27942f91be48ca7180d874af22a29d6febb Mon Sep 17 00:00:00 2001 From: "FiveFilters.org" Date: Wed, 25 Aug 2021 17:13:28 +0200 Subject: Separating title-like and low-scoring headers https://github.com/mozilla/readability/commit/2e620c232ec5b189207da5b7123470e8c8872f54 --- src/Readability.php | 30 ++++++++++++++++++++++++++++-- test/test-pages/v8-blog/expected.html | 4 +++- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 9b277db..cf2faaf 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -188,6 +188,7 @@ class Readability $this->getMainImage(); while (true) { + $this->logger->debug('Starting parse loop'); $root = $root->firstChild; $elementsToScore = $this->getNodes($root); @@ -853,6 +854,8 @@ class Readability $elementsToScore = []; + $shouldRemoveTitleHeader = true; + /* * First, node prepping. Trash nodes that look cruddy (like ones with the * class name "comment", etc), and turn divs into P tags where they have been @@ -882,6 +885,13 @@ class Readability continue; } + if ($shouldRemoveTitleHeader && $this->headerDuplicatesTitle($node)) { + $this->logger->debug(sprintf('Removing header: %s', $node->getTextContent())); + $shouldRemoveTitleHeader = false; + $node = NodeUtility::removeAndGetNext($node); + continue; + } + // Remove unlikely candidates if ($stripUnlikelyCandidates) { if ( @@ -2106,9 +2116,9 @@ class Readability if ($this->configuration->getWeightClasses()) { $weight = $header->getClassWeight(); } - $heading = $header->getTextContent(false); + $shouldRemove = $weight < 0; - if (($this->textSimilarity($this->title, $heading) > 0.75) || $weight < 0) { + if ($shouldRemove) { $this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128))); NodeUtility::removeNode($header); @@ -2116,6 +2126,22 @@ class Readability } } + /** + * Check if this node is an H1 or H2 element whose content is mostly + * the same as the article title. + * + * @param DOMNode the node to check. + * @return boolean indicating whether this is a title-like header. + */ + private function headerDuplicatesTitle($node) { + if ($node->nodeName !== 'h1' && $node->nodeName !== 'h2') { + return false; + } + $heading = $node->getTextContent(false); + $this->logger->debug(sprintf('Evaluating similarity of header: %s"', $heading)); + return $this->textSimilarity($this->title, $heading) > 0.75; + } + /** * Removes the class="" attribute from every element in the given * subtree. diff --git a/test/test-pages/v8-blog/expected.html b/test/test-pages/v8-blog/expected.html index 76a479d..ba11321 100644 --- a/test/test-pages/v8-blog/expected.html +++ b/test/test-pages/v8-blog/expected.html @@ -2,7 +2,9 @@

Emscripten has always focused first and foremost on compiling to the Web and other JavaScript environments like Node.js. But as WebAssembly starts to be used without JavaScript, new use cases are appearing, and so we've been working on support for emitting standalone Wasm files from Emscripten, that do not depend on the Emscripten JS runtime! This post explains why that's interesting.

- +

+ Using standalone mode in Emscripten # +

First, let's see what you can do with this new feature! Similar to this post let's start with a "hello world" type program that exports a single function that adds two numbers:

-- cgit v1.2.3