summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFiveFilters.org <[email protected]>2021-08-25 17:13:28 +0200
committerFiveFilters.org <[email protected]>2021-08-25 17:13:28 +0200
commit9f43d27942f91be48ca7180d874af22a29d6febb (patch)
tree755ea7f82951bd83508b7be72e9158d63ab8348d
parent346bbf878d8d982b9b097fffaed711d669e6c8b2 (diff)
Separating title-like and low-scoring headers
https://github.com/mozilla/readability/commit/2e620c232ec5b189207da5b7123470e8c8872f54
-rw-r--r--src/Readability.php30
-rw-r--r--test/test-pages/v8-blog/expected.html4
2 files changed, 31 insertions, 3 deletions
diff --git a/src/Readability.php b/src/Readability.php
index 9b277db..cf2faaf 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -188,6 +188,7 @@ class Readability
$this->getMainImage();
while (true) {
+ $this->logger->debug('Starting parse loop');
$root = $root->firstChild;
$elementsToScore = $this->getNodes($root);
@@ -853,6 +854,8 @@ class Readability
$elementsToScore = [];
+ $shouldRemoveTitleHeader = true;
+
/*
* First, node prepping. Trash nodes that look cruddy (like ones with the
* class name "comment", etc), and turn divs into P tags where they have been
@@ -882,6 +885,13 @@ class Readability
continue;
}
+ if ($shouldRemoveTitleHeader && $this->headerDuplicatesTitle($node)) {
+ $this->logger->debug(sprintf('Removing header: %s', $node->getTextContent()));
+ $shouldRemoveTitleHeader = false;
+ $node = NodeUtility::removeAndGetNext($node);
+ continue;
+ }
+
// Remove unlikely candidates
if ($stripUnlikelyCandidates) {
if (
@@ -2106,9 +2116,9 @@ class Readability
if ($this->configuration->getWeightClasses()) {
$weight = $header->getClassWeight();
}
- $heading = $header->getTextContent(false);
+ $shouldRemove = $weight < 0;
- if (($this->textSimilarity($this->title, $heading) > 0.75) || $weight < 0) {
+ if ($shouldRemove) {
$this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128)));
NodeUtility::removeNode($header);
@@ -2117,6 +2127,22 @@ class Readability
}
/**
+ * Check if this node is an H1 or H2 element whose content is mostly
+ * the same as the article title.
+ *
+ * @param DOMNode the node to check.
+ * @return boolean indicating whether this is a title-like header.
+ */
+ private function headerDuplicatesTitle($node) {
+ if ($node->nodeName !== 'h1' && $node->nodeName !== 'h2') {
+ return false;
+ }
+ $heading = $node->getTextContent(false);
+ $this->logger->debug(sprintf('Evaluating similarity of header: %s"', $heading));
+ return $this->textSimilarity($this->title, $heading) > 0.75;
+ }
+
+ /**
* Removes the class="" attribute from every element in the given
* subtree.
*
diff --git a/test/test-pages/v8-blog/expected.html b/test/test-pages/v8-blog/expected.html
index 76a479d..ba11321 100644
--- a/test/test-pages/v8-blog/expected.html
+++ b/test/test-pages/v8-blog/expected.html
@@ -2,7 +2,9 @@
<p>
Emscripten has always focused first and foremost on compiling to the Web and other JavaScript environments like Node.js. But as WebAssembly starts to be used <em>without</em> JavaScript, new use cases are appearing, and so we've been working on support for emitting <a href="https://github.com/emscripten-core/emscripten/wiki/WebAssembly-Standalone"><strong>standalone Wasm</strong></a> files from Emscripten, that do not depend on the Emscripten JS runtime! This post explains why that's interesting.
</p>
-
+ <h2 id="using-standalone-mode-in-emscripten">
+ Using standalone mode in Emscripten <a href="#using-standalone-mode-in-emscripten">#</a>
+ </h2>
<p>
First, let's see what you can do with this new feature! Similar to <a href="https://hacks.mozilla.org/2018/01/shrinking-webassembly-and-javascript-code-sizes-in-emscripten/">this post</a> let's start with a "hello world" type program that exports a single function that adds two numbers:
</p>