diff options
-rw-r--r-- | src/Readability.php | 30 | ||||
-rw-r--r-- | test/test-pages/v8-blog/expected.html | 4 |
2 files changed, 31 insertions, 3 deletions
diff --git a/src/Readability.php b/src/Readability.php index 9b277db..cf2faaf 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -188,6 +188,7 @@ class Readability $this->getMainImage(); while (true) { + $this->logger->debug('Starting parse loop'); $root = $root->firstChild; $elementsToScore = $this->getNodes($root); @@ -853,6 +854,8 @@ class Readability $elementsToScore = []; + $shouldRemoveTitleHeader = true; + /* * First, node prepping. Trash nodes that look cruddy (like ones with the * class name "comment", etc), and turn divs into P tags where they have been @@ -882,6 +885,13 @@ class Readability continue; } + if ($shouldRemoveTitleHeader && $this->headerDuplicatesTitle($node)) { + $this->logger->debug(sprintf('Removing header: %s', $node->getTextContent())); + $shouldRemoveTitleHeader = false; + $node = NodeUtility::removeAndGetNext($node); + continue; + } + // Remove unlikely candidates if ($stripUnlikelyCandidates) { if ( @@ -2106,9 +2116,9 @@ class Readability if ($this->configuration->getWeightClasses()) { $weight = $header->getClassWeight(); } - $heading = $header->getTextContent(false); + $shouldRemove = $weight < 0; - if (($this->textSimilarity($this->title, $heading) > 0.75) || $weight < 0) { + if ($shouldRemove) { $this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128))); NodeUtility::removeNode($header); @@ -2117,6 +2127,22 @@ class Readability } /** + * Check if this node is an H1 or H2 element whose content is mostly + * the same as the article title. + * + * @param DOMNode the node to check. + * @return boolean indicating whether this is a title-like header. + */ + private function headerDuplicatesTitle($node) { + if ($node->nodeName !== 'h1' && $node->nodeName !== 'h2') { + return false; + } + $heading = $node->getTextContent(false); + $this->logger->debug(sprintf('Evaluating similarity of header: %s"', $heading)); + return $this->textSimilarity($this->title, $heading) > 0.75; + } + + /** * Removes the class="" attribute from every element in the given * subtree. * diff --git a/test/test-pages/v8-blog/expected.html b/test/test-pages/v8-blog/expected.html index 76a479d..ba11321 100644 --- a/test/test-pages/v8-blog/expected.html +++ b/test/test-pages/v8-blog/expected.html @@ -2,7 +2,9 @@ <p> Emscripten has always focused first and foremost on compiling to the Web and other JavaScript environments like Node.js. But as WebAssembly starts to be used <em>without</em> JavaScript, new use cases are appearing, and so we've been working on support for emitting <a href="https://github.com/emscripten-core/emscripten/wiki/WebAssembly-Standalone"><strong>standalone Wasm</strong></a> files from Emscripten, that do not depend on the Emscripten JS runtime! This post explains why that's interesting. </p> - + <h2 id="using-standalone-mode-in-emscripten"> + Using standalone mode in Emscripten <a href="#using-standalone-mode-in-emscripten">#</a> + </h2> <p> First, let's see what you can do with this new feature! Similar to <a href="https://hacks.mozilla.org/2018/01/shrinking-webassembly-and-javascript-code-sizes-in-emscripten/">this post</a> let's start with a "hello world" type program that exports a single function that adds two numbers: </p> |