summaryrefslogtreecommitdiff
path: root/src/Readability.php
diff options
context:
space:
mode:
authorFiveFilters.org <[email protected]>2021-08-25 17:13:28 +0200
committerFiveFilters.org <[email protected]>2021-08-25 17:13:28 +0200
commit9f43d27942f91be48ca7180d874af22a29d6febb (patch)
tree755ea7f82951bd83508b7be72e9158d63ab8348d /src/Readability.php
parent346bbf878d8d982b9b097fffaed711d669e6c8b2 (diff)
Separating title-like and low-scoring headers
https://github.com/mozilla/readability/commit/2e620c232ec5b189207da5b7123470e8c8872f54
Diffstat (limited to 'src/Readability.php')
-rw-r--r--src/Readability.php30
1 files changed, 28 insertions, 2 deletions
diff --git a/src/Readability.php b/src/Readability.php
index 9b277db..cf2faaf 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -188,6 +188,7 @@ class Readability
$this->getMainImage();
while (true) {
+ $this->logger->debug('Starting parse loop');
$root = $root->firstChild;
$elementsToScore = $this->getNodes($root);
@@ -853,6 +854,8 @@ class Readability
$elementsToScore = [];
+ $shouldRemoveTitleHeader = true;
+
/*
* First, node prepping. Trash nodes that look cruddy (like ones with the
* class name "comment", etc), and turn divs into P tags where they have been
@@ -882,6 +885,13 @@ class Readability
continue;
}
+ if ($shouldRemoveTitleHeader && $this->headerDuplicatesTitle($node)) {
+ $this->logger->debug(sprintf('Removing header: %s', $node->getTextContent()));
+ $shouldRemoveTitleHeader = false;
+ $node = NodeUtility::removeAndGetNext($node);
+ continue;
+ }
+
// Remove unlikely candidates
if ($stripUnlikelyCandidates) {
if (
@@ -2106,9 +2116,9 @@ class Readability
if ($this->configuration->getWeightClasses()) {
$weight = $header->getClassWeight();
}
- $heading = $header->getTextContent(false);
+ $shouldRemove = $weight < 0;
- if (($this->textSimilarity($this->title, $heading) > 0.75) || $weight < 0) {
+ if ($shouldRemove) {
$this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128)));
NodeUtility::removeNode($header);
@@ -2117,6 +2127,22 @@ class Readability
}
/**
+ * Check if this node is an H1 or H2 element whose content is mostly
+ * the same as the article title.
+ *
+ * @param DOMNode the node to check.
+ * @return boolean indicating whether this is a title-like header.
+ */
+ private function headerDuplicatesTitle($node) {
+ if ($node->nodeName !== 'h1' && $node->nodeName !== 'h2') {
+ return false;
+ }
+ $heading = $node->getTextContent(false);
+ $this->logger->debug(sprintf('Evaluating similarity of header: %s"', $heading));
+ return $this->textSimilarity($this->title, $heading) > 0.75;
+ }
+
+ /**
* Removes the class="" attribute from every element in the given
* subtree.
*