summaryrefslogtreecommitdiff
path: root/src/Readability.php
diff options
context:
space:
mode:
Diffstat (limited to 'src/Readability.php')
-rw-r--r--src/Readability.php57
1 files changed, 51 insertions, 6 deletions
diff --git a/src/Readability.php b/src/Readability.php
index 8e22f6f..e80386e 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -970,6 +970,30 @@ class Readability
}
/**
+ * compares second text to first one
+ * 1 = same text, 0 = completely different text
+ * works the way that it splits both texts into words and then finds words that are unique in second text
+ * the result is given by the lower length of unique parts
+ *
+ * @param string $textA
+ * @param string $textB
+ *
+ * @return int 1 = same text, 0 = completely different text
+ */
+ private function textSimilarity(string $textA, string $textB) {
+ $tokensA = array_filter(preg_split(NodeUtility::$regexps['tokenize'], mb_strtolower($textA)));
+ $tokensB = array_filter(preg_split(NodeUtility::$regexps['tokenize'], mb_strtolower($textB)));
+ if (!count($tokensA) || !count($tokensB)) {
+ return 0;
+ }
+ $uniqTokensB = array_filter($tokensB, function ($token) use (&$tokensA) {
+ return !in_array($token, $tokensA);
+ });
+ $distanceB = mb_strlen(implode($uniqTokensB, ' ')) / mb_strlen(implode($tokensB, ' '));
+ return 1 - $distanceB;
+ }
+
+ /**
* Checks if the node is a byline.
*
* @param DOMNode $node
@@ -1571,7 +1595,6 @@ class Readability
$this->_cleanConditionally($article, 'fieldset');
$this->_clean($article, 'object');
$this->_clean($article, 'embed');
- $this->_clean($article, 'h1');
$this->_clean($article, 'footer');
$this->_clean($article, 'link');
$this->_clean($article, 'aside');
@@ -1592,6 +1615,7 @@ class Readability
* they are probably using it as a header and not a subheader,
* so remove it since we already extract the title separately.
*/
+ /*
$h2 = $article->getElementsByTagName('h2');
if ($h2->length === 1) {
$lengthSimilarRate = (mb_strlen($h2->item(0)->textContent) - mb_strlen($this->getTitle())) / max(mb_strlen($this->getTitle()), 1);
@@ -1608,6 +1632,7 @@ class Readability
}
}
}
+ */
$this->_clean($article, 'iframe');
$this->_clean($article, 'input');
@@ -1622,6 +1647,11 @@ class Readability
$this->_cleanConditionally($article, 'ul');
$this->_cleanConditionally($article, 'div');
+ // replace H1 with H2 as H1 should be only title that is displayed separately
+ foreach (iterator_to_array($article->getElementsByTagName('h1')) as $h1) {
+ NodeUtility::setNodeTag($h1, 'h2');
+ }
+
$this->_cleanExtraParagraphs($article);
foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) {
@@ -1871,6 +1901,19 @@ class Readability
}
}
+ private function getTextDensity($e, array $tags) {
+ $textLength = mb_strlen($e->getTextContent(true));
+ if ($textLength === 0) {
+ return 0;
+ }
+ $childrenLength = 0;
+ $children = $this->_getAllNodesWithTag($e, $tags);
+ foreach ($children as $child) {
+ $childrenLength += mb_strlen($child->getTextContent(true));
+ }
+ return $childrenLength / $textLength;
+ }
+
/**
* @param DOMDocument $article
* @param string $tag Tag to clean conditionally
@@ -1953,6 +1996,7 @@ class Readability
$img = $node->getElementsByTagName('img')->length;
$li = $node->getElementsByTagName('li')->length - 100;
$input = $node->getElementsByTagName('input')->length;
+ $headingDensity = $this->getTextDensity($node, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']);
$embedCount = 0;
$embeds = $this->_getAllNodesWithTag($node, ['object', 'embed', 'iframe']);
@@ -1979,7 +2023,7 @@ class Readability
($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag('figure')) ||
(!$isList && $li > $p) ||
($input > floor($p / 3)) ||
- (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag('figure')) ||
+ (!$isList && $headingDensity < 0.9 && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag('figure')) ||
(!$isList && $weight < 25 && $linkDensity > 0.2) ||
($weight >= 25 && $linkDensity > 0.5) ||
(($embedCount === 1 && $contentLength < 75) || $embedCount > 1);
@@ -2047,7 +2091,7 @@ class Readability
}
/**
- * Clean out spurious headers from an Element. Checks things like classnames and link density.
+ * Clean out spurious headers from an Element.
*
* @param DOMDocument $article
*
@@ -2055,15 +2099,16 @@ class Readability
**/
public function _cleanHeaders(DOMDocument $article)
{
- $headers = $this->_getAllNodesWithTag($article, ['h1', 'h2']);
+ $headingNodes = $this->_getAllNodesWithTag($article, ['h1', 'h2']);
/** @var $header DOMElement */
- foreach ($headers as $header) {
+ foreach ($headingNodes as $header) {
$weight = 0;
if ($this->configuration->getWeightClasses()) {
$weight = $header->getClassWeight();
}
+ $heading = $header->getTextContent(false);
- if ($weight < 0) {
+ if (($this->textSimilarity($this->title, $heading) > 0.75) || $weight < 0) {
$this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128)));
NodeUtility::removeNode($header);