summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-10-19 16:30:59 +0100
committerAndres Rey <[email protected]>2016-10-19 16:30:59 +0100
commitc34c30605a37514550e747d5d48187a160259036 (patch)
treefab23444361292e2754868337f88509ae4293e8f /src
parentcc4a384a0c9b6929388a56de21e0bd8f5958dadc (diff)
Progress over content scoring
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php57
-rw-r--r--src/Readability.php40
-rw-r--r--src/ReadabilityInterface.php9
3 files changed, 95 insertions, 11 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index c396043..41dab21 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -146,6 +146,22 @@ class HTMLParser
}
/**
+ * Get the density of links as a percentage of the content
+ * This is the amount of text that is inside a link divided by the total text in the node.
+ *
+ * @param Readability $readability
+ *
+ * @return int
+ */
+ public function getLinkDensity($readability)
+ {
+ $text = $readability->getTextContent();
+
+
+ return 1;
+ }
+
+ /**
* Returns the title of the html. Prioritizes the title from the metadata against the title tag.
*
* @return string|null
@@ -235,9 +251,46 @@ class HTMLParser
$contentScore += min(floor(strlen($node->getValue()) / 100), 3);
// Initialize and score ancestors.
- foreach ($ancestors as $ancestor) {
+ foreach ($ancestors as $level => $ancestor) {
$readability = new Readability($ancestor);
- $candidates[] = $readability->initializeNode();
+ $readability = $readability->initializeNode();
+
+ /*
+ * Node score divider:
+ * - parent: 1 (no division)
+ * - grandparent: 2
+ * - great grandparent+: ancestor level * 3
+ */
+
+ if ($level === 0) {
+ $scoreDivider = 1;
+ } else if ($level === 1) {
+ $scoreDivider = 2;
+ } else {
+ $scoreDivider = $level * 3;
+ }
+
+ $currentScore = $readability->getContentScore();
+ $readability->setContentScore($currentScore + ($contentScore / $scoreDivider));
+
+ $candidates[] = $readability;
+ }
+
+ /*
+ * After we've calculated scores, loop through all of the possible
+ * candidate nodes we found and find the one with the highest score.
+ */
+
+ $topCandidates = [];
+ foreach ($candidates as $candidate) {
+ /*
+ * Scale the final candidates score based on link density. Good content
+ * should have a relatively small link density (5% or less) and be mostly
+ * unaffected by this operation.
+ */
+
+ $candidate->setContentScore($candidate->getContentScore() * (1 - $this->getLinkDensity($candidate)));
+
}
}
}
diff --git a/src/Readability.php b/src/Readability.php
index bab7c42..1141a29 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -7,7 +7,7 @@ class Readability implements ReadabilityInterface
/**
* @var int
*/
- protected $score = 0;
+ protected $contentScore = 0;
/**
* @var null
@@ -41,13 +41,13 @@ class Readability implements ReadabilityInterface
{
switch ($this->node->getTagName()) {
case 'div':
- $this->score += 5;
+ $this->contentScore += 5;
break;
case 'pre':
case 'td':
case 'blockquote':
- $this->score += 3;
+ $this->contentScore += 3;
break;
case 'address':
@@ -58,7 +58,7 @@ class Readability implements ReadabilityInterface
case 'dt':
case 'li':
case 'form':
- $this->score -= 3;
+ $this->contentScore -= 3;
break;
case 'h1':
@@ -68,11 +68,11 @@ class Readability implements ReadabilityInterface
case 'h5':
case 'h6':
case 'th':
- $this->score -= 5;
+ $this->contentScore -= 5;
break;
}
- $this->score += $this->getClassWeight();
+ $this->contentScore += $this->getClassWeight();
return $this;
}
@@ -122,8 +122,32 @@ class Readability implements ReadabilityInterface
*
* @return int
*/
- public function getScore()
+ public function getContentScore()
{
- return $this->score;
+ return $this->contentScore;
+ }
+
+ /**
+ * Returns the current score of the Readability object.
+ *
+ * @param int $score
+ *
+ * @return int
+ */
+ public function setContentScore($score)
+ {
+ $this->contentScore = $score;
+
+ return $this->contentScore;
+ }
+
+ /**
+ * Returns the full text of the node.
+ *
+ * @return string
+ */
+ public function getTextContent()
+ {
+ return $this->node->getChildrenAsString();
}
}
diff --git a/src/ReadabilityInterface.php b/src/ReadabilityInterface.php
index 21b78e5..21048eb 100644
--- a/src/ReadabilityInterface.php
+++ b/src/ReadabilityInterface.php
@@ -12,7 +12,7 @@ interface ReadabilityInterface
/**
* @return int
*/
- public function getScore();
+ public function getContentScore();
/**
* @return Readability
@@ -23,4 +23,11 @@ interface ReadabilityInterface
* @return int
*/
public function getClassWeight();
+
+ /**
+ * @param int $score
+ *
+ * @return int
+ */
+ public function setContentScore($score);
}