diff options
author | Andres Rey <[email protected]> | 2016-10-19 16:30:59 +0100 |
---|---|---|
committer | Andres Rey <[email protected]> | 2016-10-19 16:30:59 +0100 |
commit | c34c30605a37514550e747d5d48187a160259036 (patch) | |
tree | fab23444361292e2754868337f88509ae4293e8f /src | |
parent | cc4a384a0c9b6929388a56de21e0bd8f5958dadc (diff) |
Progress over content scoring
Diffstat (limited to 'src')
-rw-r--r-- | src/HTMLParser.php | 57 | ||||
-rw-r--r-- | src/Readability.php | 40 | ||||
-rw-r--r-- | src/ReadabilityInterface.php | 9 |
3 files changed, 95 insertions, 11 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php index c396043..41dab21 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -146,6 +146,22 @@ class HTMLParser } /** + * Get the density of links as a percentage of the content + * This is the amount of text that is inside a link divided by the total text in the node. + * + * @param Readability $readability + * + * @return int + */ + public function getLinkDensity($readability) + { + $text = $readability->getTextContent(); + + + return 1; + } + + /** * Returns the title of the html. Prioritizes the title from the metadata against the title tag. * * @return string|null @@ -235,9 +251,46 @@ class HTMLParser $contentScore += min(floor(strlen($node->getValue()) / 100), 3); // Initialize and score ancestors. - foreach ($ancestors as $ancestor) { + foreach ($ancestors as $level => $ancestor) { $readability = new Readability($ancestor); - $candidates[] = $readability->initializeNode(); + $readability = $readability->initializeNode(); + + /* + * Node score divider: + * - parent: 1 (no division) + * - grandparent: 2 + * - great grandparent+: ancestor level * 3 + */ + + if ($level === 0) { + $scoreDivider = 1; + } else if ($level === 1) { + $scoreDivider = 2; + } else { + $scoreDivider = $level * 3; + } + + $currentScore = $readability->getContentScore(); + $readability->setContentScore($currentScore + ($contentScore / $scoreDivider)); + + $candidates[] = $readability; + } + + /* + * After we've calculated scores, loop through all of the possible + * candidate nodes we found and find the one with the highest score. + */ + + $topCandidates = []; + foreach ($candidates as $candidate) { + /* + * Scale the final candidates score based on link density. Good content + * should have a relatively small link density (5% or less) and be mostly + * unaffected by this operation. + */ + + $candidate->setContentScore($candidate->getContentScore() * (1 - $this->getLinkDensity($candidate))); + } } } diff --git a/src/Readability.php b/src/Readability.php index bab7c42..1141a29 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -7,7 +7,7 @@ class Readability implements ReadabilityInterface /** * @var int */ - protected $score = 0; + protected $contentScore = 0; /** * @var null @@ -41,13 +41,13 @@ class Readability implements ReadabilityInterface { switch ($this->node->getTagName()) { case 'div': - $this->score += 5; + $this->contentScore += 5; break; case 'pre': case 'td': case 'blockquote': - $this->score += 3; + $this->contentScore += 3; break; case 'address': @@ -58,7 +58,7 @@ class Readability implements ReadabilityInterface case 'dt': case 'li': case 'form': - $this->score -= 3; + $this->contentScore -= 3; break; case 'h1': @@ -68,11 +68,11 @@ class Readability implements ReadabilityInterface case 'h5': case 'h6': case 'th': - $this->score -= 5; + $this->contentScore -= 5; break; } - $this->score += $this->getClassWeight(); + $this->contentScore += $this->getClassWeight(); return $this; } @@ -122,8 +122,32 @@ class Readability implements ReadabilityInterface * * @return int */ - public function getScore() + public function getContentScore() { - return $this->score; + return $this->contentScore; + } + + /** + * Returns the current score of the Readability object. + * + * @param int $score + * + * @return int + */ + public function setContentScore($score) + { + $this->contentScore = $score; + + return $this->contentScore; + } + + /** + * Returns the full text of the node. + * + * @return string + */ + public function getTextContent() + { + return $this->node->getChildrenAsString(); } } diff --git a/src/ReadabilityInterface.php b/src/ReadabilityInterface.php index 21b78e5..21048eb 100644 --- a/src/ReadabilityInterface.php +++ b/src/ReadabilityInterface.php @@ -12,7 +12,7 @@ interface ReadabilityInterface /** * @return int */ - public function getScore(); + public function getContentScore(); /** * @return Readability @@ -23,4 +23,11 @@ interface ReadabilityInterface * @return int */ public function getClassWeight(); + + /** + * @param int $score + * + * @return int + */ + public function setContentScore($score); } |