summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-28 20:47:09 +0000
committerAndres Rey <[email protected]>2017-11-28 20:47:09 +0000
commitb06bbf1374f09db17504d17b59e714a5be21df6f (patch)
tree0d31d72932fd0b67763baf44677ec1f63afb361a /src
parentf09e346ecb9f6977161ec6196bc796bab5ac463e (diff)
Add initialization function and property
Diffstat (limited to 'src')
-rw-r--r--src/NodeClass/NodeClassTrait.php79
-rw-r--r--src/Readability.php29
2 files changed, 73 insertions, 35 deletions
diff --git a/src/NodeClass/NodeClassTrait.php b/src/NodeClass/NodeClassTrait.php
index b2c0fcc..467b170 100644
--- a/src/NodeClass/NodeClassTrait.php
+++ b/src/NodeClass/NodeClassTrait.php
@@ -10,7 +10,12 @@ trait NodeClassTrait
/**
* @var int
*/
- protected $contentScore = 0;
+ public $contentScore = 0;
+
+ /**
+ * @var bool
+ */
+ public $initialized = false;
/**
* @var array
@@ -21,6 +26,57 @@ trait NodeClassTrait
];
/**
+ * Initializer. Calculates the current score of the node and returns a full Readability object.
+ *
+ * @return self
+ */
+ public function initializeNode()
+ {
+ if (!$this->initialized) {
+ $contentScore = 0;
+
+ switch ($this->nodeName) {
+ case 'div':
+ $contentScore += 5;
+ break;
+
+ case 'pre':
+ case 'td':
+ case 'blockquote':
+ $contentScore += 3;
+ break;
+
+ case 'address':
+ case 'ol':
+ case 'ul':
+ case 'dl':
+ case 'dd':
+ case 'dt':
+ case 'li':
+ case 'form':
+ $contentScore -= 3;
+ break;
+
+ case 'h1':
+ case 'h2':
+ case 'h3':
+ case 'h4':
+ case 'h5':
+ case 'h6':
+ case 'th':
+ $contentScore -= 5;
+ break;
+ }
+
+ $this->contentScore = $contentScore + $this->getClassWeight();
+
+ $this->initialized = true;
+ }
+
+ return $this;
+ }
+
+ /**
* Placeholder for getAttribute method. Some nodes have the getAttribute method, some don't.
*
* @param $attributeName string Attribute to retrieve
@@ -175,27 +231,6 @@ trait NodeClassTrait
}
/**
- * Returns the current score of the Readability object.
- *
- * @return int
- */
- public function getContentScore()
- {
- return $this->contentScore;
- }
-
- /**
- * Returns the current score of the Readability object.
- *
- * @param int $score
- */
- public function setContentScore($score)
- {
- $this->contentScore = $score;
- }
-
-
- /**
* Returns the full text of the node.
*
* @param bool $normalize Normalize white space?
diff --git a/src/Readability.php b/src/Readability.php
index 17e73d4..01d428a 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -680,9 +680,12 @@ class Readability
// For every 100 characters in this paragraph, add another point. Up to 3 points.
$contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3);
- /** @var DOMElement $level */
+ /** @var $ancestor DOMElement */
foreach ($ancestors as $level => $ancestor) {
- $candidates[] = $ancestor;
+ if (!$ancestor->initialized) {
+ $ancestor->initializeNode();
+ $candidates[] = $ancestor;
+ }
/*
* Node score divider:
@@ -699,8 +702,8 @@ class Readability
$scoreDivider = $level * 3;
}
- $currentScore = $ancestor->getContentScore();
- $ancestor->setContentScore($currentScore + ($contentScore / $scoreDivider));
+ $currentScore = $ancestor->contentScore;
+ $ancestor->contentScore = $currentScore + ($contentScore / $scoreDivider);
}
}
@@ -718,12 +721,12 @@ class Readability
* unaffected by this operation.
*/
- $candidate->setContentScore($candidate->getContentScore() * (1 - $candidate->getLinkDensity()));
+ $candidate->contentScore = $candidate->contentScore * (1 - $candidate->getLinkDensity());
for ($i = 0; $i < $this->configuration->getMaxTopCandidates(); $i++) {
$aTopCandidate = isset($topCandidates[$i]) ? $topCandidates[$i] : null;
- if (!$aTopCandidate || $candidate->getContentScore() > $aTopCandidate->getContentScore()) {
+ if (!$aTopCandidate || $candidate->contentScore > $aTopCandidate->contentScore) {
array_splice($topCandidates, $i, 0, [$candidate]);
if (count($topCandidates) > $this->configuration->getMaxTopCandidates()) {
array_pop($topCandidates);
@@ -762,7 +765,7 @@ class Readability
// and whose scores are quite closed with current `topCandidate` node.
$alternativeCandidateAncestors = [];
for ($i = 1; $i < count($topCandidates); $i++) {
- if ($topCandidates[$i]->getContentScore() / $topCandidate->getContentScore() >= 0.75) {
+ if ($topCandidates[$i]->contentScore / $topCandidate->contentScore >= 0.75) {
array_push($alternativeCandidateAncestors, $topCandidates[$i]->getNodeAncestors(false));
}
}
@@ -794,14 +797,14 @@ class Readability
*/
$parentOfTopCandidate = $topCandidate->parentNode;
- $lastScore = $topCandidate->getContentScore();
+ $lastScore = $topCandidate->contentScore;
// The scores shouldn't get too low.
$scoreThreshold = $lastScore / 3;
/* @var DOMElement $parentOfTopCandidate */
while (!$parentOfTopCandidate->tagNameEqualsTo('body')) {
- $parentScore = $parentOfTopCandidate->getContentScore();
+ $parentScore = $parentOfTopCandidate->contentScore;
if ($parentScore < $scoreThreshold) {
break;
}
@@ -811,7 +814,7 @@ class Readability
$topCandidate = $parentOfTopCandidate;
break;
}
- $lastScore = $parentOfTopCandidate->getContentScore();
+ $lastScore = $parentOfTopCandidate->contentScore;
$parentOfTopCandidate = $parentOfTopCandidate->parentNode;
}
@@ -833,7 +836,7 @@ class Readability
$articleContent = new DOMDocument('1.0', 'utf-8');
$articleContent->createElement('div');
- $siblingScoreThreshold = max(10, $topCandidate->getContentScore() * 0.2);
+ $siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2);
// Keep potential top candidate's parent node to try to get text direction of it later.
$parentOfTopCandidate = $topCandidate->parentNode;
$siblings = $parentOfTopCandidate->getChildren();
@@ -851,9 +854,9 @@ class Readability
// Give a bonus if sibling nodes and top candidates have the example same classname
if ($sibling->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') {
- $contentBonus += $topCandidate->getContentScore() * 0.2;
+ $contentBonus += $topCandidate->contentScore * 0.2;
}
- if ($sibling->getContentScore() + $contentBonus >= $siblingScoreThreshold) {
+ if ($sibling->contentScore + $contentBonus >= $siblingScoreThreshold) {
$append = true;
} elseif ($sibling->tagNameEqualsTo('p')) {
$linkDensity = $siblings->getLinkDensity();