summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-11-22 15:39:21 +0000
committerAndres Rey <[email protected]>2016-11-22 15:39:21 +0000
commitcba167695482cca7492303e3de1f59333c0127b6 (patch)
tree1d8e9900642a16476ff15751ee1ef09f3c20cad5 /src
parent37dba87cce8498abaa4ef4c1892a83585abd2c43 (diff)
Removed old reference to elementsToScore, switched the moment when elements are initialized
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php24
-rw-r--r--src/Readability.php43
2 files changed, 32 insertions, 35 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 4d0a271..bd7774b 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -29,11 +29,6 @@ class HTMLParser
/**
* @var array
*/
- private $elementsToScore = [];
-
- /**
- * @var array
- */
private $regexps = [
'unlikelyCandidates' => '/banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i',
'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
@@ -136,9 +131,9 @@ class HTMLParser
$root = new Readability($root->firstChild);
- $this->getNodes($root);
+ $elementsToScore = $this->getNodes($root);
- $result = $this->rateNodes($this->elementsToScore);
+ $result = $this->rateNodes($elementsToScore);
// Todo, fix return, check for values, maybe create a function to create the return object
return [
@@ -278,6 +273,8 @@ class HTMLParser
{
$stripUnlikelyCandidates = $this->getConfig()->getOption('stripUnlikelyCandidates');
+ $elementsToScore = [];
+
/*
* First, node prepping. Trash nodes that look cruddy (like ones with the
* class name "comment", etc), and turn divs into P tags where they have been
@@ -307,7 +304,7 @@ class HTMLParser
}
if (in_array(strtolower($node->getTagName()), $this->defaultTagsToScore)) {
- $this->elementsToScore[] = $node;
+ $elementsToScore[] = $node;
}
// Turn all divs that don't have children block level elements into p's
@@ -324,7 +321,7 @@ class HTMLParser
$node = $pNode;
} elseif (!$this->hasSingleChildBlockElement($node)) {
$node->setNodeTag('p');
- $this->elementsToScore[] = $node;
+ $elementsToScore[] = $node;
} else {
// EXPERIMENTAL
foreach ($node->getChildren() as $child) {
@@ -339,6 +336,8 @@ class HTMLParser
$node = $node->getNextNode($node);
}
+
+ return $elementsToScore;
}
/**
@@ -381,7 +380,10 @@ class HTMLParser
// Initialize and score ancestors.
/** @var Readability $ancestor */
foreach ($ancestors as $level => $ancestor) {
- // No need to initialize the ancestor since getNodeAncestors() already initializes them.
+ if (!$ancestor->isInitialized()) {
+ $ancestor->initializeNode();
+ $candidates[] = $ancestor;
+ }
/*
* Node score divider:
@@ -400,8 +402,6 @@ class HTMLParser
$currentScore = $ancestor->getContentScore();
$ancestor->setContentScore($currentScore + ($contentScore / $scoreDivider));
-
- $candidates[] = $ancestor;
}
}
diff --git a/src/Readability.php b/src/Readability.php
index f7a2e3d..7c04044 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -44,30 +44,27 @@ class Readability extends Element implements ReadabilityInterface
{
parent::__construct($node);
- if (get_class($node) !== 'DOMText') {
- /*
- * Restore the score if the object has been already scored.
- *
- * An if must be added before calling the getAttribute function, because if we reach the DOMDocument
- * by getting the node parents we'll get a undefined function fatal error
- */
- $score = 0;
-
- // Check if the getAttribute method exists, as some elements lack of it (and calling it anyway throws an exception)
- if (method_exists($node, 'getAttribute')) {
- $hasScore = $node->getAttribute('data-readability');
- if ($hasScore !== '') {
- // Node was initialized previously. Restoring score and setting flag.
- $this->initialized = true;
- $score = $hasScore;
- } else {
- // Fresh, uninitialized node.
- $score = 0;
- }
+ /*
+ * Restore the score if the object has been already scored.
+ *
+ * An if must be added before calling the getAttribute function, because if we reach the DOMDocument
+ * by getting the node parents we'll get a undefined function fatal error
+ */
+ $score = 0;
+
+ // Check if the getAttribute method exists, as some elements lack of it (and calling it anyway throws an exception)
+ if (method_exists($node, 'getAttribute')) {
+ if ($node->hasAttribute('data-readability')) {
+ // Node was initialized previously. Restoring score and setting flag.
+ $this->initialized = true;
+ $score = $node->getAttribute('data-readability');
+ } else {
+ // Fresh, uninitialized node.
+ $score = 0;
}
-
- $this->setContentScore($score);
}
+
+ $this->setContentScore($score);
}
/**
@@ -102,7 +99,7 @@ class Readability extends Element implements ReadabilityInterface
$node = $this->getParent();
while ($node) {
- $ancestors[] = $node->initializeNode();
+ $ancestors[] = $node;
$level++;
if ($level >= $maxLevel) {
break;