summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-05 20:11:03 +0000
committerAndres Rey <[email protected]>2017-11-05 20:11:03 +0000
commit47f9826bd9b6191644bfce9266a6b032bdb80137 (patch)
tree3d2e7306b509c2fe9c7ea27c808288e157b09fae /src
parentf66b5ac10474ccfb880439a78d10b46f1abfe45a (diff)
Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array and whose scores are quite closed with current `topCandidate` node.
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php28
1 files changed, 26 insertions, 2 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 636796c..25fc8f9 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -806,6 +806,7 @@ class HTMLParser
$topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null;
$neededToCreateTopCandidate = false;
+ $parentOfTopCandidate = null;
/*
* If we still have no top candidate, just use the body as a last resort.
@@ -814,8 +815,6 @@ class HTMLParser
if ($topCandidate === null || $topCandidate->tagNameEqualsTo('body')) {
// Move all of the page's children into topCandidate
- $neededToCreateTopCandidate = true;
-
$topCandidate = new DOMDocument('1.0', 'utf-8');
$topCandidate->encoding = 'UTF-8';
$topCandidate->appendChild($topCandidate->createElement('div', ''));
@@ -834,6 +833,31 @@ class HTMLParser
//TODO on the original code, $topCandidate is added to the page variable, which holds the whole HTML
// Should be done this here also? (line 823 in readability.js)
} elseif ($topCandidate) {
+ // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
+ // and whose scores are quite closed with current `topCandidate` node.
+ $alternativeCandidateAncestors = [];
+ for ($i = 0; $i < count($topCandidates) - 1; $i++) {
+ if ($topCandidates[$i]->getContentScore() / $topCandidate->getContentScore() >= 0.75) {
+ $alternativeCandidateAncestors[$i] = $topCandidates[$i]->getNodeAncestors(5);
+ }
+ }
+
+ $MINIMUM_TOPCANDIDATES = 3;
+ if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) {
+ $parentOfTopCandidate = $topCandidate->getParent();
+ while (!$parentOfTopCandidate->tagNameEqualsTo('body')) {
+ $listsContainingThisAncestor = 0;
+ for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) {
+ $listsContainingThisAncestor += (int)in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]);
+ }
+ if ($listsContainingThisAncestor >= $MINIMUM_TOPCANDIDATES) {
+ $topCandidate = $parentOfTopCandidate;
+ break;
+ }
+ $parentOfTopCandidate = $parentOfTopCandidate->getParent();
+ }
+ }
+
/*
* Because of our bonus system, parents of candidates might have scores
* themselves. They get half of the node. There won't be nodes with higher