summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-05 23:55:33 +0000
committerAndres Rey <[email protected]>2017-11-05 23:55:33 +0000
commit56d2c4c1d9a3457be5dcf9ccf7f9d2b31c467e1e (patch)
tree7d660d8dbb20d25eb3d57cb3c584ccd418c83623
parent47f9826bd9b6191644bfce9266a6b032bdb80137 (diff)
If the top candidate is the only child, use parent instead. This will help sibling joining logic when adjacent content is actually located in parent's sibling node.
-rw-r--r--src/HTMLParser.php12
1 files changed, 10 insertions, 2 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 25fc8f9..b6d58ef 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -874,8 +874,8 @@ class HTMLParser
// The scores shouldn't get too low.
$scoreThreshold = $lastScore / 3;
- while ($parentOfTopCandidate) {
- /* @var Readability $parentOfTopCandidate */
+ /* @var Readability $parentOfTopCandidate */
+ while (!$parentOfTopCandidate->tagNameEqualsTo('body')) {
$parentScore = $parentOfTopCandidate->getContentScore();
if ($parentScore < $scoreThreshold) {
break;
@@ -889,6 +889,14 @@ class HTMLParser
$lastScore = $parentOfTopCandidate->getContentScore();
$parentOfTopCandidate = $parentOfTopCandidate->getParent();
}
+
+ // If the top candidate is the only child, use parent instead. This will help sibling
+ // joining logic when adjacent content is actually located in parent's sibling node.
+ $parentOfTopCandidate = $topCandidate->getParent();
+ while (!$parentOfTopCandidate->tagNameEqualsTo('body') && count($parentOfTopCandidate->getChildren()) === 1) {
+ $topCandidate = $parentOfTopCandidate;
+ $parentOfTopCandidate = $topCandidate->getParent();
+ }
}
/*