summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-11 23:40:17 +0000
committerAndres Rey <[email protected]>2017-11-11 23:40:17 +0000
commitc54c64e937aa7e49111618776784f0f8acfdd3c6 (patch)
tree60bf20a57d34aeec244984a6a6bde21303e6b2ff
parentb13495916b35efbd548b4c174630fdc16ac9f447 (diff)
Minor fix when getting alternative top candidate ancestors + Remove DOMComments
-rw-r--r--src/HTMLParser.php12
1 files changed, 9 insertions, 3 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index cea379c..8c3c204 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -643,6 +643,12 @@ class HTMLParser
while ($node) {
$matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
+ // Remove DOMComments nodes as we don't need them and mess up children counting
+ if ($node->nodeTypeEqualsTo(XML_COMMENT_NODE)) {
+ $node = $node->removeAndGetNext($node);
+ continue;
+ }
+
// Check to see if this node is a byline, and remove it if it is.
if ($this->checkByline($node, $matchString)) {
$node = $node->removeAndGetNext($node);
@@ -847,9 +853,9 @@ class HTMLParser
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
// and whose scores are quite closed with current `topCandidate` node.
$alternativeCandidateAncestors = [];
- for ($i = 0; $i < count($topCandidates) - 1; $i++) {
+ for ($i = 0; $i < count($topCandidates); $i++) {
if ($topCandidates[$i]->getContentScore() / $topCandidate->getContentScore() >= 0.75) {
- $alternativeCandidateAncestors[$i] = $topCandidates[$i]->getNodeAncestors(5);
+ $alternativeCandidateAncestors[$i] = $topCandidates[$i]->getNodeAncestors(false);
}
}
@@ -904,7 +910,7 @@ class HTMLParser
// If the top candidate is the only child, use parent instead. This will help sibling
// joining logic when adjacent content is actually located in parent's sibling node.
$parentOfTopCandidate = $topCandidate->getParent();
- while (!$parentOfTopCandidate->tagNameEqualsTo('body') && count($parentOfTopCandidate->getChildren()) === 1) {
+ while (!$parentOfTopCandidate->tagNameEqualsTo('body') && count($parentOfTopCandidate->getChildren(true)) === 1) {
$topCandidate = $parentOfTopCandidate;
$parentOfTopCandidate = $topCandidate->getParent();
}