From d6556fbc040b23e553ad0af5976d069491c881e8 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Fri, 21 Oct 2016 23:36:25 +0100 Subject: Added function to search for topCandidate --- src/HTMLParser.php | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ src/Readability.php | 2 +- 2 files changed, 50 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 2f54114..4e74b05 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -339,5 +339,54 @@ class HTMLParser $topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null; $neededToCreateTopCandidate = false; + + /* + * If we still have no top candidate, just use the body as a last resort. + * We also have to copy the body node so it is something we can modify. + */ + + if ($topCandidate === null || $topCandidate->tagNameEqualsTo('body')) { + //TODO + } elseif ($topCandidate) { + /* + * Because of our bonus system, parents of candidates might have scores + * themselves. They get half of the node. There won't be nodes with higher + * scores than our topCandidate, but if we see the score going *up* in the first + * few steps up the tree, that's a decent sign that there might be more content + * lurking in other places that we want to unify in. The sibling stuff + * below does some of that - but only if we've looked high enough up the DOM + * tree. + */ + + $parentOfTopCandidate = $topCandidate->getParent(); + $lastScore = $topCandidate->getContentScore(); + + // The scores shouldn't get too low. + $scoreThreshold = $lastScore / 3; + + while ($parentOfTopCandidate) { + /** @var Readability $parentOfTopCandidate */ + $parentScore = $parentOfTopCandidate->getContentScore(); + if ($parentScore < $scoreThreshold) { + break; + } + + if ($parentScore > $lastScore) { + // Alright! We found a better parent to use. + $topCandidate = $parentOfTopCandidate; + break; + } + $lastScore = $parentOfTopCandidate->getContentScore(); + $parentOfTopCandidate = $parentOfTopCandidate->getParent(); + } + } + + /* + * Now that we have the top candidate, look through its siblings for content + * that might also be related. Things like preambles, content split by ads + * that we removed, etc. + */ + + } } diff --git a/src/Readability.php b/src/Readability.php index a15b664..8a40a7c 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -244,6 +244,6 @@ class Readability extends Element implements ReadabilityInterface */ public function getTextContent() { - return $this->node->getChildrenAsString(); + return $this->getChildrenAsString(); } } -- cgit v1.2.3