diff options
author | Andres Rey <[email protected]> | 2016-10-21 23:36:25 +0100 |
---|---|---|
committer | Andres Rey <[email protected]> | 2016-10-21 23:36:25 +0100 |
commit | d6556fbc040b23e553ad0af5976d069491c881e8 (patch) | |
tree | 62a65b364350216ba955d1acefd40ea54fce7c74 /src | |
parent | 0c11d557755ddbbafeeccd71e2d70b5c1d1458ab (diff) |
Added function to search for topCandidate
Diffstat (limited to 'src')
-rw-r--r-- | src/HTMLParser.php | 49 | ||||
-rw-r--r-- | src/Readability.php | 2 |
2 files changed, 50 insertions, 1 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 2f54114..4e74b05 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -339,5 +339,54 @@ class HTMLParser $topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null; $neededToCreateTopCandidate = false; + + /* + * If we still have no top candidate, just use the body as a last resort. + * We also have to copy the body node so it is something we can modify. + */ + + if ($topCandidate === null || $topCandidate->tagNameEqualsTo('body')) { + //TODO + } elseif ($topCandidate) { + /* + * Because of our bonus system, parents of candidates might have scores + * themselves. They get half of the node. There won't be nodes with higher + * scores than our topCandidate, but if we see the score going *up* in the first + * few steps up the tree, that's a decent sign that there might be more content + * lurking in other places that we want to unify in. The sibling stuff + * below does some of that - but only if we've looked high enough up the DOM + * tree. + */ + + $parentOfTopCandidate = $topCandidate->getParent(); + $lastScore = $topCandidate->getContentScore(); + + // The scores shouldn't get too low. + $scoreThreshold = $lastScore / 3; + + while ($parentOfTopCandidate) { + /** @var Readability $parentOfTopCandidate */ + $parentScore = $parentOfTopCandidate->getContentScore(); + if ($parentScore < $scoreThreshold) { + break; + } + + if ($parentScore > $lastScore) { + // Alright! We found a better parent to use. + $topCandidate = $parentOfTopCandidate; + break; + } + $lastScore = $parentOfTopCandidate->getContentScore(); + $parentOfTopCandidate = $parentOfTopCandidate->getParent(); + } + } + + /* + * Now that we have the top candidate, look through its siblings for content + * that might also be related. Things like preambles, content split by ads + * that we removed, etc. + */ + + } } diff --git a/src/Readability.php b/src/Readability.php index a15b664..8a40a7c 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -244,6 +244,6 @@ class Readability extends Element implements ReadabilityInterface */ public function getTextContent() { - return $this->node->getChildrenAsString(); + return $this->getChildrenAsString(); } } |