summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-10-21 23:36:25 +0100
committerAndres Rey <[email protected]>2016-10-21 23:36:25 +0100
commitd6556fbc040b23e553ad0af5976d069491c881e8 (patch)
tree62a65b364350216ba955d1acefd40ea54fce7c74 /src
parent0c11d557755ddbbafeeccd71e2d70b5c1d1458ab (diff)
Added function to search for topCandidate
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php49
-rw-r--r--src/Readability.php2
2 files changed, 50 insertions, 1 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 2f54114..4e74b05 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -339,5 +339,54 @@ class HTMLParser
$topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null;
$neededToCreateTopCandidate = false;
+
+ /*
+ * If we still have no top candidate, just use the body as a last resort.
+ * We also have to copy the body node so it is something we can modify.
+ */
+
+ if ($topCandidate === null || $topCandidate->tagNameEqualsTo('body')) {
+ //TODO
+ } elseif ($topCandidate) {
+ /*
+ * Because of our bonus system, parents of candidates might have scores
+ * themselves. They get half of the node. There won't be nodes with higher
+ * scores than our topCandidate, but if we see the score going *up* in the first
+ * few steps up the tree, that's a decent sign that there might be more content
+ * lurking in other places that we want to unify in. The sibling stuff
+ * below does some of that - but only if we've looked high enough up the DOM
+ * tree.
+ */
+
+ $parentOfTopCandidate = $topCandidate->getParent();
+ $lastScore = $topCandidate->getContentScore();
+
+ // The scores shouldn't get too low.
+ $scoreThreshold = $lastScore / 3;
+
+ while ($parentOfTopCandidate) {
+ /** @var Readability $parentOfTopCandidate */
+ $parentScore = $parentOfTopCandidate->getContentScore();
+ if ($parentScore < $scoreThreshold) {
+ break;
+ }
+
+ if ($parentScore > $lastScore) {
+ // Alright! We found a better parent to use.
+ $topCandidate = $parentOfTopCandidate;
+ break;
+ }
+ $lastScore = $parentOfTopCandidate->getContentScore();
+ $parentOfTopCandidate = $parentOfTopCandidate->getParent();
+ }
+ }
+
+ /*
+ * Now that we have the top candidate, look through its siblings for content
+ * that might also be related. Things like preambles, content split by ads
+ * that we removed, etc.
+ */
+
+
}
}
diff --git a/src/Readability.php b/src/Readability.php
index a15b664..8a40a7c 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -244,6 +244,6 @@ class Readability extends Element implements ReadabilityInterface
*/
public function getTextContent()
{
- return $this->node->getChildrenAsString();
+ return $this->getChildrenAsString();
}
}