summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-12-12 00:43:18 +0000
committerAndres Rey <[email protected]>2016-12-12 00:43:18 +0000
commita58b71b728e820fe68fae2b9477e13d8ab8f5a7f (patch)
treee504a225f88122985d0a7463ad641eccfe856151
parent1fb1351943e27ca9d690a9e87c70ef4e1812a8ba (diff)
Added recursion to re-run the algorithm in case no quality content is found.
-rw-r--r--src/HTMLParser.php38
-rw-r--r--src/Readability.php7
2 files changed, 39 insertions, 6 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index d262519..d9db75c 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -101,6 +101,7 @@ class HTMLParser
'articleByLine' => null,
'stripUnlikelyCandidates' => true,
'cleanConditionally' => true,
+ 'weightClasses' => true,
'removeReadabilityTags' => true
];
@@ -136,11 +137,40 @@ class HTMLParser
return false;
}
- $root = new Readability($root->firstChild);
+ while (true) {
+ $root = new Readability($root->firstChild);
- $elementsToScore = $this->getNodes($root);
+ $elementsToScore = $this->getNodes($root);
+
+ $result = $this->rateNodes($elementsToScore);
+
+ /*
+ * Now that we've gone through the full algorithm, check to see if
+ * we got any meaningful content. If we didn't, we may need to re-run
+ * grabArticle with different flags set. This gives us a higher likelihood of
+ * finding the content, and the sieve approach gives us a higher likelihood of
+ * finding the -right- content.
+ */
+
+ // TODO Better way to count resulting text. Textcontent usually has alt titles and that stuff
+ // that doesn't really count to the quality of the result.
+ if ($result && mb_strlen($result->textContent) < 500) {
+ $root = $this->backupdom->getElementsByTagName('body')->item(0);
+
+ if ($this->getConfig()->getOption('stripUnlikelyCandidates')) {
+ $this->getConfig()->setOption('stripUnlikelyCandidates', false);
+ } elseif ($this->getConfig()->getOption('weightClasses')) {
+ $this->getConfig()->setOption('weightClasses', false);
+ } elseif ($this->getConfig()->getOption('cleanConditionally')) {
+ $this->getConfig()->setOption('cleanConditionally', false);
+ } else {
+ break;
+ }
+ } else {
+ break;
+ }
+ }
- $result = $this->rateNodes($elementsToScore);
// Todo, fix return, check for values, maybe create a function to create the return object
return [
@@ -473,7 +503,7 @@ class HTMLParser
$topCandidate = new DOMDocument();
$topCandidate->appendChild($topCandidate->createElement('div', ''));
- $kids = $this->backupdom->getElementsByTagName('body')->item(0)->childNodes;
+ $kids = $this->dom->getElementsByTagName('body')->item(0)->childNodes;
// Cannot be foreached, don't ask me why.
for ($i = 0; $i < $kids->length; $i++) {
diff --git a/src/Readability.php b/src/Readability.php
index 9255bc6..8d6ecc4 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -198,8 +198,11 @@ class Readability extends Element implements ReadabilityInterface
*/
public function getClassWeight()
{
- // if(!Config::FLAG_WEIGHT_CLASSES) return 0;
-
+// TODO To implement. How to get config from html parser from readability
+// if ($this->getConfig()->getOption('weightClasses')) {
+// return 0;
+// }
+//
$weight = 0;
// Look for a special classname