From a58b71b728e820fe68fae2b9477e13d8ab8f5a7f Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Mon, 12 Dec 2016 00:43:18 +0000 Subject: Added recursion to re-run the algorithm in case no quality content is found. --- src/HTMLParser.php | 38 ++++++++++++++++++++++++++++++++++---- src/Readability.php | 7 +++++-- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/src/HTMLParser.php b/src/HTMLParser.php index d262519..d9db75c 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -101,6 +101,7 @@ class HTMLParser 'articleByLine' => null, 'stripUnlikelyCandidates' => true, 'cleanConditionally' => true, + 'weightClasses' => true, 'removeReadabilityTags' => true ]; @@ -136,11 +137,40 @@ class HTMLParser return false; } - $root = new Readability($root->firstChild); + while (true) { + $root = new Readability($root->firstChild); - $elementsToScore = $this->getNodes($root); + $elementsToScore = $this->getNodes($root); + + $result = $this->rateNodes($elementsToScore); + + /* + * Now that we've gone through the full algorithm, check to see if + * we got any meaningful content. If we didn't, we may need to re-run + * grabArticle with different flags set. This gives us a higher likelihood of + * finding the content, and the sieve approach gives us a higher likelihood of + * finding the -right- content. + */ + + // TODO Better way to count resulting text. Textcontent usually has alt titles and that stuff + // that doesn't really count to the quality of the result. + if ($result && mb_strlen($result->textContent) < 500) { + $root = $this->backupdom->getElementsByTagName('body')->item(0); + + if ($this->getConfig()->getOption('stripUnlikelyCandidates')) { + $this->getConfig()->setOption('stripUnlikelyCandidates', false); + } elseif ($this->getConfig()->getOption('weightClasses')) { + $this->getConfig()->setOption('weightClasses', false); + } elseif ($this->getConfig()->getOption('cleanConditionally')) { + $this->getConfig()->setOption('cleanConditionally', false); + } else { + break; + } + } else { + break; + } + } - $result = $this->rateNodes($elementsToScore); // Todo, fix return, check for values, maybe create a function to create the return object return [ @@ -473,7 +503,7 @@ class HTMLParser $topCandidate = new DOMDocument(); $topCandidate->appendChild($topCandidate->createElement('div', '')); - $kids = $this->backupdom->getElementsByTagName('body')->item(0)->childNodes; + $kids = $this->dom->getElementsByTagName('body')->item(0)->childNodes; // Cannot be foreached, don't ask me why. for ($i = 0; $i < $kids->length; $i++) { diff --git a/src/Readability.php b/src/Readability.php index 9255bc6..8d6ecc4 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -198,8 +198,11 @@ class Readability extends Element implements ReadabilityInterface */ public function getClassWeight() { - // if(!Config::FLAG_WEIGHT_CLASSES) return 0; - +// TODO To implement. How to get config from html parser from readability +// if ($this->getConfig()->getOption('weightClasses')) { +// return 0; +// } +// $weight = 0; // Look for a special classname -- cgit v1.2.3