summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2018-03-10 19:06:28 +0000
committerAndres Rey <[email protected]>2018-03-10 19:06:28 +0000
commit4318edfb099ea92a414aaf933f870763499f0b50 (patch)
treed71df49eb6ce4f662a9143416fef504cfeb8769a
parente17ca3d5a8cbb03432dc17ba11bb5e777ba39533 (diff)
Save attempts across different runs and try to return at least something before giving up.
-rw-r--r--src/Readability.php63
1 files changed, 46 insertions, 17 deletions
diff --git a/src/Readability.php b/src/Readability.php
index 2ec3c4a..0431635 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -78,6 +78,13 @@ class Readability
private $logger;
/**
+ * Collection of attempted text extractions.
+ *
+ * @var array
+ */
+ private $attempts = [];
+
+ /**
* @var array
*/
private $defaultTagsToScore = [
@@ -162,47 +169,69 @@ class Readability
$this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getWordThreshold()));
- if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < $this->configuration->getWordThreshold()) {
+ $parseSuccessful = true;
+
+ if ($result && $length < $this->configuration->getWordThreshold()) {
$this->dom = $this->loadHTML($html);
$root = $this->dom->getElementsByTagName('body')->item(0);
+ $parseSuccessful = false;
if ($this->configuration->getStripUnlikelyCandidates()) {
$this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false');
$this->configuration->setStripUnlikelyCandidates(false);
+ $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
} elseif ($this->configuration->getWeightClasses()) {
$this->logger->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false');
$this->configuration->setWeightClasses(false);
+ $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
} elseif ($this->configuration->getCleanConditionally()) {
$this->logger->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false');
$this->configuration->setCleanConditionally(false);
+ $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
} else {
- $this->logger->emergency('[Parsing] Could not parse text, giving up :(');
+ $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
+
+ // No luck after removing flags, just return the longest text we found during the different loops
+ usort($this->attempts, function ($a, $b) {
+ return $a['textLength'] < $b['textLength'];
+ });
+
+ // But first check if we actually have something
+ if (!$this->attempts[0]['textLength']) {
+ $this->logger->emergency('[Parsing] Could not parse text, giving up :(');
- throw new ParseException('Could not parse text.');
+ throw new ParseException('Could not parse text.');
+ }
+
+ $result = $this->attempts[0]['articleContent'];
+ $parseSuccessful = true;
+ break;
}
} else {
break;
}
}
- $result = $this->postProcessContent($result);
-
- // If we haven't found an excerpt in the article's metadata, use the article's
- // first paragraph as the excerpt. This can be used for displaying a preview of
- // the article's content.
- if (!$this->getExcerpt()) {
- $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');
- $paragraphs = $result->getElementsByTagName('p');
- if ($paragraphs->length > 0) {
- $this->setExcerpt(trim($paragraphs->item(0)->textContent));
+ if ($parseSuccessful) {
+ $result = $this->postProcessContent($result);
+
+ // If we haven't found an excerpt in the article's metadata, use the article's
+ // first paragraph as the excerpt. This can be used for displaying a preview of
+ // the article's content.
+ if (!$this->getExcerpt()) {
+ $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');
+ $paragraphs = $result->getElementsByTagName('p');
+ if ($paragraphs->length > 0) {
+ $this->setExcerpt(trim($paragraphs->item(0)->textContent));
+ }
}
- }
- $this->setContent($result);
+ $this->setContent($result);
- $this->logger->info('*** Parse successful :)');
+ $this->logger->info('*** Parse successful :)');
- return true;
+ return true;
+ }
}
/**