summaryrefslogtreecommitdiff
path: root/src/Readability.php
diff options
context:
space:
mode:
Diffstat (limited to 'src/Readability.php')
-rw-r--r--src/Readability.php100
1 files changed, 79 insertions, 21 deletions
diff --git a/src/Readability.php b/src/Readability.php
index 91e703c..ef49763 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -78,6 +78,13 @@ class Readability
private $logger;
/**
+ * Collection of attempted text extractions.
+ *
+ * @var array
+ */
+ private $attempts = [];
+
+ /**
* @var array
*/
private $defaultTagsToScore = [
@@ -155,54 +162,76 @@ class Readability
* finding the -right- content.
*/
- $length = 0;
- foreach ($result->getElementsByTagName('p') as $p) {
- $length += mb_strlen($p->textContent);
- }
+ $length = mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent));
$this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getWordThreshold()));
- if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < $this->configuration->getWordThreshold()) {
+ $parseSuccessful = true;
+
+ if ($result && $length < $this->configuration->getWordThreshold()) {
$this->dom = $this->loadHTML($html);
$root = $this->dom->getElementsByTagName('body')->item(0);
+ $parseSuccessful = false;
if ($this->configuration->getStripUnlikelyCandidates()) {
$this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false');
$this->configuration->setStripUnlikelyCandidates(false);
+ $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
} elseif ($this->configuration->getWeightClasses()) {
$this->logger->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false');
$this->configuration->setWeightClasses(false);
+ $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
} elseif ($this->configuration->getCleanConditionally()) {
$this->logger->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false');
$this->configuration->setCleanConditionally(false);
+ $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
} else {
- $this->logger->emergency('[Parsing] Could not parse text, giving up :(');
+ $this->logger->debug('[Parsing] Threshold not met, searching across attempts for some content.');
+ $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
+
+ // No luck after removing flags, just return the longest text we found during the different loops
+ usort($this->attempts, function ($a, $b) {
+ return $a['textLength'] < $b['textLength'];
+ });
- throw new ParseException('Could not parse text.');
+ // But first check if we actually have something
+ if (!$this->attempts[0]['textLength']) {
+ $this->logger->emergency('[Parsing] Could not parse text, giving up :(');
+
+ throw new ParseException('Could not parse text.');
+ }
+
+ $this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.');
+
+ $result = $this->attempts[0]['articleContent'];
+ $parseSuccessful = true;
+ break;
}
} else {
break;
}
}
- $result = $this->postProcessContent($result);
-
- // If we haven't found an excerpt in the article's metadata, use the article's
- // first paragraph as the excerpt. This can be used for displaying a preview of
- // the article's content.
- if (!$this->getExcerpt()) {
- $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');
- $paragraphs = $result->getElementsByTagName('p');
- if ($paragraphs->length > 0) {
- $this->setExcerpt(trim($paragraphs->item(0)->textContent));
+ if ($parseSuccessful) {
+ $result = $this->postProcessContent($result);
+
+ // If we haven't found an excerpt in the article's metadata, use the article's
+ // first paragraph as the excerpt. This can be used for displaying a preview of
+ // the article's content.
+ if (!$this->getExcerpt()) {
+ $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');
+ $paragraphs = $result->getElementsByTagName('p');
+ if ($paragraphs->length > 0) {
+ $this->setExcerpt(trim($paragraphs->item(0)->textContent));
+ }
}
- }
- $this->setContent($result);
+ $this->setContent($result);
- $this->logger->info('*** Parse successful :)');
+ $this->logger->info('*** Parse successful :)');
- return true;
+ return true;
+ }
}
/**
@@ -468,6 +497,10 @@ class Readability
if (count(preg_split('/\s+/', $curTitle)) < 3) {
$curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1);
$this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));
+ } else if (count(preg_split('/\s+/', substr($curTitle, 0, strpos($curTitle, ':')))) > 5) {
+ // But if we have too many words before the colon there's something weird
+ // with the titles and the H tags so let's just use the original title instead
+ $curTitle = $originalTitle;
}
}
} elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
@@ -1129,6 +1162,7 @@ class Readability
$this->_clean($article, 'embed');
$this->_clean($article, 'h1');
$this->_clean($article, 'footer');
+ $this->_clean($article, 'link');
// Clean out elements have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
@@ -1480,6 +1514,28 @@ class Readability
}
/**
+ * Removes the class="" attribute from every element in the given
+ * subtree.
+ *
+ * Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes
+ * here so no need to filter those.
+ *
+ * @param DOMDocument|DOMNode $node
+ *
+ * @return void
+ **/
+ public function _cleanClasses($node)
+ {
+ if ($node->getAttribute('class') !== '') {
+ $node->removeAttribute('class');
+ }
+
+ for ($node = $node->firstChild; $node !== null; $node = $node->nextSibling) {
+ $this->_cleanClasses($node);
+ }
+ }
+
+ /**
* @param DOMDocument $article
*
* @return DOMDocument
@@ -1532,6 +1588,8 @@ class Readability
}
}
+ $this->_cleanClasses($article);
+
return $article;
}