'<',
'gt' => '>',
'amp' => '&',
'quot' => '"',
'apos' => '\'',
];
/**
* Readability constructor.
*
* @param Configuration $configuration
*/
public function __construct(Configuration $configuration)
{
$this->configuration = $configuration;
$this->logger = $this->configuration->getLogger();
}
/**
* Main parse function.
*
* @param $html
*
* @throws ParseException
*
* @return bool
*/
public function parse($html)
{
$this->logger->info('*** Starting parse process...');
$this->dom = $this->loadHTML($html);
// Checking for minimum HTML to work with.
if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) {
$this->logger->emergency('No body tag present or body tag empty');
throw new ParseException('Invalid or incomplete HTML.');
}
$this->getMetadata();
$this->getMainImage();
while (true) {
$this->logger->debug('Starting parse loop');
$root = $root->firstChild;
$elementsToScore = $this->getNodes($root);
$this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore)));
$result = $this->rateNodes($elementsToScore);
/*
* Now that we've gone through the full algorithm, check to see if
* we got any meaningful content. If we didn't, we may need to re-run
* grabArticle with different flags set. This gives us a higher likelihood of
* finding the content, and the sieve approach gives us a higher likelihood of
* finding the -right- content.
*/
$length = mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent));
$this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold()));
if ($result && $length < $this->configuration->getCharThreshold()) {
$this->dom = $this->loadHTML($html);
$root = $this->dom->getElementsByTagName('body')->item(0);
if ($this->configuration->getStripUnlikelyCandidates()) {
$this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false');
$this->configuration->setStripUnlikelyCandidates(false);
$this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
} elseif ($this->configuration->getWeightClasses()) {
$this->logger->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false');
$this->configuration->setWeightClasses(false);
$this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
} elseif ($this->configuration->getCleanConditionally()) {
$this->logger->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false');
$this->configuration->setCleanConditionally(false);
$this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
} else {
$this->logger->debug('[Parsing] Threshold not met, searching across attempts for some content.');
$this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
// No luck after removing flags, just return the longest text we found during the different loops
usort($this->attempts, function ($a, $b) {
return $b['textLength'] - $a['textLength'];
});
// But first check if we actually have something
if (!$this->attempts[0]['textLength']) {
$this->logger->emergency('[Parsing] Could not parse text, giving up :(');
throw new ParseException('Could not parse text.');
}
$this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.');
$result = $this->attempts[0]['articleContent'];
break;
}
} else {
break;
}
}
if (!$result) {
$this->logger->info('*** Parse failed :(');
return false;
}
$result = $this->postProcessContent($result);
// If we haven't found an excerpt in the article's metadata, use the article's
// first paragraph as the excerpt. This can be used for displaying a preview of
// the article's content.
if (!$this->getExcerpt()) {
$this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');
$paragraphs = $result->getElementsByTagName('p');
if ($paragraphs->length > 0) {
$this->setExcerpt(trim($paragraphs->item(0)->textContent));
}
}
$this->setContent($result);
$this->logger->info('*** Parse successful :)');
return true;
}
/**
* Creates a DOM Document object and loads the provided HTML on it.
*
* Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)
* Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
* because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
* objects and ruining the backup.
*
* @param string $html
*
* @return DOMDocument
*/
private function loadHTML($html)
{
$this->logger->debug('[Loading] Loading HTML...');
// To avoid throwing a gazillion of errors on malformed HTMLs
libxml_use_internal_errors(true);
//$html = preg_replace('/(
]*>[ \n\r\t]*){2,}/i', '
', $html); if ($this->configuration->getParser() === 'html5') { $this->logger->debug('[Loading] Using HTML5 parser...'); $html5 = new HTML5(['disable_html_ns' => true, 'target_document' => new DOMDocument('1.0', 'utf-8')]); $dom = $html5->loadHTML($html); //TODO: Improve this so it looks inside