summaryrefslogtreecommitdiff
path: root/src/Readability.php
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-12-10 20:47:36 +0000
committerAndres Rey <[email protected]>2017-12-10 20:47:36 +0000
commit2a2c4129f969cb9f3af83cb5bd6f807d8e2792cb (patch)
tree884263c3d817b9d9742bac490c49f5b3db89988b /src/Readability.php
parent8b496d68788694b34c6fe898c380bf181981019d (diff)
Improve logging messages
Diffstat (limited to 'src/Readability.php')
-rw-r--r--src/Readability.php140
1 files changed, 68 insertions, 72 deletions
diff --git a/src/Readability.php b/src/Readability.php
index 8df0189..7160fbe 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -142,7 +142,7 @@ class Readability
$root = $root->firstChild;
$elementsToScore = $this->getNodes($root);
- $this->logger->debug(sprintf('Elements to score: %s', count($elementsToScore)), $elementsToScore);
+ $this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore)));
$result = $this->rateNodes($elementsToScore);
@@ -159,23 +159,23 @@ class Readability
$length += mb_strlen($p->textContent);
}
- $this->logger->info(sprintf('Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getWordThreshold()));
+ $this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getWordThreshold()));
if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < $this->configuration->getWordThreshold()) {
$this->dom = $this->loadHTML($html);
$root = $this->dom->getElementsByTagName('body')->item(0);
if ($this->configuration->getStripUnlikelyCandidates()) {
- $this->logger->debug('Threshold not passed, trying again setting StripUnlikelyCandidates as false');
+ $this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false');
$this->configuration->setStripUnlikelyCandidates(false);
} elseif ($this->configuration->getWeightClasses()) {
- $this->logger->debug('Threshold not passed, trying again setting WeightClasses as false');
+ $this->logger->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false');
$this->configuration->setWeightClasses(false);
} elseif ($this->configuration->getCleanConditionally()) {
- $this->logger->debug('Threshold not passed, trying again setting CleanConditionally as false');
+ $this->logger->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false');
$this->configuration->setCleanConditionally(false);
} else {
- $this->logger->emergency('Could not parse text, giving up.');
+ $this->logger->emergency('[Parsing] Could not parse text, giving up :(');
throw new ParseException('Could not parse text.');
}
} else {
@@ -189,7 +189,7 @@ class Readability
// first paragraph as the excerpt. This can be used for displaying a preview of
// the article's content.
if (!$this->getExcerpt()) {
- $this->logger->debug('No excerpt text found on metadata, extracting first p node and using it as excerpt.');
+ $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');
$paragraphs = $result->getElementsByTagName('p');
if ($paragraphs->length > 0) {
$this->setExcerpt(trim($paragraphs->item(0)->textContent));
@@ -217,7 +217,7 @@ class Readability
*/
private function loadHTML($html)
{
- $this->logger->debug('Loading HTML...');
+ $this->logger->debug('[Loading] Loading HTML...');
// To avoid throwing a gazillion of errors on malformed HTMLs
libxml_use_internal_errors(true);
@@ -230,13 +230,13 @@ class Readability
}
if ($this->configuration->getNormalizeEntities()) {
- $this->logger->debug('Normalized entities via mb_convert_encoding.');
+ $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.');
// Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
}
if ($this->configuration->getSummonCthulhu()) {
- $this->logger->debug('Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘');
+ $this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘');
$html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
}
@@ -248,7 +248,7 @@ class Readability
$this->prepDocument($dom);
- $this->logger->debug('Loaded HTML successfully.');
+ $this->logger->debug('[Loading] Loaded HTML successfully.');
return $dom;
}
@@ -258,7 +258,7 @@ class Readability
*/
private function getMetadata()
{
- $this->logger->debug('Retrieving metadata...');
+ $this->logger->debug('[Metadata] Retrieving metadata...');
$values = [];
// Match "description", or Twitter's "twitter:description" (Cards)
@@ -274,7 +274,7 @@ class Readability
$elementProperty = $meta->getAttribute('property');
if (in_array('author', [$elementName, $elementProperty])) {
- $this->logger->info(sprintf('[Metadata] Found author: %s', $meta->getAttribute('content')));
+ $this->logger->info(sprintf('[Metadata] Found author: \'%s\'', $meta->getAttribute('content')));
$this->setAuthor($meta->getAttribute('content'));
continue;
}
@@ -297,15 +297,15 @@ class Readability
}
}
if (array_key_exists('description', $values)) {
- $this->logger->info(sprintf('[Metadata] Found excerpt in \'description\' tag: %s', $values['description']));
+ $this->logger->info(sprintf('[Metadata] Found excerpt in \'description\' tag: \'%s\'', $values['description']));
$this->setExcerpt($values['description']);
} elseif (array_key_exists('og:description', $values)) {
// Use facebook open graph description.
- $this->logger->info(sprintf('[Metadata] Found excerpt in \'og:description\' tag: %s', $values['og:description']));
+ $this->logger->info(sprintf('[Metadata] Found excerpt in \'og:description\' tag: \'%s\'', $values['og:description']));
$this->setExcerpt($values['og:description']);
} elseif (array_key_exists('twitter:description', $values)) {
// Use twitter cards description.
- $this->logger->info(sprintf('[Metadata] Found excerpt in \'twitter:description\' tag: %s', $values['twitter:description']));
+ $this->logger->info(sprintf('[Metadata] Found excerpt in \'twitter:description\' tag: \'%s\'', $values['twitter:description']));
$this->setExcerpt($values['twitter:description']);
}
@@ -314,21 +314,21 @@ class Readability
if (!$this->getTitle()) {
if (array_key_exists('og:title', $values)) {
// Use facebook open graph title.
- $this->logger->info(sprintf('[Metadata] Found title in \'og:title\' tag: %s', $values['og:title']));
+ $this->logger->info(sprintf('[Metadata] Found title in \'og:title\' tag: \'%s\'', $values['og:title']));
$this->setTitle($values['og:title']);
} elseif (array_key_exists('twitter:title', $values)) {
// Use twitter cards title.
- $this->logger->info(sprintf('[Metadata] Found title in \'twitter:title\' tag: %s', $values['twitter:title']));
+ $this->logger->info(sprintf('[Metadata] Found title in \'twitter:title\' tag: \'%s\'', $values['twitter:title']));
$this->setTitle($values['twitter:title']);
}
}
if (array_key_exists('og:image', $values) || array_key_exists('twitter:image', $values)) {
if (array_key_exists('og:image', $values)) {
- $this->logger->info(sprintf('[Metadata] Found main image in \'og:image\' tag: %s', $values['og:image']));
+ $this->logger->info(sprintf('[Metadata] Found main image in \'og:image\' tag: \'%s\'', $values['og:image']));
$this->setImage($values['og:image']);
} else {
- $this->logger->info(sprintf('[Metadata] Found main image in \'twitter:image\' tag: %s', $values['twitter:image']));
+ $this->logger->info(sprintf('[Metadata] Found main image in \'twitter:image\' tag: \'%s\'', $values['twitter:image']));
$this->setImage($values['twitter:image']);
}
}
@@ -410,10 +410,10 @@ class Readability
if ($this->getTitle()) {
$originalTitle = $this->getTitle();
} else {
- $this->logger->debug('Could not find title in metadata, searching for the title tag...');
+ $this->logger->debug('[Metadata] Could not find title in metadata, searching for the title tag...');
$titleTag = $this->dom->getElementsByTagName('title');
if ($titleTag->length > 0) {
- $this->logger->info(sprintf('Using title tag as article title: %s', $titleTag->item(0)->nodeValue));
+ $this->logger->info(sprintf('[Metadata] Using title tag as article title: \'%s\'', $titleTag->item(0)->nodeValue));
$originalTitle = $titleTag->item(0)->nodeValue;
}
}
@@ -435,13 +435,13 @@ class Readability
$titleHadHierarchicalSeparators = (bool)preg_match('/ [\\\\\/>»] /', $curTitle);
$curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle);
- $this->logger->info(sprintf('Found hierarchical separators in title, new title is: %s', $curTitle));
+ $this->logger->info(sprintf('[Metadata] Found hierarchical separators in title, new title is: \'%s\'', $curTitle));
// If the resulting title is too short (3 words or fewer), remove
// the first part instead:
if (count(preg_split('/\s+/', $curTitle)) < 3) {
$curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle);
- $this->logger->info(sprintf('Title too short, using the first part of the title instead: %s', $curTitle));
+ $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));
}
} elseif (strpos($curTitle, ': ') !== false) {
// Check if we have an heading containing this exact string, so we
@@ -460,12 +460,12 @@ class Readability
if (!$match) {
$curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1);
- $this->logger->info(sprintf('Title has a colon in the middle, new title is: %s', $curTitle));
+ $this->logger->info(sprintf('[Metadata] Title has a colon in the middle, new title is: \'%s\'', $curTitle));
// If the title is now too short, try the first colon instead:
if (count(preg_split('/\s+/', $curTitle)) < 3) {
$curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1);
- $this->logger->info(sprintf('Title too short, using the first part of the title instead: %s', $curTitle));
+ $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));
}
}
} elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
@@ -473,7 +473,7 @@ class Readability
if ($hOnes->length === 1) {
$curTitle = $hOnes->item(0)->nodeValue;
- $this->logger->info(sprintf('Using title from an H1 node: %s', $curTitle));
+ $this->logger->info(sprintf('[Metadata] Using title from an H1 node: \'%s\'', $curTitle));
}
}
@@ -492,7 +492,7 @@ class Readability
(!$titleHadHierarchicalSeparators || $curTitleWordCount !== $originalTitleWordCount)) {
$curTitle = $originalTitle;
- $this->logger->info(sprintf('Using title from an H1 node: %s', $curTitle));
+ $this->logger->info(sprintf('Using title from an H1 node: \'%s\'', $curTitle));
}
return $curTitle;
@@ -563,7 +563,7 @@ class Readability
*/
private function getNodes($node)
{
- $this->logger->info('Retrieving nodes...');
+ $this->logger->info('[Get Nodes] Retrieving nodes...');
$stripUnlikelyCandidates = $this->configuration->getStripUnlikelyCandidates();
@@ -578,18 +578,16 @@ class Readability
while ($node) {
$matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
- $this->logger->debug(sprintf('Match string from class and id is: %s', $matchString));
-
// Remove DOMComments nodes as we don't need them and mess up children counting
if ($node->nodeType === XML_COMMENT_NODE) {
- $this->logger->debug(sprintf('Found comment node, removing... Node content was: %s', substr($node->nodeValue, 0, 128)));
+ $this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
$node = NodeUtility::removeAndGetNext($node);
continue;
}
// Check to see if this node is a byline, and remove it if it is.
if ($this->checkByline($node, $matchString)) {
- $this->logger->debug(sprintf('Found byline, removing... Node content was: %s', substr($node->nodeValue, 0, 128)));
+ $this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
$node = NodeUtility::removeAndGetNext($node);
continue;
}
@@ -602,7 +600,7 @@ class Readability
$node->nodeName !== 'body' &&
$node->nodeName !== 'a'
) {
- $this->logger->debug(sprintf('Removing unlikely candidate. Node content was: %s', substr($node->nodeValue, 0, 128)));
+ $this->logger->debug(sprintf('[Get Nodes] Removing unlikely candidate. Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
$node = NodeUtility::removeAndGetNext($node);
continue;
}
@@ -614,13 +612,13 @@ class Readability
$node->nodeName === 'h4' || $node->nodeName === 'h5' || $node->nodeName === 'h6' ||
$node->nodeName === 'p') &&
$node->isElementWithoutContent()) {
- $this->logger->debug(sprintf('Removing empty \'%s\' node.', $node->nodeName));
+ $this->logger->debug(sprintf('[Get Nodes] Removing empty \'%s\' node.', $node->nodeName));
$node = NodeUtility::removeAndGetNext($node);
continue;
}
if (in_array(strtolower($node->nodeName), $this->defaultTagsToScore)) {
- $this->logger->debug(sprintf('Adding node to score list, node content is: %s', substr($node->nodeValue, 0, 128)));
+ $this->logger->debug(sprintf('[Get Nodes] Adding node to score list, node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
$elementsToScore[] = $node;
}
@@ -633,13 +631,13 @@ class Readability
* algorithm with DIVs with are, in practice, paragraphs.
*/
if ($node->hasSinglePNode()) {
- $this->logger->debug(sprintf('Found DIV with a single P node, removing DIV. Node content is: %s', substr($node->nodeValue, 0, 128)));
+ $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
$pNode = $node->getChildren(true)[0];
$node->parentNode->replaceChild($pNode, $node);
$node = $pNode;
$elementsToScore[] = $node;
} elseif (!$node->hasSingleChildBlockElement()) {
- $this->logger->debug(sprintf('Found DIV with a single child block element, converting to a P node. Node content is: %s', substr($node->nodeValue, 0, 128)));
+ $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
$node = NodeUtility::setNodeTag($node, 'p');
$elementsToScore[] = $node;
} else {
@@ -647,7 +645,7 @@ class Readability
foreach ($node->getChildren() as $child) {
/** @var $child DOMNode */
if ($child->nodeType === XML_TEXT_NODE && mb_strlen(trim($child->getTextContent())) > 0) {
- $this->logger->debug(sprintf('Found DIV a text node inside, converting to a P node. Node content is: %s', substr($node->nodeValue, 0, 128)));
+ $this->logger->debug(sprintf('[Get Nodes] Found DIV a text node inside, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
$newNode = $node->createNode($child, 'p');
$child->parentNode->replaceChild($newNode, $child);
}
@@ -685,7 +683,7 @@ class Readability
$rel = $node->getAttribute('rel');
if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) {
- $this->logger->info(sprintf('Found article author: %s', $node->getTextContent()));
+ $this->logger->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent()));
$this->setAuthor(trim($node->getTextContent()));
return true;
@@ -739,7 +737,7 @@ class Readability
*/
private function prepDocument(DOMDocument $dom)
{
- $this->logger->info('Preparing document for parsing...');
+ $this->logger->info('[PrepDocument] Preparing document for parsing...');
/*
* DOMNodeList must be converted to an array before looping over it.
@@ -764,7 +762,7 @@ class Readability
* (which will be replaced with a <p> later).
*/
while (($next = NodeUtility::nextElement($next)) && ($next->nodeName === 'br')) {
- $this->logger->debug('Removing chain of BR nodes...');
+ $this->logger->debug('[PrepDocument] Removing chain of BR nodes...');
$replaced = true;
$brSibling = $next->nextSibling;
@@ -792,7 +790,7 @@ class Readability
}
}
- $this->logger->debug('Replacing BR with a P node...');
+ $this->logger->debug('[PrepDocument] Replacing BR with a P node...');
// Otherwise, make this node a child of the new <p>.
$sibling = $next->nextSibling;
@@ -806,7 +804,7 @@ class Readability
$fonts = $dom->getElementsByTagName('font');
$length = $fonts->length;
for ($i = 0; $i < $length; $i++) {
- $this->logger->debug('Converting font tag into a span tag.');
+ $this->logger->debug('[PrepDocument] Converting font tag into a span tag.');
$font = $fonts->item($length - 1 - $i);
NodeUtility::setNodeTag($font, 'span', true);
}
@@ -821,7 +819,7 @@ class Readability
*/
private function rateNodes($nodes)
{
- $this->logger->info('Rating nodes...');
+ $this->logger->info('[Rating] Rating nodes...');
$candidates = [];
@@ -852,11 +850,11 @@ class Readability
// For every 100 characters in this paragraph, add another point. Up to 3 points.
$contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3);
- $this->logger->debug(sprintf('Node score %s, content: %s', $contentScore, substr($node->nodeValue, 0, 128)));
+ $this->logger->debug(sprintf('[Rating] Node score %s, content: \'%s\'', $contentScore, substr($node->nodeValue, 0, 128)));
/** @var $ancestor DOMElement */
foreach ($ancestors as $level => $ancestor) {
- $this->logger->debug('Found ancestor, initializing and adding it as a candidate...');
+ $this->logger->debug('[Rating] Found ancestor, initializing and adding it as a candidate...');
if (!$ancestor->isInitialized()) {
$ancestor->initializeNode($this->configuration->getWeightClasses());
$candidates[] = $ancestor;
@@ -880,7 +878,7 @@ class Readability
$currentScore = $ancestor->contentScore;
$ancestor->contentScore = $currentScore + ($contentScore / $scoreDivider);
- $this->logger->debug(sprintf('Ancestor score %s, value: %s', $ancestor->contentScore, substr($ancestor->nodeValue, 0, 128)));
+ $this->logger->debug(sprintf('[Rating] Ancestor score %s, value: \'%s\'', $ancestor->contentScore, substr($ancestor->nodeValue, 0, 128)));
}
}
@@ -922,7 +920,7 @@ class Readability
*/
if ($topCandidate === null || $topCandidate->nodeName === 'body') {
- $this->logger->info('No top candidate found or top candidate is the body tag. Moving all child nodes to a new DIV node.');
+ $this->logger->info('[Rating] No top candidate found or top candidate is the body tag. Moving all child nodes to a new DIV node.');
// Move all of the page's children into topCandidate
$topCandidate = new DOMDocument('1.0', 'utf-8');
@@ -939,7 +937,7 @@ class Readability
// Candidate must be created using firstChild to grab the DOMElement instead of the DOMDocument.
$topCandidate = $topCandidate->firstChild;
} elseif ($topCandidate) {
- $this->logger->info('Found top candidate');
+ $this->logger->info(sprintf('[Rating] Found top candidate, score: %s', $topCandidate->contentScore));
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
// and whose scores are quite closed with current `topCandidate` node.
$alternativeCandidateAncestors = [];
@@ -991,7 +989,7 @@ class Readability
if ($parentScore > $lastScore) {
// Alright! We found a better parent to use.
$topCandidate = $parentOfTopCandidate;
- $this->logger->info('Found a better top candidate.');
+ $this->logger->info('[Rating] Found a better top candidate.');
break;
}
$lastScore = $parentOfTopCandidate->contentScore;
@@ -1013,7 +1011,7 @@ class Readability
* that we removed, etc.
*/
- $this->logger->info('Creating final article content document...');
+ $this->logger->info('[Rating] Creating final article content document...');
$articleContent = new DOMDocument('1.0', 'utf-8');
$articleContent->createElement('div');
@@ -1025,14 +1023,14 @@ class Readability
$hasContent = false;
+ $this->logger->info('[Rating] Adding top candidate siblings...');
+
/** @var DOMElement $sibling */
foreach ($siblings as $sibling) {
- $this->logger->info('Adding top candidate siblings...');
-
$append = false;
if ($sibling === $topCandidate) {
- $this->logger->debug('Sibling is equal to the top candidate, adding to the final article...');
+ $this->logger->debug('[Rating] Sibling is equal to the top candidate, adding to the final article...');
$append = true;
} else {
@@ -1057,7 +1055,7 @@ class Readability
}
if ($append) {
- $this->logger->debug(sprintf('Appending sibling to final article, content is: %s', substr($sibling->nodeValue, 0, 128)));
+ $this->logger->debug(sprintf('[Rating] Appending sibling to final article, content is: \'%s\'', substr($sibling->nodeValue, 0, 128)));
$hasContent = true;
@@ -1091,7 +1089,7 @@ class Readability
$articleDir = $ancestor->getAttribute('dir');
if ($articleDir) {
$this->setDirection($articleDir);
- $this->logger->debug(sprintf('Found article direction: %s', $articleDir));
+ $this->logger->debug(sprintf('[Rating] Found article direction: %s', $articleDir));
break;
}
}
@@ -1111,7 +1109,7 @@ class Readability
*/
public function prepArticle(DOMDocument $article)
{
- $this->logger->info('Preparing final article...');
+ $this->logger->info('[PrepArticle] Preparing final article...');
$this->_cleanStyles($article);
$this->_clean($article, 'style');
@@ -1151,7 +1149,7 @@ class Readability
$titlesMatch = strpos($this->getTitle(), $h2->item(0)->textContent) !== false;
}
if ($titlesMatch) {
- $this->logger->info('Found title repeated in an H2 node, removing...');
+ $this->logger->info('[PrepArticle] Found title repeated in an H2 node, removing...');
$this->_clean($article, 'h2');
}
}
@@ -1175,7 +1173,7 @@ class Readability
foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) {
$next = $br->nextSibling;
if ($next && $next->nodeName === 'p') {
- $this->logger->debug('Removing br node next to a p node.');
+ $this->logger->debug('[PrepArticle] Removing br node next to a p node.');
$br->parentNode->removeChild($br);
}
}
@@ -1250,8 +1248,6 @@ class Readability
**/
public function _cleanStyles($node)
{
- $this->logger->info('Cleaning styles...');
-
if (property_exists($node, 'tagName') && $node->tagName === 'svg') {
return;
}
@@ -1292,7 +1288,7 @@ class Readability
$next = NodeUtility::getNextNode($node);
while ($next && $next !== $endOfSearchMarkerNode) {
if (preg_match($regex, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) {
- $this->logger->debug(sprintf('Removing matched node with regex: %s, node content was: %s', $regex, substr($next->nodeValue, 0, 128)));
+ $this->logger->debug(sprintf('Removing matched node with regex: \'%s\', node class was: \'%s\', id: \'%s\'', $regex, $next->getAttribute('class'), $next->getAttribute('id')));
$next = NodeUtility::removeAndGetNext($next);
} else {
$next = NodeUtility::getNextNode($next);
@@ -1321,7 +1317,7 @@ class Readability
$totalCount = $imgCount + $embedCount + $objectCount + $iframeCount;
if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) {
- $this->logger->debug(sprintf('Removing extra paragraph. Text content was: %s', substr($paragraph->textContent, 0, 128)));
+ $this->logger->debug(sprintf('[PrepArticle] Removing extra paragraph. Text content was: \'%s\'', substr($paragraph->textContent, 0, 128)));
$paragraph->parentNode->removeChild($paragraph);
}
}
@@ -1363,7 +1359,7 @@ class Readability
}
if ($weight < 0) {
- $this->logger->debug(sprintf('Removing tag %s with 0 or less weight', $tag));
+ $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\' with 0 or less weight', $tag));
NodeUtility::removeNode($node);
continue;
@@ -1403,7 +1399,7 @@ class Readability
(($embedCount === 1 && $contentLength < 75) || $embedCount > 1);
if ($haveToRemove) {
- $this->logger->debug(sprintf('Removing tag %s.', $tag));
+ $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\'.', $tag));
NodeUtility::removeNode($node);
}
@@ -1447,7 +1443,7 @@ class Readability
continue;
}
}
- $this->logger->debug(sprintf('Removing node %s.', $item->tagName));
+ $this->logger->debug(sprintf('[PrepArticle] Removing node \'%s\'.', $item->tagName));
NodeUtility::removeNode($item);
}
@@ -1472,7 +1468,7 @@ class Readability
}
if ($weight < 0) {
- $this->logger->debug(sprintf('Removing H node with 0 or less weight. Content was: %s', substr($header->nodeValue, 0, 128)));
+ $this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128)));
NodeUtility::removeNode($header);
}
@@ -1487,7 +1483,7 @@ class Readability
*/
public function postProcessContent(DOMDocument $article)
{
- $this->logger->info('PostProcessing content...');
+ $this->logger->info('[PostProcess] PostProcessing content...');
// Readability cannot open relative uris so we convert them to absolute uris.
if ($this->configuration->getFixRelativeURLs()) {
@@ -1498,12 +1494,12 @@ class Readability
// Replace links with javascript: URIs with text content, since
// they won't work after scripts have been removed from the page.
if (strpos($href, 'javascript:') === 0) {
- $this->logger->debug(sprintf('Removing \'javascript:\' link. Content is: %s', substr($link->textContent, 0, 128)));
+ $this->logger->debug(sprintf('[PostProcess] Removing \'javascript:\' link. Content is: \'%s\'', substr($link->textContent, 0, 128)));
$text = $article->createTextNode($link->textContent);
$link->parentNode->replaceChild($text, $link);
} else {
- $this->logger->debug(sprintf('Converting link to absolute URI: %s', substr($href, 0, 128)));
+ $this->logger->debug(sprintf('[PostProcess] Converting link to absolute URI: \'%s\'', substr($href, 0, 128)));
$link->setAttribute('href', $this->toAbsoluteURI($href));
}
@@ -1525,7 +1521,7 @@ class Readability
$src = array_filter($url);
$src = reset($src);
if ($src) {
- $this->logger->debug(sprintf('Converting image URL to absolute URI: %s', substr($src, 0, 128)));
+ $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($src, 0, 128)));
$img->setAttribute('src', $this->toAbsoluteURI($src));
}