From 550fe35fbf3bbefa3257ddb7ec32afaea48b6726 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Wed, 28 Dec 2016 16:23:07 -0300 Subject: Removed the private var title since it wasn't used --- src/HTMLParser.php | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index a8c28ff..6f7afe7 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -26,11 +26,6 @@ class HTMLParser */ private $metadata = []; - /** - * @var array - */ - private $title = []; - /** * @var array */ @@ -137,7 +132,7 @@ class HTMLParser $this->metadata = $this->getMetadata(); - $this->title = $this->getTitle(); + $this->metadata['title'] = $this->getTitle(); // Checking for minimum HTML to work with. if (!($root = $this->dom->getElementsByTagName('body')->item(0))) { @@ -162,7 +157,11 @@ class HTMLParser // TODO Better way to count resulting text. Textcontent usually has alt titles and that stuff // that doesn't really count to the quality of the result. - if ($result && mb_strlen($result->textContent) < 500) { + $length = 0; + foreach($result->getElementsByTagName('p') as $p){ + $length += mb_strlen($p->textContent); + } + if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < 500) { $root = $this->backupdom->getElementsByTagName('body')->item(0); if ($this->getConfig()->getOption('stripUnlikelyCandidates')) { -- cgit v1.2.3 From ae573a6fe5d369c014b427b813fdc0bdfc20de54 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Sun, 12 Feb 2017 20:13:02 +0000 Subject: Extract top image when og:image and twitter:image are missing on the HTML --- src/HTMLParser.php | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 6f7afe7..5866bd9 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -132,6 +132,8 @@ class HTMLParser $this->metadata = $this->getMetadata(); + $this->metadata['image'] = $this->getMainImage(); + $this->metadata['title'] = $this->getTitle(); // Checking for minimum HTML to work with. @@ -158,7 +160,7 @@ class HTMLParser // TODO Better way to count resulting text. Textcontent usually has alt titles and that stuff // that doesn't really count to the quality of the result. $length = 0; - foreach($result->getElementsByTagName('p') as $p){ + foreach ($result->getElementsByTagName('p') as $p) { $length += mb_strlen($p->textContent); } if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < 500) { @@ -431,11 +433,39 @@ class HTMLParser if (array_key_exists('og:image', $values) || array_key_exists('twitter:image', $values)) { $metadata['image'] = ($values['og:image']) ? $values['og:image'] : $values['twitter:image']; + } else { + $metadata['image'] = null; } return $metadata; } + /** + * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't + * find a correct image. + * + * @return bool|string URL of the top image or false if unsuccessful. + */ + public function getMainImage() + { + if ($this->metadata['image'] !== null) { + return $this->metadata['image']; + } + + foreach ($this->dom->getElementsByTagName('link') as $link) { + /** @var \DOMElement $link */ + /* + * Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and + * finally check for the existence of the href attribute, which should hold the image url. + */ + if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')){ + return $link->getAttribute('href'); + } + } + + return false; + } + /** * Get the density of links as a percentage of the content * This is the amount of text that is inside a link divided by the total text in the node. -- cgit v1.2.3 From 0f99c53420f05187dc3af9fe04f1129e14dfe96e Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Tue, 21 Feb 2017 18:40:42 +0000 Subject: Fixed test cases and added function to replace font tags with span + param to setNodeTag to keep attributes from original node. --- src/HTMLParser.php | 11 ++++++++++- src/Readability.php | 20 ++++++++++++++------ 2 files changed, 24 insertions(+), 7 deletions(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 5866bd9..5fb27b8 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -290,6 +290,15 @@ class HTMLParser } } } + + // Replace font tags with span + $fonts = $this->dom->getElementsByTagName('font'); + $length = $fonts->length; + for ($i = 0; $i < $length; $i++) { + $font = $fonts->item($length - 1 - $i); + $span = new Readability($font); + $span->setNodeTag('span', true); + } } public function postProcessContent(DOMDocument $article) @@ -458,7 +467,7 @@ class HTMLParser * Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and * finally check for the existence of the href attribute, which should hold the image url. */ - if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')){ + if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')) { return $link->getAttribute('href'); } } diff --git a/src/Readability.php b/src/Readability.php index 027858f..44633b2 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -253,7 +253,7 @@ class Readability extends Element implements ReadabilityInterface { // Check if the setAttribute method exists, as some elements lack of it (and calling it anyway throws an exception) if (method_exists($this->node, 'setAttribute')) { - $this->contentScore = (float) $score; + $this->contentScore = (float)$score; // Set score in an attribute of the tag to prevent losing it while creating new Readability objects. $this->node->setAttribute('data-readability', $this->contentScore); @@ -286,8 +286,9 @@ class Readability extends Element implements ReadabilityInterface * element with the new tag name and importing it to the main DOMDocument. * * @param string $value + * @param bool $importAttributes */ - public function setNodeTag($value) + public function setNodeTag($value, $importAttributes = false) { $new = new \DOMDocument(); $new->appendChild($new->createElement($value)); @@ -298,6 +299,13 @@ class Readability extends Element implements ReadabilityInterface $new->firstChild->appendChild($import); } + if ($importAttributes) { + // Import attributes from the original node. + foreach ($this->node->attributes as $attribute) { + $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue); + } + } + // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement. $import = $this->node->ownerDocument->importNode($new->firstChild, true); $this->node->parentNode->replaceChild($import, $this->node); @@ -335,7 +343,7 @@ class Readability extends Element implements ReadabilityInterface * for parents. * * @param Readability $originalNode - * @param bool $ignoreSelfAndKids + * @param bool $ignoreSelfAndKids * * @return Readability */ @@ -411,7 +419,7 @@ class Readability extends Element implements ReadabilityInterface * Creates a new node based on the text content of the original node. * * @param Readability $originalNode - * @param string $tagName + * @param string $tagName * * @return Readability */ @@ -458,8 +466,8 @@ class Readability extends Element implements ReadabilityInterface * provided one. * * @param Readability $node - * @param string $tagName - * @param int $maxDepth + * @param string $tagName + * @param int $maxDepth * * @return bool */ -- cgit v1.2.3 From 3b73cde640956aa08cee59a9be44d941a819b5e6 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Fri, 3 Mar 2017 16:58:42 +0000 Subject: Functons to normalize space and disable subtitute entities --- src/HTMLParser.php | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 5fb27b8..1b1a516 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -35,7 +35,7 @@ class HTMLParser 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', 'byline' => '/byline|author|dateline|writtenby|p-author/i', 'replaceFonts' => '/<(\/?)font[^>]*>/gi', - 'normalize' => '/\s{2,}/g', + 'normalize' => '/\s{2,}/', 'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i', 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', 'prevLink' => '/(prev|earl|old|new|<|«)/i', @@ -99,6 +99,8 @@ class HTMLParser 'weightClasses' => true, 'removeReadabilityTags' => true, 'fixRelativeURLs' => false, + 'normalizeSpaces' => false, + 'substituteEntities' => true, 'originalURL' => 'http://fakehost', ]; @@ -202,6 +204,11 @@ class HTMLParser */ private function loadHTML($html) { + if (!$this->getConfig()->getOption('substituteEntities')) { + // Keep the original HTML entities + $this->dom->substituteEntities = false; + } + // Prepend the XML tag to avoid having issues with special characters. Should be harmless. $this->dom->loadHTML('' . $html); $this->dom->encoding = 'UTF-8'; @@ -334,6 +341,12 @@ class HTMLParser } } + if ($this->getConfig()->getOption('normalizeSpaces')) { + foreach ($article->getElementsByTagName('p') as $node) { + $node->nodeValue = preg_replace($this->regexps['normalize'], ' ', $node->nodeValue); + } + } + return $article; } -- cgit v1.2.3 From a0d42e6578d641f83443b863a333e7b1a9d50357 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Tue, 7 Mar 2017 15:41:24 +0000 Subject: Fuck this, we are not going to normalize blank space. --- src/HTMLParser.php | 7 ------- 1 file changed, 7 deletions(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 1b1a516..1ef4489 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -99,7 +99,6 @@ class HTMLParser 'weightClasses' => true, 'removeReadabilityTags' => true, 'fixRelativeURLs' => false, - 'normalizeSpaces' => false, 'substituteEntities' => true, 'originalURL' => 'http://fakehost', ]; @@ -341,12 +340,6 @@ class HTMLParser } } - if ($this->getConfig()->getOption('normalizeSpaces')) { - foreach ($article->getElementsByTagName('p') as $node) { - $node->nodeValue = preg_replace($this->regexps['normalize'], ' ', $node->nodeValue); - } - } - return $article; } -- cgit v1.2.3 From 77c9754c302fa3112bb4f739913df9f2785fc8df Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Thu, 9 Mar 2017 10:00:57 +0000 Subject: Fixed small mistake when getting the articleByLine. Corrected test case --- src/HTMLParser.php | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 1ef4489..5d02d05 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -1112,7 +1112,14 @@ class HTMLParser */ private function checkByline($node, $matchString) { - if ($this->getConfig()->getOption('articleByLine')) { + if (!$this->getConfig()->getOption('articleByLine')) { + return false; + } + + /* + * Check if the byline is already set + */ + if (isset($this->metadata['byline'])) { return false; } -- cgit v1.2.3 From a9b9bd9aa00322c988847de6de91f37ba5e89034 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Fri, 10 Mar 2017 10:48:44 +0000 Subject: Fixed all test cases and bugs, now 100% of our test pass. BREAK OUT THE CHAMPAGNE! --- src/HTMLParser.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 5d02d05..0b407ae 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -626,7 +626,7 @@ class HTMLParser continue; } // Discard nodes with less than 25 characters, without blank space - if (mb_strlen($node->getValue(true)) < 25) { + if (mb_strlen($node->getTextContent(true)) < 25) { continue; } @@ -641,10 +641,10 @@ class HTMLParser $contentScore = 1; // Add points for any commas within this paragraph. - $contentScore += count(explode(',', $node->getValue(true))); + $contentScore += count(explode(',', $node->getTextContent(true))); // For every 100 characters in this paragraph, add another point. Up to 3 points. - $contentScore += min(floor(mb_strlen($node->getValue(true)) / 100), 3); + $contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3); // Initialize and score ancestors. /** @var Readability $ancestor */ -- cgit v1.2.3 From 52b15698db988be358bc36bcca3ac8facd6003e2 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Fri, 10 Mar 2017 10:54:32 +0000 Subject: Apply fixes from StyleCI --- src/HTMLParser.php | 3 ++- src/Readability.php | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 0b407ae..3226904 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -533,6 +533,7 @@ class HTMLParser * Gets nodes from the root element. * * @param $node Readability + * * @return array */ private function getNodes(Readability $node) @@ -1106,7 +1107,7 @@ class HTMLParser * Checks if the node is a byline. * * @param Readability $node - * @param string $matchString + * @param string $matchString * * @return bool */ diff --git a/src/Readability.php b/src/Readability.php index 44633b2..c55e0ad 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -253,7 +253,7 @@ class Readability extends Element implements ReadabilityInterface { // Check if the setAttribute method exists, as some elements lack of it (and calling it anyway throws an exception) if (method_exists($this->node, 'setAttribute')) { - $this->contentScore = (float)$score; + $this->contentScore = (float) $score; // Set score in an attribute of the tag to prevent losing it while creating new Readability objects. $this->node->setAttribute('data-readability', $this->contentScore); @@ -286,7 +286,7 @@ class Readability extends Element implements ReadabilityInterface * element with the new tag name and importing it to the main DOMDocument. * * @param string $value - * @param bool $importAttributes + * @param bool $importAttributes */ public function setNodeTag($value, $importAttributes = false) { @@ -343,7 +343,7 @@ class Readability extends Element implements ReadabilityInterface * for parents. * * @param Readability $originalNode - * @param bool $ignoreSelfAndKids + * @param bool $ignoreSelfAndKids * * @return Readability */ @@ -419,7 +419,7 @@ class Readability extends Element implements ReadabilityInterface * Creates a new node based on the text content of the original node. * * @param Readability $originalNode - * @param string $tagName + * @param string $tagName * * @return Readability */ @@ -466,8 +466,8 @@ class Readability extends Element implements ReadabilityInterface * provided one. * * @param Readability $node - * @param string $tagName - * @param int $maxDepth + * @param string $tagName + * @param int $maxDepth * * @return bool */ -- cgit v1.2.3