diff options
author | Andres Rey <[email protected]> | 2017-03-10 11:10:46 +0000 |
---|---|---|
committer | GitHub <[email protected]> | 2017-03-10 11:10:46 +0000 |
commit | 10c528c0b98164be5da0bfd097b0190e26e6de5f (patch) | |
tree | 30adee993e1c52abc398e7f6e2a0b0723c73ecf9 /src | |
parent | 2a493bcc6cd8c175c531a26b6c9b061e911dcf39 (diff) | |
parent | a2d10aa920780447c946ac64efa7c095c854bff6 (diff) |
Merge pull request #18 from andreskrey/development
Prepare for release v0.2.0
Diffstat (limited to 'src')
-rw-r--r-- | src/HTMLParser.php | 78 | ||||
-rw-r--r-- | src/Readability.php | 10 |
2 files changed, 74 insertions, 14 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 0313b2a..bc9aa9f 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -29,18 +29,13 @@ class HTMLParser /** * @var array */ - private $title = []; - - /** - * @var array - */ private $regexps = [ 'unlikelyCandidates' => '/banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i', 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', 'byline' => '/byline|author|dateline|writtenby|p-author/i', 'replaceFonts' => '/<(\/?)font[^>]*>/gi', - 'normalize' => '/\s{2,}/g', + 'normalize' => '/\s{2,}/', 'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i', 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', 'prevLink' => '/(prev|earl|old|new|<|«)/i', @@ -104,6 +99,7 @@ class HTMLParser 'weightClasses' => true, 'removeReadabilityTags' => true, 'fixRelativeURLs' => false, + 'substituteEntities' => true, 'originalURL' => 'http://fakehost', ]; @@ -137,7 +133,9 @@ class HTMLParser $this->metadata = $this->getMetadata(); - $this->title = $this->getTitle(); + $this->metadata['image'] = $this->getMainImage(); + + $this->metadata['title'] = $this->getTitle(); // Checking for minimum HTML to work with. if (!($root = $this->dom->getElementsByTagName('body')->item(0))) { @@ -162,7 +160,11 @@ class HTMLParser // TODO Better way to count resulting text. Textcontent usually has alt titles and that stuff // that doesn't really count to the quality of the result. - if ($result && mb_strlen($result->textContent) < 500) { + $length = 0; + foreach ($result->getElementsByTagName('p') as $p) { + $length += mb_strlen($p->textContent); + } + if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < 500) { $root = $this->backupdom->getElementsByTagName('body')->item(0); if ($this->getConfig()->getOption('stripUnlikelyCandidates')) { @@ -205,6 +207,11 @@ class HTMLParser */ private function loadHTML($html) { + if (!$this->getConfig()->getOption('substituteEntities')) { + // Keep the original HTML entities + $this->dom->substituteEntities = false; + } + // Prepend the XML tag to avoid having issues with special characters. Should be harmless. $this->dom->loadHTML('<?xml encoding="UTF-8">' . $html); $this->dom->encoding = 'UTF-8'; @@ -293,6 +300,15 @@ class HTMLParser } } } + + // Replace font tags with span + $fonts = $this->dom->getElementsByTagName('font'); + $length = $fonts->length; + for ($i = 0; $i < $length; $i++) { + $font = $fonts->item($length - 1 - $i); + $span = new Readability($font); + $span->setNodeTag('span', true); + } } public function postProcessContent(DOMDocument $article) @@ -436,12 +452,40 @@ class HTMLParser if (array_key_exists('og:image', $values) || array_key_exists('twitter:image', $values)) { $metadata['image'] = ($values['og:image']) ? $values['og:image'] : $values['twitter:image']; + } else { + $metadata['image'] = null; } return $metadata; } /** + * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't + * find a correct image. + * + * @return bool|string URL of the top image or false if unsuccessful. + */ + public function getMainImage() + { + if ($this->metadata['image'] !== null) { + return $this->metadata['image']; + } + + foreach ($this->dom->getElementsByTagName('link') as $link) { + /** @var \DOMElement $link */ + /* + * Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and + * finally check for the existence of the href attribute, which should hold the image url. + */ + if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')) { + return $link->getAttribute('href'); + } + } + + return false; + } + + /** * Get the density of links as a percentage of the content * This is the amount of text that is inside a link divided by the total text in the node. * @@ -493,6 +537,7 @@ class HTMLParser * Gets nodes from the root element. * * @param $node Readability + * * @return array */ private function getNodes(Readability $node) @@ -586,7 +631,7 @@ class HTMLParser continue; } // Discard nodes with less than 25 characters, without blank space - if (mb_strlen($node->getValue(true)) < 25) { + if (mb_strlen($node->getTextContent(true)) < 25) { continue; } @@ -601,10 +646,10 @@ class HTMLParser $contentScore = 1; // Add points for any commas within this paragraph. - $contentScore += count(explode(',', $node->getValue(true))); + $contentScore += count(explode(',', $node->getTextContent(true))); // For every 100 characters in this paragraph, add another point. Up to 3 points. - $contentScore += min(floor(mb_strlen($node->getValue(true)) / 100), 3); + $contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3); // Initialize and score ancestors. /** @var Readability $ancestor */ @@ -1066,13 +1111,20 @@ class HTMLParser * Checks if the node is a byline. * * @param Readability $node - * @param string $matchString + * @param string $matchString * * @return bool */ private function checkByline($node, $matchString) { - if ($this->getConfig()->getOption('articleByLine')) { + if (!$this->getConfig()->getOption('articleByLine')) { + return false; + } + + /* + * Check if the byline is already set + */ + if (isset($this->metadata['byline'])) { return false; } diff --git a/src/Readability.php b/src/Readability.php index 027858f..c55e0ad 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -286,8 +286,9 @@ class Readability extends Element implements ReadabilityInterface * element with the new tag name and importing it to the main DOMDocument. * * @param string $value + * @param bool $importAttributes */ - public function setNodeTag($value) + public function setNodeTag($value, $importAttributes = false) { $new = new \DOMDocument(); $new->appendChild($new->createElement($value)); @@ -298,6 +299,13 @@ class Readability extends Element implements ReadabilityInterface $new->firstChild->appendChild($import); } + if ($importAttributes) { + // Import attributes from the original node. + foreach ($this->node->attributes as $attribute) { + $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue); + } + } + // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement. $import = $this->node->ownerDocument->importNode($new->firstChild, true); $this->node->parentNode->replaceChild($import, $this->node); |