diff options
Diffstat (limited to 'src/Readability.php')
-rw-r--r-- | src/Readability.php | 84 |
1 files changed, 67 insertions, 17 deletions
diff --git a/src/Readability.php b/src/Readability.php index 40baa1c..ab856f7 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -8,6 +8,7 @@ use andreskrey\Readability\Nodes\DOM\DOMNode; use andreskrey\Readability\Nodes\DOM\DOMText; use andreskrey\Readability\Nodes\NodeUtility; use Psr\Log\LoggerInterface; +use \Masterminds\HTML5; /** * Class Readability. @@ -71,6 +72,14 @@ class Readability protected $direction = null; /** + * Base URI + * HTML5PHP doesn't appear to store it in the baseURI property like PHP's DOMDocument does when parsing with libxml + * + * @var string|null + */ + protected $baseURI = null; + + /** * Configuration object. * * @var Configuration @@ -254,26 +263,46 @@ class Readability // To avoid throwing a gazillion of errors on malformed HTMLs libxml_use_internal_errors(true); - $dom = new DOMDocument('1.0', 'utf-8'); + //$html = preg_replace('/(<br[^>]*>[ \n\r\t]*){2,}/i', '</p><p>', $html); + + if ($this->configuration->getParser() === 'html5') { + $this->logger->debug('[Loading] Using HTML5 parser...'); + $html5 = new HTML5(['disable_html_ns' => true, 'target_document' => new DOMDocument('1.0', 'utf-8')]); + $dom = $html5->loadHTML($html); + //TODO: Improve this so it looks inside <html><head><base>, not just any <base> + $base = $dom->getElementsByTagName('base'); + if ($base->length > 0) { + $base = $base->item(0); + $base = $base->getAttribute('href'); + if ($base != '') { + $this->baseURI = $base; + } + } + } else { + $this->logger->debug('[Loading] Using libxml parser...'); + $dom = new DOMDocument('1.0', 'utf-8'); + if ($this->configuration->getNormalizeEntities()) { + $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.'); + // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content + $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); + } + } if (!$this->configuration->getSubstituteEntities()) { // Keep the original HTML entities $dom->substituteEntities = false; } - if ($this->configuration->getNormalizeEntities()) { - $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.'); - // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content - $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); - } - if ($this->configuration->getSummonCthulhu()) { $this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘'); $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html); } // Prepend the XML tag to avoid having issues with special characters. Should be harmless. - $dom->loadHTML('<?xml encoding="UTF-8">' . $html); + if ($this->configuration->getParser() !== 'html5') { + $dom->loadHTML('<?xml encoding="UTF-8">' . $html); + $this->baseURI = $dom->baseURI; + } $dom->encoding = 'UTF-8'; $this->removeScripts($dom); @@ -611,13 +640,13 @@ class Readability public function getPathInfo($url) { // Check for base URLs - if ($this->dom->baseURI !== null) { - if (substr($this->dom->baseURI, 0, 1) === '/') { + if ($this->baseURI !== null) { + if (substr($this->baseURI, 0, 1) === '/') { // URLs starting with '/' override completely the URL defined in the link - $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . $this->dom->baseURI; + $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . $this->baseURI; } else { // Otherwise just prepend the base to the actual path - $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/' . rtrim($this->dom->baseURI, '/') . '/'; + $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/' . rtrim($this->baseURI, '/') . '/'; } } else { $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/'; @@ -774,8 +803,9 @@ class Readability } $rel = $node->getAttribute('rel'); + $itemprop = $node->getAttribute("itemprop"); - if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) { + if ($rel === 'author' || ($itemprop && strpos($itemprop, 'author') !== false) || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) { $this->logger->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent())); $this->setAuthor(trim($node->getTextContent())); @@ -886,6 +916,10 @@ class Readability $p->removeChild($p->lastChild); } + while ($p && $p->firstChild && $p->firstChild->isWhitespace()) { + $p->removeChild($p->firstChild); + } + if ($p->parentNode->tagName === 'p') { NodeUtility::setNodeTag($p->parentNode, 'div'); } @@ -1270,7 +1304,7 @@ class Readability $this->_cleanExtraParagraphs($article); foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) { - $next = $br->nextSibling; + $next = NodeUtility::nextElement($br->nextSibling); if ($next && $next->nodeName === 'p') { $this->logger->debug('[PrepArticle] Removing br node next to a p node.'); $br->parentNode->removeChild($br); @@ -1525,6 +1559,17 @@ class Readability } } + public function _getAllNodesWithTag($node, array $tagNames) { + $nodes = []; + foreach ($tagNames as $tag) { + $nodeList = $node->getElementsByTagName($tag); + foreach ($nodeList as $n) { + $nodes[] = $n; + } + } + return $nodes; + } + /** * Clean a node of all elements of type "tag". * (Unless it's a youtube/vimeo video. People love movies.). @@ -1556,8 +1601,8 @@ class Readability continue; } - // Then check the elements inside this element for the same. - if (preg_match(NodeUtility::$regexps['videos'], $item->C14N())) { + // For embed with <object> tag, check inner HTML as well. + if ($item->tagName === 'object' && preg_match(NodeUtility::$regexps['videos'], $item->C14N())) { continue; } } @@ -1703,7 +1748,12 @@ class Readability */ public function getContent() { - return ($this->content instanceof DOMDocument) ? $this->content->C14N() : null; + if ($this->content instanceof DOMDocument) { + $html5 = new HTML5(['disable_html_ns' => true]); + return $html5->saveHTML($this->content); + } else { + return null; + } } /** |