From 72a26f10493ceb677fdfc50823d44ca759301175 Mon Sep 17 00:00:00 2001 From: Pedro Amorim Date: Thu, 16 Nov 2017 16:54:48 +0100 Subject: Add getImages() Get all images URL of current DOM at once. --- src/HTMLParser.php | 51 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 77c68cf..a492307 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -187,6 +187,7 @@ class HTMLParser 'title' => isset($this->metadata['title']) ? $this->metadata['title'] : null, 'author' => isset($this->metadata['author']) ? $this->metadata['author'] : null, 'image' => isset($this->metadata['image']) ? $this->metadata['image'] : null, + 'images' => $this->getImages(), 'article' => $result, 'html' => $result->C14N(), 'dir' => isset($this->metadata['articleDir']) ? $this->metadata['articleDir'] : null, @@ -338,10 +339,7 @@ class HTMLParser public function postProcessContent(DOMDocument $article) { - $url = $this->getConfig()->getOption('originalURL'); - $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/'; - $scheme = parse_url($pathBase, PHP_URL_SCHEME); - $prePath = $scheme . '://' . parse_url($pathBase, PHP_URL_HOST); + list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->getConfig()->getOption('originalURL')); // Readability cannot open relative uris so we convert them to absolute uris. if ($this->getConfig()->getOption('fixRelativeURLs')) { @@ -403,6 +401,20 @@ class HTMLParser return $pathBase . $uri; } + /** + * @param string $url + * + * @return array [$pathBase, $scheme, $prePath] + */ + public function getPathInfo($url) + { + $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/'; + $scheme = parse_url($pathBase, PHP_URL_SCHEME); + $prePath = $scheme . '://' . parse_url($pathBase, PHP_URL_HOST); + + return [$pathBase, $scheme, $prePath]; + } + private function nextElement($node) { $next = $node; @@ -510,6 +522,37 @@ class HTMLParser return false; } + /** + * @return array + */ + public function getImages() + { + $result = []; + if (!empty($this->metadata['image'])) { + $result[] = $this->metadata['image']; + } + if (null == $this->dom) { + return $result; + } + + foreach ($this->dom->getElementsByTagName('img') as $img) { + if ($src = $img->getAttribute('src')) { + $result[] = $src; + } + } + + if ($this->getConfig()->getOption('fixRelativeURLs')) { + list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->getConfig()->getOption('originalURL')); + foreach ($result as &$imgSrc) { + $imgSrc = $this->toAbsoluteURI($imgSrc, $pathBase, $scheme, $prePath); + } + } + + $result = array_unique(array_filter($result)); + + return $result; + } + /** * Get the density of links as a percentage of the content * This is the amount of text that is inside a link divided by the total text in the node. -- cgit v1.2.3