summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/HTMLParser.php51
1 files changed, 47 insertions, 4 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 77c68cf..a492307 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -187,6 +187,7 @@ class HTMLParser
'title' => isset($this->metadata['title']) ? $this->metadata['title'] : null,
'author' => isset($this->metadata['author']) ? $this->metadata['author'] : null,
'image' => isset($this->metadata['image']) ? $this->metadata['image'] : null,
+ 'images' => $this->getImages(),
'article' => $result,
'html' => $result->C14N(),
'dir' => isset($this->metadata['articleDir']) ? $this->metadata['articleDir'] : null,
@@ -338,10 +339,7 @@ class HTMLParser
public function postProcessContent(DOMDocument $article)
{
- $url = $this->getConfig()->getOption('originalURL');
- $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/';
- $scheme = parse_url($pathBase, PHP_URL_SCHEME);
- $prePath = $scheme . '://' . parse_url($pathBase, PHP_URL_HOST);
+ list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->getConfig()->getOption('originalURL'));
// Readability cannot open relative uris so we convert them to absolute uris.
if ($this->getConfig()->getOption('fixRelativeURLs')) {
@@ -403,6 +401,20 @@ class HTMLParser
return $pathBase . $uri;
}
+ /**
+ * @param string $url
+ *
+ * @return array [$pathBase, $scheme, $prePath]
+ */
+ public function getPathInfo($url)
+ {
+ $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/';
+ $scheme = parse_url($pathBase, PHP_URL_SCHEME);
+ $prePath = $scheme . '://' . parse_url($pathBase, PHP_URL_HOST);
+
+ return [$pathBase, $scheme, $prePath];
+ }
+
private function nextElement($node)
{
$next = $node;
@@ -511,6 +523,37 @@ class HTMLParser
}
/**
+ * @return array
+ */
+ public function getImages()
+ {
+ $result = [];
+ if (!empty($this->metadata['image'])) {
+ $result[] = $this->metadata['image'];
+ }
+ if (null == $this->dom) {
+ return $result;
+ }
+
+ foreach ($this->dom->getElementsByTagName('img') as $img) {
+ if ($src = $img->getAttribute('src')) {
+ $result[] = $src;
+ }
+ }
+
+ if ($this->getConfig()->getOption('fixRelativeURLs')) {
+ list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->getConfig()->getOption('originalURL'));
+ foreach ($result as &$imgSrc) {
+ $imgSrc = $this->toAbsoluteURI($imgSrc, $pathBase, $scheme, $prePath);
+ }
+ }
+
+ $result = array_unique(array_filter($result));
+
+ return $result;
+ }
+
+ /**
* Get the density of links as a percentage of the content
* This is the amount of text that is inside a link divided by the total text in the node.
*