summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-12-23 07:12:14 -0300
committerAndres Rey <[email protected]>2016-12-23 07:12:14 -0300
commitb4bd54c469c9d9e24adbfadec50932fb319230a7 (patch)
tree2fab8fdc21b85a0329fcee6de3d006ac51043d18 /src
parentd024c2d80a38d721d6d1d0fd812a706a5fc22562 (diff)
New function to solve relative URLs
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php73
1 files changed, 72 insertions, 1 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 58da4b7..2a300dc 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -102,7 +102,9 @@ class HTMLParser
'stripUnlikelyCandidates' => true,
'cleanConditionally' => true,
'weightClasses' => true,
- 'removeReadabilityTags' => true
+ 'removeReadabilityTags' => true,
+ 'fixRelativeURLs' => true,
+ 'originalURL' => 'http://fakehost'
];
$this->environment = Environment::createDefaultEnvironment($defaults);
@@ -176,6 +178,7 @@ class HTMLParser
}
}
+ $result = $this->postProcessContent($result);
// Todo, fix return, check for values, maybe create a function to create the return object
return [
@@ -282,6 +285,74 @@ class HTMLParser
}
}
+ public function postProcessContent(DOMDocument $article)
+ {
+ $url = $this->getConfig()->getOption('originalURL');
+ $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/';
+ $scheme = parse_url($pathBase, PHP_URL_SCHEME);
+ $prePath = $scheme . '://' . parse_url($pathBase, PHP_URL_HOST);
+
+ // Readability cannot open relative uris so we convert them to absolute uris.
+ if ($this->getConfig()->getOption('fixRelativeURLs')) {
+ foreach ($article->getElementsByTagName('a') as $link) {
+ /** @var \DOMElement $link */
+ $href = $link->getAttribute('href');
+ if ($href) {
+ // Replace links with javascript: URIs with text content, since
+ // they won't work after scripts have been removed from the page.
+ if (strpos($href, 'javascript:' === 0)) {
+ $text = $article->ownerDocument->createTextNode($link->textContent);
+ $link->parentNode->replaceChild($text, $link);
+ } else {
+ $link->setAttribute('href', $this->toAbsoluteURI($href, $pathBase, $scheme, $prePath));
+ }
+ }
+ }
+
+ foreach ($article->getElementsByTagName('img') as $img) {
+ /** @var \DOMElement $img */
+ $src = $img->getAttribute('src');
+ if ($src) {
+ $img->setAttribute('src', $this->toAbsoluteURI($src, $pathBase, $scheme, $prePath));
+ }
+ }
+ }
+
+ return $article;
+ }
+
+ private function toAbsoluteURI($uri, $pathBase, $scheme, $prePath)
+ {
+ // If this is already an absolute URI, return it.
+ if (preg_match('/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/', $uri)) {
+ return $uri;
+ }
+
+ // Scheme-rooted relative URI.
+ if (substr($uri, 0, 2) === '//') {
+ return $scheme . '://' . substr($uri, 2);
+ }
+
+ // Prepath-rooted relative URI.
+ if (substr($uri, 0, 1) === '/') {
+ return $prePath . $uri;
+ }
+
+ // Dotslash relative URI.
+ if (strpos($uri, './') === 0) {
+ return $pathBase . substr($uri, 2);
+ }
+ // Ignore hash URIs:
+ if (substr($uri, 0, 1) === '#') {
+ return $uri;
+ }
+
+ // Standard relative URI; add entire path. pathBase already includes a
+ // trailing "/".
+ return $pathBase . $uri;
+ }
+
+
private function nextElement($node)
{
$next = $node;