diff options
Diffstat (limited to 'classes/urlhelper.php')
-rw-r--r-- | classes/urlhelper.php | 105 |
1 files changed, 85 insertions, 20 deletions
diff --git a/classes/urlhelper.php b/classes/urlhelper.php index b2c1331b6..2dfb22a5d 100644 --- a/classes/urlhelper.php +++ b/classes/urlhelper.php @@ -6,16 +6,39 @@ class UrlHelper { "tel" ]; + const EXTRA_SCHEMES_BY_CONTENT_TYPE = [ + "application/x-bittorrent" => [ "magnet" ], + ]; + + // TODO: class properties can be switched to PHP typing if/when the minimum PHP_VERSION is raised to 7.4.0+ + /** @var string */ static $fetch_last_error; + + /** @var int */ static $fetch_last_error_code; + + /** @var string */ static $fetch_last_error_content; + + /** @var string */ static $fetch_last_content_type; + + /** @var string */ static $fetch_last_modified; + + /** @var string */ static $fetch_effective_url; + + /** @var string */ static $fetch_effective_ip_addr; + + /** @var bool */ static $fetch_curl_used; - static function build_url($parts) { + /** + * @param array<string, string|int> $parts + */ + static function build_url(array $parts): string { $tmp = $parts['scheme'] . "://" . $parts['host']; if (isset($parts['path'])) $tmp .= $parts['path']; @@ -33,13 +56,29 @@ class UrlHelper { * @param string $rel_url Possibly relative URL in the document * @param string $owner_element Owner element tag name (i.e. "a") (optional) * @param string $owner_attribute Owner attribute (i.e. "href") (optional) + * @param string $content_type URL content type as specified by enclosures, etc. * - * @return string Absolute URL + * @return false|string Absolute URL or false on failure (either during URL parsing or validation) */ - public static function rewrite_relative($base_url, $rel_url, string $owner_element = "", string $owner_attribute = "") { + public static function rewrite_relative($base_url, + $rel_url, + string $owner_element = "", + string $owner_attribute = "", + string $content_type = "") { $rel_parts = parse_url($rel_url); + /** + * If parse_url failed to parse $rel_url return false to match the current "invalid thing" behavior + * of UrlHelper::validate(). + * + * TODO: There are many places where a string return value is assumed. We should either update those + * to account for the possibility of failure, or look into updating this function's return values. + */ + if ($rel_parts === false) { + return false; + } + if (!empty($rel_parts['host']) && !empty($rel_parts['scheme'])) { return self::validate($rel_url); @@ -51,8 +90,13 @@ class UrlHelper { $owner_element == "a" && $owner_attribute == "href") { return $rel_url; + // allow some extra schemes for links with feed-specified content type i.e. enclosures + } else if ($content_type && + isset(self::EXTRA_SCHEMES_BY_CONTENT_TYPE[$content_type]) && + in_array($rel_parts["scheme"], self::EXTRA_SCHEMES_BY_CONTENT_TYPE[$content_type])) { + return $rel_url; // allow limited subset of inline base64-encoded images for IMG elements - } else if ($rel_parts["scheme"] ?? "" == "data" && + } else if (($rel_parts["scheme"] ?? "") == "data" && preg_match('%^image/(webp|gif|jpg|png|svg);base64,%', $rel_parts["path"]) && $owner_element == "img" && $owner_attribute == "src") { @@ -60,8 +104,8 @@ class UrlHelper { } else { $base_parts = parse_url($base_url); - $rel_parts['host'] = $base_parts['host']; - $rel_parts['scheme'] = $base_parts['scheme']; + $rel_parts['host'] = $base_parts['host'] ?? ""; + $rel_parts['scheme'] = $base_parts['scheme'] ?? ""; if (isset($rel_parts['path'])) { @@ -80,8 +124,10 @@ class UrlHelper { } } - // extended filtering involves validation for safe ports and loopback - static function validate($url, $extended_filtering = false) { + /** extended filtering involves validation for safe ports and loopback + * @return false|string false if something went wrong, otherwise the URL string + */ + static function validate(string $url, bool $extended_filtering = false) { $url = clean($url); @@ -107,6 +153,11 @@ class UrlHelper { } else { $tokens['host'] = idn_to_ascii($tokens['host']); } + + // if `idn_to_ascii` failed + if ($tokens['host'] === false) { + return false; + } } } @@ -138,7 +189,10 @@ class UrlHelper { return $url; } - static function resolve_redirects($url, $timeout, $nest = 0) { + /** + * @return false|string + */ + static function resolve_redirects(string $url, int $timeout, int $nest = 0) { // too many redirects if ($nest > 10) @@ -162,8 +216,12 @@ class UrlHelper { $context = stream_context_create($context_options); + // PHP 8 changed the second param from int to bool, but we still support PHP >= 7.1.0 + // @phpstan-ignore-next-line $headers = get_headers($url, 0, $context); } else { + // PHP 8 changed the second param from int to bool, but we still support PHP >= 7.1.0 + // @phpstan-ignore-next-line $headers = get_headers($url, 0); } @@ -185,12 +243,16 @@ class UrlHelper { return false; } - // TODO: max_size currently only works for CURL transfers + /** + * @param array<string, bool|int|string>|string $options + * @return false|string false if something went wrong, otherwise string contents + */ + // TODO: max_size currently only works for CURL transfers // TODO: multiple-argument way is deprecated, first parameter is a hash now public static function fetch($options /* previously: 0: $url , 1: $type = false, 2: $login = false, 3: $pass = false, 4: $post_query = false, 5: $timeout = false, 6: $timestamp = 0, 7: $useragent = false*/) { - self::$fetch_last_error = false; + self::$fetch_last_error = ""; self::$fetch_last_error_code = -1; self::$fetch_last_error_content = ""; self::$fetch_last_content_type = ""; @@ -239,6 +301,8 @@ class UrlHelper { $url = ltrim($url, ' '); $url = str_replace(' ', '%20', $url); + Debug::log("[UrlHelper] fetching: $url", Debug::LOG_EXTENDED); + $url = self::validate($url, true); if (!$url) { @@ -275,15 +339,15 @@ class UrlHelper { curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout ? $timeout : Config::get(Config::FILE_FETCH_CONNECT_TIMEOUT)); curl_setopt($ch, CURLOPT_TIMEOUT, $timeout ? $timeout : Config::get(Config::FILE_FETCH_TIMEOUT)); - curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("open_basedir") && $followlocation); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, $followlocation); curl_setopt($ch, CURLOPT_MAXREDIRS, 20); curl_setopt($ch, CURLOPT_BINARYTRANSFER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_ANY); - curl_setopt($ch, CURLOPT_USERAGENT, $useragent ? $useragent : - SELF_USER_AGENT); + curl_setopt($ch, CURLOPT_USERAGENT, $useragent ? $useragent : Config::get_user_agent()); curl_setopt($ch, CURLOPT_ENCODING, ""); + curl_setopt($ch, CURLOPT_COOKIEJAR, "/dev/null"); if ($http_referrer) curl_setopt($ch, CURLOPT_REFERER, $http_referrer); @@ -298,7 +362,7 @@ class UrlHelper { //Debug::log("[curl progressfunction] $downloaded $max_size", Debug::$LOG_EXTENDED); if ($downloaded > $max_size) { - Debug::log("curl: reached max size of $max_size bytes requesting $url, aborting.", Debug::LOG_VERBOSE); + Debug::log("[UrlHelper] fetch error: curl reached max size of $max_size bytes downloading $url, aborting.", Debug::LOG_VERBOSE); return 1; } @@ -307,10 +371,6 @@ class UrlHelper { } - if (!ini_get("open_basedir")) { - curl_setopt($ch, CURLOPT_COOKIEJAR, "/dev/null"); - } - if (Config::get(Config::HTTP_PROXY)) { curl_setopt($ch, CURLOPT_PROXY, Config::get(Config::HTTP_PROXY)); } @@ -374,6 +434,8 @@ class UrlHelper { if (curl_errno($ch) != 0) { self::$fetch_last_error .= "; " . curl_errno($ch) . " " . curl_error($ch); + } else { + self::$fetch_last_error = "HTTP Code: $http_code "; } self::$fetch_last_error_content = $contents; @@ -510,7 +572,10 @@ class UrlHelper { } } - public static function url_to_youtube_vid($url) { + /** + * @return false|string false if the provided URL didn't match expected patterns, otherwise the video ID string + */ + public static function url_to_youtube_vid(string $url) { $url = str_replace("youtube.com", "youtube-nocookie.com", $url); $regexps = [ |