diff options
author | Andrew Dolgov <[email protected]> | 2020-09-22 09:04:33 +0300 |
---|---|---|
committer | Andrew Dolgov <[email protected]> | 2020-09-22 09:04:33 +0300 |
commit | 74568df4ff7b7788991636f6fb2ed62012f85c3b (patch) | |
tree | 673bcb01157b38e4b38f8f4c8227012e8a750e18 /classes/urlhelper.php | |
parent | d04ac399ff284e9747e3fb55e87d05e0a5b8d85f (diff) |
remove a lot of stuff from global context (functions.php), add a few helper classes instead
Diffstat (limited to 'classes/urlhelper.php')
-rw-r--r-- | classes/urlhelper.php | 474 |
1 files changed, 474 insertions, 0 deletions
diff --git a/classes/urlhelper.php b/classes/urlhelper.php new file mode 100644 index 000000000..c57d0b9d9 --- /dev/null +++ b/classes/urlhelper.php @@ -0,0 +1,474 @@ +<?php +class UrlHelper { + static function build_url($parts) { + $tmp = $parts['scheme'] . "://" . $parts['host'] . $parts['path']; + + if (isset($parts['query'])) $tmp .= '?' . $parts['query']; + if (isset($parts['fragment'])) $tmp .= '#' . $parts['fragment']; + + return $tmp; + } + + /** + * Converts a (possibly) relative URL to a absolute one. + * + * @param string $url Base URL (i.e. from where the document is) + * @param string $rel_url Possibly relative URL in the document + * + * @return string Absolute URL + */ + public static function rewrite_relative($url, $rel_url) { + + $rel_parts = parse_url($rel_url); + + if ($rel_parts['host'] && $rel_parts['scheme']) { + return UrlHelper::validate($rel_url); + } else if (strpos($rel_url, "//") === 0) { + # protocol-relative URL (rare but they exist) + return UrlHelper::validate("https:" . $rel_url); + } else if (strpos($rel_url, "magnet:") === 0) { + # allow magnet links + return $rel_url; + } else { + $parts = parse_url($url); + + $rel_parts['host'] = $parts['host']; + $rel_parts['scheme'] = $parts['scheme']; + + if (strpos($rel_parts['path'], '/') !== 0) + $rel_parts['path'] = '/' . $rel_parts['path']; + + $rel_parts['path'] = str_replace("/./", "/", $rel_parts['path']); + $rel_parts['path'] = str_replace("//", "/", $rel_parts['path']); + + return UrlHelper::validate(UrlHelper::build_url($rel_parts)); + } + } + + // extended filtering involves validation for safe ports and loopback + static function validate($url, $extended_filtering = false) { + + $url = clean($url); + + # fix protocol-relative URLs + if (strpos($url, "//") === 0) + $url = "https:" . $url; + + if (filter_var($url, FILTER_VALIDATE_URL) === false) + return false; + + $tokens = parse_url($url); + + if (!$tokens['host']) + return false; + + if (!in_array(strtolower($tokens['scheme']), ['http', 'https'])) + return false; + + if ($extended_filtering) { + if (!in_array($tokens['port'], [80, 443, ''])) + return false; + + if (strtolower($tokens['host']) == 'localhost' || $tokens['host'] == '::1' || strpos($tokens['host'], '127.') === 0) + return false; + } + + //convert IDNA hostname to punycode if possible + if (function_exists("idn_to_ascii")) { + if (mb_detect_encoding($tokens['host']) != 'ASCII') { + $parts['host'] = idn_to_ascii($tokens['host']); + $url = UrlHelper::build_url($tokens); + } + } + + return $url; + } + + static function resolve_redirects($url, $timeout, $nest = 0) { + + // too many redirects + if ($nest > 10) + return false; + + if (version_compare(PHP_VERSION, '7.1.0', '>=')) { + $context_options = array( + 'http' => array( + 'header' => array( + 'Connection: close' + ), + 'method' => 'HEAD', + 'timeout' => $timeout, + 'protocol_version'=> 1.1) + ); + + if (defined('_HTTP_PROXY')) { + $context_options['http']['request_fulluri'] = true; + $context_options['http']['proxy'] = _HTTP_PROXY; + } + + $context = stream_context_create($context_options); + + $headers = get_headers($url, 0, $context); + } else { + $headers = get_headers($url, 0); + } + + if (is_array($headers)) { + $headers = array_reverse($headers); // last one is the correct one + + foreach($headers as $header) { + if (stripos($header, 'Location:') === 0) { + $url = UrlHelper::rewrite_relative($url, trim(substr($header, strlen('Location:')))); + + return resolve_redirects($url, $timeout, $nest + 1); + } + } + + return $url; + } + + // request failed? + return false; + } + + // TODO: max_size currently only works for CURL transfers + // TODO: multiple-argument way is deprecated, first parameter is a hash now + public static function fetch($options /* previously: 0: $url , 1: $type = false, 2: $login = false, 3: $pass = false, + 4: $post_query = false, 5: $timeout = false, 6: $timestamp = 0, 7: $useragent = false*/) { + + global $fetch_last_error; + global $fetch_last_error_code; + global $fetch_last_error_content; + global $fetch_last_content_type; + global $fetch_last_modified; + global $fetch_effective_url; + global $fetch_effective_ip_addr; + global $fetch_curl_used; + global $fetch_domain_hits; + + $fetch_last_error = false; + $fetch_last_error_code = -1; + $fetch_last_error_content = ""; + $fetch_last_content_type = ""; + $fetch_curl_used = false; + $fetch_last_modified = ""; + $fetch_effective_url = ""; + $fetch_effective_ip_addr = ""; + + if (!is_array($fetch_domain_hits)) + $fetch_domain_hits = []; + + if (!is_array($options)) { + + // falling back on compatibility shim + $option_names = [ "url", "type", "login", "pass", "post_query", "timeout", "last_modified", "useragent" ]; + $tmp = []; + + for ($i = 0; $i < func_num_args(); $i++) { + $tmp[$option_names[$i]] = func_get_arg($i); + } + + $options = $tmp; + + /*$options = array( + "url" => func_get_arg(0), + "type" => @func_get_arg(1), + "login" => @func_get_arg(2), + "pass" => @func_get_arg(3), + "post_query" => @func_get_arg(4), + "timeout" => @func_get_arg(5), + "timestamp" => @func_get_arg(6), + "useragent" => @func_get_arg(7) + ); */ + } + + $url = $options["url"]; + $type = isset($options["type"]) ? $options["type"] : false; + $login = isset($options["login"]) ? $options["login"] : false; + $pass = isset($options["pass"]) ? $options["pass"] : false; + $post_query = isset($options["post_query"]) ? $options["post_query"] : false; + $timeout = isset($options["timeout"]) ? $options["timeout"] : false; + $last_modified = isset($options["last_modified"]) ? $options["last_modified"] : ""; + $useragent = isset($options["useragent"]) ? $options["useragent"] : false; + $followlocation = isset($options["followlocation"]) ? $options["followlocation"] : true; + $max_size = isset($options["max_size"]) ? $options["max_size"] : MAX_DOWNLOAD_FILE_SIZE; // in bytes + $http_accept = isset($options["http_accept"]) ? $options["http_accept"] : false; + $http_referrer = isset($options["http_referrer"]) ? $options["http_referrer"] : false; + + $url = ltrim($url, ' '); + $url = str_replace(' ', '%20', $url); + + $url = UrlHelper::validate($url, true); + + if (!$url) { + $fetch_last_error = "Requested URL failed extended validation."; + return false; + } + + $url_host = parse_url($url, PHP_URL_HOST); + $ip_addr = gethostbyname($url_host); + + if (!$ip_addr || strpos($ip_addr, "127.") === 0) { + $fetch_last_error = "URL hostname failed to resolve or resolved to a loopback address ($ip_addr)"; + return false; + } + + $fetch_domain_hits[$url_host] += 1; + + /*if ($fetch_domain_hits[$url_host] > MAX_FETCH_REQUESTS_PER_HOST) { + user_error("Exceeded fetch request quota for $url_host: " . $fetch_domain_hits[$url_host], E_USER_WARNING); + #return false; + }*/ + + if (!defined('NO_CURL') && function_exists('curl_init') && !ini_get("open_basedir")) { + + $fetch_curl_used = true; + + $ch = curl_init($url); + + $curl_http_headers = []; + + if ($last_modified && !$post_query) + array_push($curl_http_headers, "If-Modified-Since: $last_modified"); + + if ($http_accept) + array_push($curl_http_headers, "Accept: " . $http_accept); + + if (count($curl_http_headers) > 0) + curl_setopt($ch, CURLOPT_HTTPHEADER, $curl_http_headers); + + curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout ? $timeout : FILE_FETCH_CONNECT_TIMEOUT); + curl_setopt($ch, CURLOPT_TIMEOUT, $timeout ? $timeout : FILE_FETCH_TIMEOUT); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, !ini_get("open_basedir") && $followlocation); + curl_setopt($ch, CURLOPT_MAXREDIRS, 20); + curl_setopt($ch, CURLOPT_BINARYTRANSFER, true); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_HEADER, true); + curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_ANY); + curl_setopt($ch, CURLOPT_USERAGENT, $useragent ? $useragent : + SELF_USER_AGENT); + curl_setopt($ch, CURLOPT_ENCODING, ""); + + if ($http_referrer) + curl_setopt($ch, CURLOPT_REFERER, $http_referrer); + + if ($max_size) { + curl_setopt($ch, CURLOPT_NOPROGRESS, false); + curl_setopt($ch, CURLOPT_BUFFERSIZE, 16384); // needed to get 5 arguments in progress function? + + // holy shit closures in php + // download & upload are *expected* sizes respectively, could be zero + curl_setopt($ch, CURLOPT_PROGRESSFUNCTION, function($curl_handle, $download_size, $downloaded, $upload_size, $uploaded) use( &$max_size) { + Debug::log("[curl progressfunction] $downloaded $max_size", Debug::$LOG_EXTENDED); + + return ($downloaded > $max_size) ? 1 : 0; // if max size is set, abort when exceeding it + }); + + } + + if (!ini_get("open_basedir")) { + curl_setopt($ch, CURLOPT_COOKIEJAR, "/dev/null"); + } + + if (defined('_HTTP_PROXY')) { + curl_setopt($ch, CURLOPT_PROXY, _HTTP_PROXY); + } + + if ($post_query) { + curl_setopt($ch, CURLOPT_POST, true); + curl_setopt($ch, CURLOPT_POSTFIELDS, $post_query); + } + + if ($login && $pass) + curl_setopt($ch, CURLOPT_USERPWD, "$login:$pass"); + + $ret = @curl_exec($ch); + + $headers_length = curl_getinfo($ch, CURLINFO_HEADER_SIZE); + $headers = explode("\r\n", substr($ret, 0, $headers_length)); + $contents = substr($ret, $headers_length); + + foreach ($headers as $header) { + if (strstr($header, ": ") !== false) { + list ($key, $value) = explode(": ", $header); + + if (strtolower($key) == "last-modified") { + $fetch_last_modified = $value; + } + } + + if (substr(strtolower($header), 0, 7) == 'http/1.') { + $fetch_last_error_code = (int) substr($header, 9, 3); + $fetch_last_error = $header; + } + } + + if (curl_errno($ch) === 23 || curl_errno($ch) === 61) { + curl_setopt($ch, CURLOPT_ENCODING, 'none'); + $contents = @curl_exec($ch); + } + + $http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE); + $fetch_last_content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); + + $fetch_effective_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); + + if (!UrlHelper::validate($fetch_effective_url, true)) { + $fetch_last_error = "URL received after redirection failed extended validation."; + + return false; + } + + $fetch_effective_ip_addr = gethostbyname(parse_url($fetch_effective_url, PHP_URL_HOST)); + + if (!$fetch_effective_ip_addr || strpos($fetch_effective_ip_addr, "127.") === 0) { + $fetch_last_error = "URL hostname received after redirection failed to resolve or resolved to a loopback address ($fetch_effective_ip_addr)"; + + return false; + } + + $fetch_last_error_code = $http_code; + + if ($http_code != 200 || $type && strpos($fetch_last_content_type, "$type") === false) { + + if (curl_errno($ch) != 0) { + $fetch_last_error .= "; " . curl_errno($ch) . " " . curl_error($ch); + } + + $fetch_last_error_content = $contents; + curl_close($ch); + return false; + } + + if (!$contents) { + $fetch_last_error = curl_errno($ch) . " " . curl_error($ch); + curl_close($ch); + return false; + } + + curl_close($ch); + + $is_gzipped = RSSUtils::is_gzipped($contents); + + if ($is_gzipped) { + $tmp = @gzdecode($contents); + + if ($tmp) $contents = $tmp; + } + + return $contents; + } else { + + $fetch_curl_used = false; + + if ($login && $pass){ + $url_parts = array(); + + preg_match("/(^[^:]*):\/\/(.*)/", $url, $url_parts); + + $pass = urlencode($pass); + + if ($url_parts[1] && $url_parts[2]) { + $url = $url_parts[1] . "://$login:$pass@" . $url_parts[2]; + } + } + + // TODO: should this support POST requests or not? idk + + $context_options = array( + 'http' => array( + 'header' => array( + 'Connection: close' + ), + 'method' => 'GET', + 'ignore_errors' => true, + 'timeout' => $timeout ? $timeout : FILE_FETCH_TIMEOUT, + 'protocol_version'=> 1.1) + ); + + if (!$post_query && $last_modified) + array_push($context_options['http']['header'], "If-Modified-Since: $last_modified"); + + if ($http_accept) + array_push($context_options['http']['header'], "Accept: $http_accept"); + + if ($http_referrer) + array_push($context_options['http']['header'], "Referer: $http_referrer"); + + if (defined('_HTTP_PROXY')) { + $context_options['http']['request_fulluri'] = true; + $context_options['http']['proxy'] = _HTTP_PROXY; + } + + $context = stream_context_create($context_options); + + $old_error = error_get_last(); + + $fetch_effective_url = resolve_redirects($url, $timeout ? $timeout : FILE_FETCH_CONNECT_TIMEOUT); + + if (!UrlHelper::validate($fetch_effective_url, true)) { + $fetch_last_error = "URL received after redirection failed extended validation."; + + return false; + } + + $fetch_effective_ip_addr = gethostbyname(parse_url($fetch_effective_url, PHP_URL_HOST)); + + if (!$fetch_effective_ip_addr || strpos($fetch_effective_ip_addr, "127.") === 0) { + $fetch_last_error = "URL hostname received after redirection failed to resolve or resolved to a loopback address ($fetch_effective_ip_addr)"; + + return false; + } + + $data = @file_get_contents($url, false, $context); + + if (isset($http_response_header) && is_array($http_response_header)) { + foreach ($http_response_header as $header) { + if (strstr($header, ": ") !== false) { + list ($key, $value) = explode(": ", $header); + + $key = strtolower($key); + + if ($key == 'content-type') { + $fetch_last_content_type = $value; + // don't abort here b/c there might be more than one + // e.g. if we were being redirected -- last one is the right one + } else if ($key == 'last-modified') { + $fetch_last_modified = $value; + } else if ($key == 'location') { + $fetch_effective_url = $value; + } + } + + if (substr(strtolower($header), 0, 7) == 'http/1.') { + $fetch_last_error_code = (int) substr($header, 9, 3); + $fetch_last_error = $header; + } + } + } + + if ($fetch_last_error_code != 200) { + $error = error_get_last(); + + if ($error['message'] != $old_error['message']) { + $fetch_last_error .= "; " . $error["message"]; + } + + $fetch_last_error_content = $data; + + return false; + } + + $is_gzipped = RSSUtils::is_gzipped($data); + + if ($is_gzipped) { + $tmp = @gzdecode($data); + + if ($tmp) $data = $tmp; + } + + return $data; + } + } + +} |