loadHTML('' . $res); $xpath = new DOMXPath($doc); // is it a good idea to possibly rewrite urls to our own prefix? // $rewrite_base_url = $site_url ? $site_url : Config::get_self_url(); $rewrite_base_url = "http://domain.invalid/"; $entries = $xpath->query('(//a[@href]|//img[@src]|//source[@srcset|@src])'); foreach ($entries as $entry) { if ($entry->hasAttribute('href')) { $entry->setAttribute('href', self::rewrite_relative($rewrite_base_url, $entry->getAttribute('href'))); $entry->setAttribute('rel', 'noopener noreferrer'); $entry->setAttribute("target", "_blank"); } if ($entry->hasAttribute('src')) { $entry->setAttribute('src', self::rewrite_relative($rewrite_base_url, $entry->getAttribute('src'))); } if ($entry->nodeName == 'img') { $entry->setAttribute('referrerpolicy', 'no-referrer'); $entry->setAttribute('loading', 'lazy'); } if ($entry->hasAttribute('srcset')) { $entry->removeAttribute('srcset'); } } $allowed_elements = array('a', 'abbr', 'address', 'acronym', 'audio', 'article', 'aside', 'b', 'bdi', 'bdo', 'big', 'blockquote', 'body', 'br', 'caption', 'cite', 'center', 'code', 'col', 'colgroup', 'data', 'dd', 'del', 'details', 'description', 'dfn', 'div', 'dl', 'font', 'dt', 'em', 'footer', 'figure', 'figcaption', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hr', 'html', 'i', 'img', 'ins', 'kbd', 'li', 'main', 'mark', 'nav', 'noscript', 'ol', 'p', 'picture', 'pre', 'q', 'ruby', 'rp', 'rt', 's', 'samp', 'section', 'small', 'source', 'span', 'strike', 'strong', 'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'time', 'tr', 'track', 'tt', 'u', 'ul', 'var', 'wbr', 'video', 'xml:namespace' ); $disallowed_attributes = array('id', 'style', 'class', 'width', 'height', 'allow'); $doc->removeChild($doc->firstChild); //remove doctype $doc = self::strip_harmful_tags($doc, $allowed_elements, $disallowed_attributes); $res = $doc->saveHTML(); /* strip everything outside of ... */ $res_frag = array(); if (preg_match('/(.*)<\/body>/is', $res, $res_frag)) { return $res_frag[1]; } else { return $res; } } /** * @param array $allowed_elements * @param array $disallowed_attributes * */ private static function strip_harmful_tags(DOMDocument $doc, array $allowed_elements, array $disallowed_attributes) : DOMDocument { $xpath = new DOMXPath($doc); $entries = $xpath->query('//*'); foreach ($entries as $entry) { if (!in_array($entry->nodeName, $allowed_elements)) { $entry->parentNode->removeChild($entry); } if ($entry->hasAttributes()) { $attrs_to_remove = array(); foreach ($entry->attributes as $attr) { if (strpos($attr->nodeName, 'on') === 0) { array_push($attrs_to_remove, $attr); } if (strpos($attr->nodeName, "data-") === 0) { array_push($attrs_to_remove, $attr); } if ($attr->nodeName == 'href' && stripos($attr->value, 'javascript:') === 0) { array_push($attrs_to_remove, $attr); } if (in_array($attr->nodeName, $disallowed_attributes)) { array_push($attrs_to_remove, $attr); } } foreach ($attrs_to_remove as $attr) { $entry->removeAttributeNode($attr); } } } return $doc; } // extended filtering involves validation for safe ports and loopback /** @return string|bool */ static function validate(string $url, bool $extended_filtering = false) : mixed { $url = trim($url); # fix protocol-relative URLs if (strpos($url, "//") === 0) $url = "https:" . $url; $tokens = parse_url($url); // this isn't really necessary because filter_var(... FILTER_VALIDATE_URL) requires host and scheme // as per https://php.watch/versions/7.3/filter-var-flag-deprecation but it might save time if (empty($tokens['host'])) return false; if (!in_array(strtolower($tokens['scheme']), ['http', 'https'])) return false; //convert IDNA hostname to punycode if possible if (function_exists("idn_to_ascii")) { if (mb_detect_encoding($tokens['host']) != 'ASCII') { if (defined('IDNA_NONTRANSITIONAL_TO_ASCII') && defined('INTL_IDNA_VARIANT_UTS46')) { $tokens['host'] = idn_to_ascii($tokens['host'], IDNA_NONTRANSITIONAL_TO_ASCII, INTL_IDNA_VARIANT_UTS46); } else { $tokens['host'] = idn_to_ascii($tokens['host']); } } } // separate set of tokens with urlencoded 'path' because filter_var() rightfully fails on non-latin characters // (used for validation only, we actually request the original URL, in case of urlencode breaking it) $tokens_filter_var = $tokens; if ($tokens['path'] ?? false) { $tokens_filter_var['path'] = implode("/", array_map("rawurlencode", array_map("rawurldecode", explode("/", $tokens['path'])))); } $url = self::build_url($tokens); $url_filter_var = self::build_url($tokens_filter_var); if (filter_var($url_filter_var, FILTER_VALIDATE_URL) === false) return false; if ($extended_filtering) { if (!in_array($tokens['port'] ?? '', [80, 443, ''])) return false; if (strtolower($tokens['host']) == 'localhost' || $tokens['host'] == '::1' || strpos($tokens['host'], '127.') === 0) return false; } return $url; } /** @param array $parts */ static function build_url(array $parts) : string { $tmp = $parts['scheme'] . "://" . $parts['host']; if (isset($parts['path'])) $tmp .= $parts['path']; if (isset($parts['query'])) $tmp .= '?' . $parts['query']; if (isset($parts['fragment'])) $tmp .= '#' . $parts['fragment']; return $tmp; } /** @return string|bool */ static function resolve_redirects(string $url, int $timeout, int $nest = 0) : mixed { // too many redirects if ($nest > 10) return false; if (version_compare(PHP_VERSION, '7.1.0', '>=')) { $context_options = array( 'http' => array( 'header' => array( 'Connection: close' ), 'method' => 'HEAD', 'timeout' => $timeout, 'protocol_version'=> 1.1) ); $context = stream_context_create($context_options); $headers = get_headers($url, false, $context); } else { $headers = get_headers($url, false); } if (is_array($headers)) { $headers = array_reverse($headers); // last one is the correct one foreach($headers as $header) { if (stripos($header, 'Location:') === 0) { $url = self::rewrite_relative($url, trim(substr($header, strlen('Location:')))); return self::resolve_redirects($url, $timeout, $nest + 1); } } return $url; } // request failed? return false; } }