From 50ddfaf18b0b9aae7768facb39a3314d23961835 Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Sun, 14 Mar 2021 15:20:59 +0300 Subject: sanitize retrieved wiktionary content (just in case) --- classes/sanitizer.php | 252 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 classes/sanitizer.php (limited to 'classes') diff --git a/classes/sanitizer.php b/classes/sanitizer.php new file mode 100644 index 0000000..cf68632 --- /dev/null +++ b/classes/sanitizer.php @@ -0,0 +1,252 @@ +loadHTML('' . $res); + $xpath = new DOMXPath($doc); + + // is it a good idea to possibly rewrite urls to our own prefix? + // $rewrite_base_url = $site_url ? $site_url : Config::get_self_url(); + $rewrite_base_url = "http://domain.invalid/"; + + $entries = $xpath->query('(//a[@href]|//img[@src]|//source[@srcset|@src])'); + + foreach ($entries as $entry) { + + if ($entry->hasAttribute('href')) { + $entry->setAttribute('href', + self::rewrite_relative($rewrite_base_url, $entry->getAttribute('href'))); + + $entry->setAttribute('rel', 'noopener noreferrer'); + $entry->setAttribute("target", "_blank"); + } + + if ($entry->hasAttribute('src')) { + $entry->setAttribute('src', + self::rewrite_relative($rewrite_base_url, $entry->getAttribute('src'))); + } + + if ($entry->nodeName == 'img') { + $entry->setAttribute('referrerpolicy', 'no-referrer'); + $entry->setAttribute('loading', 'lazy'); + } + + if ($entry->hasAttribute('srcset')) { + $entry->removeAttribute('srcset'); + } + } + + $allowed_elements = array('a', 'abbr', 'address', 'acronym', 'audio', 'article', 'aside', + 'b', 'bdi', 'bdo', 'big', 'blockquote', 'body', 'br', + 'caption', 'cite', 'center', 'code', 'col', 'colgroup', + 'data', 'dd', 'del', 'details', 'description', 'dfn', 'div', 'dl', 'font', + 'dt', 'em', 'footer', 'figure', 'figcaption', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hr', 'html', 'i', + 'img', 'ins', 'kbd', 'li', 'main', 'mark', 'nav', 'noscript', + 'ol', 'p', 'picture', 'pre', 'q', 'ruby', 'rp', 'rt', 's', 'samp', 'section', + 'small', 'source', 'span', 'strike', 'strong', 'sub', 'summary', + 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'time', + 'tr', 'track', 'tt', 'u', 'ul', 'var', 'wbr', 'video', 'xml:namespace' ); + + $disallowed_attributes = array('id', 'style', 'class', 'width', 'height', 'allow'); + + $doc->removeChild($doc->firstChild); //remove doctype + $doc = self::strip_harmful_tags($doc, $allowed_elements, $disallowed_attributes); + + $res = $doc->saveHTML(); + + /* strip everything outside of ... */ + + $res_frag = array(); + if (preg_match('/(.*)<\/body>/is', $res, $res_frag)) { + return $res_frag[1]; + } else { + return $res; + } + } + + private static function strip_harmful_tags($doc, $allowed_elements, $disallowed_attributes) { + $xpath = new DOMXPath($doc); + $entries = $xpath->query('//*'); + + foreach ($entries as $entry) { + if (!in_array($entry->nodeName, $allowed_elements)) { + $entry->parentNode->removeChild($entry); + } + + if ($entry->hasAttributes()) { + $attrs_to_remove = array(); + + foreach ($entry->attributes as $attr) { + + if (strpos($attr->nodeName, 'on') === 0) { + array_push($attrs_to_remove, $attr); + } + + if (strpos($attr->nodeName, "data-") === 0) { + array_push($attrs_to_remove, $attr); + } + + if ($attr->nodeName == 'href' && stripos($attr->value, 'javascript:') === 0) { + array_push($attrs_to_remove, $attr); + } + + if (in_array($attr->nodeName, $disallowed_attributes)) { + array_push($attrs_to_remove, $attr); + } + } + + foreach ($attrs_to_remove as $attr) { + $entry->removeAttributeNode($attr); + } + } + } + + return $doc; + } + + // extended filtering involves validation for safe ports and loopback + static function validate($url, $extended_filtering = false) { + + $url = trim($url); + + # fix protocol-relative URLs + if (strpos($url, "//") === 0) + $url = "https:" . $url; + + $tokens = parse_url($url); + + // this isn't really necessary because filter_var(... FILTER_VALIDATE_URL) requires host and scheme + // as per https://php.watch/versions/7.3/filter-var-flag-deprecation but it might save time + if (empty($tokens['host'])) + return false; + + if (!in_array(strtolower($tokens['scheme']), ['http', 'https'])) + return false; + + //convert IDNA hostname to punycode if possible + if (function_exists("idn_to_ascii")) { + if (mb_detect_encoding($tokens['host']) != 'ASCII') { + if (defined('IDNA_NONTRANSITIONAL_TO_ASCII') && defined('INTL_IDNA_VARIANT_UTS46')) { + $tokens['host'] = idn_to_ascii($tokens['host'], IDNA_NONTRANSITIONAL_TO_ASCII, INTL_IDNA_VARIANT_UTS46); + } else { + $tokens['host'] = idn_to_ascii($tokens['host']); + } + } + } + + // separate set of tokens with urlencoded 'path' because filter_var() rightfully fails on non-latin characters + // (used for validation only, we actually request the original URL, in case of urlencode breaking it) + $tokens_filter_var = $tokens; + + if ($tokens['path'] ?? false) { + $tokens_filter_var['path'] = implode("/", + array_map("rawurlencode", + array_map("rawurldecode", + explode("/", $tokens['path'])))); + } + + $url = self::build_url($tokens); + $url_filter_var = self::build_url($tokens_filter_var); + + if (filter_var($url_filter_var, FILTER_VALIDATE_URL) === false) + return false; + + if ($extended_filtering) { + if (!in_array($tokens['port'] ?? '', [80, 443, ''])) + return false; + + if (strtolower($tokens['host']) == 'localhost' || $tokens['host'] == '::1' || strpos($tokens['host'], '127.') === 0) + return false; + } + + return $url; + } + + static function build_url($parts) { + $tmp = $parts['scheme'] . "://" . $parts['host']; + + if (isset($parts['path'])) $tmp .= $parts['path']; + if (isset($parts['query'])) $tmp .= '?' . $parts['query']; + if (isset($parts['fragment'])) $tmp .= '#' . $parts['fragment']; + + return $tmp; + } + + static function resolve_redirects($url, $timeout, $nest = 0) { + + // too many redirects + if ($nest > 10) + return false; + + if (version_compare(PHP_VERSION, '7.1.0', '>=')) { + $context_options = array( + 'http' => array( + 'header' => array( + 'Connection: close' + ), + 'method' => 'HEAD', + 'timeout' => $timeout, + 'protocol_version'=> 1.1) + ); + + $context = stream_context_create($context_options); + + $headers = get_headers($url, 0, $context); + } else { + $headers = get_headers($url, 0); + } + + if (is_array($headers)) { + $headers = array_reverse($headers); // last one is the correct one + + foreach($headers as $header) { + if (stripos($header, 'Location:') === 0) { + $url = self::rewrite_relative($url, trim(substr($header, strlen('Location:')))); + + return self::resolve_redirects($url, $timeout, $nest + 1); + } + } + + return $url; + } + + // request failed? + return false; + } + +} -- cgit v1.2.3