Browse Source

domdocument: remove old meta charset unicode hacks, replace with shorter xml preamble utf8 hack (on loadhtml where it makes sense)
af_readability: better (?) charset hack for non-unicode pages

Andrew Dolgov 1 year ago
parent
commit
671f4cee65

+ 1 - 1
classes/handler/public.php

@@ -329,7 +329,7 @@ class Handler_Public extends Handler {
 		if (!$og_image) {
 			$tmpdoc = new DOMDocument();
 
-			if (@$tmpdoc->loadHTML(mb_substr($content, 0, 131070))) {
+			if (@$tmpdoc->loadHTML('<?xml encoding="UTF-8">' . mb_substr($content, 0, 131070))) {
 				$tmpxpath = new DOMXPath($tmpdoc);
 				$imgs = $tmpxpath->query("//img");
 

+ 1 - 5
classes/rssutils.php

@@ -1200,12 +1200,8 @@ class RSSUtils {
 	static function cache_media($html, $site_url) {
 		libxml_use_internal_errors(true);
 
-		$charset_hack = '<head>
-			<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
-		</head>';
-
 		$doc = new DOMDocument();
-		$doc->loadHTML($charset_hack . $html);
+		$doc->loadHTML('<?xml encoding="UTF-8">' . $html);
 		$xpath = new DOMXPath($doc);
 
 		$entries = $xpath->query('(//img[@src])|(//video/source[@src])|(//audio/source[@src])');

+ 5 - 15
include/functions.php

@@ -562,7 +562,7 @@
 			libxml_use_internal_errors(true);
 
 			$doc = new DOMDocument();
-			$doc->loadHTML($html);
+			$doc->loadHTML('<?xml encoding="UTF-8">' . $html);
 			$xpath = new DOMXPath($doc);
 
 			$base = $xpath->query('/html/head/base[@href]');
@@ -1518,14 +1518,10 @@
 	// plugins work on original source URLs used before caching
 
 	function rewrite_cached_urls($str) {
-		$charset_hack = '<head>
-				<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
-			</head>';
-
 		$res = trim($str); if (!$res) return '';
 
 		$doc = new DOMDocument();
-		$doc->loadHTML($charset_hack . $res);
+		$doc->loadHTML('<?xml encoding="UTF-8">' . $res);
 		$xpath = new DOMXPath($doc);
 
 		$entries = $xpath->query('(//img[@src]|//picture/source[@src]|//video[@poster]|//video/source[@src]|//audio/source[@src])');
@@ -1580,16 +1576,10 @@
 
 		$res = trim($str); if (!$res) return '';
 
-		$charset_hack = '<head>
-				<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
-			</head>';
-
-		$res = trim($res); if (!$res) return '';
-
 		libxml_use_internal_errors(true);
 
 		$doc = new DOMDocument();
-		$doc->loadHTML($charset_hack . $res);
+		$doc->loadHTML('<?xml encoding="UTF-8">' . $res);
 		$xpath = new DOMXPath($doc);
 
 		$rewrite_base_url = $site_url ? $site_url : get_self_url_prefix();
@@ -2115,7 +2105,7 @@
 		libxml_use_internal_errors(true);
 
 		$doc = new DOMDocument();
-		$doc->loadHTML($content);
+		$doc->loadHTML('<?xml encoding="UTF-8">' . $content);
 		$xpath = new DOMXPath($doc);
 		$entries = $xpath->query('/html/head/link[@rel="alternate" and '.
 			'(contains(@type,"rss") or contains(@type,"atom"))]|/html/head/link[@rel="feed"]');
@@ -2136,7 +2126,7 @@
 	}
 
 	function is_html($content) {
-		return preg_match("/<html|DOCTYPE html/i", substr($content, 0, 100)) !== 0;
+		return preg_match("/<html|DOCTYPE html/i", substr($content, 0, 8192)) !== 0;
 	}
 
 	function url_is_html($url, $login = false, $pass = false) {

+ 1 - 5
plugins/af_fsckportal/init.php

@@ -19,11 +19,7 @@ class Af_Fsckportal extends Plugin {
 
 			$doc = new DOMDocument();
 
-			$charset_hack = '<head>
-				<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
-			</head>';
-
-			@$doc->loadHTML($charset_hack . $article["content"]);
+			@$doc->loadHTML('<?xml encoding="UTF-8">' . $article["content"]);
 
 			if ($doc) {
 				$xpath = new DOMXPath($doc);

+ 3 - 8
plugins/af_readability/init.php

@@ -172,14 +172,10 @@ class Af_Readability extends Plugin {
 			if (!$tmpdoc->loadHTML($tmp))
 				return false;
 
+			// this is the worst hack yet :(
 			if (strtolower($tmpdoc->encoding) != 'utf-8') {
-				$tmpxpath = new DOMXPath($tmpdoc);
-
-				foreach ($tmpxpath->query("//meta") as $elem) {
-					$elem->parentNode->removeChild($elem);
-				}
-
-				$tmp = $tmpdoc->saveHTML();
+				$tmp = preg_replace("/<meta.*?charset.*?\/>/i", "", $tmp);
+				$tmp = mb_convert_encoding($tmp, 'utf-8', $tmpdoc->encoding);
 			}
 
 			try {
@@ -210,7 +206,6 @@ class Af_Readability extends Plugin {
 			} catch (Exception $e) {
 				return false;
 			}
-
 		}
 
 		return false;

+ 2 - 6
plugins/af_tumblr_1280/init.php

@@ -25,12 +25,8 @@ class Af_Tumblr_1280 extends Plugin {
 		if (!function_exists("curl_init") || ini_get("open_basedir"))
 			return $article;
 
-		$charset_hack = '<head>
-			<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
-		</head>';
-
 		$doc = new DOMDocument();
-		$doc->loadHTML($charset_hack . $article["content"]);
+		$doc->loadHTML('<?xml encoding="UTF-8">' . $article["content"]);
 
 		$found = false;
 
@@ -92,4 +88,4 @@ class Af_Tumblr_1280 extends Plugin {
 		return 2;
 	}
 
-}
+}

+ 1 - 1
plugins/af_zz_imgproxy/init.php

@@ -155,7 +155,7 @@ class Af_Zz_ImgProxy extends Plugin {
 		$proxy_all = $this->host->get($this, "proxy_all");
 
 		$doc = new DOMDocument();
-		if (@$doc->loadHTML($article["content"])) {
+		if (@$doc->loadHTML('<?xml encoding="UTF-8">' . $article["content"])) {
 			$xpath = new DOMXPath($doc);
 			$imgs = $xpath->query("//img[@src]");
 

+ 1 - 5
plugins/cache_starred_images/init.php

@@ -190,12 +190,8 @@ class Cache_Starred_Images extends Plugin implements IHandler {
             return;
         }
 
-		$charset_hack = '<head>
-			<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
-		</head>';
-
 		$doc = new DOMDocument();
-		$doc->loadHTML($charset_hack . $content);
+		$doc->loadHTML('<?xml encoding="UTF-8">' . $content);
 		$xpath = new DOMXPath($doc);
 
 		$entries = $xpath->query('(//img[@src])|(//video/source[@src])');