summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Dolgov <[email protected]>2019-03-21 21:08:02 +0300
committerAndrew Dolgov <[email protected]>2019-03-21 21:08:02 +0300
commit671f4cee657f36881eeeea7e5d314034252e3ee7 (patch)
tree2dc84b4a15b591e94366b37f39418f6e70a8e593
parent3bd3324e5a9171e6cca20b44d1569da41a1d4874 (diff)
domdocument: remove old meta charset unicode hacks, replace with shorter xml preamble utf8 hack (on loadhtml where it makes sense)
af_readability: better (?) charset hack for non-unicode pages
-rwxr-xr-xclasses/handler/public.php2
-rwxr-xr-xclasses/rssutils.php6
-rwxr-xr-xinclude/functions.php20
-rw-r--r--plugins/af_fsckportal/init.php6
-rwxr-xr-xplugins/af_readability/init.php11
-rwxr-xr-xplugins/af_tumblr_1280/init.php8
-rwxr-xr-xplugins/af_zz_imgproxy/init.php2
-rwxr-xr-xplugins/cache_starred_images/init.php6
8 files changed, 15 insertions, 46 deletions
diff --git a/classes/handler/public.php b/classes/handler/public.php
index f73427cbf..318cecd72 100755
--- a/classes/handler/public.php
+++ b/classes/handler/public.php
@@ -329,7 +329,7 @@ class Handler_Public extends Handler {
if (!$og_image) {
$tmpdoc = new DOMDocument();
- if (@$tmpdoc->loadHTML(mb_substr($content, 0, 131070))) {
+ if (@$tmpdoc->loadHTML('<?xml encoding="UTF-8">' . mb_substr($content, 0, 131070))) {
$tmpxpath = new DOMXPath($tmpdoc);
$imgs = $tmpxpath->query("//img");
diff --git a/classes/rssutils.php b/classes/rssutils.php
index 82a6963e8..5aff2f8a3 100755
--- a/classes/rssutils.php
+++ b/classes/rssutils.php
@@ -1200,12 +1200,8 @@ class RSSUtils {
static function cache_media($html, $site_url) {
libxml_use_internal_errors(true);
- $charset_hack = '<head>
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
- </head>';
-
$doc = new DOMDocument();
- $doc->loadHTML($charset_hack . $html);
+ $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
$xpath = new DOMXPath($doc);
$entries = $xpath->query('(//img[@src])|(//video/source[@src])|(//audio/source[@src])');
diff --git a/include/functions.php b/include/functions.php
index 5ebd4e0ff..d59e79126 100755
--- a/include/functions.php
+++ b/include/functions.php
@@ -562,7 +562,7 @@
libxml_use_internal_errors(true);
$doc = new DOMDocument();
- $doc->loadHTML($html);
+ $doc->loadHTML('<?xml encoding="UTF-8">' . $html);
$xpath = new DOMXPath($doc);
$base = $xpath->query('/html/head/base[@href]');
@@ -1518,14 +1518,10 @@
// plugins work on original source URLs used before caching
function rewrite_cached_urls($str) {
- $charset_hack = '<head>
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
- </head>';
-
$res = trim($str); if (!$res) return '';
$doc = new DOMDocument();
- $doc->loadHTML($charset_hack . $res);
+ $doc->loadHTML('<?xml encoding="UTF-8">' . $res);
$xpath = new DOMXPath($doc);
$entries = $xpath->query('(//img[@src]|//picture/source[@src]|//video[@poster]|//video/source[@src]|//audio/source[@src])');
@@ -1580,16 +1576,10 @@
$res = trim($str); if (!$res) return '';
- $charset_hack = '<head>
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
- </head>';
-
- $res = trim($res); if (!$res) return '';
-
libxml_use_internal_errors(true);
$doc = new DOMDocument();
- $doc->loadHTML($charset_hack . $res);
+ $doc->loadHTML('<?xml encoding="UTF-8">' . $res);
$xpath = new DOMXPath($doc);
$rewrite_base_url = $site_url ? $site_url : get_self_url_prefix();
@@ -2115,7 +2105,7 @@
libxml_use_internal_errors(true);
$doc = new DOMDocument();
- $doc->loadHTML($content);
+ $doc->loadHTML('<?xml encoding="UTF-8">' . $content);
$xpath = new DOMXPath($doc);
$entries = $xpath->query('/html/head/link[@rel="alternate" and '.
'(contains(@type,"rss") or contains(@type,"atom"))]|/html/head/link[@rel="feed"]');
@@ -2136,7 +2126,7 @@
}
function is_html($content) {
- return preg_match("/<html|DOCTYPE html/i", substr($content, 0, 100)) !== 0;
+ return preg_match("/<html|DOCTYPE html/i", substr($content, 0, 8192)) !== 0;
}
function url_is_html($url, $login = false, $pass = false) {
diff --git a/plugins/af_fsckportal/init.php b/plugins/af_fsckportal/init.php
index 0fa58e9ed..04b77a15a 100644
--- a/plugins/af_fsckportal/init.php
+++ b/plugins/af_fsckportal/init.php
@@ -19,11 +19,7 @@ class Af_Fsckportal extends Plugin {
$doc = new DOMDocument();
- $charset_hack = '<head>
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
- </head>';
-
- @$doc->loadHTML($charset_hack . $article["content"]);
+ @$doc->loadHTML('<?xml encoding="UTF-8">' . $article["content"]);
if ($doc) {
$xpath = new DOMXPath($doc);
diff --git a/plugins/af_readability/init.php b/plugins/af_readability/init.php
index 117646c30..32c54a2c7 100755
--- a/plugins/af_readability/init.php
+++ b/plugins/af_readability/init.php
@@ -172,14 +172,10 @@ class Af_Readability extends Plugin {
if (!$tmpdoc->loadHTML($tmp))
return false;
+ // this is the worst hack yet :(
if (strtolower($tmpdoc->encoding) != 'utf-8') {
- $tmpxpath = new DOMXPath($tmpdoc);
-
- foreach ($tmpxpath->query("//meta") as $elem) {
- $elem->parentNode->removeChild($elem);
- }
-
- $tmp = $tmpdoc->saveHTML();
+ $tmp = preg_replace("/<meta.*?charset.*?\/>/i", "", $tmp);
+ $tmp = mb_convert_encoding($tmp, 'utf-8', $tmpdoc->encoding);
}
try {
@@ -210,7 +206,6 @@ class Af_Readability extends Plugin {
} catch (Exception $e) {
return false;
}
-
}
return false;
diff --git a/plugins/af_tumblr_1280/init.php b/plugins/af_tumblr_1280/init.php
index 8aba0e652..5d7f366a4 100755
--- a/plugins/af_tumblr_1280/init.php
+++ b/plugins/af_tumblr_1280/init.php
@@ -25,12 +25,8 @@ class Af_Tumblr_1280 extends Plugin {
if (!function_exists("curl_init") || ini_get("open_basedir"))
return $article;
- $charset_hack = '<head>
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
- </head>';
-
$doc = new DOMDocument();
- $doc->loadHTML($charset_hack . $article["content"]);
+ $doc->loadHTML('<?xml encoding="UTF-8">' . $article["content"]);
$found = false;
@@ -92,4 +88,4 @@ class Af_Tumblr_1280 extends Plugin {
return 2;
}
-} \ No newline at end of file
+}
diff --git a/plugins/af_zz_imgproxy/init.php b/plugins/af_zz_imgproxy/init.php
index 2cd8fcaf0..b172d4563 100755
--- a/plugins/af_zz_imgproxy/init.php
+++ b/plugins/af_zz_imgproxy/init.php
@@ -155,7 +155,7 @@ class Af_Zz_ImgProxy extends Plugin {
$proxy_all = $this->host->get($this, "proxy_all");
$doc = new DOMDocument();
- if (@$doc->loadHTML($article["content"])) {
+ if (@$doc->loadHTML('<?xml encoding="UTF-8">' . $article["content"])) {
$xpath = new DOMXPath($doc);
$imgs = $xpath->query("//img[@src]");
diff --git a/plugins/cache_starred_images/init.php b/plugins/cache_starred_images/init.php
index a1916e226..714d4cb9b 100755
--- a/plugins/cache_starred_images/init.php
+++ b/plugins/cache_starred_images/init.php
@@ -190,12 +190,8 @@ class Cache_Starred_Images extends Plugin implements IHandler {
return;
}
- $charset_hack = '<head>
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
- </head>';
-
$doc = new DOMDocument();
- $doc->loadHTML($charset_hack . $content);
+ $doc->loadHTML('<?xml encoding="UTF-8">' . $content);
$xpath = new DOMXPath($doc);
$entries = $xpath->query('(//img[@src])|(//video/source[@src])');