From 4ad04ee227dd7d704f417aaf9d6762f5cfdf4c1f Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Tue, 29 Oct 2013 12:15:26 +0400 Subject: report all libxml errors in updater debug output force utf8 encoding if devforceupdate is on parser: try to convert non-unicode feeds with specified encoding to utf8 before trying to remove dangling utf8 characters in case of utf8-related libxml errors because doing so produces garbage content --- classes/feedparser.php | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'classes/feedparser.php') diff --git a/classes/feedparser.php b/classes/feedparser.php index 1c97e496b..de6c56542 100644 --- a/classes/feedparser.php +++ b/classes/feedparser.php @@ -13,6 +13,16 @@ class FeedParser { const FEED_RSS = 1; const FEED_ATOM = 2; + function normalize_encoding($data) { + if (preg_match('/^(<\?xml[\t\n\r ].*?encoding[\t\n\r ]*=[\t\n\r ]*["\'])(.+?)(["\'].*?\?>)/s', $data, $matches) === 1) { + $data = mb_convert_encoding($data, 'UTF-8', $matches[2]); + + $data = preg_replace('/^<\?xml[\t\n\r ].*?\?>/s', $matches[1] . "UTF-8" . $matches[3] , $data); + } + + return $data; + } + function __construct($data) { libxml_use_internal_errors(true); libxml_clear_errors(); @@ -25,19 +35,15 @@ class FeedParser { // libxml compiled without iconv? if ($error && $error->code == 32) { - if (preg_match('/^(<\?xml[\t\n\r ].*?encoding[\t\n\r ]*=[\t\n\r ]*["\'])(.+?)(["\'].*?\?>)/s', $data, $matches) === 1) { - $data = mb_convert_encoding($data, 'UTF-8', $matches[2]); - - $data = preg_replace('/^<\?xml[\t\n\r ].*?\?>/s', $matches[1] . "UTF-8" . $matches[3] , $data); + $data = $this->normalize_encoding($data); - if ($data) { - libxml_clear_errors(); + if ($data) { + libxml_clear_errors(); - $this->doc = new DOMDocument(); - $this->doc->loadXML($data); + $this->doc = new DOMDocument(); + $this->doc->loadXML($data); - $error = libxml_get_last_error(); - } + $error = libxml_get_last_error(); } } @@ -45,6 +51,9 @@ class FeedParser { if ($error) { foreach (libxml_get_errors() as $err) { if ($err->code == 9) { + // if the source feed is not in utf8, next conversion will fail + $data = $this->normalize_encoding($data); + // remove dangling bytes $data = mb_convert_encoding($data, 'UTF-8', 'UTF-8'); -- cgit v1.2.3