diff options
author | Andrew Dolgov <[email protected]> | 2013-04-19 13:17:28 +0400 |
---|---|---|
committer | Andrew Dolgov <[email protected]> | 2013-04-19 13:17:28 +0400 |
commit | ebec81a6fb2dff0b2fe6b569b021e057995ee6c7 (patch) | |
tree | 9acdc3fd78b47d19f98827b76dcf096dec814429 /include | |
parent | 1367bc3f5e0f99f5b900bcd3ea9e7512b7c84388 (diff) |
subscribe: verify XML before adding to the database; fetch: try to work around entity problems if initial parsing fails
Diffstat (limited to 'include')
-rw-r--r-- | include/functions.php | 19 | ||||
-rw-r--r-- | include/rssfuncs.php | 29 |
2 files changed, 47 insertions, 1 deletions
diff --git a/include/functions.php b/include/functions.php index 4cc8f134d..8ac5753c9 100644 --- a/include/functions.php +++ b/include/functions.php @@ -1558,6 +1558,7 @@ * Here you should call extractfeedurls in rpc-backend * to get all possible feeds. * 5 - Couldn't download the URL content. + * 6 - Content is an invalid XML. */ function subscribe_to_feed($url, $cat_id = 0, $auth_login = '', $auth_pass = '') { @@ -1588,6 +1589,18 @@ $url = key($feedUrls); } + libxml_use_internal_errors(true); + $doc = new DOMDocument(); + $doc->loadXML(html_entity_decode($contents)); + $error = libxml_get_last_error(); + libxml_clear_errors(); + + if ($error) { + $error_message = format_libxml_error($error); + + return array("code" => 6, "message" => $error_message); + } + if ($cat_id == "0" || !$cat_id) { $cat_qpart = "NULL"; } else { @@ -4203,4 +4216,10 @@ return LABEL_BASE_INDEX - 1 + abs($feed); } + function format_libxml_error($error) { + return T_sprintf("LibXML error %s at line %d (column %d): %s", + $error->code, $error->line, $error->column, + $error->message); + } + ?> diff --git a/include/rssfuncs.php b/include/rssfuncs.php index 31d35bf8e..47d622169 100644 --- a/include/rssfuncs.php +++ b/include/rssfuncs.php @@ -316,6 +316,25 @@ _debug("update_rss_feed: fetch done."); } + $error = verify_feed_xml($feed_data); + + if ($error) { + if ($debug_enabled) { + _debug("update_rss_feed: error verifying XML, code: " . $error->code); + } + + if ($error->code == 26) { + if ($debug_enabled) { + _debug("update_rss_feed: got error 26, trying to decode entities..."); + } + + $feed_data = html_entity_decode($feed_data, ENT_COMPAT, 'UTF-8'); + + $error = verify_feed_xml($feed_data); + + if ($error) $feed_data = ''; + } + } } if (!$feed_data) { @@ -559,7 +578,7 @@ _debug("update_rss_feed: date $entry_timestamp [$entry_timestamp_fmt]"); } - $entry_title = html_entity_decode($item->get_title()); + $entry_title = html_entity_decode($item->get_title(), ENT_COMPAT, 'UTF-8'); $entry_link = rewrite_relative_url($site_url, $item->get_link()); @@ -1421,5 +1440,13 @@ mb_strtolower(strip_tags($title), 'utf-8')); } + function verify_feed_xml($feed_data) { + libxml_use_internal_errors(true); + $doc = new DOMDocument(); + $doc->loadXML($feed_data); + $error = libxml_get_last_error(); + libxml_clear_errors(); + return $error; + } ?> |