summaryrefslogtreecommitdiff
path: root/magpierss
diff options
context:
space:
mode:
authorAndrew Dolgov <[email protected]>2007-08-21 15:15:50 +0100
committerAndrew Dolgov <[email protected]>2007-08-21 15:15:50 +0100
commit2a479dced03735c9e6062bf0366e2774ca253300 (patch)
treee50007da96c63f9466fbecf942ce89e4cfe58220 /magpierss
parent999703d15607c880d07671b57d6a0c5547821c83 (diff)
rework feed content mangling algorithm
Diffstat (limited to 'magpierss')
-rw-r--r--magpierss/rss_fetch.inc30
-rw-r--r--magpierss/rss_parse.inc57
2 files changed, 57 insertions, 30 deletions
diff --git a/magpierss/rss_fetch.inc b/magpierss/rss_fetch.inc
index 77a1a704f..126dc630a 100644
--- a/magpierss/rss_fetch.inc
+++ b/magpierss/rss_fetch.inc
@@ -279,33 +279,6 @@ function _fetch_remote_file ($url, $headers = "" ) {
}
-function _convert_entities ($string) {
- # Source: http://www.w3.org/TR/REC-html40/sgml/entities.html
- $html_entities = array(
- "&nbsp", "&iexcl", "&cent", "&pound", "&curren", "&yen", "&brvbar", "&sect", "&uml", "&copy",
- "&ordf", "&laquo", "&not", "&shy", "&reg", "&macr", "&deg", "&plusmn", "&sup2", "&sup3",
- "&acute", "&micro", "&para", "&middot", "&cedil", "&sup1", "&ordm", "&raquo", "&frac14", "&frac12",
- "&frac34", "&iquest", "&Agrave", "&Aacute", "&Acirc", "&Atilde", "&Auml", "&Aring", "&AElig", "&Ccedil",
- "&Egrave", "&Eacute", "&Ecirc", "&Euml", "&Igrave", "&Iacute", "&Icirc", "&Iuml", "&ETH", "&Ntilde",
- "&Ograve", "&Oacute", "&Ocirc", "&Otilde", "&Ouml", "&times", "&Oslash", "&Ugrave", "&Uacute", "&Ucirc",
- "&Uuml", "&Yacute", "&THORN", "&szlig", "&agrave", "&aacute", "&acirc", "&atilde", "&auml", "&aring",
- "&aelig", "&ccedil", "&egrave", "&eacute", "&ecirc", "&euml", "&igrave", "&iacute", "&icirc", "&iuml",
- "&eth", "&ntilde", "&ograve", "&oacute", "&ocirc", "&otilde", "&ouml", "&divide", "&oslash", "&ugrave",
- "&uacute", "&ucirc", "&uuml", "&yacute", "&thorn", "&yuml",);
- $numeric_entities = array(
- "&#160;", "&#161;", "&#162;", "&#163;", "&#164;", "&#165;", "&#166;", "&#167;", "&#168;", "&#169;",
- "&#170;", "&#171;", "&#172;", "&#173;", "&#174;", "&#175;", "&#176;", "&#177;", "&#178;", "&#179;",
- "&#180;", "&#181;", "&#182;", "&#183;", "&#184;", "&#185;", "&#186;", "&#187;", "&#188;", "&#189;",
- "&#190;", "&#191;", "&#192;", "&#193;", "&#194;", "&#195;", "&#196;", "&#197;", "&#198;", "&#199;",
- "&#200;", "&#201;", "&#202;", "&#203;", "&#204;", "&#205;", "&#206;", "&#207;", "&#208;", "&#209;",
- "&#210;", "&#211;", "&#212;", "&#213;", "&#214;", "&#215;", "&#216;", "&#217;", "&#218;", "&#219;",
- "&#220;", "&#221;", "&#222;", "&#223;", "&#224;", "&#225;", "&#226;", "&#227;", "&#228;", "&#229;",
- "&#230;", "&#231;", "&#232;", "&#233;", "&#234;", "&#235;", "&#236;", "&#237;", "&#238;", "&#239;",
- "&#240;", "&#241;", "&#242;", "&#243;", "&#244;", "&#245;", "&#246;", "&#247;", "&#248;", "&#249;",
- "&#250;", "&#251;", "&#252;", "&#253;", "&#254;", "&#255;");
- return str_replace($html_entities, $numeric_entities, $string);
-}
-
/*=======================================================================*\
Function: _response_to_rss
Purpose: parse an HTTP response object into an RSS object
@@ -313,8 +286,7 @@ function _convert_entities ($string) {
Output: parsed RSS object (see rss_parse)
\*=======================================================================*/
function _response_to_rss ($resp) {
- $converted_source = _convert_entities($resp->results);
- $rss = new MagpieRSS( $converted_source, MAGPIE_OUTPUT_ENCODING, "UTF-8", false);
+ $rss = new MagpieRSS( $resp->results, MAGPIE_OUTPUT_ENCODING, "UTF-8", false);
// if RSS parsed successfully
if ( $rss and !$rss->ERROR) {
diff --git a/magpierss/rss_parse.inc b/magpierss/rss_parse.inc
index 66e5e65f2..3aff57a50 100644
--- a/magpierss/rss_parse.inc
+++ b/magpierss/rss_parse.inc
@@ -23,6 +23,35 @@
define('RSS', 'RSS');
define('ATOM', 'Atom');
+function _convert_entities ($string) {
+ # Source: http://www.w3.org/TR/REC-html40/sgml/entities.html
+ $html_entities = array(
+ "&nbsp", "&iexcl", "&cent", "&pound", "&curren", "&yen", "&brvbar", "&sect", "&uml", "&copy",
+ "&ordf", "&laquo", "&not", "&shy", "&reg", "&macr", "&deg", "&plusmn", "&sup2", "&sup3",
+ "&acute", "&micro", "&para", "&middot", "&cedil", "&sup1", "&ordm", "&raquo", "&frac14", "&frac12",
+ "&frac34", "&iquest", "&Agrave", "&Aacute", "&Acirc", "&Atilde", "&Auml", "&Aring", "&AElig", "&Ccedil",
+ "&Egrave", "&Eacute", "&Ecirc", "&Euml", "&Igrave", "&Iacute", "&Icirc", "&Iuml", "&ETH", "&Ntilde",
+ "&Ograve", "&Oacute", "&Ocirc", "&Otilde", "&Ouml", "&times", "&Oslash", "&Ugrave", "&Uacute", "&Ucirc",
+ "&Uuml", "&Yacute", "&THORN", "&szlig", "&agrave", "&aacute", "&acirc", "&atilde", "&auml", "&aring",
+ "&aelig", "&ccedil", "&egrave", "&eacute", "&ecirc", "&euml", "&igrave", "&iacute", "&icirc", "&iuml",
+ "&eth", "&ntilde", "&ograve", "&oacute", "&ocirc", "&otilde", "&ouml", "&divide", "&oslash", "&ugrave",
+ "&uacute", "&ucirc", "&uuml", "&yacute", "&thorn", "&yuml",);
+ $numeric_entities = array(
+ "&#160;", "&#161;", "&#162;", "&#163;", "&#164;", "&#165;", "&#166;", "&#167;", "&#168;", "&#169;",
+ "&#170;", "&#171;", "&#172;", "&#173;", "&#174;", "&#175;", "&#176;", "&#177;", "&#178;", "&#179;",
+ "&#180;", "&#181;", "&#182;", "&#183;", "&#184;", "&#185;", "&#186;", "&#187;", "&#188;", "&#189;",
+ "&#190;", "&#191;", "&#192;", "&#193;", "&#194;", "&#195;", "&#196;", "&#197;", "&#198;", "&#199;",
+ "&#200;", "&#201;", "&#202;", "&#203;", "&#204;", "&#205;", "&#206;", "&#207;", "&#208;", "&#209;",
+ "&#210;", "&#211;", "&#212;", "&#213;", "&#214;", "&#215;", "&#216;", "&#217;", "&#218;", "&#219;",
+ "&#220;", "&#221;", "&#222;", "&#223;", "&#224;", "&#225;", "&#226;", "&#227;", "&#228;", "&#229;",
+ "&#230;", "&#231;", "&#232;", "&#233;", "&#234;", "&#235;", "&#236;", "&#237;", "&#238;", "&#239;",
+ "&#240;", "&#241;", "&#242;", "&#243;", "&#244;", "&#245;", "&#246;", "&#247;", "&#248;", "&#249;",
+ "&#250;", "&#251;", "&#252;", "&#253;", "&#254;", "&#255;");
+ return str_replace($html_entities, $numeric_entities, $string);
+}
+
+
+
require_once (MAGPIE_DIR . 'rss_utils.inc');
/**
@@ -149,12 +178,14 @@ class MagpieRSS {
$enc = mb_detect_encoding($string);
}
+ # try fix XML, pass 1
+
$source = mb_convert_encoding($source, "UTF-8", $enc);
list($parser, $source) = $this->create_parser($source,
$output_encoding, $input_encoding, $detect_encoding);
- $this->parser = $parser;
+ $this->parser = $parser;
xml_set_object( $this->parser, $this );
xml_set_element_handler($this->parser,
@@ -163,6 +194,30 @@ class MagpieRSS {
xml_set_character_data_handler( $this->parser, 'feed_cdata' );
$status = xml_parse( $this->parser, $source);
+
+ # try to fix XML, pass 2
+
+ if (! $status) {
+ $errorcode = xml_get_error_code( $this->parser );
+ if ( $errorcode != XML_ERROR_NONE ) {
+
+ $source = _convert_entities($source);
+
+ list($parser, $source) = $this->create_parser($source,
+ $output_encoding, $input_encoding, $detect_encoding);
+
+ $this->parser = $parser;
+
+ xml_set_object( $this->parser, $this );
+ xml_set_element_handler($this->parser,
+ 'feed_start_element', 'feed_end_element' );
+
+ xml_set_character_data_handler( $this->parser, 'feed_cdata' );
+
+ $status = xml_parse( $this->parser, $source);
+
+ }
+ }
}
}