From 2a479dced03735c9e6062bf0366e2774ca253300 Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Tue, 21 Aug 2007 15:15:50 +0100 Subject: rework feed content mangling algorithm --- magpierss/rss_parse.inc | 57 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) (limited to 'magpierss/rss_parse.inc') diff --git a/magpierss/rss_parse.inc b/magpierss/rss_parse.inc index 66e5e65f2..3aff57a50 100644 --- a/magpierss/rss_parse.inc +++ b/magpierss/rss_parse.inc @@ -23,6 +23,35 @@ define('RSS', 'RSS'); define('ATOM', 'Atom'); +function _convert_entities ($string) { + # Source: http://www.w3.org/TR/REC-html40/sgml/entities.html + $html_entities = array( + " ", "¡", "¢", "£", "¤", "¥", "¦", "§", "¨", "©", + "ª", "«", "¬", "­", "®", "¯", "°", "±", "²", "³", + "´", "µ", "¶", "·", "¸", "¹", "º", "»", "¼", "½", + "¾", "¿", "À", "Á", "Â", "Ã", "Ä", "Å", "Æ", "Ç", + "È", "É", "Ê", "Ë", "Ì", "Í", "Î", "Ï", "Ð", "Ñ", + "Ò", "Ó", "Ô", "Õ", "Ö", "×", "Ø", "Ù", "Ú", "Û", + "Ü", "Ý", "Þ", "ß", "à", "á", "â", "ã", "ä", "å", + "æ", "ç", "è", "é", "ê", "ë", "ì", "í", "î", "ï", + "ð", "ñ", "ò", "ó", "ô", "õ", "ö", "÷", "ø", "ù", + "ú", "û", "ü", "ý", "þ", "ÿ",); + $numeric_entities = array( + " ", "¡", "¢", "£", "¤", "¥", "¦", "§", "¨", "©", + "ª", "«", "¬", "­", "®", "¯", "°", "±", "²", "³", + "´", "µ", "¶", "·", "¸", "¹", "º", "»", "¼", "½", + "¾", "¿", "À", "Á", "Â", "Ã", "Ä", "Å", "Æ", "Ç", + "È", "É", "Ê", "Ë", "Ì", "Í", "Î", "Ï", "Ð", "Ñ", + "Ò", "Ó", "Ô", "Õ", "Ö", "×", "Ø", "Ù", "Ú", "Û", + "Ü", "Ý", "Þ", "ß", "à", "á", "â", "ã", "ä", "å", + "æ", "ç", "è", "é", "ê", "ë", "ì", "í", "î", "ï", + "ð", "ñ", "ò", "ó", "ô", "õ", "ö", "÷", "ø", "ù", + "ú", "û", "ü", "ý", "þ", "ÿ"); + return str_replace($html_entities, $numeric_entities, $string); +} + + + require_once (MAGPIE_DIR . 'rss_utils.inc'); /** @@ -149,12 +178,14 @@ class MagpieRSS { $enc = mb_detect_encoding($string); } + # try fix XML, pass 1 + $source = mb_convert_encoding($source, "UTF-8", $enc); list($parser, $source) = $this->create_parser($source, $output_encoding, $input_encoding, $detect_encoding); - $this->parser = $parser; + $this->parser = $parser; xml_set_object( $this->parser, $this ); xml_set_element_handler($this->parser, @@ -163,6 +194,30 @@ class MagpieRSS { xml_set_character_data_handler( $this->parser, 'feed_cdata' ); $status = xml_parse( $this->parser, $source); + + # try to fix XML, pass 2 + + if (! $status) { + $errorcode = xml_get_error_code( $this->parser ); + if ( $errorcode != XML_ERROR_NONE ) { + + $source = _convert_entities($source); + + list($parser, $source) = $this->create_parser($source, + $output_encoding, $input_encoding, $detect_encoding); + + $this->parser = $parser; + + xml_set_object( $this->parser, $this ); + xml_set_element_handler($this->parser, + 'feed_start_element', 'feed_end_element' ); + + xml_set_character_data_handler( $this->parser, 'feed_cdata' ); + + $status = xml_parse( $this->parser, $source); + + } + } } } -- cgit v1.2.3