diff options
Diffstat (limited to 'lib/htmlpurifier/library/HTMLPurifier/Lexer')
-rw-r--r--[-rwxr-xr-x] | lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php | 76 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/htmlpurifier/library/HTMLPurifier/Lexer/DirectLex.php | 14 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php | 37 | ||||
-rw-r--r--[-rwxr-xr-x] | lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php | 2 |
4 files changed, 95 insertions, 34 deletions
diff --git a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php index 0db3974bf..82f377450 100755..100644 --- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php +++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php @@ -41,7 +41,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer // attempt to armor stray angled brackets that cannot possibly // form tags and thus are probably being used as emoticons - if ($config->get('Core', 'AggressivelyFixLt')) { + if ($config->get('Core.AggressivelyFixLt')) { $char = '[^a-z!\/]'; $comment = "/<!--(.*?)(-->|\z)/is"; $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); @@ -72,23 +72,57 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer } /** - * Recursive function that tokenizes a node, putting it into an accumulator. - * + * Iterative function that tokenizes a node, putting it into an accumulator. + * To iterate is human, to recurse divine - L. Peter Deutsch * @param $node DOMNode to be tokenized. * @param $tokens Array-list of already tokenized tokens. - * @param $collect Says whether or start and close are collected, set to - * false at first recursion because it's the implicit DIV - * tag you're dealing with. * @returns Tokens of node appended to previously passed tokens. */ - protected function tokenizeDOM($node, &$tokens, $collect = false) { + protected function tokenizeDOM($node, &$tokens) { + + $level = 0; + $nodes = array($level => array($node)); + $closingNodes = array(); + do { + while (!empty($nodes[$level])) { + $node = array_shift($nodes[$level]); // FIFO + $collect = $level > 0 ? true : false; + $needEndingTag = $this->createStartNode($node, $tokens, $collect); + if ($needEndingTag) { + $closingNodes[$level][] = $node; + } + if ($node->childNodes && $node->childNodes->length) { + $level++; + $nodes[$level] = array(); + foreach ($node->childNodes as $childNode) { + array_push($nodes[$level], $childNode); + } + } + } + $level--; + if ($level && isset($closingNodes[$level])) { + while($node = array_pop($closingNodes[$level])) { + $this->createEndNode($node, $tokens); + } + } + } while ($level > 0); + } + /** + * @param $node DOMNode to be tokenized. + * @param $tokens Array-list of already tokenized tokens. + * @param $collect Says whether or start and close are collected, set to + * false at first recursion because it's the implicit DIV + * tag you're dealing with. + * @returns bool if the token needs an endtoken + */ + protected function createStartNode($node, &$tokens, $collect) { // intercept non element nodes. WE MUST catch all of them, // but we're not getting the character reference nodes because // those should have been preprocessed if ($node->nodeType === XML_TEXT_NODE) { $tokens[] = $this->factory->createText($node->data); - return; + return false; } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { // undo libxml's special treatment of <script> and <style> tags $last = end($tokens); @@ -106,48 +140,44 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer } } $tokens[] = $this->factory->createText($this->parseData($data)); - return; + return false; } elseif ($node->nodeType === XML_COMMENT_NODE) { // this is code is only invoked for comments in script/style in versions // of libxml pre-2.6.28 (regular comments, of course, are still // handled regularly) $tokens[] = $this->factory->createComment($node->data); - return; + return false; } elseif ( // not-well tested: there may be other nodes we have to grab $node->nodeType !== XML_ELEMENT_NODE ) { - return; + return false; } - $attr = $node->hasAttributes() ? - $this->transformAttrToAssoc($node->attributes) : - array(); + $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array(); // We still have to make sure that the element actually IS empty if (!$node->childNodes->length) { if ($collect) { $tokens[] = $this->factory->createEmpty($node->tagName, $attr); } + return false; } else { - if ($collect) { // don't wrap on first iteration + if ($collect) { $tokens[] = $this->factory->createStart( $tag_name = $node->tagName, // somehow, it get's dropped $attr ); } - foreach ($node->childNodes as $node) { - // remember, it's an accumulator. Otherwise, we'd have - // to use array_merge - $this->tokenizeDOM($node, $tokens, true); - } - if ($collect) { - $tokens[] = $this->factory->createEnd($tag_name); - } + return true; } + } + protected function createEndNode($node, &$tokens) { + $tokens[] = $this->factory->createEnd($node->tagName); } + /** * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array. * diff --git a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DirectLex.php b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DirectLex.php index bfca4533d..456e6e190 100755..100644 --- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DirectLex.php +++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DirectLex.php @@ -33,7 +33,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // special normalization for script tags without any armor // our "armor" heurstic is a < sign any number of whitespaces after // the first script tag - if ($config->get('HTML', 'Trusted')) { + if ($config->get('HTML.Trusted')) { $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si', array($this, 'scriptCallback'), $html); } @@ -45,12 +45,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $array = array(); // result array // This is also treated to mean maintain *column* numbers too - $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers'); + $maintain_line_numbers = $config->get('Core.MaintainLineNumbers'); if ($maintain_line_numbers === null) { // automatically determine line numbering by checking // if error collection is on - $maintain_line_numbers = $config->get('Core', 'CollectErrors'); + $maintain_line_numbers = $config->get('Core.CollectErrors'); } if ($maintain_line_numbers) { @@ -67,10 +67,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $nl = "\n"; // how often to manually recalculate. This will ALWAYS be right, // but it's pretty wasteful. Set to 0 to turn off - $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval'); + $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval'); $e = false; - if ($config->get('Core', 'CollectErrors')) { + if ($config->get('Core.CollectErrors')) { $e =& $context->get('ErrorCollector'); } @@ -345,7 +345,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer if ($string == '') return array(); // no attributes $e = false; - if ($config->get('Core', 'CollectErrors')) { + if ($config->get('Core.CollectErrors')) { $e =& $context->get('ErrorCollector'); } @@ -384,7 +384,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer } } if ($value === false) $value = ''; - return array($key => $value); + return array($key => $this->parseData($value)); } // setup loop environment diff --git a/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php b/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php index 57cffa82a..1d358c7b6 100755..100644 --- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php +++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php @@ -26,13 +26,20 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer * Internal accumulator array for SAX parsers. */ protected $tokens = array(); + protected $last_token_was_empty; + + private $parent_handler; + private $stack = array(); public function tokenizeHTML($string, $config, $context) { $this->tokens = array(); + $this->last_token_was_empty = false; $string = $this->normalize($string, $config, $context); + $this->parent_handler = set_error_handler(array($this, 'muteStrictErrorHandler')); + $parser = new XML_HTMLSax3(); $parser->set_object($this); $parser->set_element_handler('openHandler','closeHandler'); @@ -44,6 +51,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer $parser->parse($string); + restore_error_handler(); + return $this->tokens; } @@ -58,9 +67,11 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer } if ($closed) { $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs); + $this->last_token_was_empty = true; } else { $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs); } + $this->stack[] = $name; return true; } @@ -71,10 +82,12 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer // HTMLSax3 seems to always send empty tags an extra close tag // check and ignore if you see it: // [TESTME] to make sure it doesn't overreach - if ($this->tokens[count($this->tokens)-1] instanceof HTMLPurifier_Token_Empty) { + if ($this->last_token_was_empty) { + $this->last_token_was_empty = false; return true; } $this->tokens[] = new HTMLPurifier_Token_End($name); + if (!empty($this->stack)) array_pop($this->stack); return true; } @@ -82,6 +95,7 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer * Data event handler, interface is defined by PEAR package. */ public function dataHandler(&$parser, $data) { + $this->last_token_was_empty = false; $this->tokens[] = new HTMLPurifier_Token_Text($data); return true; } @@ -91,7 +105,18 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer */ public function escapeHandler(&$parser, $data) { if (strpos($data, '--') === 0) { - $this->tokens[] = new HTMLPurifier_Token_Comment($data); + // remove trailing and leading double-dashes + $data = substr($data, 2); + if (strlen($data) >= 2 && substr($data, -2) == "--") { + $data = substr($data, 0, -2); + } + if (isset($this->stack[sizeof($this->stack) - 1]) && + $this->stack[sizeof($this->stack) - 1] == "style") { + $this->tokens[] = new HTMLPurifier_Token_Text($data); + } else { + $this->tokens[] = new HTMLPurifier_Token_Comment($data); + } + $this->last_token_was_empty = false; } // CDATA is handled elsewhere, but if it was handled here: //if (strpos($data, '[CDATA[') === 0) { @@ -101,6 +126,14 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer return true; } + /** + * An error handler that mutes strict errors + */ + public function muteStrictErrorHandler($errno, $errstr, $errfile=null, $errline=null, $errcontext=null) { + if ($errno == E_STRICT) return; + return call_user_func($this->parent_handler, $errno, $errstr, $errfile, $errline, $errcontext); + } + } // vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php b/lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php index fa1bf973e..faf00b829 100755..100644 --- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php +++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php @@ -125,8 +125,6 @@ class HTML5 { const EOF = 5; public function __construct($data) { - $data = str_replace("\r\n", "\n", $data); - $data = str_replace("\r", null, $data); $this->data = $data; $this->char = -1; |