From f4f0f80d2118437e5047ba266f92d7acb3c38fb7 Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Mon, 11 Apr 2011 16:41:01 +0400 Subject: update HTMLPurifier; enable embedded flash video in articles --- .../library/HTMLPurifier/Lexer/DOMLex.php | 76 +++++++++++++++------- .../library/HTMLPurifier/Lexer/DirectLex.php | 14 ++-- .../library/HTMLPurifier/Lexer/PEARSax3.php | 37 ++++++++++- .../library/HTMLPurifier/Lexer/PH5P.php | 2 - 4 files changed, 95 insertions(+), 34 deletions(-) mode change 100755 => 100644 lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php mode change 100755 => 100644 lib/htmlpurifier/library/HTMLPurifier/Lexer/DirectLex.php mode change 100755 => 100644 lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php mode change 100755 => 100644 lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php (limited to 'lib/htmlpurifier/library/HTMLPurifier/Lexer') diff --git a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php old mode 100755 new mode 100644 index 0db3974bf..82f377450 --- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php +++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php @@ -41,7 +41,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer // attempt to armor stray angled brackets that cannot possibly // form tags and thus are probably being used as emoticons - if ($config->get('Core', 'AggressivelyFixLt')) { + if ($config->get('Core.AggressivelyFixLt')) { $char = '[^a-z!\/]'; $comment = "/|\z)/is"; $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); @@ -72,23 +72,57 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer } /** - * Recursive function that tokenizes a node, putting it into an accumulator. - * + * Iterative function that tokenizes a node, putting it into an accumulator. + * To iterate is human, to recurse divine - L. Peter Deutsch * @param $node DOMNode to be tokenized. * @param $tokens Array-list of already tokenized tokens. - * @param $collect Says whether or start and close are collected, set to - * false at first recursion because it's the implicit DIV - * tag you're dealing with. * @returns Tokens of node appended to previously passed tokens. */ - protected function tokenizeDOM($node, &$tokens, $collect = false) { + protected function tokenizeDOM($node, &$tokens) { + + $level = 0; + $nodes = array($level => array($node)); + $closingNodes = array(); + do { + while (!empty($nodes[$level])) { + $node = array_shift($nodes[$level]); // FIFO + $collect = $level > 0 ? true : false; + $needEndingTag = $this->createStartNode($node, $tokens, $collect); + if ($needEndingTag) { + $closingNodes[$level][] = $node; + } + if ($node->childNodes && $node->childNodes->length) { + $level++; + $nodes[$level] = array(); + foreach ($node->childNodes as $childNode) { + array_push($nodes[$level], $childNode); + } + } + } + $level--; + if ($level && isset($closingNodes[$level])) { + while($node = array_pop($closingNodes[$level])) { + $this->createEndNode($node, $tokens); + } + } + } while ($level > 0); + } + /** + * @param $node DOMNode to be tokenized. + * @param $tokens Array-list of already tokenized tokens. + * @param $collect Says whether or start and close are collected, set to + * false at first recursion because it's the implicit DIV + * tag you're dealing with. + * @returns bool if the token needs an endtoken + */ + protected function createStartNode($node, &$tokens, $collect) { // intercept non element nodes. WE MUST catch all of them, // but we're not getting the character reference nodes because // those should have been preprocessed if ($node->nodeType === XML_TEXT_NODE) { $tokens[] = $this->factory->createText($node->data); - return; + return false; } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { // undo libxml's special treatment of )#si', array($this, 'scriptCallback'), $html); } @@ -45,12 +45,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $array = array(); // result array // This is also treated to mean maintain *column* numbers too - $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers'); + $maintain_line_numbers = $config->get('Core.MaintainLineNumbers'); if ($maintain_line_numbers === null) { // automatically determine line numbering by checking // if error collection is on - $maintain_line_numbers = $config->get('Core', 'CollectErrors'); + $maintain_line_numbers = $config->get('Core.CollectErrors'); } if ($maintain_line_numbers) { @@ -67,10 +67,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $nl = "\n"; // how often to manually recalculate. This will ALWAYS be right, // but it's pretty wasteful. Set to 0 to turn off - $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval'); + $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval'); $e = false; - if ($config->get('Core', 'CollectErrors')) { + if ($config->get('Core.CollectErrors')) { $e =& $context->get('ErrorCollector'); } @@ -345,7 +345,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer if ($string == '') return array(); // no attributes $e = false; - if ($config->get('Core', 'CollectErrors')) { + if ($config->get('Core.CollectErrors')) { $e =& $context->get('ErrorCollector'); } @@ -384,7 +384,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer } } if ($value === false) $value = ''; - return array($key => $value); + return array($key => $this->parseData($value)); } // setup loop environment diff --git a/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php b/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php old mode 100755 new mode 100644 index 57cffa82a..1d358c7b6 --- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php +++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php @@ -26,13 +26,20 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer * Internal accumulator array for SAX parsers. */ protected $tokens = array(); + protected $last_token_was_empty; + + private $parent_handler; + private $stack = array(); public function tokenizeHTML($string, $config, $context) { $this->tokens = array(); + $this->last_token_was_empty = false; $string = $this->normalize($string, $config, $context); + $this->parent_handler = set_error_handler(array($this, 'muteStrictErrorHandler')); + $parser = new XML_HTMLSax3(); $parser->set_object($this); $parser->set_element_handler('openHandler','closeHandler'); @@ -44,6 +51,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer $parser->parse($string); + restore_error_handler(); + return $this->tokens; } @@ -58,9 +67,11 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer } if ($closed) { $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs); + $this->last_token_was_empty = true; } else { $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs); } + $this->stack[] = $name; return true; } @@ -71,10 +82,12 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer // HTMLSax3 seems to always send empty tags an extra close tag // check and ignore if you see it: // [TESTME] to make sure it doesn't overreach - if ($this->tokens[count($this->tokens)-1] instanceof HTMLPurifier_Token_Empty) { + if ($this->last_token_was_empty) { + $this->last_token_was_empty = false; return true; } $this->tokens[] = new HTMLPurifier_Token_End($name); + if (!empty($this->stack)) array_pop($this->stack); return true; } @@ -82,6 +95,7 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer * Data event handler, interface is defined by PEAR package. */ public function dataHandler(&$parser, $data) { + $this->last_token_was_empty = false; $this->tokens[] = new HTMLPurifier_Token_Text($data); return true; } @@ -91,7 +105,18 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer */ public function escapeHandler(&$parser, $data) { if (strpos($data, '--') === 0) { - $this->tokens[] = new HTMLPurifier_Token_Comment($data); + // remove trailing and leading double-dashes + $data = substr($data, 2); + if (strlen($data) >= 2 && substr($data, -2) == "--") { + $data = substr($data, 0, -2); + } + if (isset($this->stack[sizeof($this->stack) - 1]) && + $this->stack[sizeof($this->stack) - 1] == "style") { + $this->tokens[] = new HTMLPurifier_Token_Text($data); + } else { + $this->tokens[] = new HTMLPurifier_Token_Comment($data); + } + $this->last_token_was_empty = false; } // CDATA is handled elsewhere, but if it was handled here: //if (strpos($data, '[CDATA[') === 0) { @@ -101,6 +126,14 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer return true; } + /** + * An error handler that mutes strict errors + */ + public function muteStrictErrorHandler($errno, $errstr, $errfile=null, $errline=null, $errcontext=null) { + if ($errno == E_STRICT) return; + return call_user_func($this->parent_handler, $errno, $errstr, $errfile, $errline, $errcontext); + } + } // vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php b/lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php old mode 100755 new mode 100644 index fa1bf973e..faf00b829 --- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php +++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php @@ -125,8 +125,6 @@ class HTML5 { const EOF = 5; public function __construct($data) { - $data = str_replace("\r\n", "\n", $data); - $data = str_replace("\r", null, $data); $this->data = $data; $this->char = -1; -- cgit v1.2.3