update HTMLPurifier; enable embedded flash video in articles

author: Andrew Dolgov <[email protected]> 2011-04-11 16:41:01 +0400
committer: Andrew Dolgov <[email protected]> 2011-04-11 16:41:01 +0400
commit: f4f0f80d2118437e5047ba266f92d7acb3c38fb7 (patch)
tree: fb15f179dcd68b55613394ad864455f1796de555 /lib/htmlpurifier/library/HTMLPurifier/Lexer
parent: ad92c6ac62903f3bb37f16048fedff44a2eb540d (diff)
4 files changed, 95 insertions, 34 deletions
diff --git a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php
index 0db3974bf..82f377450 100755..100644
--- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php
@@ -41,7 +41,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
 
         // attempt to armor stray angled brackets that cannot possibly
         // form tags and thus are probably being used as emoticons
-        if ($config->get('Core', 'AggressivelyFixLt')) {
+        if ($config->get('Core.AggressivelyFixLt')) {
             $char = '[^a-z!\/]';
             $comment = "/<!--(.*?)(-->|\z)/is";
             $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
@@ -72,23 +72,57 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
     }
 
     /**
-     * Recursive function that tokenizes a node, putting it into an accumulator.
-     *
+     * Iterative function that tokenizes a node, putting it into an accumulator.
+     * To iterate is human, to recurse divine - L. Peter Deutsch
      * @param $node     DOMNode to be tokenized.
      * @param $tokens   Array-list of already tokenized tokens.
-     * @param $collect  Says whether or start and close are collected, set to
-     *                  false at first recursion because it's the implicit DIV
-     *                  tag you're dealing with.
      * @returns Tokens of node appended to previously passed tokens.
      */
-    protected function tokenizeDOM($node, &$tokens, $collect = false) {
+    protected function tokenizeDOM($node, &$tokens) {
+
+        $level = 0;
+        $nodes = array($level => array($node));
+        $closingNodes = array();
+        do {
+            while (!empty($nodes[$level])) {
+                $node = array_shift($nodes[$level]); // FIFO
+                $collect = $level > 0 ? true : false;
+                $needEndingTag = $this->createStartNode($node, $tokens, $collect);
+                if ($needEndingTag) {
+                    $closingNodes[$level][] = $node;
+                }
+                if ($node->childNodes && $node->childNodes->length) {
+                    $level++;
+                    $nodes[$level] = array();
+                    foreach ($node->childNodes as $childNode) {
+                        array_push($nodes[$level], $childNode);
+                    }
+                }
+            }
+            $level--;
+            if ($level && isset($closingNodes[$level])) {
+                while($node = array_pop($closingNodes[$level])) {
+                    $this->createEndNode($node, $tokens);
+                }
+            }
+        } while ($level > 0);
+    }
 
+    /**
+     * @param $node  DOMNode to be tokenized.
+     * @param $tokens   Array-list of already tokenized tokens.
+     * @param $collect  Says whether or start and close are collected, set to
+     *                    false at first recursion because it's the implicit DIV
+     *                    tag you're dealing with.
+     * @returns bool if the token needs an endtoken
+     */
+    protected function createStartNode($node, &$tokens, $collect) {
         // intercept non element nodes. WE MUST catch all of them,
         // but we're not getting the character reference nodes because
         // those should have been preprocessed
         if ($node->nodeType === XML_TEXT_NODE) {
             $tokens[] = $this->factory->createText($node->data);
-            return;
+            return false;
         } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
             // undo libxml's special treatment of <script> and <style> tags
             $last = end($tokens);
@@ -106,48 +140,44 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
                 }
             }
             $tokens[] = $this->factory->createText($this->parseData($data));
-            return;
+            return false;
         } elseif ($node->nodeType === XML_COMMENT_NODE) {
             // this is code is only invoked for comments in script/style in versions
             // of libxml pre-2.6.28 (regular comments, of course, are still
             // handled regularly)
             $tokens[] = $this->factory->createComment($node->data);
-            return;
+            return false;
         } elseif (
             // not-well tested: there may be other nodes we have to grab
             $node->nodeType !== XML_ELEMENT_NODE
         ) {
-            return;
+            return false;
         }
 
-        $attr = $node->hasAttributes() ?
-            $this->transformAttrToAssoc($node->attributes) :
-            array();
+        $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
 
         // We still have to make sure that the element actually IS empty
         if (!$node->childNodes->length) {
             if ($collect) {
                 $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
             }
+            return false;
         } else {
-            if ($collect) { // don't wrap on first iteration
+            if ($collect) {
                 $tokens[] = $this->factory->createStart(
                     $tag_name = $node->tagName, // somehow, it get's dropped
                     $attr
                 );
             }
-            foreach ($node->childNodes as $node) {
-                // remember, it's an accumulator. Otherwise, we'd have
-                // to use array_merge
-                $this->tokenizeDOM($node, $tokens, true);
-            }
-            if ($collect) {
-                $tokens[] = $this->factory->createEnd($tag_name);
-            }
+            return true;
         }
+    }
 
+    protected function createEndNode($node, &$tokens) {
+        $tokens[] = $this->factory->createEnd($node->tagName);
     }
 
+
     /**
      * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
      *
diff --git a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DirectLex.php b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DirectLex.php
index bfca4533d..456e6e190 100755..100644
--- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DirectLex.php
@@ -33,7 +33,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
         // special normalization for script tags without any armor
         // our "armor" heurstic is a < sign any number of whitespaces after
         // the first script tag
-        if ($config->get('HTML', 'Trusted')) {
+        if ($config->get('HTML.Trusted')) {
             $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
                 array($this, 'scriptCallback'), $html);
         }
@@ -45,12 +45,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
         $array = array(); // result array
 
         // This is also treated to mean maintain *column* numbers too
-        $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
+        $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
 
         if ($maintain_line_numbers === null) {
             // automatically determine line numbering by checking
             // if error collection is on
-            $maintain_line_numbers = $config->get('Core', 'CollectErrors');
+            $maintain_line_numbers = $config->get('Core.CollectErrors');
         }
 
         if ($maintain_line_numbers) {
@@ -67,10 +67,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
         $nl = "\n";
         // how often to manually recalculate. This will ALWAYS be right,
         // but it's pretty wasteful. Set to 0 to turn off
-        $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval');
+        $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
 
         $e = false;
-        if ($config->get('Core', 'CollectErrors')) {
+        if ($config->get('Core.CollectErrors')) {
             $e =& $context->get('ErrorCollector');
         }
 
@@ -345,7 +345,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
         if ($string == '') return array(); // no attributes
 
         $e = false;
-        if ($config->get('Core', 'CollectErrors')) {
+        if ($config->get('Core.CollectErrors')) {
             $e =& $context->get('ErrorCollector');
         }
 
@@ -384,7 +384,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 }
             }
             if ($value === false) $value = '';
-            return array($key => $value);
+            return array($key => $this->parseData($value));
         }
 
         // setup loop environment
diff --git a/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php b/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php
index 57cffa82a..1d358c7b6 100755..100644
--- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php
@@ -26,13 +26,20 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
      * Internal accumulator array for SAX parsers.
      */
     protected $tokens = array();
+    protected $last_token_was_empty;
+
+    private $parent_handler;
+    private $stack = array();
 
     public function tokenizeHTML($string, $config, $context) {
 
         $this->tokens = array();
+        $this->last_token_was_empty = false;
 
         $string = $this->normalize($string, $config, $context);
 
+        $this->parent_handler = set_error_handler(array($this, 'muteStrictErrorHandler'));
+
         $parser = new XML_HTMLSax3();
         $parser->set_object($this);
         $parser->set_element_handler('openHandler','closeHandler');
@@ -44,6 +51,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
 
         $parser->parse($string);
 
+        restore_error_handler();
+
         return $this->tokens;
 
     }
@@ -58,9 +67,11 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
         }
         if ($closed) {
             $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
+            $this->last_token_was_empty = true;
         } else {
             $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
         }
+        $this->stack[] = $name;
         return true;
     }
 
@@ -71,10 +82,12 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
         // HTMLSax3 seems to always send empty tags an extra close tag
         // check and ignore if you see it:
         // [TESTME] to make sure it doesn't overreach
-        if ($this->tokens[count($this->tokens)-1] instanceof HTMLPurifier_Token_Empty) {
+        if ($this->last_token_was_empty) {
+            $this->last_token_was_empty = false;
             return true;
         }
         $this->tokens[] = new HTMLPurifier_Token_End($name);
+        if (!empty($this->stack)) array_pop($this->stack);
         return true;
     }
 
@@ -82,6 +95,7 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
      * Data event handler, interface is defined by PEAR package.
      */
     public function dataHandler(&$parser, $data) {
+        $this->last_token_was_empty = false;
         $this->tokens[] = new HTMLPurifier_Token_Text($data);
         return true;
     }
@@ -91,7 +105,18 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
      */
     public function escapeHandler(&$parser, $data) {
         if (strpos($data, '--') === 0) {
-            $this->tokens[] = new HTMLPurifier_Token_Comment($data);
+            // remove trailing and leading double-dashes
+            $data = substr($data, 2);
+            if (strlen($data) >= 2 && substr($data, -2) == "--") {
+                $data = substr($data, 0, -2);
+            }
+            if (isset($this->stack[sizeof($this->stack) - 1]) &&
+                $this->stack[sizeof($this->stack) - 1] == "style") {
+                $this->tokens[] = new HTMLPurifier_Token_Text($data);
+            } else {
+                $this->tokens[] = new HTMLPurifier_Token_Comment($data);
+            }
+            $this->last_token_was_empty = false;
         }
         // CDATA is handled elsewhere, but if it was handled here:
         //if (strpos($data, '[CDATA[') === 0) {
@@ -101,6 +126,14 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
         return true;
     }
 
+    /**
+     * An error handler that mutes strict errors
+     */
+    public function muteStrictErrorHandler($errno, $errstr, $errfile=null, $errline=null, $errcontext=null) {
+        if ($errno == E_STRICT) return;
+        return call_user_func($this->parent_handler, $errno, $errstr, $errfile, $errline, $errcontext);
+    }
+
 }
 
 // vim: et sw=4 sts=4
diff --git a/lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php b/lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php
index fa1bf973e..faf00b829 100755..100644
--- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php
+++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php
@@ -125,8 +125,6 @@ class HTML5 {
     const EOF      = 5;
 
     public function __construct($data) {
-        $data = str_replace("\r\n", "\n", $data);
-        $data = str_replace("\r", null, $data);
 
         $this->data = $data;
         $this->char = -1;
author	Andrew Dolgov <[email protected]>	2011-04-11 16:41:01 +0400
committer	Andrew Dolgov <[email protected]>	2011-04-11 16:41:01 +0400
commit	f4f0f80d2118437e5047ba266f92d7acb3c38fb7 (patch)
tree	fb15f179dcd68b55613394ad864455f1796de555 /lib/htmlpurifier/library/HTMLPurifier/Lexer
parent	ad92c6ac62903f3bb37f16048fedff44a2eb540d (diff)