From 010efc9b814b433bc60353caec185d905688a32b Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Tue, 5 Jun 2012 21:52:21 +0400 Subject: Revert "remove htmlpurifier" This reverts commit c21a462d52bd32737c32c29b060da03b38f1c2e6. --- .../library/HTMLPurifier/Lexer/DOMLex.php | 243 +++++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php (limited to 'lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php') diff --git a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php new file mode 100644 index 000000000..82f377450 --- /dev/null +++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php @@ -0,0 +1,243 @@ +factory = new HTMLPurifier_TokenFactory(); + } + + public function tokenizeHTML($html, $config, $context) { + + $html = $this->normalize($html, $config, $context); + + // attempt to armor stray angled brackets that cannot possibly + // form tags and thus are probably being used as emoticons + if ($config->get('Core.AggressivelyFixLt')) { + $char = '[^a-z!\/]'; + $comment = "/|\z)/is"; + $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); + do { + $old = $html; + $html = preg_replace("/<($char)/i", '<\\1', $html); + } while ($html !== $old); + $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments + } + + // preprocess html, essential for UTF-8 + $html = $this->wrapHTML($html, $config, $context); + + $doc = new DOMDocument(); + $doc->encoding = 'UTF-8'; // theoretically, the above has this covered + + set_error_handler(array($this, 'muteErrorHandler')); + $doc->loadHTML($html); + restore_error_handler(); + + $tokens = array(); + $this->tokenizeDOM( + $doc->getElementsByTagName('html')->item(0)-> // + getElementsByTagName('body')->item(0)-> // + getElementsByTagName('div')->item(0) //
+ , $tokens); + return $tokens; + } + + /** + * Iterative function that tokenizes a node, putting it into an accumulator. + * To iterate is human, to recurse divine - L. Peter Deutsch + * @param $node DOMNode to be tokenized. + * @param $tokens Array-list of already tokenized tokens. + * @returns Tokens of node appended to previously passed tokens. + */ + protected function tokenizeDOM($node, &$tokens) { + + $level = 0; + $nodes = array($level => array($node)); + $closingNodes = array(); + do { + while (!empty($nodes[$level])) { + $node = array_shift($nodes[$level]); // FIFO + $collect = $level > 0 ? true : false; + $needEndingTag = $this->createStartNode($node, $tokens, $collect); + if ($needEndingTag) { + $closingNodes[$level][] = $node; + } + if ($node->childNodes && $node->childNodes->length) { + $level++; + $nodes[$level] = array(); + foreach ($node->childNodes as $childNode) { + array_push($nodes[$level], $childNode); + } + } + } + $level--; + if ($level && isset($closingNodes[$level])) { + while($node = array_pop($closingNodes[$level])) { + $this->createEndNode($node, $tokens); + } + } + } while ($level > 0); + } + + /** + * @param $node DOMNode to be tokenized. + * @param $tokens Array-list of already tokenized tokens. + * @param $collect Says whether or start and close are collected, set to + * false at first recursion because it's the implicit DIV + * tag you're dealing with. + * @returns bool if the token needs an endtoken + */ + protected function createStartNode($node, &$tokens, $collect) { + // intercept non element nodes. WE MUST catch all of them, + // but we're not getting the character reference nodes because + // those should have been preprocessed + if ($node->nodeType === XML_TEXT_NODE) { + $tokens[] = $this->factory->createText($node->data); + return false; + } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { + // undo libxml's special treatment of