diff options
author | Andrew Dolgov <[email protected]> | 2012-06-05 21:52:21 +0400 |
---|---|---|
committer | Andrew Dolgov <[email protected]> | 2012-06-05 21:52:21 +0400 |
commit | 010efc9b814b433bc60353caec185d905688a32b (patch) | |
tree | b2b4f62cbc2d10cf75386e992434be1f4013dc13 /lib/htmlpurifier/library/HTMLPurifier/Filter | |
parent | 705b97b7fca9ea70820af5fcd926f88903eaa430 (diff) |
Revert "remove htmlpurifier"
This reverts commit c21a462d52bd32737c32c29b060da03b38f1c2e6.
Diffstat (limited to 'lib/htmlpurifier/library/HTMLPurifier/Filter')
-rw-r--r-- | lib/htmlpurifier/library/HTMLPurifier/Filter/ExtractStyleBlocks.php | 288 | ||||
-rw-r--r-- | lib/htmlpurifier/library/HTMLPurifier/Filter/YouTube.php | 39 |
2 files changed, 327 insertions, 0 deletions
diff --git a/lib/htmlpurifier/library/HTMLPurifier/Filter/ExtractStyleBlocks.php b/lib/htmlpurifier/library/HTMLPurifier/Filter/ExtractStyleBlocks.php new file mode 100644 index 000000000..320aa4f16 --- /dev/null +++ b/lib/htmlpurifier/library/HTMLPurifier/Filter/ExtractStyleBlocks.php @@ -0,0 +1,288 @@ +<?php + +// why is this a top level function? Because PHP 5.2.0 doesn't seem to +// understand how to interpret this filter if it's a static method. +// It's all really silly, but if we go this route it might be reasonable +// to coalesce all of these methods into one. +function htmlpurifier_filter_extractstyleblocks_muteerrorhandler() {} + +/** + * This filter extracts <style> blocks from input HTML, cleans them up + * using CSSTidy, and then places them in $purifier->context->get('StyleBlocks') + * so they can be used elsewhere in the document. + * + * @note + * See tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php for + * sample usage. + * + * @note + * This filter can also be used on stylesheets not included in the + * document--something purists would probably prefer. Just directly + * call HTMLPurifier_Filter_ExtractStyleBlocks->cleanCSS() + */ +class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter +{ + + public $name = 'ExtractStyleBlocks'; + private $_styleMatches = array(); + private $_tidy; + + private $_id_attrdef; + private $_class_attrdef; + private $_enum_attrdef; + + public function __construct() { + $this->_tidy = new csstidy(); + $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true); + $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident(); + $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(array('first-child', 'link', 'visited', 'active', 'hover', 'focus')); + } + + /** + * Save the contents of CSS blocks to style matches + * @param $matches preg_replace style $matches array + */ + protected function styleCallback($matches) { + $this->_styleMatches[] = $matches[1]; + } + + /** + * Removes inline <style> tags from HTML, saves them for later use + * @todo Extend to indicate non-text/css style blocks + */ + public function preFilter($html, $config, $context) { + $tidy = $config->get('Filter.ExtractStyleBlocks.TidyImpl'); + if ($tidy !== null) $this->_tidy = $tidy; + $html = preg_replace_callback('#<style(?:\s.*)?>(.+)</style>#isU', array($this, 'styleCallback'), $html); + $style_blocks = $this->_styleMatches; + $this->_styleMatches = array(); // reset + $context->register('StyleBlocks', $style_blocks); // $context must not be reused + if ($this->_tidy) { + foreach ($style_blocks as &$style) { + $style = $this->cleanCSS($style, $config, $context); + } + } + return $html; + } + + /** + * Takes CSS (the stuff found in <style>) and cleans it. + * @warning Requires CSSTidy <http://csstidy.sourceforge.net/> + * @param $css CSS styling to clean + * @param $config Instance of HTMLPurifier_Config + * @param $context Instance of HTMLPurifier_Context + * @return Cleaned CSS + */ + public function cleanCSS($css, $config, $context) { + // prepare scope + $scope = $config->get('Filter.ExtractStyleBlocks.Scope'); + if ($scope !== null) { + $scopes = array_map('trim', explode(',', $scope)); + } else { + $scopes = array(); + } + // remove comments from CSS + $css = trim($css); + if (strncmp('<!--', $css, 4) === 0) { + $css = substr($css, 4); + } + if (strlen($css) > 3 && substr($css, -3) == '-->') { + $css = substr($css, 0, -3); + } + $css = trim($css); + set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler'); + $this->_tidy->parse($css); + restore_error_handler(); + $css_definition = $config->getDefinition('CSS'); + $html_definition = $config->getDefinition('HTML'); + $new_css = array(); + foreach ($this->_tidy->css as $k => $decls) { + // $decls are all CSS declarations inside an @ selector + $new_decls = array(); + foreach ($decls as $selector => $style) { + $selector = trim($selector); + if ($selector === '') continue; // should not happen + // Parse the selector + // Here is the relevant part of the CSS grammar: + // + // ruleset + // : selector [ ',' S* selector ]* '{' ... + // selector + // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]? + // combinator + // : '+' S* + // : '>' S* + // simple_selector + // : element_name [ HASH | class | attrib | pseudo ]* + // | [ HASH | class | attrib | pseudo ]+ + // element_name + // : IDENT | '*' + // ; + // class + // : '.' IDENT + // ; + // attrib + // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S* + // [ IDENT | STRING ] S* ]? ']' + // ; + // pseudo + // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ] + // ; + // + // For reference, here are the relevant tokens: + // + // HASH #{name} + // IDENT {ident} + // INCLUDES == + // DASHMATCH |= + // STRING {string} + // FUNCTION {ident}\( + // + // And the lexical scanner tokens + // + // name {nmchar}+ + // nmchar [_a-z0-9-]|{nonascii}|{escape} + // nonascii [\240-\377] + // escape {unicode}|\\[^\r\n\f0-9a-f] + // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])? + // ident -?{nmstart}{nmchar*} + // nmstart [_a-z]|{nonascii}|{escape} + // string {string1}|{string2} + // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\" + // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\' + // + // We'll implement a subset (in order to reduce attack + // surface); in particular: + // + // - No Unicode support + // - No escapes support + // - No string support (by proxy no attrib support) + // - element_name is matched against allowed + // elements (some people might find this + // annoying...) + // - Pseudo-elements one of :first-child, :link, + // :visited, :active, :hover, :focus + + // handle ruleset + $selectors = array_map('trim', explode(',', $selector)); + $new_selectors = array(); + foreach ($selectors as $sel) { + // split on +, > and spaces + $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE); + // even indices are chunks, odd indices are + // delimiters + $nsel = null; + $delim = null; // guaranteed to be non-null after + // two loop iterations + for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) { + $x = $basic_selectors[$i]; + if ($i % 2) { + // delimiter + if ($x === ' ') { + $delim = ' '; + } else { + $delim = ' ' . $x . ' '; + } + } else { + // simple selector + $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE); + $sdelim = null; + $nx = null; + for ($j = 0, $cc = count($components); $j < $cc; $j ++) { + $y = $components[$j]; + if ($j === 0) { + if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) { + $nx = $y; + } else { + // $nx stays null; this matters + // if we don't manage to find + // any valid selector content, + // in which case we ignore the + // outer $delim + } + } elseif ($j % 2) { + // set delimiter + $sdelim = $y; + } else { + $attrdef = null; + if ($sdelim === '#') { + $attrdef = $this->_id_attrdef; + } elseif ($sdelim === '.') { + $attrdef = $this->_class_attrdef; + } elseif ($sdelim === ':') { + $attrdef = $this->_enum_attrdef; + } else { + throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split'); + } + $r = $attrdef->validate($y, $config, $context); + if ($r !== false) { + if ($r !== true) { + $y = $r; + } + if ($nx === null) { + $nx = ''; + } + $nx .= $sdelim . $y; + } + } + } + if ($nx !== null) { + if ($nsel === null) { + $nsel = $nx; + } else { + $nsel .= $delim . $nx; + } + } else { + // delimiters to the left of invalid + // basic selector ignored + } + } + } + if ($nsel !== null) { + if (!empty($scopes)) { + foreach ($scopes as $s) { + $new_selectors[] = "$s $nsel"; + } + } else { + $new_selectors[] = $nsel; + } + } + } + if (empty($new_selectors)) continue; + $selector = implode(', ', $new_selectors); + foreach ($style as $name => $value) { + if (!isset($css_definition->info[$name])) { + unset($style[$name]); + continue; + } + $def = $css_definition->info[$name]; + $ret = $def->validate($value, $config, $context); + if ($ret === false) unset($style[$name]); + else $style[$name] = $ret; + } + $new_decls[$selector] = $style; + } + $new_css[$k] = $new_decls; + } + // remove stuff that shouldn't be used, could be reenabled + // after security risks are analyzed + $this->_tidy->css = $new_css; + $this->_tidy->import = array(); + $this->_tidy->charset = null; + $this->_tidy->namespace = null; + $css = $this->_tidy->print->plain(); + // we are going to escape any special characters <>& to ensure + // that no funny business occurs (i.e. </style> in a font-family prop). + if ($config->get('Filter.ExtractStyleBlocks.Escaping')) { + $css = str_replace( + array('<', '>', '&'), + array('\3C ', '\3E ', '\26 '), + $css + ); + } + return $css; + } + +} + +// vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/library/HTMLPurifier/Filter/YouTube.php b/lib/htmlpurifier/library/HTMLPurifier/Filter/YouTube.php new file mode 100644 index 000000000..23df221ea --- /dev/null +++ b/lib/htmlpurifier/library/HTMLPurifier/Filter/YouTube.php @@ -0,0 +1,39 @@ +<?php + +class HTMLPurifier_Filter_YouTube extends HTMLPurifier_Filter +{ + + public $name = 'YouTube'; + + public function preFilter($html, $config, $context) { + $pre_regex = '#<object[^>]+>.+?'. + 'http://www.youtube.com/((?:v|cp)/[A-Za-z0-9\-_=]+).+?</object>#s'; + $pre_replace = '<span class="youtube-embed">\1</span>'; + return preg_replace($pre_regex, $pre_replace, $html); + } + + public function postFilter($html, $config, $context) { + $post_regex = '#<span class="youtube-embed">((?:v|cp)/[A-Za-z0-9\-_=]+)</span>#'; + return preg_replace_callback($post_regex, array($this, 'postFilterCallback'), $html); + } + + protected function armorUrl($url) { + return str_replace('--', '--', $url); + } + + protected function postFilterCallback($matches) { + $url = $this->armorUrl($matches[1]); + return '<object width="425" height="350" type="application/x-shockwave-flash" '. + 'data="http://www.youtube.com/'.$url.'">'. + '<param name="movie" value="http://www.youtube.com/'.$url.'"></param>'. + '<!--[if IE]>'. + '<embed src="http://www.youtube.com/'.$url.'"'. + 'type="application/x-shockwave-flash"'. + 'wmode="transparent" width="425" height="350" />'. + '<![endif]-->'. + '</object>'; + + } +} + +// vim: et sw=4 sts=4 |