From dd205fbad642ace6d0e33c8553f7d73404f140b4 Mon Sep 17 00:00:00 2001 From: Michael Kuhn Date: Sat, 28 Apr 2012 14:37:51 +0200 Subject: Update HTML Purifier to version 4.4.0. --- .../HTMLPurifier/Filter/ExtractStyleBlocks.php | 175 +++++++++++++++++++-- 1 file changed, 164 insertions(+), 11 deletions(-) (limited to 'lib/htmlpurifier/library/HTMLPurifier/Filter') diff --git a/lib/htmlpurifier/library/HTMLPurifier/Filter/ExtractStyleBlocks.php b/lib/htmlpurifier/library/HTMLPurifier/Filter/ExtractStyleBlocks.php index bbf78a663..320aa4f16 100644 --- a/lib/htmlpurifier/library/HTMLPurifier/Filter/ExtractStyleBlocks.php +++ b/lib/htmlpurifier/library/HTMLPurifier/Filter/ExtractStyleBlocks.php @@ -1,5 +1,11 @@ blocks from input HTML, cleans them up * using CSSTidy, and then places them in $purifier->context->get('StyleBlocks') @@ -21,8 +27,15 @@ class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter private $_styleMatches = array(); private $_tidy; + private $_id_attrdef; + private $_class_attrdef; + private $_enum_attrdef; + public function __construct() { $this->_tidy = new csstidy(); + $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true); + $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident(); + $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(array('first-child', 'link', 'visited', 'active', 'hover', 'focus')); } /** @@ -77,27 +90,166 @@ class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter $css = substr($css, 0, -3); } $css = trim($css); + set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler'); $this->_tidy->parse($css); + restore_error_handler(); $css_definition = $config->getDefinition('CSS'); + $html_definition = $config->getDefinition('HTML'); + $new_css = array(); foreach ($this->_tidy->css as $k => $decls) { // $decls are all CSS declarations inside an @ selector $new_decls = array(); foreach ($decls as $selector => $style) { $selector = trim($selector); if ($selector === '') continue; // should not happen - if ($selector[0] === '+') { - if ($selector !== '' && $selector[0] === '+') continue; - } - if (!empty($scopes)) { - $new_selector = array(); // because multiple ones are possible - $selectors = array_map('trim', explode(',', $selector)); - foreach ($scopes as $s1) { - foreach ($selectors as $s2) { - $new_selector[] = "$s1 $s2"; + // Parse the selector + // Here is the relevant part of the CSS grammar: + // + // ruleset + // : selector [ ',' S* selector ]* '{' ... + // selector + // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]? + // combinator + // : '+' S* + // : '>' S* + // simple_selector + // : element_name [ HASH | class | attrib | pseudo ]* + // | [ HASH | class | attrib | pseudo ]+ + // element_name + // : IDENT | '*' + // ; + // class + // : '.' IDENT + // ; + // attrib + // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S* + // [ IDENT | STRING ] S* ]? ']' + // ; + // pseudo + // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ] + // ; + // + // For reference, here are the relevant tokens: + // + // HASH #{name} + // IDENT {ident} + // INCLUDES == + // DASHMATCH |= + // STRING {string} + // FUNCTION {ident}\( + // + // And the lexical scanner tokens + // + // name {nmchar}+ + // nmchar [_a-z0-9-]|{nonascii}|{escape} + // nonascii [\240-\377] + // escape {unicode}|\\[^\r\n\f0-9a-f] + // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])? + // ident -?{nmstart}{nmchar*} + // nmstart [_a-z]|{nonascii}|{escape} + // string {string1}|{string2} + // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\" + // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\' + // + // We'll implement a subset (in order to reduce attack + // surface); in particular: + // + // - No Unicode support + // - No escapes support + // - No string support (by proxy no attrib support) + // - element_name is matched against allowed + // elements (some people might find this + // annoying...) + // - Pseudo-elements one of :first-child, :link, + // :visited, :active, :hover, :focus + + // handle ruleset + $selectors = array_map('trim', explode(',', $selector)); + $new_selectors = array(); + foreach ($selectors as $sel) { + // split on +, > and spaces + $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE); + // even indices are chunks, odd indices are + // delimiters + $nsel = null; + $delim = null; // guaranteed to be non-null after + // two loop iterations + for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) { + $x = $basic_selectors[$i]; + if ($i % 2) { + // delimiter + if ($x === ' ') { + $delim = ' '; + } else { + $delim = ' ' . $x . ' '; + } + } else { + // simple selector + $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE); + $sdelim = null; + $nx = null; + for ($j = 0, $cc = count($components); $j < $cc; $j ++) { + $y = $components[$j]; + if ($j === 0) { + if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) { + $nx = $y; + } else { + // $nx stays null; this matters + // if we don't manage to find + // any valid selector content, + // in which case we ignore the + // outer $delim + } + } elseif ($j % 2) { + // set delimiter + $sdelim = $y; + } else { + $attrdef = null; + if ($sdelim === '#') { + $attrdef = $this->_id_attrdef; + } elseif ($sdelim === '.') { + $attrdef = $this->_class_attrdef; + } elseif ($sdelim === ':') { + $attrdef = $this->_enum_attrdef; + } else { + throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split'); + } + $r = $attrdef->validate($y, $config, $context); + if ($r !== false) { + if ($r !== true) { + $y = $r; + } + if ($nx === null) { + $nx = ''; + } + $nx .= $sdelim . $y; + } + } + } + if ($nx !== null) { + if ($nsel === null) { + $nsel = $nx; + } else { + $nsel .= $delim . $nx; + } + } else { + // delimiters to the left of invalid + // basic selector ignored + } + } + } + if ($nsel !== null) { + if (!empty($scopes)) { + foreach ($scopes as $s) { + $new_selectors[] = "$s $nsel"; + } + } else { + $new_selectors[] = $nsel; } } - $selector = implode(', ', $new_selector); // now it's a string } + if (empty($new_selectors)) continue; + $selector = implode(', ', $new_selectors); foreach ($style as $name => $value) { if (!isset($css_definition->info[$name])) { unset($style[$name]); @@ -110,10 +262,11 @@ class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter } $new_decls[$selector] = $style; } - $this->_tidy->css[$k] = $new_decls; + $new_css[$k] = $new_decls; } // remove stuff that shouldn't be used, could be reenabled // after security risks are analyzed + $this->_tidy->css = $new_css; $this->_tidy->import = array(); $this->_tidy->charset = null; $this->_tidy->namespace = null; -- cgit v1.2.3