From dd205fbad642ace6d0e33c8553f7d73404f140b4 Mon Sep 17 00:00:00 2001 From: Michael Kuhn Date: Sat, 28 Apr 2012 14:37:51 +0200 Subject: Update HTML Purifier to version 4.4.0. --- .../library/HTMLPurifier/ChildDef/List.php | 120 +++++++++++++++++++++ .../library/HTMLPurifier/ChildDef/Table.php | 95 +++++++++++++++- 2 files changed, 210 insertions(+), 5 deletions(-) create mode 100644 lib/htmlpurifier/library/HTMLPurifier/ChildDef/List.php (limited to 'lib/htmlpurifier/library/HTMLPurifier/ChildDef') diff --git a/lib/htmlpurifier/library/HTMLPurifier/ChildDef/List.php b/lib/htmlpurifier/library/HTMLPurifier/ChildDef/List.php new file mode 100644 index 000000000..cdaa2893a --- /dev/null +++ b/lib/htmlpurifier/library/HTMLPurifier/ChildDef/List.php @@ -0,0 +1,120 @@ + true, 'ul' => true, 'ol' => true); + public function validateChildren($tokens_of_children, $config, $context) { + // Flag for subclasses + $this->whitespace = false; + + // if there are no tokens, delete parent node + if (empty($tokens_of_children)) return false; + + // the new set of children + $result = array(); + + // current depth into the nest + $nesting = 0; + + // a little sanity check to make sure it's not ALL whitespace + $all_whitespace = true; + + $seen_li = false; + $need_close_li = false; + + foreach ($tokens_of_children as $token) { + if (!empty($token->is_whitespace)) { + $result[] = $token; + continue; + } + $all_whitespace = false; // phew, we're not talking about whitespace + + if ($nesting == 1 && $need_close_li) { + $result[] = new HTMLPurifier_Token_End('li'); + $nesting--; + $need_close_li = false; + } + + $is_child = ($nesting == 0); + + if ($token instanceof HTMLPurifier_Token_Start) { + $nesting++; + } elseif ($token instanceof HTMLPurifier_Token_End) { + $nesting--; + } + + if ($is_child) { + if ($token->name === 'li') { + // good + $seen_li = true; + } elseif ($token->name === 'ul' || $token->name === 'ol') { + // we want to tuck this into the previous li + $need_close_li = true; + $nesting++; + if (!$seen_li) { + // create a new li element + $result[] = new HTMLPurifier_Token_Start('li'); + } else { + // backtrack until found + while(true) { + $t = array_pop($result); + if ($t instanceof HTMLPurifier_Token_End) { + // XXX actually, these invariants could very plausibly be violated + // if we are doing silly things with modifying the set of allowed elements. + // FORTUNATELY, it doesn't make a difference, since the allowed + // elements are hard-coded here! + if ($t->name !== 'li') { + trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR); + return false; + } + break; + } elseif ($t instanceof HTMLPurifier_Token_Empty) { // bleagh + if ($t->name !== 'li') { + trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR); + return false; + } + // XXX this should have a helper for it... + $result[] = new HTMLPurifier_Token_Start('li', $t->attr, $t->line, $t->col, $t->armor); + break; + } else { + if (!$t->is_whitespace) { + trigger_error("Only whitespace present invariant violated in List ChildDef", E_USER_ERROR); + return false; + } + } + } + } + } else { + // start wrapping (this doesn't precisely mimic + // browser behavior, but what browsers do is kind of + // hard to mimic in a standards compliant way + // XXX Actually, this has no impact in practice, + // because this gets handled earlier. Arguably, + // we should rip out all of that processing + $result[] = new HTMLPurifier_Token_Start('li'); + $nesting++; + $seen_li = true; + $need_close_li = true; + } + } + $result[] = $token; + } + if ($need_close_li) { + $result[] = new HTMLPurifier_Token_End('li'); + } + if (empty($result)) return false; + if ($all_whitespace) { + return false; + } + if ($tokens_of_children == $result) return true; + return $result; + } +} + +// vim: et sw=4 sts=4 diff --git a/lib/htmlpurifier/library/HTMLPurifier/ChildDef/Table.php b/lib/htmlpurifier/library/HTMLPurifier/ChildDef/Table.php index 34f0227dd..9a93421a1 100644 --- a/lib/htmlpurifier/library/HTMLPurifier/ChildDef/Table.php +++ b/lib/htmlpurifier/library/HTMLPurifier/ChildDef/Table.php @@ -1,7 +1,33 @@ s with a . foreach ($tokens_of_children as $token) { $is_child = ($nesting == 0); @@ -51,8 +79,9 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef // okay, let's stash the tokens away // first token tells us the type of the collection switch ($collection[$tag_index]->name) { - case 'tr': case 'tbody': + $tbody_mode = true; + case 'tr': $content[] = $collection; break; case 'caption': @@ -61,13 +90,28 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef break; case 'thead': case 'tfoot': + $tbody_mode = true; + // XXX This breaks rendering properties with + // Firefox, which never floats a to + // the top. Ever. (Our scheme will float the + // first to the top.) So maybe + // s that are not first should be + // turned into ? Very tricky, indeed. + // access the appropriate variable, $thead or $tfoot $var = $collection[$tag_index]->name; if ($$var === false) { $$var = $collection; } else { - // transmutate the first and less entries into - // tbody tags, and then put into content + // Oops, there's a second one! What + // should we do? Current behavior is to + // transmutate the first and last entries into + // tbody tags, and then put into content. + // Maybe a better idea is to *attach + // it* to the existing thead or tfoot? + // We don't do this, because Firefox + // doesn't float an extra tfoot to the + // bottom like it does for the first one. $collection[$tag_index]->name = 'tbody'; $collection[count($collection)-1]->name = 'tbody'; $content[] = $collection; @@ -126,7 +170,48 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array); if ($thead !== false) $ret = array_merge($ret, $thead); if ($tfoot !== false) $ret = array_merge($ret, $tfoot); - foreach ($content as $token_array) $ret = array_merge($ret, $token_array); + + if ($tbody_mode) { + // a little tricky, since the start of the collection may be + // whitespace + $inside_tbody = false; + foreach ($content as $token_array) { + // find the starting token + foreach ($token_array as $t) { + if ($t->name === 'tr' || $t->name === 'tbody') { + break; + } + } // iterator variable carries over + if ($t->name === 'tr') { + if ($inside_tbody) { + $ret = array_merge($ret, $token_array); + } else { + $ret[] = new HTMLPurifier_Token_Start('tbody'); + $ret = array_merge($ret, $token_array); + $inside_tbody = true; + } + } elseif ($t->name === 'tbody') { + if ($inside_tbody) { + $ret[] = new HTMLPurifier_Token_End('tbody'); + $inside_tbody = false; + $ret = array_merge($ret, $token_array); + } else { + $ret = array_merge($ret, $token_array); + } + } else { + trigger_error("tr/tbody in content invariant failed in Table ChildDef", E_USER_ERROR); + } + } + if ($inside_tbody) { + $ret[] = new HTMLPurifier_Token_End('tbody'); + } + } else { + foreach ($content as $token_array) { + // invariant: everything in here is s + $ret = array_merge($ret, $token_array); + } + } + if (!empty($collection) && $is_collecting == false){ // grab the trailing space $ret = array_merge($ret, $collection); -- cgit v1.2.3