diff options
Diffstat (limited to 'lib/htmlpurifier/library/HTMLPurifier/ChildDef/Table.php')
-rw-r--r-- | lib/htmlpurifier/library/HTMLPurifier/ChildDef/Table.php | 95 |
1 files changed, 90 insertions, 5 deletions
diff --git a/lib/htmlpurifier/library/HTMLPurifier/ChildDef/Table.php b/lib/htmlpurifier/library/HTMLPurifier/ChildDef/Table.php index 34f0227dd..9a93421a1 100644 --- a/lib/htmlpurifier/library/HTMLPurifier/ChildDef/Table.php +++ b/lib/htmlpurifier/library/HTMLPurifier/ChildDef/Table.php @@ -1,7 +1,33 @@ <?php /** - * Definition for tables + * Definition for tables. The general idea is to extract out all of the + * essential bits, and then reconstruct it later. + * + * This is a bit confusing, because the DTDs and the W3C + * validators seem to disagree on the appropriate definition. The + * DTD claims: + * + * (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+) + * + * But actually, the HTML4 spec then has this to say: + * + * The TBODY start tag is always required except when the table + * contains only one table body and no table head or foot sections. + * The TBODY end tag may always be safely omitted. + * + * So the DTD is kind of wrong. The validator is, unfortunately, kind + * of on crack. + * + * The definition changed again in XHTML1.1; and in my opinion, this + * formulation makes the most sense. + * + * caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ )) + * + * Essentially, we have two modes: thead/tfoot/tbody mode, and tr mode. + * If we encounter a thead, tfoot or tbody, we are placed in the former + * mode, and we *must* wrap any stray tr segments with a tbody. But if + * we don't run into any of them, just have tr tags is OK. */ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef { @@ -33,6 +59,8 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef $collection = array(); // collected nodes $tag_index = 0; // the first node might be whitespace, // so this tells us where the start tag is + $tbody_mode = false; // if true, then we need to wrap any stray + // <tr>s with a <tbody>. foreach ($tokens_of_children as $token) { $is_child = ($nesting == 0); @@ -51,8 +79,9 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef // okay, let's stash the tokens away // first token tells us the type of the collection switch ($collection[$tag_index]->name) { - case 'tr': case 'tbody': + $tbody_mode = true; + case 'tr': $content[] = $collection; break; case 'caption': @@ -61,13 +90,28 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef break; case 'thead': case 'tfoot': + $tbody_mode = true; + // XXX This breaks rendering properties with + // Firefox, which never floats a <thead> to + // the top. Ever. (Our scheme will float the + // first <thead> to the top.) So maybe + // <thead>s that are not first should be + // turned into <tbody>? Very tricky, indeed. + // access the appropriate variable, $thead or $tfoot $var = $collection[$tag_index]->name; if ($$var === false) { $$var = $collection; } else { - // transmutate the first and less entries into - // tbody tags, and then put into content + // Oops, there's a second one! What + // should we do? Current behavior is to + // transmutate the first and last entries into + // tbody tags, and then put into content. + // Maybe a better idea is to *attach + // it* to the existing thead or tfoot? + // We don't do this, because Firefox + // doesn't float an extra tfoot to the + // bottom like it does for the first one. $collection[$tag_index]->name = 'tbody'; $collection[count($collection)-1]->name = 'tbody'; $content[] = $collection; @@ -126,7 +170,48 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array); if ($thead !== false) $ret = array_merge($ret, $thead); if ($tfoot !== false) $ret = array_merge($ret, $tfoot); - foreach ($content as $token_array) $ret = array_merge($ret, $token_array); + + if ($tbody_mode) { + // a little tricky, since the start of the collection may be + // whitespace + $inside_tbody = false; + foreach ($content as $token_array) { + // find the starting token + foreach ($token_array as $t) { + if ($t->name === 'tr' || $t->name === 'tbody') { + break; + } + } // iterator variable carries over + if ($t->name === 'tr') { + if ($inside_tbody) { + $ret = array_merge($ret, $token_array); + } else { + $ret[] = new HTMLPurifier_Token_Start('tbody'); + $ret = array_merge($ret, $token_array); + $inside_tbody = true; + } + } elseif ($t->name === 'tbody') { + if ($inside_tbody) { + $ret[] = new HTMLPurifier_Token_End('tbody'); + $inside_tbody = false; + $ret = array_merge($ret, $token_array); + } else { + $ret = array_merge($ret, $token_array); + } + } else { + trigger_error("tr/tbody in content invariant failed in Table ChildDef", E_USER_ERROR); + } + } + if ($inside_tbody) { + $ret[] = new HTMLPurifier_Token_End('tbody'); + } + } else { + foreach ($content as $token_array) { + // invariant: everything in here is <tr>s + $ret = array_merge($ret, $token_array); + } + } + if (!empty($collection) && $is_collecting == false){ // grab the trailing space $ret = array_merge($ret, $collection); |