diff options
-rw-r--r-- | src/HTML5/Parser/EventHandler.php | 27 | ||||
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 7 | ||||
-rw-r--r-- | test/HTML5/Parser/EventStack.php | 9 | ||||
-rw-r--r-- | test/HTML5/Parser/TokenizerTest.php | 57 |
4 files changed, 80 insertions, 20 deletions
diff --git a/src/HTML5/Parser/EventHandler.php b/src/HTML5/Parser/EventHandler.php index 8282cb7..ebb30b2 100644 --- a/src/HTML5/Parser/EventHandler.php +++ b/src/HTML5/Parser/EventHandler.php @@ -27,7 +27,32 @@ interface EventHandler { /** * A start tag. * - * + * IMPORTANT: The parser watches the return value of this event. If this returns + * an integer, the parser will switch TEXTMODE patters according to the int. + * + * This is how the Tree Builder can tell the Tokenizer when a certain tag should + * cause the parser to go into RAW text mode. + * + * The HTML5 standard requires that the builder is the one that initiates this + * step, and this is the only way short of a circular reference that we can + * do that. + * + * Example: if a startTag even for a `script` name is fired, and the startTag() + * implementation returns Tokenizer::TEXTMODE_RAW, then the tokenizer will + * switch into RAW text mode and consume data until it reaches a closing + * `script` tag. + * + * The textmode is automatically reset to Tokenizer::TEXTMODE_NORMAL when the + * closing tag is encounter. **This behavior may change.** + * + * @param string $name + * The tag name. + * @param array $attributes + * An array with all of the tag's attributes. + * @param boolean $selfClosing + * An indicator of whether or not this tag is self-closing (<foo/>) + * @return numeric + * One of the Tokenizer::TEXTMODE_* constants. */ public function startTag($name, $attributes = array(), $selfClosing = FALSE); /** diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 4f2f792..02bb328 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -303,11 +303,15 @@ class Tokenizer { } while (!$this->isTagEnd($selfClose)); - $this->events->startTag($name, $attributes, $selfClose); + $mode = $this->events->startTag($name, $attributes, $selfClose); // Should we do this? What does this buy that selfClose doesn't? if ($selfClose) { $this->events->endTag($name); } + elseif (is_int($mode)) { + //fprintf(STDOUT, "Event response says move into mode %d for tag %s", $mode, $name); + $this->setTextMode($mode, $name); + } $this->scanner->next(); @@ -816,6 +820,7 @@ class Tokenizer { } // If we get here, we hit the EOF. + $this->parseError("Unexpected EOF during text read."); return $buffer; } diff --git a/test/HTML5/Parser/EventStack.php b/test/HTML5/Parser/EventStack.php index f197855..c9ac20e 100644 --- a/test/HTML5/Parser/EventStack.php +++ b/test/HTML5/Parser/EventStack.php @@ -4,6 +4,12 @@ namespace HTML5\Parser; /** * This testing class gathers events from a parser and builds a stack of events. * It is useful for checking the output of a tokenizer. + * + * IMPORTANT: + * + * The startTag event also kicks the parser into TEXTMODE_RAW when it encounters + * script or pre tags. This is to match the behavior required by the HTML5 spec, + * which says that the tree builder must tell the tokenizer when to switch states. */ class EventStack implements EventHandler { protected $stack; @@ -42,6 +48,9 @@ class EventStack implements EventHandler { public function startTag($name, $attributes = array(), $selfClosing = FALSE) { $args = func_get_args(); $this->store('startTag', $args); + if ($name == 'pre' || $name == 'script') { + return Tokenizer::TEXTMODE_RAW; + } } public function endTag($name) { diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php index 69f90b9..7ec4c76 100644 --- a/test/HTML5/Parser/TokenizerTest.php +++ b/test/HTML5/Parser/TokenizerTest.php @@ -395,38 +395,59 @@ class TokenizerTest extends \HTML5\Tests\TestCase { '<pre>hello</pre</pre>' => 'hello</pre', "<pre>\nhello</pre\n</pre>" => "\nhello</pre\n", '<pre>&</pre>' => '&', + '<pre><!--not a comment--></pre>' => '<!--not a comment-->', + '<pre><![CDATA[not a comment]]></pre>' => '<![CDATA[not a comment]]>', ); foreach ($good as $test => $expects) { - list($tok, $events) = $this->createTokenizer($test); - - $tok->setTextMode(Tokenizer::TEXTMODE_RAW, 'pre'); - $tok->parse(); - - //fprintf(STDOUT, "Test: %s\n", $test); - fprintf(STDOUT, "Test: %s %s\n", $test, print_r($events, TRUE)); - + $events = $this->parse($test); $this->assertEventEquals('startTag', 'pre', $events->get(0)); $this->assertEventEquals('text', $expects, $events->get(1)); $this->assertEventEquals('endTag', 'pre', $events->get(2)); } $bad = array( - '<pre>&</pre' => '&', + '<pre>&</pre' => '&</pre', + '<pre>Hello world' => 'Hello world', ); + foreach ($bad as $test => $expects) { + $events = $this->parse($test); + $this->assertEquals(4, $events->depth(), "Counting events for '$test': " . print_r($events, TRUE)); + $this->assertEventEquals('startTag', 'pre', $events->get(0)); + $this->assertEventError($events->get(1)); + $this->assertEventEquals('text', $expects, $events->get(2)); + } } public function testText() { - $good = array( - 'a<br>b', - '<a>test</a>', - 'a<![[ test ]]>b', - 'a&b', - 'a&b', - 'a& b& c', - ); - $this->markTestIncomplete("Need tag parsing first."); + $events = $this->parse('a<br>b'); + $this->assertEquals(4, $events->depth(), "Events: " . print_r($events, TRUE)); + $this->assertEventEquals('text', 'a', $events->get(0)); + $this->assertEventEquals('startTag', 'br', $events->get(1)); + $this->assertEventEquals('text', 'b', $events->get(2)); + + $events = $this->parse('<a>Test</a>'); + $this->assertEquals(4, $events->depth(), "Events: " . print_r($events, TRUE)); + $this->assertEventEquals('startTag', 'a', $events->get(0)); + $this->assertEventEquals('text', 'Test', $events->get(1)); + $this->assertEventEquals('endTag', 'a', $events->get(2)); + + $events = $this->parse('a<![CDATA[test]]>b'); + $this->assertEquals(4, $events->depth(), "Events: " . print_r($events, TRUE)); + $this->assertEventEquals('text', 'a', $events->get(0)); + $this->assertEventEquals('cdata', 'test', $events->get(1)); + $this->assertEventEquals('text', 'b', $events->get(2)); + + $events = $this->parse('a<!--test-->b'); + $this->assertEquals(4, $events->depth(), "Events: " . print_r($events, TRUE)); + $this->assertEventEquals('text', 'a', $events->get(0)); + $this->assertEventEquals('comment', 'test', $events->get(1)); + $this->assertEventEquals('text', 'b', $events->get(2)); + + $events = $this->parse('a&b'); + $this->assertEquals(2, $events->depth(), "Events: " . print_r($events, TRUE)); + $this->assertEventEquals('text', 'a&b', $events->get(0)); } // ================================================================ |