diff options
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 32 | ||||
-rw-r--r-- | test/HTML5/Parser/TokenizerTest.php | 39 |
2 files changed, 48 insertions, 23 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index a23e1f4..b84f9bf 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -235,8 +235,18 @@ class Tokenizer { // Comment: if ($tok == '-' && $this->scanner->peek() == '-') { + $this->scanner->next(); // Consume the other '-' + $this->scanner->next(); // Next char. + return $this->comment(); } + elseif($tok == 'D') { + } + elseif($tok == '[') { + } + // FINISH + $this->parseError("Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s", $tok); + $this->bogusComment('<!'); return TRUE; } @@ -472,10 +482,11 @@ class Tokenizer { return TRUE; } - protected function commentStart() { - } - protected function commentStartDash() { - } + /** + * Read a comment. + * + * Expects the first tok to be inside of the comment. + */ protected function comment() { $tok = $this->scanner->current(); $comment = ''; @@ -504,17 +515,18 @@ class Tokenizer { } protected function isCommentEnd() { + // EOF + if($this->scanner->current() === FALSE) { + // Hit the end. + $this->parseError("Unexpected EOF in a comment."); + return TRUE; + } + // If it doesn't start with -, not the end. if($this->scanner->current() != '-') { return FALSE; } - // EOF - if($this->scanner->Current() === FALSE) { - // Hit the end. - $this->events->parseError("Unexpected EOF in a comment."); - return TRUE; - } // Advance one, and test for '->' if ($this->scanner->next() == '-' diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php index c057360..291bd0f 100644 --- a/test/HTML5/Parser/TokenizerTest.php +++ b/test/HTML5/Parser/TokenizerTest.php @@ -87,13 +87,18 @@ class TokenizerTest extends \HTML5\Tests\TestCase { } public function testBogusComment() { - $str = '</+this is a bogus comment. +>'; - $events = $this->parse($str . ' '); - $e0 = $events->get(0); - $this->assertEquals('error', $e0['name']); - $e1 = $events->get(1); - $this->assertEquals('comment', $e1['name']); - $this->assertEquals($str, $e1['data'][0]); + $bogus = array( + '</+this is a bogus comment. +>', + '<!+this is a bogus comment. !>', + ); + foreach ($bogus as $str) { + $events = $this->parse($str . ' '); + $e0 = $events->get(0); + $this->assertEquals('error', $e0['name']); + $e1 = $events->get(1); + $this->assertEquals('comment', $e1['name']); + $this->assertEquals($str, $e1['data'][0]); + } } public function testEndTag() { @@ -162,19 +167,27 @@ class TokenizerTest extends \HTML5\Tests\TestCase { '<!-- --$i -->' => ' --$i ', '<!----$i-->' => '--$i', '<!-- 1 > 0 -->' => ' 1 > 0 ', - '<!-- - Hello World. - -->' => "\nHello World\n", + "<!--\nHello World.\na-->" => "\nHello World.\na", '<!-- <!-- -->' => ' <!-- ', ); + foreach ($good as $test => $expected) { + $events = $this->parse($test); + $e1 = $events->get(0); + $this->assertEquals('comment', $e1['name'], 'Expected a comment for ' . $test); + $this->assertEquals($expected, $e1['data'][0]); + } + $fail = array( '<!-->' => '', '<!--Hello' => 'Hello', + "<!--\0Hello" => UTF8Utils::FFFD . 'Hello', ); + foreach ($fail as $test => $expected) { + $events = $this->parse($test); + $e0 = $events->get(0); + $this->assertEquals('error', $e0['name'], 'Expected an error for ' . $test . print_r($events, TRUE)); - foreach ($good as $test => $expected) { - $events = $this->parse($good); - $e1 = $events->get(0); + $e1 = $events->get(1); $this->assertEquals('comment', $e1['name'], 'Expected a comment for ' . $test); $this->assertEquals($expected, $e1['data'][0]); } |