diff options
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 79 | ||||
-rw-r--r-- | test/HTML5/Parser/TokenizerTest.php | 44 |
2 files changed, 106 insertions, 17 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index d79f2c0..7411551 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -296,30 +296,75 @@ class Tokenizer { } /** + * Consume a tag name and body. * 8.2.4.10 */ protected function tagName() { - $name = $this->scanner->current(); - $tok = $this->scanner->next(); - switch ($tok) { - case "\n": - case "\t": - case "\f": - case ' ': - return $this->beforeAttribute(); - case '/': - return $this->selfClosingTag(); - case '>': + $tok = $this->scanner->current(); + if (!ctype_alpha($tok)) { + return FALSE; + } + + // We know this is at least one char. + $name = strtolower($this->scanner->charsUntil("/> \n\f\t")); + $attributes = array(); + $selfClose = FALSE; + do { + $this->attributes($attributes); + $this->scanner->whitespace(); + } + while (!$this->isTagEnd($selfClose)); + $this->events->startTag($name, $attributes, $selfClose); + // Should we do this? What does this buy that selfClose doesn't? + if ($selfClose) { + $this->events->endTag($name); } + + $this->scanner->next(); + + return TRUE; + } + + protected function isTagEnd(&$selfClose) { + $tok = $this->scanner->current(); + if ($tok == '/') { + $this->scanner->next(); + $this->scanner->whitespace(); + if ($this->scanner->current() == '>') { + $selfClose = TRUE; + $this->scanner->next(); + return TRUE; + } + // Basically, we skip the / token and go on. + // See 8.2.4.43. + $this->parseError("Unexpected '%s' inside of a tag.", $this->scanner->current()); + return FALSE; + } + + if ($this->scanner->current() == '>') { + return TRUE; + } + if ($this->scanner->current() === FALSE) { + $this->parseError("Unexpected EOF inside of tag."); + return TRUE; + } + return FALSE; - // tab, lf, ff, space -> before attr name - // / -> self-closing tag - // > -> current tag is done, data-state - // NULL parse error - // EOF -> parse error - // -> append to tagname + } + + + /** + * Parse attributes from inside of a tag. + */ + protected function attributes(&$attributes) { + $tok = $this->scanner->current(); + if ($tok == '/' || $tok == '>') { + return array(); + } + + return array(); } diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php index c4c66e7..3065cbb 100644 --- a/test/HTML5/Parser/TokenizerTest.php +++ b/test/HTML5/Parser/TokenizerTest.php @@ -273,6 +273,50 @@ class TokenizerTest extends \HTML5\Tests\TestCase { } } + /** + * This tests just simple tags. + */ + public function testSimpleTags() { + $open = array( + '<foo>' => 'foo', + '<foo >' => 'foo', + "<foo\n\n\n\n>" => 'foo', + '<foo:bar>' => 'foo:bar', + ); + foreach ($open as $test => $expects) { + $events = $this->parse($test); + $this->assertEquals(2, $events->depth(), "Counting events for '$test'" . print_r($events, TRUE)); + $this->assertEventEquals('startTag', $expects, $events->get(0)); + } + $selfClose= array( + '<foo/>' => 'foo', + '<foo />' => 'foo', + "<foo\n\n\n\n/>" => 'foo', + '<foo:bar/>' => 'foo:bar', + ); + foreach ($selfClose as $test => $expects) { + $events = $this->parse($test); + $this->assertEquals(3, $events->depth(), "Counting events for '$test'" . print_r($events, TRUE)); + $this->assertEventEquals('startTag', $expects, $events->get(0)); + $this->assertEventEquals('endTag', $expects, $events->get(1)); + } + + $bad = array( + '<foo' => 'foo', + '<foo ' => 'foo', + '<foo/' => 'foo', + '<foo /' => 'foo', + ); + + foreach ($bad as $test => $expects) { + $events = $this->parse($test); + //fprintf(STDOUT, $test . PHP_EOL); + $this->assertEquals(3, $events->depth(), "Counting events for '$test': " . print_r($events, TRUE)); + $this->assertEventError($events->get(0)); + $this->assertEventEquals('startTag', $expects, $events->get(1)); + } + } + public function testText() { $good = array( 'a<br>b', |