summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/HTML5/Parser/Tokenizer.php79
-rw-r--r--test/HTML5/Parser/TokenizerTest.php44
2 files changed, 106 insertions, 17 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index d79f2c0..7411551 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -296,30 +296,75 @@ class Tokenizer {
}
/**
+ * Consume a tag name and body.
* 8.2.4.10
*/
protected function tagName() {
- $name = $this->scanner->current();
- $tok = $this->scanner->next();
- switch ($tok) {
- case "\n":
- case "\t":
- case "\f":
- case ' ':
- return $this->beforeAttribute();
- case '/':
- return $this->selfClosingTag();
- case '>':
+ $tok = $this->scanner->current();
+ if (!ctype_alpha($tok)) {
+ return FALSE;
+ }
+
+ // We know this is at least one char.
+ $name = strtolower($this->scanner->charsUntil("/> \n\f\t"));
+ $attributes = array();
+ $selfClose = FALSE;
+ do {
+ $this->attributes($attributes);
+ $this->scanner->whitespace();
+ }
+ while (!$this->isTagEnd($selfClose));
+ $this->events->startTag($name, $attributes, $selfClose);
+ // Should we do this? What does this buy that selfClose doesn't?
+ if ($selfClose) {
+ $this->events->endTag($name);
}
+
+ $this->scanner->next();
+
+ return TRUE;
+ }
+
+ protected function isTagEnd(&$selfClose) {
+ $tok = $this->scanner->current();
+ if ($tok == '/') {
+ $this->scanner->next();
+ $this->scanner->whitespace();
+ if ($this->scanner->current() == '>') {
+ $selfClose = TRUE;
+ $this->scanner->next();
+ return TRUE;
+ }
+ // Basically, we skip the / token and go on.
+ // See 8.2.4.43.
+ $this->parseError("Unexpected '%s' inside of a tag.", $this->scanner->current());
+ return FALSE;
+ }
+
+ if ($this->scanner->current() == '>') {
+ return TRUE;
+ }
+ if ($this->scanner->current() === FALSE) {
+ $this->parseError("Unexpected EOF inside of tag.");
+ return TRUE;
+ }
+
return FALSE;
- // tab, lf, ff, space -> before attr name
- // / -> self-closing tag
- // > -> current tag is done, data-state
- // NULL parse error
- // EOF -> parse error
- // -> append to tagname
+ }
+
+
+ /**
+ * Parse attributes from inside of a tag.
+ */
+ protected function attributes(&$attributes) {
+ $tok = $this->scanner->current();
+ if ($tok == '/' || $tok == '>') {
+ return array();
+ }
+
+ return array();
}
diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php
index c4c66e7..3065cbb 100644
--- a/test/HTML5/Parser/TokenizerTest.php
+++ b/test/HTML5/Parser/TokenizerTest.php
@@ -273,6 +273,50 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
}
}
+ /**
+ * This tests just simple tags.
+ */
+ public function testSimpleTags() {
+ $open = array(
+ '<foo>' => 'foo',
+ '<foo >' => 'foo',
+ "<foo\n\n\n\n>" => 'foo',
+ '<foo:bar>' => 'foo:bar',
+ );
+ foreach ($open as $test => $expects) {
+ $events = $this->parse($test);
+ $this->assertEquals(2, $events->depth(), "Counting events for '$test'" . print_r($events, TRUE));
+ $this->assertEventEquals('startTag', $expects, $events->get(0));
+ }
+ $selfClose= array(
+ '<foo/>' => 'foo',
+ '<foo />' => 'foo',
+ "<foo\n\n\n\n/>" => 'foo',
+ '<foo:bar/>' => 'foo:bar',
+ );
+ foreach ($selfClose as $test => $expects) {
+ $events = $this->parse($test);
+ $this->assertEquals(3, $events->depth(), "Counting events for '$test'" . print_r($events, TRUE));
+ $this->assertEventEquals('startTag', $expects, $events->get(0));
+ $this->assertEventEquals('endTag', $expects, $events->get(1));
+ }
+
+ $bad = array(
+ '<foo' => 'foo',
+ '<foo ' => 'foo',
+ '<foo/' => 'foo',
+ '<foo /' => 'foo',
+ );
+
+ foreach ($bad as $test => $expects) {
+ $events = $this->parse($test);
+ //fprintf(STDOUT, $test . PHP_EOL);
+ $this->assertEquals(3, $events->depth(), "Counting events for '$test': " . print_r($events, TRUE));
+ $this->assertEventError($events->get(0));
+ $this->assertEventEquals('startTag', $expects, $events->get(1));
+ }
+ }
+
public function testText() {
$good = array(
'a<br>b',