summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTechnosophos <[email protected]>2013-04-15 17:26:04 -0500
committerTechnosophos <[email protected]>2013-04-15 17:26:04 -0500
commit34718c58884fe5e7d645e4bf12d9c39d77cc8b4d (patch)
tree074b1b33fe3581907cacac04a0e8806dbf3a4612
parenta4f2aabfb05e16d5a29f1256f82e94b5fbe73583 (diff)
First shot at DOCTYPE parsing and testing.
-rw-r--r--src/HTML5/Parser/Tokenizer.php93
-rw-r--r--test/HTML5/Parser/TokenizerTest.php51
2 files changed, 138 insertions, 6 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index 378af43..466cb5d 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -244,7 +244,7 @@ class Tokenizer {
}
// Doctype
elseif($tok == 'D') {
- return $this->doctype();
+ return $this->doctype('');
}
// CDATA section
elseif($tok == '[') {
@@ -411,6 +411,15 @@ class Tokenizer {
$this->scanner->unconsume(1);
return FALSE;
}
+
+ /**
+ * Parse a DOCTYPE.
+ *
+ * Parse a DOCTYPE declaration. This method has strong bearing on whether or
+ * not Quirksmode is enabled on the event handler.
+ *
+ * @todo This method is a little long. Should probably refactor.
+ */
protected function doctype() {
if ($this->scanner->current() != 'D') {
return FALSE;
@@ -427,7 +436,7 @@ class Tokenizer {
// EOF: die.
if ($tok === FALSE) {
- $this->events->doctype('html5','','', TRUE);
+ $this->events->doctype('html5',EventHandler::DOCTYPE_NONE,'', TRUE);
return $this->eof();
}
@@ -445,14 +454,88 @@ class Tokenizer {
// Lowercase ASCII, replace \0 with FFFD
$doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD));
- // If FALSE, emit a parse error.
+ $tok = $this->scanner->current();
+
+ // If FALSE, emit a parse error, DOCTYPE, and return.
+ if ($tok === FALSE) {
+ $this->parseError('Unexpected EOF in DOCTYPE declaration.');
+ $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, NULL, TRUE);
+ return TRUE;
+ }
+
+ // Short DOCTYPE, like <!DOCTYPE html>
+ if ($tok == '>') {
+ $this->events->doctype($doctypeName);
+ $this->scanner->next();
+ return TRUE;
+ }
+
+ $pub = strtoupper($this->scanner->getAsciiAlpha());
+ $white = strlen($this->scanner->whitespace());
+ $tok = $this->scanner->current();
+
+ // Get ID, and flag it as pub or system.
+ if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) {
+ // Get the sys ID.
+ $type = $pub == 'PUBLIC' ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM;
+ $id = $this->quotedString("\0>");
+ if ($id === FALSE) {
+ $this->events->doctype($doctypeName, $type, $pub, FALSE);
+ return FALSE;
+ }
+
+ // Well-formed complete DOCTYPE.
+ $this->scanner->whitespace();
+ if ($this->scanner->current() == '>') {
+ $this->events->doctype($doctypeName, $type, $id, FALSE);
+ return TRUE;
+ }
+
+ // If we get here, we have <!DOCTYPE foo PUBLIC "bar" SOME_JUNK
+ // Throw away the junk, parse error, quirks mode, return TRUE.
+ $this->scanner->charsUntil(">");
+ $this->parseError("Malformed DOCTYPE.");
+ $this->events->doctype($doctypeName, $type, $id, TRUE);
+ return TRUE;
+ }
+
+ // Else it's a bogus DOCTYPE.
+ // Consume to > and trash.
+ $this->scanner->charsUntil('>');
- // Get pub and sys IDs
+ $this->parseError("Expected PUBLIC or SYSTEM. Got %s%s.", $pub);
+ $this->events->doctype($doctypeName, 0, NULL, TRUE);
+ return TRUE;
- // If >, end doctype
+ }
+ /**
+ * Utility for reading a quoted string.
+ *
+ * @param string $stopchars
+ * Characters (in addition to a close-quote) that should stop the string.
+ * E.g. sometimes '>' is higher precedence than '"' or "'".
+ * @return mixed
+ * String if one is found (quotations omitted)
+ */
+ protected function quotedString($stopchars) {
+ $tok = $this->scanner->current();
+ if ($tok == '"' || "'") {
+ $this->scanner->next();
+ $ret = $this->scanner->charsUntil($tok . $stopchars);
+ if ($this->scanner->current() == $tok) {
+ $this->scanner->next();
+ }
+ else {
+ // Parse error because no close quote.
+ $this->parseError("Expected %s, got %s", $tok, $this->scanner->current());
+ }
+ return $ret;
+ }
+ return FALSE;
}
+
/**
* Handle a CDATA section.
*/
diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php
index 26df4b5..349b713 100644
--- a/test/HTML5/Parser/TokenizerTest.php
+++ b/test/HTML5/Parser/TokenizerTest.php
@@ -17,7 +17,12 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
*/
public function assertEventEquals($type, $expects, $event) {
$this->assertEquals($type, $event['name'], "Event $type for " . print_r($event, TRUE));
- $this->assertEquals($expects, $event['data'][0], "Event $type should equal $expects: " . print_r($event, TRUE));
+ if (is_array($expects)) {
+ $this->assertEquals($expects, $event['data'], "Event $type should equal $expects: " . print_r($event, TRUE));
+ }
+ else {
+ $this->assertEquals($expects, $event['data'][0], "Event $type should equal $expects: " . print_r($event, TRUE));
+ }
}
/**
@@ -205,6 +210,50 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
}
}
+ public function testDoctype() {
+ $good = array(
+ '<!DOCTYPE html>' => array('html', 0, NULL, FALSE),
+ "<!DOCTYPE\nhtml>" => array('html', 0, NULL, FALSE),
+ "<!DOCTYPE\fhtml>" => array('html', 0, NULL, FALSE),
+ '<!DOCTYPE html PUBLIC "foo bar">' => array('html', EventStack::DOCTYPE_PUBLIC, 'foo bar', FALSE),
+ "<!DOCTYPE html PUBLIC 'foo bar'>" => array('html', EventStack::DOCTYPE_PUBLIC, 'foo bar', FALSE),
+ '<!DOCTYPE html PUBLIC "foo bar" >' => array('html', EventStack::DOCTYPE_PUBLIC, 'foo bar', FALSE),
+ "<!DOCTYPE html \nPUBLIC\n'foo bar'>" => array('html', EventStack::DOCTYPE_PUBLIC, 'foo bar', FALSE),
+ '<!DOCTYPE html SYSTEM "foo bar">' => array('html', EventStack::DOCTYPE_SYSTEM, 'foo bar', FALSE),
+ "<!DOCTYPE html SYSTEM 'foo bar'>" => array('html', EventStack::DOCTYPE_SYSTEM, 'foo bar', FALSE),
+ '<!DOCTYPE html SYSTEM "foo/bar" >' => array('html', EventStack::DOCTYPE_SYSTEM, 'foo/bar', FALSE),
+ "<!DOCTYPE html \nSYSTEM\n'foo bar'>" => array('html', EventStack::DOCTYPE_SYSTEM, 'foo bar', FALSE),
+ );
+
+ foreach ($good as $test => $expects) {
+ $events = $this->parse($test);
+ $this->assertEquals(2, $events->depth(), "Counting events for '$test'");
+ $this->assertEventEquals('doctype', $expects, $events->get(0));
+ }
+
+ $bad = array(
+ '<!DOCTYPE>' => array(NULL, EventStack::DOCTYPE_NONE, NULL, TRUE),
+ '<!DOCTYPE >' => array(NULL, EventStack::DOCTYPE_NONE, NULL, TRUE),
+ '<!DOCTYPE foo' => array('foo', EventStack::DOCTYPE_NONE, NULL, TRUE),
+ '<!DOCTYPE foo PUB' => array('foo', EventStack::DOCTYPE_NONE, NULL, TRUE),
+ '<!DOCTYPE foo PUB>' => array('foo', EventStack::DOCTYPE_NONE, NULL, TRUE),
+ '<!DOCTYPE foo PUB "Looks good">' => array('foo', EventStack::DOCTYPE_NONE, NULL, TRUE),
+ '<!DOCTYPE foo SYSTME "Looks good"' => array('foo', EventStack::DOCTYPE_NONE, NULL, TRUE),
+ '<!DOCTYPE foo PUBLIC' => array('foo', EventStack::DOCTYPE_PUBLIC, NULL, TRUE),
+ '<!DOCTYPE foo PUBLIC>' => array('foo', EventStack::DOCTYPE_PUBLIC, NULL, TRUE),
+ '<!DOCTYPE foo SYSTEM' => array('foo', EventStack::DOCTYPE_SYSTEM, NULL, TRUE),
+ '<!DOCTYPE foo SYSTEM>' => array('foo', EventStack::DOCTYPE_SYSTEM, NULL, TRUE),
+ '<!DOCTYPE html SYSTEM "foo bar"' => array('html', EventStack::DOCTYPE_SYSTEM, 'foo bar', TRUE),
+ '<!DOCTYPE html SYSTEM "foo bar" more stuff>' => array('html', EventStack::DOCTYPE_SYSTEM, 'foo bar', TRUE),
+ );
+ foreach ($bad as $test => $expects) {
+ $events = $this->parse($test);
+ $this->assertEquals(3, $events->depth(), "Counting events for '$test'");
+ $this->assertEventError($events->get(0));
+ $this->assertEventEquals('doctype', $expects, $events->get(1));
+ }
+ }
+
public function testText() {
$good = array(
'a<br>b',