summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Butcher <[email protected]>2013-04-15 09:41:52 -0500
committerMatt Butcher <[email protected]>2013-04-15 09:41:52 -0500
commit56e13d972eadec82cda8c9c59b2f597c38e1aafb (patch)
tree819efa8761b84020f9f8c58fbb427f67dccd7f32
parentefb1bb1327f3fcbfe5c1fa515b73e0804b7cf9df (diff)
UNFINISHED: DOCTYPE parser is in progress.
-rw-r--r--README.md5
-rw-r--r--src/HTML5/Parser/Tokenizer.php30
2 files changed, 34 insertions, 1 deletions
diff --git a/README.md b/README.md
index 5bc8090..ee1837f 100644
--- a/README.md
+++ b/README.md
@@ -116,3 +116,8 @@ html5lib project.
This software is released under the MIT license. The original html5lib
library was also released under the MIT license.
+
+See LICENSE.txt
+
+Certain files contain copyright assertions by specific individuals
+involved with html5lib. Those have been retained where appropriate.
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index bf484bd..378af43 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -422,7 +422,35 @@ class Tokenizer {
return $this->bogusComment('<!' . $chars);
}
- // Now we need to parse the DOCTYPE.
+ $this->scanner->whitespace();
+ $tok = $this->scanner->current();
+
+ // EOF: die.
+ if ($tok === FALSE) {
+ $this->events->doctype('html5','','', TRUE);
+ return $this->eof();
+ }
+
+ $doctypeName = '';
+
+ // NULL char: convert.
+ if ($tok === "\0") {
+ $this->parseError("Unexpected NULL character in DOCTYPE.");
+ $doctypeName .= UTF8::FFFD;
+ $tok = $this->scanner->next();
+ }
+
+ $stop = " \n\f>";
+ $doctypeName = $this->scanner->charsUntil($stop);
+ // Lowercase ASCII, replace \0 with FFFD
+ $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD));
+
+ // If FALSE, emit a parse error.
+
+ // Get pub and sys IDs
+
+ // If >, end doctype
+
}
/**