summaryrefslogtreecommitdiff
path: root/src/HTML5/Parser/Tokenizer.php
diff options
context:
space:
mode:
authorMatt Butcher <[email protected]>2013-04-10 08:18:45 -0500
committerMatt Butcher <[email protected]>2013-04-10 08:18:45 -0500
commitb5ba917c1e142f1a1e3d4a23e1ef3e1d2dcf50e8 (patch)
tree3d8910ad9433589c19e03c94a7d0c4be9c35c31d /src/HTML5/Parser/Tokenizer.php
parent03f992f08d559f5fdae774cdae4b8dff0ea401ed (diff)
HIGHLY EXPERIMENTAL: Writing a new parser.
Diffstat (limited to 'src/HTML5/Parser/Tokenizer.php')
-rw-r--r--src/HTML5/Parser/Tokenizer.php327
1 files changed, 327 insertions, 0 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
new file mode 100644
index 0000000..d201fd8
--- /dev/null
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -0,0 +1,327 @@
+<?php
+namespace HTML5\Parser;
+
+class Tokenizer {
+ protected $scanner;
+ protected $events;
+ protected $tok;
+
+ /**
+ * Buffer for text.
+ */
+ protected $text = '';
+
+ public function __construct($scanner, $eventHandler) {
+ $this->scanner = $scanner;
+ $this->events = $eventHandler;
+ }
+
+ /**
+ * 8.2.4.1
+ */
+ public function consumeData() {
+ // Scan a token
+ $this->scanner->next();
+ // Character Ref
+ $this->characterReference();
+
+ // TagOpen
+ // Null
+ // EOF
+ // Character
+ }
+
+ /**
+ * 8.2.4.2
+ */
+ protected function characterReference($inAttr = FALSE) {
+ if ($this->tok == '&') {
+ $this->tok = $this->scanner->next();
+ $$this->text .= $this->consumeCharacterReference($inAttr);
+ }
+ }
+
+ protected function consumeCharacterReference($inAttribute = FALSE) {
+ $entity = '';
+ $start = $this->scanner->position();
+
+ // Whitespace: Ignore
+ switch ($this->tok) {
+ case NULL:
+ case "\t":
+ case "\n":
+ case "\f":
+ case ' ':
+ case '&':
+ case '<':
+ // Don't consume; just return. Spec says return nothing, but I
+ // think we have to append '&' to the string.
+ return '&';
+ case '#':
+ // Consume and read a number
+ $this->tok = $this->scanner->next();
+ // X[0-9a-fA-F]+;
+ // x[0-9a-fA-F]+;
+ if ($this->tok == 'x' || $this->tok == 'X') {
+ $hex = $this->scanner->getHex();
+ $this->tok = $this->scanner->current();
+ if (empty($hex)) {
+ throw ParseError("Expected &#xHEX;, got &#x" . $this->tok);
+ }
+ $entity = hexdec($hex);
+ }
+ // [0-9]+;
+ else {
+ $entity = $this->scanner->getNumeric();
+ $this->tok = $this->scanner->current();
+ if (empty($numeric)) {
+ throw ParseError("Expected &#DIGITS;, got $#" . $this->tok);
+ }
+ }
+ break;
+ default:
+ // Attempt to consume a string up to a ';'.
+ // [a-zA-Z0-9]+;
+ $entity = $this->scanner->getAsciiAlpha();
+ $this->tok = $this->scanner->current();
+
+ }
+
+ // We have an entity. We're done here.
+ if ($this->tok == ';') {
+ return $entity;
+ }
+
+ // If in an attribute, then failing to match ; means unconsume the
+ // entire string. Otherwise, failure to match is an error.
+ if ($inAttribute) {
+ $this->scanner->unconsume($this->scanner->position() - $start);
+ return '&';
+ }
+
+ throw new ParseException("Expected &ENTITY;, got &ENTITY (no trailing ;)");
+
+ }
+
+ protected function rcdata() {
+ // Ampersand
+ // <
+ // Null
+ // EOF
+ // Character
+ }
+
+ protected function rawtext() {
+ // < is a literal
+ // NULL is an error
+ // EOF
+ // Character data
+ }
+
+ protected function scriptData() {
+ // < is a literal
+ // NULL is an error
+ // EOF
+ // Character data
+ }
+
+ /**
+ * 8.2.4.7
+ */
+ protected function plaintext() {
+ // NULL -> parse error
+ // EOF -> eof
+ // -> Character data
+ }
+
+ /**
+ * 8.2.4.8
+ */
+ protected function tagOpen() {
+ // ! -> markup declaration
+ // / -> end tagopen
+ // a-zA-Z -> tagname
+ // ? -> parse error
+ // -> Anything else is a parse error
+ }
+
+ /**
+ * 8.2.4.9
+ */
+ protected function endTagOpen() {
+ // a-zA-Z -> tagname
+ // > -> parse error
+ // EOF -> parse error
+ // -> parse error
+ }
+
+ /**
+ * 8.2.4.10
+ */
+ protected function tagName() {
+ // tab, lf, ff, space -> before attr name
+ // / -> self-closing tag
+ // > -> current tag is done, data-state
+ // NULL parse error
+ // EOF -> parse error
+ // -> append to tagname
+ }
+
+ /**
+ * 8.2.4.11
+ */
+ protected function rcdataLessThan() {
+ // / -> empty the tmp buffer and go to end-tag
+ // ->rcdata
+ }
+
+ /**
+ * 8.2.4.12
+ */
+ protected function rcdataEndTag() {
+ // A-Za-z: append to tagname
+ // -> rcdata state
+ }
+
+ /**
+ * 8.2.4.13
+ */
+ protected function rcdataEndTagName() {
+ // tab, lf, ff, space -> before attribute or treat as anything
+ // / -> self-closing tag
+ // > -> end tag, back to data
+ // A-Za-z -> append to tagname
+ // -> rcdata state
+ }
+
+ /**
+ * 8.2.4.14
+ */
+ protected function rawtextLessThan() {
+ // / -> rawtext endtag state
+ // -> rawtext
+ }
+
+ /**
+ * 8.2.4.15
+ */
+ protected function rawtextEndTagOpen() {
+ // A-Za-z -> rawtext
+ // ->rawtext
+ }
+
+ protected function rawtextEndTagName() {
+ // tab, lf, ff, space -> before attr name
+ //
+ }
+
+ protected function scriptLessThan(){
+ }
+ protected function scriptEndTagOpen() {
+ }
+ protected function scriptEndTagName() {
+ }
+ protected function scriptEscapeStart() {
+ }
+ protected function scriptEscapeStartDash() {
+ }
+ protected function scriptEscaped() {
+ }
+ protected function scriptEscapedDash() {
+ }
+ protected function scriptEscapedDashDash() {
+ }
+ protected function scriptEscapedLessThan() {
+ }
+ protected function scriptEscapedEndTagOpen() {
+ }
+ protected function scriptEscapedEndTagName() {
+ }
+ protected function scriptDoubleEscapeStart() {
+ }
+ protected function scriptDoubleEscaped() {
+ }
+ protected function scriptDoubleEscapedDash() {
+ }
+ protected function scriptDoubleEscapedDashDash() {
+ }
+ protected function scriptDoubleEscapedLessThan() {
+ }
+ protected function scriptDoubleEscapeEnd() {
+ }
+ protected function beforeAttributeName() {
+ }
+ protected function attributeName() {
+ }
+ protected function afterAttributeName() {
+ }
+ protected function beforeAttributeValue() {
+ }
+ protected function attributeValueDoubleQuote() {
+ }
+ protected function attributeValueSingleQuote() {
+ }
+ protected function attributeValueUnquoted() {
+ }
+ protected function characterReferenceInAttributeValue() {
+ }
+ protected function afterAttributeValueQuoted() {
+ }
+ protected function selfCloseingStartTag() {
+ }
+ protected function bogusComment() {
+ }
+ protected function markupDeclarationOpen() {
+ }
+ protected function commentStart() {
+ }
+ protected function commentStartDash() {
+ }
+ protected function comment() {
+ }
+ protected function commentEndDash() {
+ }
+ protected function commentEnd() {
+ }
+ protected function commentEndBangState() {
+ }
+ protected function doctype() {
+ }
+ protected function beforeDoctype() {
+ }
+ protected function doctypeName() {
+ }
+ protected function afterDoctypeName() {
+ }
+ protected function doctypePublicKeyword() {
+ }
+ protected function beforeDoctypePublicId() {
+ }
+ protected function doctypePublicIdDoubleQuoted() {
+ }
+ protected function doctypePublicIdSingleQuoted() {
+ }
+ protected function afterDoctypePublicId() {
+ }
+ protected function betweenDoctypePublicAndSystem() {
+ }
+ protected function afterDoctypeSystemKeyword() {
+ }
+ protected function beforeDoctypeSystemIdentifier() {
+ }
+ protected function doctypeSystemIdDoubleQuoted() {
+ }
+ protected function doctypeSystemIdSingleQuoted() {
+ }
+ protected function afterDoctypeSystemId() {
+ }
+ protected function bogusDoctype() {
+ }
+ protected function cdataSection() {
+ }
+
+
+
+
+
+}