summaryrefslogtreecommitdiff
path: root/src/HTML5/Parser/Tokenizer.php
diff options
context:
space:
mode:
authorMatt Butcher <[email protected]>2013-04-12 21:33:17 -0500
committerMatt Butcher <[email protected]>2013-04-12 21:33:17 -0500
commit3d8562c11dd5e7591ea29562c43fb74939836b83 (patch)
tree38e0e3e6aed4934c97777db9b18ab9bc9b91c9a3 /src/HTML5/Parser/Tokenizer.php
parentffcfa507b081cf132db5b90c26bfad66d79a4eb4 (diff)
CDATA handling is complete. DOCTYPE is begun.
Diffstat (limited to 'src/HTML5/Parser/Tokenizer.php')
-rw-r--r--src/HTML5/Parser/Tokenizer.php325
1 files changed, 189 insertions, 136 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index 899e908..d8676c5 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -215,6 +215,9 @@ class Tokenizer {
return FALSE;
}
+ // Any buffered text data can go out now.
+ $this->flushBuffer();
+
$this->scanner->next();
return $this->markupDeclaration() ||
@@ -240,8 +243,10 @@ class Tokenizer {
return $this->comment();
}
elseif($tok == 'D') {
+ $this->doctype();
}
elseif($tok == '[') {
+ $this->cdataSection();
}
// FINISH
@@ -250,38 +255,6 @@ class Tokenizer {
return TRUE;
}
- protected function rcdata() {
- // Ampersand
- // <
- // Null
- // EOF
- // Character
- }
-
- protected function rawtext() {
- // < is a literal
- // NULL is an error
- // EOF
- // Character data
- }
-
- protected function scriptData() {
- // < is a literal
- // NULL is an error
- // EOF
- // Character data
- }
-
- /**
- * 8.2.4.7
- */
- protected function plaintext() {
- // NULL -> parse error
- // EOF -> eof
- // -> Character data
- }
-
-
/**
* Consume an end tag.
* 8.2.4.9
@@ -347,108 +320,6 @@ class Tokenizer {
// -> append to tagname
}
- /**
- * 8.2.4.11
- */
- protected function rcdataLessThan() {
- // / -> empty the tmp buffer and go to end-tag
- // ->rcdata
- }
-
- /**
- * 8.2.4.12
- */
- protected function rcdataEndTag() {
- // A-Za-z: append to tagname
- // -> rcdata state
- }
-
- /**
- * 8.2.4.13
- */
- protected function rcdataEndTagName() {
- // tab, lf, ff, space -> before attribute or treat as anything
- // / -> self-closing tag
- // > -> end tag, back to data
- // A-Za-z -> append to tagname
- // -> rcdata state
- }
-
- /**
- * 8.2.4.14
- */
- protected function rawtextLessThan() {
- // / -> rawtext endtag state
- // -> rawtext
- }
-
- /**
- * 8.2.4.15
- */
- protected function rawtextEndTagOpen() {
- // A-Za-z -> rawtext
- // ->rawtext
- }
-
- protected function rawtextEndTagName() {
- // tab, lf, ff, space -> before attr name
- //
- }
-
- protected function scriptLessThan(){
- }
- protected function scriptEndTagOpen() {
- }
- protected function scriptEndTagName() {
- }
- protected function scriptEscapeStart() {
- }
- protected function scriptEscapeStartDash() {
- }
- protected function scriptEscaped() {
- }
- protected function scriptEscapedDash() {
- }
- protected function scriptEscapedDashDash() {
- }
- protected function scriptEscapedLessThan() {
- }
- protected function scriptEscapedEndTagOpen() {
- }
- protected function scriptEscapedEndTagName() {
- }
- protected function scriptDoubleEscapeStart() {
- }
- protected function scriptDoubleEscaped() {
- }
- protected function scriptDoubleEscapedDash() {
- }
- protected function scriptDoubleEscapedDashDash() {
- }
- protected function scriptDoubleEscapedLessThan() {
- }
- protected function scriptDoubleEscapeEnd() {
- }
- protected function beforeAttributeName() {
- }
- protected function attributeName() {
- }
- protected function afterAttributeName() {
- }
- protected function beforeAttributeValue() {
- }
- protected function attributeValueDoubleQuote() {
- }
- protected function attributeValueSingleQuote() {
- }
- protected function attributeValueUnquoted() {
- }
- protected function characterReferenceInAttributeValue() {
- }
- protected function afterAttributeValueQuoted() {
- }
- protected function selfCloseingStartTag() {
- }
/**
* Consume malformed markup as if it were a comment.
@@ -551,6 +422,190 @@ class Tokenizer {
// Now we need to parse the DOCTYPE.
}
+
+ /**
+ * Handle a CDATA section.
+ */
+ protected function cdataSection() {
+ if ($this->scanner->current() != '[') {
+ return FALSE;
+ }
+ $cdata = '';
+ $this->scanner->next();
+
+ $chars = $this->scanner->charsWhile('CDAT');
+ if ($chars != 'CDATA' || $this->scanner->current() != '[') {
+ $this->parseError('Expected [CDATA[, got %s', $chars);
+ return $this->bogusComment('<![' . $chars);
+ }
+
+ $tok = $this->scanner->next();
+ do {
+ if ($tok === FALSE) {
+ $this->parseError('Unexpected EOF inside CDATA.');
+ $this->bogusComment('<![CDATA[' . $cdata);
+ return TRUE;
+ }
+ $cdata .= $tok;
+ $tok = $this->scanner->next();
+ }
+ while (!$this->isCdataClose());
+
+ $this->events->cdata($cdata);
+ return TRUE;
+
+ }
+ /**
+ * Check whether the parser has reached the end of a CDATA section.
+ */
+ protected function isCdataClose() {
+ $tok = $this->scanner->current();
+ if ($tok != ']') {
+ return FALSE;
+ }
+ $tok = $this->scanner->next();
+ if ($tok == ']' && $this->scanner->peek() == '>') {
+ return TRUE;
+ }
+ // Unconsume one char and return.
+ $this->scanner->unconsume();
+ return FALSE;
+ }
+
+ protected function rcdata() {
+ // Ampersand
+ // <
+ // Null
+ // EOF
+ // Character
+ }
+
+ protected function rawtext() {
+ // < is a literal
+ // NULL is an error
+ // EOF
+ // Character data
+ }
+
+ protected function scriptData() {
+ // < is a literal
+ // NULL is an error
+ // EOF
+ // Character data
+ }
+
+ /**
+ * 8.2.4.7
+ */
+ protected function plaintext() {
+ // NULL -> parse error
+ // EOF -> eof
+ // -> Character data
+ }
+
+
+ /**
+ * 8.2.4.11
+ */
+ protected function rcdataLessThan() {
+ // / -> empty the tmp buffer and go to end-tag
+ // ->rcdata
+ }
+
+ /**
+ * 8.2.4.12
+ */
+ protected function rcdataEndTag() {
+ // A-Za-z: append to tagname
+ // -> rcdata state
+ }
+
+ /**
+ * 8.2.4.13
+ */
+ protected function rcdataEndTagName() {
+ // tab, lf, ff, space -> before attribute or treat as anything
+ // / -> self-closing tag
+ // > -> end tag, back to data
+ // A-Za-z -> append to tagname
+ // -> rcdata state
+ }
+
+ /**
+ * 8.2.4.14
+ */
+ protected function rawtextLessThan() {
+ // / -> rawtext endtag state
+ // -> rawtext
+ }
+
+ /**
+ * 8.2.4.15
+ */
+ protected function rawtextEndTagOpen() {
+ // A-Za-z -> rawtext
+ // ->rawtext
+ }
+
+ protected function rawtextEndTagName() {
+ // tab, lf, ff, space -> before attr name
+ //
+ }
+
+ protected function scriptLessThan(){
+ }
+ protected function scriptEndTagOpen() {
+ }
+ protected function scriptEndTagName() {
+ }
+ protected function scriptEscapeStart() {
+ }
+ protected function scriptEscapeStartDash() {
+ }
+ protected function scriptEscaped() {
+ }
+ protected function scriptEscapedDash() {
+ }
+ protected function scriptEscapedDashDash() {
+ }
+ protected function scriptEscapedLessThan() {
+ }
+ protected function scriptEscapedEndTagOpen() {
+ }
+ protected function scriptEscapedEndTagName() {
+ }
+ protected function scriptDoubleEscapeStart() {
+ }
+ protected function scriptDoubleEscaped() {
+ }
+ protected function scriptDoubleEscapedDash() {
+ }
+ protected function scriptDoubleEscapedDashDash() {
+ }
+ protected function scriptDoubleEscapedLessThan() {
+ }
+ protected function scriptDoubleEscapeEnd() {
+ }
+ protected function beforeAttributeName() {
+ }
+ protected function attributeName() {
+ }
+ protected function afterAttributeName() {
+ }
+ protected function beforeAttributeValue() {
+ }
+ protected function attributeValueDoubleQuote() {
+ }
+ protected function attributeValueSingleQuote() {
+ }
+ protected function attributeValueUnquoted() {
+ }
+ protected function characterReferenceInAttributeValue() {
+ }
+ protected function afterAttributeValueQuoted() {
+ }
+ protected function selfCloseingStartTag() {
+ }
protected function beforeDoctype() {
}
protected function doctypeName() {
@@ -581,8 +636,6 @@ class Tokenizer {
}
protected function bogusDoctype() {
}
- protected function cdataSection() {
- }
// ================================================================