diff options
author | Matt Butcher <[email protected]> | 2013-04-12 21:33:17 -0500 |
---|---|---|
committer | Matt Butcher <[email protected]> | 2013-04-12 21:33:17 -0500 |
commit | 3d8562c11dd5e7591ea29562c43fb74939836b83 (patch) | |
tree | 38e0e3e6aed4934c97777db9b18ab9bc9b91c9a3 /src/HTML5/Parser/Tokenizer.php | |
parent | ffcfa507b081cf132db5b90c26bfad66d79a4eb4 (diff) |
CDATA handling is complete. DOCTYPE is begun.
Diffstat (limited to 'src/HTML5/Parser/Tokenizer.php')
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 325 |
1 files changed, 189 insertions, 136 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 899e908..d8676c5 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -215,6 +215,9 @@ class Tokenizer { return FALSE; } + // Any buffered text data can go out now. + $this->flushBuffer(); + $this->scanner->next(); return $this->markupDeclaration() || @@ -240,8 +243,10 @@ class Tokenizer { return $this->comment(); } elseif($tok == 'D') { + $this->doctype(); } elseif($tok == '[') { + $this->cdataSection(); } // FINISH @@ -250,38 +255,6 @@ class Tokenizer { return TRUE; } - protected function rcdata() { - // Ampersand - // < - // Null - // EOF - // Character - } - - protected function rawtext() { - // < is a literal - // NULL is an error - // EOF - // Character data - } - - protected function scriptData() { - // < is a literal - // NULL is an error - // EOF - // Character data - } - - /** - * 8.2.4.7 - */ - protected function plaintext() { - // NULL -> parse error - // EOF -> eof - // -> Character data - } - - /** * Consume an end tag. * 8.2.4.9 @@ -347,108 +320,6 @@ class Tokenizer { // -> append to tagname } - /** - * 8.2.4.11 - */ - protected function rcdataLessThan() { - // / -> empty the tmp buffer and go to end-tag - // ->rcdata - } - - /** - * 8.2.4.12 - */ - protected function rcdataEndTag() { - // A-Za-z: append to tagname - // -> rcdata state - } - - /** - * 8.2.4.13 - */ - protected function rcdataEndTagName() { - // tab, lf, ff, space -> before attribute or treat as anything - // / -> self-closing tag - // > -> end tag, back to data - // A-Za-z -> append to tagname - // -> rcdata state - } - - /** - * 8.2.4.14 - */ - protected function rawtextLessThan() { - // / -> rawtext endtag state - // -> rawtext - } - - /** - * 8.2.4.15 - */ - protected function rawtextEndTagOpen() { - // A-Za-z -> rawtext - // ->rawtext - } - - protected function rawtextEndTagName() { - // tab, lf, ff, space -> before attr name - // - } - - protected function scriptLessThan(){ - } - protected function scriptEndTagOpen() { - } - protected function scriptEndTagName() { - } - protected function scriptEscapeStart() { - } - protected function scriptEscapeStartDash() { - } - protected function scriptEscaped() { - } - protected function scriptEscapedDash() { - } - protected function scriptEscapedDashDash() { - } - protected function scriptEscapedLessThan() { - } - protected function scriptEscapedEndTagOpen() { - } - protected function scriptEscapedEndTagName() { - } - protected function scriptDoubleEscapeStart() { - } - protected function scriptDoubleEscaped() { - } - protected function scriptDoubleEscapedDash() { - } - protected function scriptDoubleEscapedDashDash() { - } - protected function scriptDoubleEscapedLessThan() { - } - protected function scriptDoubleEscapeEnd() { - } - protected function beforeAttributeName() { - } - protected function attributeName() { - } - protected function afterAttributeName() { - } - protected function beforeAttributeValue() { - } - protected function attributeValueDoubleQuote() { - } - protected function attributeValueSingleQuote() { - } - protected function attributeValueUnquoted() { - } - protected function characterReferenceInAttributeValue() { - } - protected function afterAttributeValueQuoted() { - } - protected function selfCloseingStartTag() { - } /** * Consume malformed markup as if it were a comment. @@ -551,6 +422,190 @@ class Tokenizer { // Now we need to parse the DOCTYPE. } + + /** + * Handle a CDATA section. + */ + protected function cdataSection() { + if ($this->scanner->current() != '[') { + return FALSE; + } + $cdata = ''; + $this->scanner->next(); + + $chars = $this->scanner->charsWhile('CDAT'); + if ($chars != 'CDATA' || $this->scanner->current() != '[') { + $this->parseError('Expected [CDATA[, got %s', $chars); + return $this->bogusComment('<![' . $chars); + } + + $tok = $this->scanner->next(); + do { + if ($tok === FALSE) { + $this->parseError('Unexpected EOF inside CDATA.'); + $this->bogusComment('<![CDATA[' . $cdata); + return TRUE; + } + $cdata .= $tok; + $tok = $this->scanner->next(); + } + while (!$this->isCdataClose()); + + $this->events->cdata($cdata); + return TRUE; + + } + /** + * Check whether the parser has reached the end of a CDATA section. + */ + protected function isCdataClose() { + $tok = $this->scanner->current(); + if ($tok != ']') { + return FALSE; + } + $tok = $this->scanner->next(); + if ($tok == ']' && $this->scanner->peek() == '>') { + return TRUE; + } + // Unconsume one char and return. + $this->scanner->unconsume(); + return FALSE; + } + + protected function rcdata() { + // Ampersand + // < + // Null + // EOF + // Character + } + + protected function rawtext() { + // < is a literal + // NULL is an error + // EOF + // Character data + } + + protected function scriptData() { + // < is a literal + // NULL is an error + // EOF + // Character data + } + + /** + * 8.2.4.7 + */ + protected function plaintext() { + // NULL -> parse error + // EOF -> eof + // -> Character data + } + + + /** + * 8.2.4.11 + */ + protected function rcdataLessThan() { + // / -> empty the tmp buffer and go to end-tag + // ->rcdata + } + + /** + * 8.2.4.12 + */ + protected function rcdataEndTag() { + // A-Za-z: append to tagname + // -> rcdata state + } + + /** + * 8.2.4.13 + */ + protected function rcdataEndTagName() { + // tab, lf, ff, space -> before attribute or treat as anything + // / -> self-closing tag + // > -> end tag, back to data + // A-Za-z -> append to tagname + // -> rcdata state + } + + /** + * 8.2.4.14 + */ + protected function rawtextLessThan() { + // / -> rawtext endtag state + // -> rawtext + } + + /** + * 8.2.4.15 + */ + protected function rawtextEndTagOpen() { + // A-Za-z -> rawtext + // ->rawtext + } + + protected function rawtextEndTagName() { + // tab, lf, ff, space -> before attr name + // + } + + protected function scriptLessThan(){ + } + protected function scriptEndTagOpen() { + } + protected function scriptEndTagName() { + } + protected function scriptEscapeStart() { + } + protected function scriptEscapeStartDash() { + } + protected function scriptEscaped() { + } + protected function scriptEscapedDash() { + } + protected function scriptEscapedDashDash() { + } + protected function scriptEscapedLessThan() { + } + protected function scriptEscapedEndTagOpen() { + } + protected function scriptEscapedEndTagName() { + } + protected function scriptDoubleEscapeStart() { + } + protected function scriptDoubleEscaped() { + } + protected function scriptDoubleEscapedDash() { + } + protected function scriptDoubleEscapedDashDash() { + } + protected function scriptDoubleEscapedLessThan() { + } + protected function scriptDoubleEscapeEnd() { + } + protected function beforeAttributeName() { + } + protected function attributeName() { + } + protected function afterAttributeName() { + } + protected function beforeAttributeValue() { + } + protected function attributeValueDoubleQuote() { + } + protected function attributeValueSingleQuote() { + } + protected function attributeValueUnquoted() { + } + protected function characterReferenceInAttributeValue() { + } + protected function afterAttributeValueQuoted() { + } + protected function selfCloseingStartTag() { + } protected function beforeDoctype() { } protected function doctypeName() { @@ -581,8 +636,6 @@ class Tokenizer { } protected function bogusDoctype() { } - protected function cdataSection() { - } // ================================================================ |