diff options
-rw-r--r-- | src/HTML5/Parser/EventHandler.php | 2 | ||||
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 325 | ||||
-rw-r--r-- | test/HTML5/Parser/EventStack.php | 4 | ||||
-rw-r--r-- | test/HTML5/Parser/TokenizerTest.php | 35 |
4 files changed, 228 insertions, 138 deletions
diff --git a/src/HTML5/Parser/EventHandler.php b/src/HTML5/Parser/EventHandler.php index 8d200b1..76df740 100644 --- a/src/HTML5/Parser/EventHandler.php +++ b/src/HTML5/Parser/EventHandler.php @@ -39,6 +39,6 @@ interface EventHandler { public function parseError($msg, $line, $col); // Do we need... - // public function cdata(); + public function cdata($data); // public function processorInstruction(); } diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 899e908..d8676c5 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -215,6 +215,9 @@ class Tokenizer { return FALSE; } + // Any buffered text data can go out now. + $this->flushBuffer(); + $this->scanner->next(); return $this->markupDeclaration() || @@ -240,8 +243,10 @@ class Tokenizer { return $this->comment(); } elseif($tok == 'D') { + $this->doctype(); } elseif($tok == '[') { + $this->cdataSection(); } // FINISH @@ -250,38 +255,6 @@ class Tokenizer { return TRUE; } - protected function rcdata() { - // Ampersand - // < - // Null - // EOF - // Character - } - - protected function rawtext() { - // < is a literal - // NULL is an error - // EOF - // Character data - } - - protected function scriptData() { - // < is a literal - // NULL is an error - // EOF - // Character data - } - - /** - * 8.2.4.7 - */ - protected function plaintext() { - // NULL -> parse error - // EOF -> eof - // -> Character data - } - - /** * Consume an end tag. * 8.2.4.9 @@ -347,108 +320,6 @@ class Tokenizer { // -> append to tagname } - /** - * 8.2.4.11 - */ - protected function rcdataLessThan() { - // / -> empty the tmp buffer and go to end-tag - // ->rcdata - } - - /** - * 8.2.4.12 - */ - protected function rcdataEndTag() { - // A-Za-z: append to tagname - // -> rcdata state - } - - /** - * 8.2.4.13 - */ - protected function rcdataEndTagName() { - // tab, lf, ff, space -> before attribute or treat as anything - // / -> self-closing tag - // > -> end tag, back to data - // A-Za-z -> append to tagname - // -> rcdata state - } - - /** - * 8.2.4.14 - */ - protected function rawtextLessThan() { - // / -> rawtext endtag state - // -> rawtext - } - - /** - * 8.2.4.15 - */ - protected function rawtextEndTagOpen() { - // A-Za-z -> rawtext - // ->rawtext - } - - protected function rawtextEndTagName() { - // tab, lf, ff, space -> before attr name - // - } - - protected function scriptLessThan(){ - } - protected function scriptEndTagOpen() { - } - protected function scriptEndTagName() { - } - protected function scriptEscapeStart() { - } - protected function scriptEscapeStartDash() { - } - protected function scriptEscaped() { - } - protected function scriptEscapedDash() { - } - protected function scriptEscapedDashDash() { - } - protected function scriptEscapedLessThan() { - } - protected function scriptEscapedEndTagOpen() { - } - protected function scriptEscapedEndTagName() { - } - protected function scriptDoubleEscapeStart() { - } - protected function scriptDoubleEscaped() { - } - protected function scriptDoubleEscapedDash() { - } - protected function scriptDoubleEscapedDashDash() { - } - protected function scriptDoubleEscapedLessThan() { - } - protected function scriptDoubleEscapeEnd() { - } - protected function beforeAttributeName() { - } - protected function attributeName() { - } - protected function afterAttributeName() { - } - protected function beforeAttributeValue() { - } - protected function attributeValueDoubleQuote() { - } - protected function attributeValueSingleQuote() { - } - protected function attributeValueUnquoted() { - } - protected function characterReferenceInAttributeValue() { - } - protected function afterAttributeValueQuoted() { - } - protected function selfCloseingStartTag() { - } /** * Consume malformed markup as if it were a comment. @@ -551,6 +422,190 @@ class Tokenizer { // Now we need to parse the DOCTYPE. } + + /** + * Handle a CDATA section. + */ + protected function cdataSection() { + if ($this->scanner->current() != '[') { + return FALSE; + } + $cdata = ''; + $this->scanner->next(); + + $chars = $this->scanner->charsWhile('CDAT'); + if ($chars != 'CDATA' || $this->scanner->current() != '[') { + $this->parseError('Expected [CDATA[, got %s', $chars); + return $this->bogusComment('<![' . $chars); + } + + $tok = $this->scanner->next(); + do { + if ($tok === FALSE) { + $this->parseError('Unexpected EOF inside CDATA.'); + $this->bogusComment('<![CDATA[' . $cdata); + return TRUE; + } + $cdata .= $tok; + $tok = $this->scanner->next(); + } + while (!$this->isCdataClose()); + + $this->events->cdata($cdata); + return TRUE; + + } + /** + * Check whether the parser has reached the end of a CDATA section. + */ + protected function isCdataClose() { + $tok = $this->scanner->current(); + if ($tok != ']') { + return FALSE; + } + $tok = $this->scanner->next(); + if ($tok == ']' && $this->scanner->peek() == '>') { + return TRUE; + } + // Unconsume one char and return. + $this->scanner->unconsume(); + return FALSE; + } + + protected function rcdata() { + // Ampersand + // < + // Null + // EOF + // Character + } + + protected function rawtext() { + // < is a literal + // NULL is an error + // EOF + // Character data + } + + protected function scriptData() { + // < is a literal + // NULL is an error + // EOF + // Character data + } + + /** + * 8.2.4.7 + */ + protected function plaintext() { + // NULL -> parse error + // EOF -> eof + // -> Character data + } + + + /** + * 8.2.4.11 + */ + protected function rcdataLessThan() { + // / -> empty the tmp buffer and go to end-tag + // ->rcdata + } + + /** + * 8.2.4.12 + */ + protected function rcdataEndTag() { + // A-Za-z: append to tagname + // -> rcdata state + } + + /** + * 8.2.4.13 + */ + protected function rcdataEndTagName() { + // tab, lf, ff, space -> before attribute or treat as anything + // / -> self-closing tag + // > -> end tag, back to data + // A-Za-z -> append to tagname + // -> rcdata state + } + + /** + * 8.2.4.14 + */ + protected function rawtextLessThan() { + // / -> rawtext endtag state + // -> rawtext + } + + /** + * 8.2.4.15 + */ + protected function rawtextEndTagOpen() { + // A-Za-z -> rawtext + // ->rawtext + } + + protected function rawtextEndTagName() { + // tab, lf, ff, space -> before attr name + // + } + + protected function scriptLessThan(){ + } + protected function scriptEndTagOpen() { + } + protected function scriptEndTagName() { + } + protected function scriptEscapeStart() { + } + protected function scriptEscapeStartDash() { + } + protected function scriptEscaped() { + } + protected function scriptEscapedDash() { + } + protected function scriptEscapedDashDash() { + } + protected function scriptEscapedLessThan() { + } + protected function scriptEscapedEndTagOpen() { + } + protected function scriptEscapedEndTagName() { + } + protected function scriptDoubleEscapeStart() { + } + protected function scriptDoubleEscaped() { + } + protected function scriptDoubleEscapedDash() { + } + protected function scriptDoubleEscapedDashDash() { + } + protected function scriptDoubleEscapedLessThan() { + } + protected function scriptDoubleEscapeEnd() { + } + protected function beforeAttributeName() { + } + protected function attributeName() { + } + protected function afterAttributeName() { + } + protected function beforeAttributeValue() { + } + protected function attributeValueDoubleQuote() { + } + protected function attributeValueSingleQuote() { + } + protected function attributeValueUnquoted() { + } + protected function characterReferenceInAttributeValue() { + } + protected function afterAttributeValueQuoted() { + } + protected function selfCloseingStartTag() { + } protected function beforeDoctype() { } protected function doctypeName() { @@ -581,8 +636,6 @@ class Tokenizer { } protected function bogusDoctype() { } - protected function cdataSection() { - } // ================================================================ diff --git a/test/HTML5/Parser/EventStack.php b/test/HTML5/Parser/EventStack.php index 4d82629..478ae60 100644 --- a/test/HTML5/Parser/EventStack.php +++ b/test/HTML5/Parser/EventStack.php @@ -52,6 +52,10 @@ class EventStack implements EventHandler { $this->store('comment', array($cdata)); } + public function cdata($data) { + $this->store('cdata', func_get_args()); + } + public function text($cdata) { //fprintf(STDOUT, "Received TEXT event with: " . $cdata); $this->store('text', array($cdata)); diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php index ead02d0..40259ea 100644 --- a/test/HTML5/Parser/TokenizerTest.php +++ b/test/HTML5/Parser/TokenizerTest.php @@ -90,10 +90,17 @@ class TokenizerTest extends \HTML5\Tests\TestCase { $bogus = array( '</+this is a bogus comment. +>', '<!+this is a bogus comment. !>', + '<!D OCTYPE foo bar>', '<!DOCTYEP foo bar>', + '<![CADATA[ TEST ]]>', + '<![CDATA Hello ]]>', + '<![CDATA[ Hello [[>', + '<!CDATA[[ test ]]>', + '<![CDATA[', + '<![CDATA[hellooooo hello', ); foreach ($bogus as $str) { - $events = $this->parse($str . ' '); + $events = $this->parse($str); $e0 = $events->get(0); $this->assertEquals('error', $e0['name']); $e1 = $events->get(1); @@ -182,6 +189,7 @@ class TokenizerTest extends \HTML5\Tests\TestCase { '<!-->' => '', '<!--Hello' => 'Hello', "<!--\0Hello" => UTF8Utils::FFFD . 'Hello', + '<!--' => '', ); foreach ($fail as $test => $expected) { $events = $this->parse($test); @@ -194,4 +202,29 @@ class TokenizerTest extends \HTML5\Tests\TestCase { } } + + public function testCDATASection() { + $good = array( + '<![CDATA[ This is a test. ]]>' => ' This is a test. ', + '<![CDATA[CDATA]]>' => 'CDATA', + '<![CDATA[ ]] > ]]>' => ' ]] > ', + '<![CDATA[ ]]>' => ' ', + ); + foreach ($good as $test => $expects) { + $events = $this->parse($test); + $e1 = $events->get(0); + $this->assertEquals('cdata', $e1['name'], "CDATA section for " . $test . print_r($events, TRUE)); + $this->assertEquals($expects, $e1['data'][0], "CDATA section for " . $test); + } + } + + public function testText() { + $good = array( + 'a<br>b', + '<a>test</a>', + 'a<![[ test ]]>b', + 'a&b', + ); + $this->markTestIncomplete("Need tag parsing first."); + } } |