summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/HTML5/Parser/EventHandler.php2
-rw-r--r--src/HTML5/Parser/Tokenizer.php325
-rw-r--r--test/HTML5/Parser/EventStack.php4
-rw-r--r--test/HTML5/Parser/TokenizerTest.php35
4 files changed, 228 insertions, 138 deletions
diff --git a/src/HTML5/Parser/EventHandler.php b/src/HTML5/Parser/EventHandler.php
index 8d200b1..76df740 100644
--- a/src/HTML5/Parser/EventHandler.php
+++ b/src/HTML5/Parser/EventHandler.php
@@ -39,6 +39,6 @@ interface EventHandler {
public function parseError($msg, $line, $col);
// Do we need...
- // public function cdata();
+ public function cdata($data);
// public function processorInstruction();
}
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index 899e908..d8676c5 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -215,6 +215,9 @@ class Tokenizer {
return FALSE;
}
+ // Any buffered text data can go out now.
+ $this->flushBuffer();
+
$this->scanner->next();
return $this->markupDeclaration() ||
@@ -240,8 +243,10 @@ class Tokenizer {
return $this->comment();
}
elseif($tok == 'D') {
+ $this->doctype();
}
elseif($tok == '[') {
+ $this->cdataSection();
}
// FINISH
@@ -250,38 +255,6 @@ class Tokenizer {
return TRUE;
}
- protected function rcdata() {
- // Ampersand
- // <
- // Null
- // EOF
- // Character
- }
-
- protected function rawtext() {
- // < is a literal
- // NULL is an error
- // EOF
- // Character data
- }
-
- protected function scriptData() {
- // < is a literal
- // NULL is an error
- // EOF
- // Character data
- }
-
- /**
- * 8.2.4.7
- */
- protected function plaintext() {
- // NULL -> parse error
- // EOF -> eof
- // -> Character data
- }
-
-
/**
* Consume an end tag.
* 8.2.4.9
@@ -347,108 +320,6 @@ class Tokenizer {
// -> append to tagname
}
- /**
- * 8.2.4.11
- */
- protected function rcdataLessThan() {
- // / -> empty the tmp buffer and go to end-tag
- // ->rcdata
- }
-
- /**
- * 8.2.4.12
- */
- protected function rcdataEndTag() {
- // A-Za-z: append to tagname
- // -> rcdata state
- }
-
- /**
- * 8.2.4.13
- */
- protected function rcdataEndTagName() {
- // tab, lf, ff, space -> before attribute or treat as anything
- // / -> self-closing tag
- // > -> end tag, back to data
- // A-Za-z -> append to tagname
- // -> rcdata state
- }
-
- /**
- * 8.2.4.14
- */
- protected function rawtextLessThan() {
- // / -> rawtext endtag state
- // -> rawtext
- }
-
- /**
- * 8.2.4.15
- */
- protected function rawtextEndTagOpen() {
- // A-Za-z -> rawtext
- // ->rawtext
- }
-
- protected function rawtextEndTagName() {
- // tab, lf, ff, space -> before attr name
- //
- }
-
- protected function scriptLessThan(){
- }
- protected function scriptEndTagOpen() {
- }
- protected function scriptEndTagName() {
- }
- protected function scriptEscapeStart() {
- }
- protected function scriptEscapeStartDash() {
- }
- protected function scriptEscaped() {
- }
- protected function scriptEscapedDash() {
- }
- protected function scriptEscapedDashDash() {
- }
- protected function scriptEscapedLessThan() {
- }
- protected function scriptEscapedEndTagOpen() {
- }
- protected function scriptEscapedEndTagName() {
- }
- protected function scriptDoubleEscapeStart() {
- }
- protected function scriptDoubleEscaped() {
- }
- protected function scriptDoubleEscapedDash() {
- }
- protected function scriptDoubleEscapedDashDash() {
- }
- protected function scriptDoubleEscapedLessThan() {
- }
- protected function scriptDoubleEscapeEnd() {
- }
- protected function beforeAttributeName() {
- }
- protected function attributeName() {
- }
- protected function afterAttributeName() {
- }
- protected function beforeAttributeValue() {
- }
- protected function attributeValueDoubleQuote() {
- }
- protected function attributeValueSingleQuote() {
- }
- protected function attributeValueUnquoted() {
- }
- protected function characterReferenceInAttributeValue() {
- }
- protected function afterAttributeValueQuoted() {
- }
- protected function selfCloseingStartTag() {
- }
/**
* Consume malformed markup as if it were a comment.
@@ -551,6 +422,190 @@ class Tokenizer {
// Now we need to parse the DOCTYPE.
}
+
+ /**
+ * Handle a CDATA section.
+ */
+ protected function cdataSection() {
+ if ($this->scanner->current() != '[') {
+ return FALSE;
+ }
+ $cdata = '';
+ $this->scanner->next();
+
+ $chars = $this->scanner->charsWhile('CDAT');
+ if ($chars != 'CDATA' || $this->scanner->current() != '[') {
+ $this->parseError('Expected [CDATA[, got %s', $chars);
+ return $this->bogusComment('<![' . $chars);
+ }
+
+ $tok = $this->scanner->next();
+ do {
+ if ($tok === FALSE) {
+ $this->parseError('Unexpected EOF inside CDATA.');
+ $this->bogusComment('<![CDATA[' . $cdata);
+ return TRUE;
+ }
+ $cdata .= $tok;
+ $tok = $this->scanner->next();
+ }
+ while (!$this->isCdataClose());
+
+ $this->events->cdata($cdata);
+ return TRUE;
+
+ }
+ /**
+ * Check whether the parser has reached the end of a CDATA section.
+ */
+ protected function isCdataClose() {
+ $tok = $this->scanner->current();
+ if ($tok != ']') {
+ return FALSE;
+ }
+ $tok = $this->scanner->next();
+ if ($tok == ']' && $this->scanner->peek() == '>') {
+ return TRUE;
+ }
+ // Unconsume one char and return.
+ $this->scanner->unconsume();
+ return FALSE;
+ }
+
+ protected function rcdata() {
+ // Ampersand
+ // <
+ // Null
+ // EOF
+ // Character
+ }
+
+ protected function rawtext() {
+ // < is a literal
+ // NULL is an error
+ // EOF
+ // Character data
+ }
+
+ protected function scriptData() {
+ // < is a literal
+ // NULL is an error
+ // EOF
+ // Character data
+ }
+
+ /**
+ * 8.2.4.7
+ */
+ protected function plaintext() {
+ // NULL -> parse error
+ // EOF -> eof
+ // -> Character data
+ }
+
+
+ /**
+ * 8.2.4.11
+ */
+ protected function rcdataLessThan() {
+ // / -> empty the tmp buffer and go to end-tag
+ // ->rcdata
+ }
+
+ /**
+ * 8.2.4.12
+ */
+ protected function rcdataEndTag() {
+ // A-Za-z: append to tagname
+ // -> rcdata state
+ }
+
+ /**
+ * 8.2.4.13
+ */
+ protected function rcdataEndTagName() {
+ // tab, lf, ff, space -> before attribute or treat as anything
+ // / -> self-closing tag
+ // > -> end tag, back to data
+ // A-Za-z -> append to tagname
+ // -> rcdata state
+ }
+
+ /**
+ * 8.2.4.14
+ */
+ protected function rawtextLessThan() {
+ // / -> rawtext endtag state
+ // -> rawtext
+ }
+
+ /**
+ * 8.2.4.15
+ */
+ protected function rawtextEndTagOpen() {
+ // A-Za-z -> rawtext
+ // ->rawtext
+ }
+
+ protected function rawtextEndTagName() {
+ // tab, lf, ff, space -> before attr name
+ //
+ }
+
+ protected function scriptLessThan(){
+ }
+ protected function scriptEndTagOpen() {
+ }
+ protected function scriptEndTagName() {
+ }
+ protected function scriptEscapeStart() {
+ }
+ protected function scriptEscapeStartDash() {
+ }
+ protected function scriptEscaped() {
+ }
+ protected function scriptEscapedDash() {
+ }
+ protected function scriptEscapedDashDash() {
+ }
+ protected function scriptEscapedLessThan() {
+ }
+ protected function scriptEscapedEndTagOpen() {
+ }
+ protected function scriptEscapedEndTagName() {
+ }
+ protected function scriptDoubleEscapeStart() {
+ }
+ protected function scriptDoubleEscaped() {
+ }
+ protected function scriptDoubleEscapedDash() {
+ }
+ protected function scriptDoubleEscapedDashDash() {
+ }
+ protected function scriptDoubleEscapedLessThan() {
+ }
+ protected function scriptDoubleEscapeEnd() {
+ }
+ protected function beforeAttributeName() {
+ }
+ protected function attributeName() {
+ }
+ protected function afterAttributeName() {
+ }
+ protected function beforeAttributeValue() {
+ }
+ protected function attributeValueDoubleQuote() {
+ }
+ protected function attributeValueSingleQuote() {
+ }
+ protected function attributeValueUnquoted() {
+ }
+ protected function characterReferenceInAttributeValue() {
+ }
+ protected function afterAttributeValueQuoted() {
+ }
+ protected function selfCloseingStartTag() {
+ }
protected function beforeDoctype() {
}
protected function doctypeName() {
@@ -581,8 +636,6 @@ class Tokenizer {
}
protected function bogusDoctype() {
}
- protected function cdataSection() {
- }
// ================================================================
diff --git a/test/HTML5/Parser/EventStack.php b/test/HTML5/Parser/EventStack.php
index 4d82629..478ae60 100644
--- a/test/HTML5/Parser/EventStack.php
+++ b/test/HTML5/Parser/EventStack.php
@@ -52,6 +52,10 @@ class EventStack implements EventHandler {
$this->store('comment', array($cdata));
}
+ public function cdata($data) {
+ $this->store('cdata', func_get_args());
+ }
+
public function text($cdata) {
//fprintf(STDOUT, "Received TEXT event with: " . $cdata);
$this->store('text', array($cdata));
diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php
index ead02d0..40259ea 100644
--- a/test/HTML5/Parser/TokenizerTest.php
+++ b/test/HTML5/Parser/TokenizerTest.php
@@ -90,10 +90,17 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
$bogus = array(
'</+this is a bogus comment. +>',
'<!+this is a bogus comment. !>',
+ '<!D OCTYPE foo bar>',
'<!DOCTYEP foo bar>',
+ '<![CADATA[ TEST ]]>',
+ '<![CDATA Hello ]]>',
+ '<![CDATA[ Hello [[>',
+ '<!CDATA[[ test ]]>',
+ '<![CDATA[',
+ '<![CDATA[hellooooo hello',
);
foreach ($bogus as $str) {
- $events = $this->parse($str . ' ');
+ $events = $this->parse($str);
$e0 = $events->get(0);
$this->assertEquals('error', $e0['name']);
$e1 = $events->get(1);
@@ -182,6 +189,7 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
'<!-->' => '',
'<!--Hello' => 'Hello',
"<!--\0Hello" => UTF8Utils::FFFD . 'Hello',
+ '<!--' => '',
);
foreach ($fail as $test => $expected) {
$events = $this->parse($test);
@@ -194,4 +202,29 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
}
}
+
+ public function testCDATASection() {
+ $good = array(
+ '<![CDATA[ This is a test. ]]>' => ' This is a test. ',
+ '<![CDATA[CDATA]]>' => 'CDATA',
+ '<![CDATA[ ]] > ]]>' => ' ]] > ',
+ '<![CDATA[ ]]>' => ' ',
+ );
+ foreach ($good as $test => $expects) {
+ $events = $this->parse($test);
+ $e1 = $events->get(0);
+ $this->assertEquals('cdata', $e1['name'], "CDATA section for " . $test . print_r($events, TRUE));
+ $this->assertEquals($expects, $e1['data'][0], "CDATA section for " . $test);
+ }
+ }
+
+ public function testText() {
+ $good = array(
+ 'a<br>b',
+ '<a>test</a>',
+ 'a<![[ test ]]>b',
+ 'a&amp;b',
+ );
+ $this->markTestIncomplete("Need tag parsing first.");
+ }
}