summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/HTML5/Parser/EventHandler.php27
-rw-r--r--src/HTML5/Parser/Tokenizer.php7
-rw-r--r--test/HTML5/Parser/EventStack.php9
-rw-r--r--test/HTML5/Parser/TokenizerTest.php57
4 files changed, 80 insertions, 20 deletions
diff --git a/src/HTML5/Parser/EventHandler.php b/src/HTML5/Parser/EventHandler.php
index 8282cb7..ebb30b2 100644
--- a/src/HTML5/Parser/EventHandler.php
+++ b/src/HTML5/Parser/EventHandler.php
@@ -27,7 +27,32 @@ interface EventHandler {
/**
* A start tag.
*
- *
+ * IMPORTANT: The parser watches the return value of this event. If this returns
+ * an integer, the parser will switch TEXTMODE patters according to the int.
+ *
+ * This is how the Tree Builder can tell the Tokenizer when a certain tag should
+ * cause the parser to go into RAW text mode.
+ *
+ * The HTML5 standard requires that the builder is the one that initiates this
+ * step, and this is the only way short of a circular reference that we can
+ * do that.
+ *
+ * Example: if a startTag even for a `script` name is fired, and the startTag()
+ * implementation returns Tokenizer::TEXTMODE_RAW, then the tokenizer will
+ * switch into RAW text mode and consume data until it reaches a closing
+ * `script` tag.
+ *
+ * The textmode is automatically reset to Tokenizer::TEXTMODE_NORMAL when the
+ * closing tag is encounter. **This behavior may change.**
+ *
+ * @param string $name
+ * The tag name.
+ * @param array $attributes
+ * An array with all of the tag's attributes.
+ * @param boolean $selfClosing
+ * An indicator of whether or not this tag is self-closing (<foo/>)
+ * @return numeric
+ * One of the Tokenizer::TEXTMODE_* constants.
*/
public function startTag($name, $attributes = array(), $selfClosing = FALSE);
/**
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index 4f2f792..02bb328 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -303,11 +303,15 @@ class Tokenizer {
}
while (!$this->isTagEnd($selfClose));
- $this->events->startTag($name, $attributes, $selfClose);
+ $mode = $this->events->startTag($name, $attributes, $selfClose);
// Should we do this? What does this buy that selfClose doesn't?
if ($selfClose) {
$this->events->endTag($name);
}
+ elseif (is_int($mode)) {
+ //fprintf(STDOUT, "Event response says move into mode %d for tag %s", $mode, $name);
+ $this->setTextMode($mode, $name);
+ }
$this->scanner->next();
@@ -816,6 +820,7 @@ class Tokenizer {
}
// If we get here, we hit the EOF.
+ $this->parseError("Unexpected EOF during text read.");
return $buffer;
}
diff --git a/test/HTML5/Parser/EventStack.php b/test/HTML5/Parser/EventStack.php
index f197855..c9ac20e 100644
--- a/test/HTML5/Parser/EventStack.php
+++ b/test/HTML5/Parser/EventStack.php
@@ -4,6 +4,12 @@ namespace HTML5\Parser;
/**
* This testing class gathers events from a parser and builds a stack of events.
* It is useful for checking the output of a tokenizer.
+ *
+ * IMPORTANT:
+ *
+ * The startTag event also kicks the parser into TEXTMODE_RAW when it encounters
+ * script or pre tags. This is to match the behavior required by the HTML5 spec,
+ * which says that the tree builder must tell the tokenizer when to switch states.
*/
class EventStack implements EventHandler {
protected $stack;
@@ -42,6 +48,9 @@ class EventStack implements EventHandler {
public function startTag($name, $attributes = array(), $selfClosing = FALSE) {
$args = func_get_args();
$this->store('startTag', $args);
+ if ($name == 'pre' || $name == 'script') {
+ return Tokenizer::TEXTMODE_RAW;
+ }
}
public function endTag($name) {
diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php
index 69f90b9..7ec4c76 100644
--- a/test/HTML5/Parser/TokenizerTest.php
+++ b/test/HTML5/Parser/TokenizerTest.php
@@ -395,38 +395,59 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
'<pre>hello</pre</pre>' => 'hello</pre',
"<pre>\nhello</pre\n</pre>" => "\nhello</pre\n",
'<pre>&amp;</pre>' => '&amp;',
+ '<pre><!--not a comment--></pre>' => '<!--not a comment-->',
+ '<pre><![CDATA[not a comment]]></pre>' => '<![CDATA[not a comment]]>',
);
foreach ($good as $test => $expects) {
- list($tok, $events) = $this->createTokenizer($test);
-
- $tok->setTextMode(Tokenizer::TEXTMODE_RAW, 'pre');
- $tok->parse();
-
- //fprintf(STDOUT, "Test: %s\n", $test);
- fprintf(STDOUT, "Test: %s %s\n", $test, print_r($events, TRUE));
-
+ $events = $this->parse($test);
$this->assertEventEquals('startTag', 'pre', $events->get(0));
$this->assertEventEquals('text', $expects, $events->get(1));
$this->assertEventEquals('endTag', 'pre', $events->get(2));
}
$bad = array(
- '<pre>&amp;</pre' => '&amp;',
+ '<pre>&amp;</pre' => '&amp;</pre',
+ '<pre>Hello world' => 'Hello world',
);
+ foreach ($bad as $test => $expects) {
+ $events = $this->parse($test);
+ $this->assertEquals(4, $events->depth(), "Counting events for '$test': " . print_r($events, TRUE));
+ $this->assertEventEquals('startTag', 'pre', $events->get(0));
+ $this->assertEventError($events->get(1));
+ $this->assertEventEquals('text', $expects, $events->get(2));
+ }
}
public function testText() {
- $good = array(
- 'a<br>b',
- '<a>test</a>',
- 'a<![[ test ]]>b',
- 'a&amp;b',
- 'a&b',
- 'a& b& c',
- );
- $this->markTestIncomplete("Need tag parsing first.");
+ $events = $this->parse('a<br>b');
+ $this->assertEquals(4, $events->depth(), "Events: " . print_r($events, TRUE));
+ $this->assertEventEquals('text', 'a', $events->get(0));
+ $this->assertEventEquals('startTag', 'br', $events->get(1));
+ $this->assertEventEquals('text', 'b', $events->get(2));
+
+ $events = $this->parse('<a>Test</a>');
+ $this->assertEquals(4, $events->depth(), "Events: " . print_r($events, TRUE));
+ $this->assertEventEquals('startTag', 'a', $events->get(0));
+ $this->assertEventEquals('text', 'Test', $events->get(1));
+ $this->assertEventEquals('endTag', 'a', $events->get(2));
+
+ $events = $this->parse('a<![CDATA[test]]>b');
+ $this->assertEquals(4, $events->depth(), "Events: " . print_r($events, TRUE));
+ $this->assertEventEquals('text', 'a', $events->get(0));
+ $this->assertEventEquals('cdata', 'test', $events->get(1));
+ $this->assertEventEquals('text', 'b', $events->get(2));
+
+ $events = $this->parse('a<!--test-->b');
+ $this->assertEquals(4, $events->depth(), "Events: " . print_r($events, TRUE));
+ $this->assertEventEquals('text', 'a', $events->get(0));
+ $this->assertEventEquals('comment', 'test', $events->get(1));
+ $this->assertEventEquals('text', 'b', $events->get(2));
+
+ $events = $this->parse('a&amp;b');
+ $this->assertEquals(2, $events->depth(), "Events: " . print_r($events, TRUE));
+ $this->assertEventEquals('text', 'a&b', $events->get(0));
}
// ================================================================