From 9351d1c046ed3a6a82db3daa7eaab2336d5b204d Mon Sep 17 00:00:00 2001 From: Technosophos Date: Fri, 19 Apr 2013 17:43:28 -0500 Subject: Full support for rawtext. Unit tests finished. --- src/HTML5/Parser/EventHandler.php | 27 ++++++++++++++++++++++++++- src/HTML5/Parser/Tokenizer.php | 7 ++++++- 2 files changed, 32 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/HTML5/Parser/EventHandler.php b/src/HTML5/Parser/EventHandler.php index 8282cb7..ebb30b2 100644 --- a/src/HTML5/Parser/EventHandler.php +++ b/src/HTML5/Parser/EventHandler.php @@ -27,7 +27,32 @@ interface EventHandler { /** * A start tag. * - * + * IMPORTANT: The parser watches the return value of this event. If this returns + * an integer, the parser will switch TEXTMODE patters according to the int. + * + * This is how the Tree Builder can tell the Tokenizer when a certain tag should + * cause the parser to go into RAW text mode. + * + * The HTML5 standard requires that the builder is the one that initiates this + * step, and this is the only way short of a circular reference that we can + * do that. + * + * Example: if a startTag even for a `script` name is fired, and the startTag() + * implementation returns Tokenizer::TEXTMODE_RAW, then the tokenizer will + * switch into RAW text mode and consume data until it reaches a closing + * `script` tag. + * + * The textmode is automatically reset to Tokenizer::TEXTMODE_NORMAL when the + * closing tag is encounter. **This behavior may change.** + * + * @param string $name + * The tag name. + * @param array $attributes + * An array with all of the tag's attributes. + * @param boolean $selfClosing + * An indicator of whether or not this tag is self-closing () + * @return numeric + * One of the Tokenizer::TEXTMODE_* constants. */ public function startTag($name, $attributes = array(), $selfClosing = FALSE); /** diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 4f2f792..02bb328 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -303,11 +303,15 @@ class Tokenizer { } while (!$this->isTagEnd($selfClose)); - $this->events->startTag($name, $attributes, $selfClose); + $mode = $this->events->startTag($name, $attributes, $selfClose); // Should we do this? What does this buy that selfClose doesn't? if ($selfClose) { $this->events->endTag($name); } + elseif (is_int($mode)) { + //fprintf(STDOUT, "Event response says move into mode %d for tag %s", $mode, $name); + $this->setTextMode($mode, $name); + } $this->scanner->next(); @@ -816,6 +820,7 @@ class Tokenizer { } // If we get here, we hit the EOF. + $this->parseError("Unexpected EOF during text read."); return $buffer; } -- cgit v1.2.3