diff options
Diffstat (limited to 'src/HTML5/Parser')
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 87 |
1 files changed, 60 insertions, 27 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 1d786c5..d4bf539 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -31,6 +31,9 @@ class Tokenizer { */ protected $text = ''; + // When this goes to false, the parser stops. + protected $carryOn = TRUE; + /** * Create a new tokenizer. * @@ -53,9 +56,12 @@ class Tokenizer { * Main entry point. */ public function parse() { - while ($this->consumeData()) { - $this->scanner->next(); + $p = 0; + do { + $p = $this->scanner->position(); + $this->consumeData(); } + while ($this->carryOn); } /** @@ -93,22 +99,43 @@ class Tokenizer { * HTML5 8.2.4.1 */ protected function consumeData() { - // Character Ref - $this->characterReference(); + $this->characterReference() || + $this->tagOpen() || + $this->eof() || + $this->characterData(); - // TagOpen + return $this->carryOn; + } + + /** + * This buffers the current token as character data. + */ + protected function characterData() { + $tok = $this->scanner->current(); + + // This should never happen... + if ($tok === FALSE) { + return FALSE; + } // Null + if ($tok == "\00") { + $this->parseError("Received NULL character."); + } + $this->buffer($tok); + $this->scanner->next(); + return TRUE; + } - // EOF + protected function eof() { if ($this->scanner->current() === FALSE) { - // Flush any trailing text, and then throw an EOF. + //fprintf(STDOUT, "EOF"); $this->flushText(); $this->events->eof(); - return FALSE; + $this->carryOn = FALSE; + return TRUE; } - // Character - return TRUE; + return FALSE; } /** @@ -122,9 +149,12 @@ class Tokenizer { */ protected function characterReference($inAttribute = FALSE) { + // If it fails this, it's definitely not an entity. if ($this->scanner->current() != '&') { - return; + return FALSE; } + + // Next char after &. $tok = $this->scanner->next(); $entity = ''; $start = $this->scanner->position(); @@ -140,8 +170,7 @@ class Tokenizer { case '<': // Don't consume; just return. Spec says return nothing, but I // think we have to append '&' to the string. - $this->buffer('&'); - return; + return FALSE; case '#': // Consume and read a number $tok = $this->scanner->next(); @@ -155,7 +184,7 @@ class Tokenizer { if (empty($hex)) { //throw new ParseError("Expected &#xHEX;, got &#x" . $tok); $this->parseError("Expected &#xHEX;, got &#x" . $tok); - return; + return FALSE; } $entity = CharacterReference::lookupHex($hex); } @@ -166,7 +195,7 @@ class Tokenizer { if (empty($numeric)) { //throw ParseError("Expected &#DIGITS;, got $#" . $tok); $this->parseError("Expected &#DIGITS;, got $#" . $tok); - return; + return FALSE; } $entity = CharacterReference::lookupDecimal($numeric); } @@ -187,7 +216,8 @@ class Tokenizer { // We have an entity. We're done here. if ($tok == ';') { $this->buffer($entity); - return; + $this->scanner->next(); + return TRUE; } // If in an attribute, then failing to match ; means unconsume the @@ -195,7 +225,7 @@ class Tokenizer { if ($inAttribute) { $this->scanner->unconsume($this->scanner->position() - $start); $this->buffer('&'); - return; + return FALSE; } //throw new ParseError("Expected &ENTITY;, got &ENTITY (no trailing ;) " . $tok); @@ -203,6 +233,19 @@ class Tokenizer { } + /** + * 8.2.4.8 + */ + protected function tagOpen() { + // ! -> markup declaration + // / -> end tagopen + // a-zA-Z -> tagname + // ? -> parse error + // -> Anything else is a parse error + //fprintf(STDOUT, '+'); + return FALSE; + } + protected function rcdata() { // Ampersand // < @@ -234,16 +277,6 @@ class Tokenizer { // -> Character data } - /** - * 8.2.4.8 - */ - protected function tagOpen() { - // ! -> markup declaration - // / -> end tagopen - // a-zA-Z -> tagname - // ? -> parse error - // -> Anything else is a parse error - } /** * 8.2.4.9 |