1 files changed, 60 insertions, 27 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index 1d786c5..d4bf539 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -31,6 +31,9 @@ class Tokenizer {
    */
   protected $text = '';
 
+  // When this goes to false, the parser stops.
+  protected $carryOn = TRUE;
+
   /**
    * Create a new tokenizer.
    *
@@ -53,9 +56,12 @@ class Tokenizer {
    * Main entry point.
    */
   public function parse() {
-    while ($this->consumeData()) {
-      $this->scanner->next();
+    $p = 0;
+    do {
+      $p = $this->scanner->position();
+      $this->consumeData();
     }
+    while ($this->carryOn);
   }
 
   /**
@@ -93,22 +99,43 @@ class Tokenizer {
    * HTML5 8.2.4.1
    */
   protected function consumeData() {
-
     // Character Ref
-    $this->characterReference();
+    $this->characterReference() ||
+      $this->tagOpen() ||
+      $this->eof() ||
+      $this->characterData();
 
-    // TagOpen
+    return $this->carryOn;
+  }
+
+  /**
+   * This buffers the current token as character data.
+   */
+  protected function characterData() {
+    $tok = $this->scanner->current();
+
+    // This should never happen...
+    if ($tok === FALSE) {
+      return FALSE;
+    }
     // Null
+    if ($tok == "\00") {
+      $this->parseError("Received NULL character.");
+    }
+    $this->buffer($tok);
+    $this->scanner->next();
+    return TRUE;
+  }
 
-    // EOF
+  protected function eof() {
     if ($this->scanner->current() === FALSE) {
-      // Flush any trailing text, and then throw an EOF.
+      //fprintf(STDOUT, "EOF");
       $this->flushText();
       $this->events->eof();
-      return FALSE;
+      $this->carryOn = FALSE;
+      return TRUE;
     }
-    // Character
-    return TRUE;
+    return FALSE;
   }
 
   /**
@@ -122,9 +149,12 @@ class Tokenizer {
    */
   protected function characterReference($inAttribute = FALSE) {
 
+    // If it fails this, it's definitely not an entity.
     if ($this->scanner->current() != '&') {
-      return;
+      return FALSE;
     }
+
+    // Next char after &.
     $tok = $this->scanner->next();
     $entity = '';
     $start = $this->scanner->position();
@@ -140,8 +170,7 @@ class Tokenizer {
     case '<':
       // Don't consume; just return. Spec says return nothing, but I 
       // think we have to append '&' to the string.
-      $this->buffer('&');
-      return;
+      return FALSE;
     case '#':
       // Consume and read a number
       $tok = $this->scanner->next();
@@ -155,7 +184,7 @@ class Tokenizer {
         if (empty($hex)) {
           //throw new ParseError("Expected &#xHEX;, got &#x" . $tok);
           $this->parseError("Expected &#xHEX;, got &#x" . $tok);
-          return;
+          return FALSE;
         }
         $entity = CharacterReference::lookupHex($hex);
       }
@@ -166,7 +195,7 @@ class Tokenizer {
         if (empty($numeric)) {
           //throw ParseError("Expected &#DIGITS;, got $#" . $tok);
           $this->parseError("Expected &#DIGITS;, got $#" . $tok);
-          return;
+          return FALSE;
         }
         $entity = CharacterReference::lookupDecimal($numeric);
       }
@@ -187,7 +216,8 @@ class Tokenizer {
     // We have an entity. We're done here.
     if ($tok == ';') {
       $this->buffer($entity);
-      return;
+      $this->scanner->next();
+      return TRUE;
     }
 
     // If in an attribute, then failing to match ; means unconsume the 
@@ -195,7 +225,7 @@ class Tokenizer {
     if ($inAttribute) {
       $this->scanner->unconsume($this->scanner->position() - $start);
       $this->buffer('&');
-      return;
+      return FALSE;
     }
 
     //throw new ParseError("Expected &ENTITY;, got &ENTITY (no trailing ;) " . $tok);
@@ -203,6 +233,19 @@ class Tokenizer {
 
   }
 
+  /**
+   * 8.2.4.8
+   */
+  protected function tagOpen() {
+    // ! -> markup declaration
+    // / -> end tagopen
+    // a-zA-Z -> tagname
+    // ? -> parse error
+    // -> Anything else is a parse error
+    //fprintf(STDOUT, '+');
+    return FALSE;
+  }
+
   protected function rcdata() {
     // Ampersand
     // <
@@ -234,16 +277,6 @@ class Tokenizer {
     // -> Character data
   }
 
-  /**
-   * 8.2.4.8
-   */
-  protected function tagOpen() {
-    // ! -> markup declaration
-    // / -> end tagopen
-    // a-zA-Z -> tagname
-    // ? -> parse error
-    // -> Anything else is a parse error
-  }
 
   /**
    * 8.2.4.9