CDATA handling is complete. DOCTYPE is begun.

author: Matt Butcher <[email protected]> 2013-04-12 21:33:17 -0500
committer: Matt Butcher <[email protected]> 2013-04-12 21:33:17 -0500
commit: 3d8562c11dd5e7591ea29562c43fb74939836b83 (patch)
tree: 38e0e3e6aed4934c97777db9b18ab9bc9b91c9a3 /src/HTML5/Parser/Tokenizer.php
parent: ffcfa507b081cf132db5b90c26bfad66d79a4eb4 (diff)
1 files changed, 189 insertions, 136 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index 899e908..d8676c5 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -215,6 +215,9 @@ class Tokenizer {
       return FALSE;
     }
 
+    // Any buffered text data can go out now.
+    $this->flushBuffer();
+
     $this->scanner->next();
 
     return $this->markupDeclaration() ||
@@ -240,8 +243,10 @@ class Tokenizer {
       return $this->comment();
     }
     elseif($tok == 'D') {
+      $this->doctype();
     }
     elseif($tok == '[') {
+      $this->cdataSection();
     }
 
     // FINISH
@@ -250,38 +255,6 @@ class Tokenizer {
     return TRUE;
   }
 
-  protected function rcdata() {
-    // Ampersand
-    // <
-    // Null
-    // EOF
-    // Character
-  }
-
-  protected function rawtext() {
-    // < is a literal
-    // NULL is an error
-    // EOF
-    // Character data
-  }
-
-  protected function scriptData() {
-    // < is a literal
-    // NULL is an error
-    // EOF
-    // Character data
-  }
-
-  /**
-   * 8.2.4.7
-   */
-  protected function plaintext() {
-    // NULL -> parse error
-    // EOF -> eof
-    // -> Character data
-  }
-
-
   /**
    * Consume an end tag.
    * 8.2.4.9
@@ -347,108 +320,6 @@ class Tokenizer {
     // -> append to tagname
   }
 
-  /**
-   * 8.2.4.11
-   */
-  protected function rcdataLessThan() {
-    // / -> empty the tmp buffer and go to end-tag
-    // ->rcdata
-  }
-
-  /**
-   * 8.2.4.12
-   */
-  protected function rcdataEndTag() {
-    // A-Za-z: append to tagname
-    // -> rcdata state
-  }
-
-  /**
-   * 8.2.4.13
-   */
-  protected function rcdataEndTagName() {
-    // tab, lf, ff, space -> before attribute or treat as anything
-    // / -> self-closing tag
-    // > -> end tag, back to data
-    // A-Za-z -> append to tagname
-    // -> rcdata state
-  }
-
-  /**
-   * 8.2.4.14
-   */
-  protected function rawtextLessThan() {
-    // / -> rawtext endtag state
-    // -> rawtext
-  }
-
-  /**
-   * 8.2.4.15
-   */
-  protected function rawtextEndTagOpen() {
-    // A-Za-z -> rawtext
-    // ->rawtext
-  }
-
-  protected function rawtextEndTagName() {
-    // tab, lf, ff, space -> before attr name
-    //
-  }
-
-  protected function scriptLessThan(){
-  }
-  protected function scriptEndTagOpen() {
-  }
-  protected function scriptEndTagName() {
-  }
-  protected function scriptEscapeStart() {
-  }
-  protected function scriptEscapeStartDash() {
-  }
-  protected function scriptEscaped() {
-  }
-  protected function scriptEscapedDash() {
-  }
-  protected function scriptEscapedDashDash() {
-  }
-  protected function scriptEscapedLessThan() {
-  }
-  protected function scriptEscapedEndTagOpen() {
-  }
-  protected function scriptEscapedEndTagName() {
-  }
-  protected function scriptDoubleEscapeStart() {
-  }
-  protected function scriptDoubleEscaped() {
-  }
-  protected function scriptDoubleEscapedDash() {
-  }
-  protected function scriptDoubleEscapedDashDash() {
-  }
-  protected function scriptDoubleEscapedLessThan() {
-  }
-  protected function scriptDoubleEscapeEnd() {
-  }
-  protected function beforeAttributeName() {
-  }
-  protected function attributeName() {
-  }
-  protected function afterAttributeName() {
-  }
-  protected function beforeAttributeValue() {
-  }
-  protected function attributeValueDoubleQuote() {
-  }
-  protected function attributeValueSingleQuote() {
-  }
-  protected function attributeValueUnquoted() {
-  }
-  protected function characterReferenceInAttributeValue() {
-  }
-  protected function afterAttributeValueQuoted() {
-  }
-  protected function selfCloseingStartTag() {
-  }
 
   /**
    * Consume malformed markup as if it were a comment.
@@ -551,6 +422,190 @@ class Tokenizer {
 
     // Now we need to parse the DOCTYPE.
   }
+
+  /**
+   * Handle a CDATA section.
+   */
+  protected function cdataSection() {
+    if ($this->scanner->current() != '[') {
+      return FALSE;
+    }
+    $cdata = '';
+    $this->scanner->next();
+
+    $chars = $this->scanner->charsWhile('CDAT');
+    if ($chars != 'CDATA' || $this->scanner->current() != '[') {
+      $this->parseError('Expected [CDATA[, got %s', $chars);
+      return $this->bogusComment('<![' . $chars);
+    }
+
+    $tok = $this->scanner->next();
+    do {
+      if ($tok === FALSE) {
+        $this->parseError('Unexpected EOF inside CDATA.');
+        $this->bogusComment('<![CDATA[' . $cdata);
+        return TRUE;
+      }
+      $cdata .= $tok;
+      $tok = $this->scanner->next();
+    }
+    while (!$this->isCdataClose());
+
+    $this->events->cdata($cdata);
+    return TRUE;
+
+  }
+  /**
+   * Check whether the parser has reached the end of a CDATA section.
+   */
+  protected function isCdataClose() {
+    $tok = $this->scanner->current();
+    if ($tok != ']') {
+      return FALSE;
+    }
+    $tok = $this->scanner->next();
+    if ($tok == ']' && $this->scanner->peek() == '>') {
+      return TRUE;
+    }
+    // Unconsume one char and return.
+    $this->scanner->unconsume();
+    return FALSE;
+  }
+
+  protected function rcdata() {
+    // Ampersand
+    // <
+    // Null
+    // EOF
+    // Character
+  }
+
+  protected function rawtext() {
+    // < is a literal
+    // NULL is an error
+    // EOF
+    // Character data
+  }
+
+  protected function scriptData() {
+    // < is a literal
+    // NULL is an error
+    // EOF
+    // Character data
+  }
+
+  /**
+   * 8.2.4.7
+   */
+  protected function plaintext() {
+    // NULL -> parse error
+    // EOF -> eof
+    // -> Character data
+  }
+
+
+  /**
+   * 8.2.4.11
+   */
+  protected function rcdataLessThan() {
+    // / -> empty the tmp buffer and go to end-tag
+    // ->rcdata
+  }
+
+  /**
+   * 8.2.4.12
+   */
+  protected function rcdataEndTag() {
+    // A-Za-z: append to tagname
+    // -> rcdata state
+  }
+
+  /**
+   * 8.2.4.13
+   */
+  protected function rcdataEndTagName() {
+    // tab, lf, ff, space -> before attribute or treat as anything
+    // / -> self-closing tag
+    // > -> end tag, back to data
+    // A-Za-z -> append to tagname
+    // -> rcdata state
+  }
+
+  /**
+   * 8.2.4.14
+   */
+  protected function rawtextLessThan() {
+    // / -> rawtext endtag state
+    // -> rawtext
+  }
+
+  /**
+   * 8.2.4.15
+   */
+  protected function rawtextEndTagOpen() {
+    // A-Za-z -> rawtext
+    // ->rawtext
+  }
+
+  protected function rawtextEndTagName() {
+    // tab, lf, ff, space -> before attr name
+    //
+  }
+
+  protected function scriptLessThan(){
+  }
+  protected function scriptEndTagOpen() {
+  }
+  protected function scriptEndTagName() {
+  }
+  protected function scriptEscapeStart() {
+  }
+  protected function scriptEscapeStartDash() {
+  }
+  protected function scriptEscaped() {
+  }
+  protected function scriptEscapedDash() {
+  }
+  protected function scriptEscapedDashDash() {
+  }
+  protected function scriptEscapedLessThan() {
+  }
+  protected function scriptEscapedEndTagOpen() {
+  }
+  protected function scriptEscapedEndTagName() {
+  }
+  protected function scriptDoubleEscapeStart() {
+  }
+  protected function scriptDoubleEscaped() {
+  }
+  protected function scriptDoubleEscapedDash() {
+  }
+  protected function scriptDoubleEscapedDashDash() {
+  }
+  protected function scriptDoubleEscapedLessThan() {
+  }
+  protected function scriptDoubleEscapeEnd() {
+  }
+  protected function beforeAttributeName() {
+  }
+  protected function attributeName() {
+  }
+  protected function afterAttributeName() {
+  }
+  protected function beforeAttributeValue() {
+  }
+  protected function attributeValueDoubleQuote() {
+  }
+  protected function attributeValueSingleQuote() {
+  }
+  protected function attributeValueUnquoted() {
+  }
+  protected function characterReferenceInAttributeValue() {
+  }
+  protected function afterAttributeValueQuoted() {
+  }
+  protected function selfCloseingStartTag() {
+  }
   protected function beforeDoctype() {
   }
   protected function doctypeName() {
@@ -581,8 +636,6 @@ class Tokenizer {
   }
   protected function bogusDoctype() {
   }
-  protected function cdataSection() {
-  }
 
 
   // ================================================================
author	Matt Butcher <[email protected]>	2013-04-12 21:33:17 -0500
committer	Matt Butcher <[email protected]>	2013-04-12 21:33:17 -0500
commit	3d8562c11dd5e7591ea29562c43fb74939836b83 (patch)
tree	38e0e3e6aed4934c97777db9b18ab9bc9b91c9a3 /src/HTML5/Parser/Tokenizer.php
parent	ffcfa507b081cf132db5b90c26bfad66d79a4eb4 (diff)