summaryrefslogtreecommitdiff
path: root/src/HTML5/Parser
diff options
context:
space:
mode:
authorTechnosophos <[email protected]>2013-04-11 17:39:22 -0500
committerTechnosophos <[email protected]>2013-04-11 17:39:22 -0500
commitf0da96a373b912eb649954d7920a96c6284f9455 (patch)
tree2492b865ea4067f1a015c270158f8e0349bd9934 /src/HTML5/Parser
parent4b48113eed8e21ccc6b8b4bbb310cbecdd85af65 (diff)
Working on closing tag and bogus comments.
Diffstat (limited to 'src/HTML5/Parser')
-rw-r--r--src/HTML5/Parser/Scanner.php20
-rw-r--r--src/HTML5/Parser/Tokenizer.php79
2 files changed, 82 insertions, 17 deletions
diff --git a/src/HTML5/Parser/Scanner.php b/src/HTML5/Parser/Scanner.php
index f952704..b58b662 100644
--- a/src/HTML5/Parser/Scanner.php
+++ b/src/HTML5/Parser/Scanner.php
@@ -13,6 +13,9 @@ class Scanner {
protected $is;
+ // Flipping this to TRUE will give minisculely more debugging info.
+ public $debug = FALSE;
+
/**
* Create a new Scanner.
*
@@ -54,6 +57,7 @@ class Scanner {
public function next() {
$this->is->next();
if ($this->is->valid()) {
+ if ($this->debug) fprintf(STDOUT, "> %s\n", $this->is->current());
return $this->is->current();
}
return FALSE;
@@ -137,6 +141,15 @@ class Scanner {
}
/**
+ * Consume whitespace.
+ *
+ * Whitespace in HTML5 is: formfeed, tab, newline, space.
+ */
+ public function whitespace() {
+ return $this->is->charsWhile("\n\t\f ");
+ }
+
+ /**
* Returns the current line that is being consumed.
*
* @return int
@@ -146,6 +159,13 @@ class Scanner {
return $this->is->currentLine();
}
+ public function charsUntil($mask) {
+ return $this->is->charsUntil($mask);
+ }
+ public function charsWhile($mask) {
+ return $this->is->charsWhile($mask);
+ }
+
/**
* Returns the current column of the current line that the tokenizer is at.
*
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index af8773f..0eae949 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -103,7 +103,7 @@ class Tokenizer {
protected function eof() {
if ($this->scanner->current() === FALSE) {
//fprintf(STDOUT, "EOF");
- $this->flushText();
+ $this->flushBuffer();
$this->events->eof();
$this->carryOn = FALSE;
return TRUE;
@@ -157,7 +157,7 @@ class Tokenizer {
$hex = $this->scanner->getHex();
if (empty($hex)) {
//throw new ParseError("Expected &#xHEX;, got &#x" . $tok);
- $this->parseError("Expected &#xHEX;, got &#x" . $tok);
+ $this->parseError("Expected &#xHEX;, got &#x%s", $tok);
return FALSE;
}
$entity = CharacterReference::lookupHex($hex);
@@ -168,7 +168,7 @@ class Tokenizer {
$numeric = $this->scanner->getNumeric();
if (empty($numeric)) {
//throw ParseError("Expected &#DIGITS;, got $#" . $tok);
- $this->parseError("Expected &#DIGITS;, got $#" . $tok);
+ $this->parseError("Expected &#DIGITS;, got $#%s", $tok);
return FALSE;
}
$entity = CharacterReference::lookupDecimal($numeric);
@@ -180,7 +180,7 @@ class Tokenizer {
$cname = $this->scanner->getAsciiAlpha();
$entity = CharacterReference::lookupName($cname);
if ($entity == NULL) {
- $this->parseError("No match in entity table for " . $entity);
+ $this->parseError("No match in entity table for '%s'", $entity);
}
}
@@ -203,7 +203,7 @@ class Tokenizer {
}
//throw new ParseError("Expected &ENTITY;, got &ENTITY (no trailing ;) " . $tok);
- $this->parseError("Expected &ENTITY;, got &ENTITY (no trailing ;) " . $tok);
+ $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok);
}
@@ -218,7 +218,7 @@ class Tokenizer {
$this->scanner->next();
return $this->markupDeclaration() ||
- $this->endTagOpen() ||
+ $this->endTag() ||
$this->tagName() ||
$this->processingInstruction() ||
// This always returns false.
@@ -269,9 +269,10 @@ class Tokenizer {
/**
+ * Consume an end tag.
* 8.2.4.9
*/
- protected function endTagOpen() {
+ protected function endTag() {
if ($this->scanner->current() != '/') {
return FALSE;
}
@@ -282,20 +283,47 @@ class Tokenizer {
// EOF -> parse error
// -> parse error
if (!ctype_alpha($tok)) {
- $this->parseError("Expected tag name, got " . $tok);
+ $this->parseError("Expected tag name, got '%s'", $tok);
if ($tok == "\0" || $tok === FALSE) {
return FALSE;
}
- return $this->bogusComment();
+ return $this->bogusComment('</');
}
- return $this->tagName();
+ $name = $this->scanner->charsUntil("\n\f \t>");
+ // Trash whitespace.
+ $this->scanner->charsWhile("\n\f \t");
+
+ if ($this->scanner->current() != '>') {
+ $this->parseError("Expected >, got '%s'", $this->scanner->current());
+ // We just trash stuff until we get to the next tag close.
+ $this->scanner->charsUntil('>');
+ }
+
+ $this->events->endTag($name);
+ $this->scanner->next();
+ return TRUE;
+
}
/**
* 8.2.4.10
*/
protected function tagName() {
+ $name = $this->scanner->current();
+ $tok = $this->scanner->next();
+ switch ($tok) {
+ case "\n":
+ case "\t":
+ case "\f":
+ case ' ':
+ return $this->beforeAttribute();
+ case '/':
+ return $this->selfClosingTag();
+ case '>':
+
+
+ }
return FALSE;
// tab, lf, ff, space -> before attr name
// / -> self-closing tag
@@ -411,21 +439,31 @@ class Tokenizer {
/**
* Consume malformed markup as if it were a comment.
* 8.2.4.44
+ *
+ * The spec requires that the ENTIRE tag-like thing be enclosed inside of
+ * the comment. So this will generate comments like:
+ *
+ * &lt;!--&lt/+foo&gt;--&gt;
+ *
+ * @param string $leading
+ * Prepend any leading characters. This essentially
+ * negates the need to backtrack, but it's sort of
+ * a hack.
*/
- protected function bogusComment() {
+ protected function bogusComment($leading = '') {
// TODO: This can be done more efficiently when the
// scanner exposes a readUntil() method.
- $comment = '';
+ $comment = $leading;
$tok = $this->scanner->current();
do {
$comment .= $tok;
$tok = $this->scanner->next();
- fprintf(STDOUT, "> %s\n", $tok);
- } while ($tok !== FALSE || $tok != '>');
+ } while ($tok !== FALSE && $tok != '>');
$this->flushBuffer();
- $this->events->comment($comment);
+ $this->events->comment($comment . $tok);
+ $this->scanner->next();
return TRUE;
}
@@ -504,7 +542,7 @@ class Tokenizer {
* temporary text buffer. (The buffer is used to group as much PCDATA
* as we can instead of emitting lots and lots of TEXT events.)
*/
- protected function flushText() {
+ protected function flushBuffer() {
if (empty($this->text)) {
return;
}
@@ -515,7 +553,7 @@ class Tokenizer {
/**
* Add text to the temporary buffer.
*
- * @see flushText()
+ * @see flushBuffer()
*/
protected function buffer($str) {
$this->text .= $str;
@@ -528,6 +566,13 @@ class Tokenizer {
* characters.
*/
protected function parseError($msg) {
+ $args = func_get_args();
+
+ if (count($args) > 1) {
+ array_shift($args);
+ $msg = vsprintf($msg, $args);
+ }
+
$line = $this->scanner->currentLine();
$col = $this->scanner->columnOffset();
$this->events->parseError($msg, $line, $col);