summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/HTML5/Parser/Tokenizer.php36
-rw-r--r--test/HTML5/Parser/TokenizerTest.php20
2 files changed, 41 insertions, 15 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index 04baa10..a79781f 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -326,11 +326,18 @@ class Tokenizer {
$attributes = array();
$selfClose = FALSE;
- do {
- $this->scanner->whitespace();
- $this->attribute($attributes);
+ // Handle attribute parse exceptions here so that we can
+ // react by trying to build a sensible parse tree.
+ try {
+ do {
+ $this->scanner->whitespace();
+ $this->attribute($attributes);
+ }
+ while (!$this->isTagEnd($selfClose));
+ }
+ catch (ParseError $e) {
+ $selfClose = FALSE;
}
- while (!$this->isTagEnd($selfClose));
$mode = $this->events->startTag($name, $attributes, $selfClose);
// Should we do this? What does this buy that selfClose doesn't?
@@ -390,6 +397,14 @@ class Tokenizer {
return FALSE;
}
+ if ($tok == '<') {
+ $this->parseError("Unexepcted '<' inside of attributes list.");
+ // Push the < back onto the stack.
+ $this->scanner->unconsume();
+ // Let the caller figure out how to handle this.
+ throw new ParseError("Start tag inside of attribute.");
+ }
+
$name = strtolower($this->scanner->charsUntil("/>=\n\f\t "));
if (strlen($name) == 0) {
@@ -483,6 +498,7 @@ class Tokenizer {
while (strspn($tok, $stoplist) == 0 && $tok !== FALSE) {
if ($tok == '&') {
$val .= $this->decodeCharacterReference(TRUE);
+ $tok = $this->scanner->current();
}
else {
if(strspn($tok, "\"'<=`") > 0) {
@@ -774,7 +790,7 @@ class Tokenizer {
*
* XML processing instructions are supposed to be ignored in HTML5,
* treated as "bogus comments". However, since we're not a user
- * agent, we allow them. We consume until ?> and then issue a
+ * agent, we allow them. We consume until ?> and then issue a
* EventListener::processingInstruction() event.
*/
protected function processingInstruction() {
@@ -819,7 +835,7 @@ class Tokenizer {
// ================================================================
/**
- * Read from the input stream until we get to the desired sequene
+ * Read from the input stream until we get to the desired sequene
* or hit the end of the input stream.
*/
protected function readUntilSequence($sequence) {
@@ -849,11 +865,11 @@ class Tokenizer {
* This will read the stream for the $sequence. If it's
* found, this will return TRUE. If not, return FALSE.
* Since this unconsumes any chars it reads, the caller
- * will still need to read the next sequence, even if
+ * will still need to read the next sequence, even if
* this returns TRUE.
*
* Example: $this->sequenceMatches('</script>') will
- * see if the input stream is at the start of a
+ * see if the input stream is at the start of a
* '</script>' string.
*/
protected function sequenceMatches($sequence) {
@@ -902,7 +918,7 @@ class Tokenizer {
/**
* Emit a parse error.
*
- * A parse error always returns FALSE because it never consumes any
+ * A parse error always returns FALSE because it never consumes any
* characters.
*/
protected function parseError($msg) {
@@ -1008,7 +1024,7 @@ class Tokenizer {
return $entity;
}
- // If in an attribute, then failing to match ; means unconsume the
+ // If in an attribute, then failing to match ; means unconsume the
// entire string. Otherwise, failure to match is an error.
if ($inAttribute) {
$this->scanner->unconsume($this->scanner->position() - $start);
diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php
index 9f335b0..a55250f 100644
--- a/test/HTML5/Parser/TokenizerTest.php
+++ b/test/HTML5/Parser/TokenizerTest.php
@@ -109,7 +109,7 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
$e1 = $events->get(0);
$this->assertEquals('error', $e1['name']);
- // FIXME: Once the text processor is done, need to verify that the
+ // FIXME: Once the text processor is done, need to verify that the
// tokens are transformed correctly into text.
}
@@ -139,12 +139,12 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
$succeed = array(
'</a>' => 'a',
'</test>' => 'test',
- '</test
+ '</test
>' => 'test',
'</thisIsTheTagThatDoesntEndItJustGoesOnAndOnMyFriend>' =>
'thisisthetagthatdoesntenditjustgoesonandonmyfriend',
// See 8.2.4.10, which requires this and does not say error.
- '</a<b>' => 'a<b',
+ '</a<b>' => 'a<b',
);
$this->isAllGood('endTag', 2, $succeed);
@@ -271,8 +271,8 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
public function testProcessorInstruction() {
$good = array(
'<?hph ?>' => 'hph',
- '<?hph echo "Hello World"; ?>' => array('hph', 'echo "Hello World"; '),
- "<?hph \necho 'Hello World';\n?>" => array('hph', "echo 'Hello World';\n"),
+ '<?hph echo "Hello World"; ?>' => array('hph', 'echo "Hello World"; '),
+ "<?hph \necho 'Hello World';\n?>" => array('hph', "echo 'Hello World';\n"),
);
$this->isAllGood('pi', 2, $good);
}
@@ -379,6 +379,8 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
$reallyBad = array(
'<foo ="bar">' => array('foo', array('=' => NULL, '"bar"' => NULL), FALSE),
'<foo////>' => array('foo', array(), TRUE),
+ // character "&" in unquoted attribute shouldn't cause an infinite loop
+ '<foo bar=index.php?str=1&amp;id=29>' => array('foo', array('bar' => 'index.php?str=1&id=29'), FALSE),
);
foreach ($reallyBad as $test => $expects) {
$events = $this->parse($test);
@@ -387,6 +389,14 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
$this->assertEventError($events->get(1));
//$this->assertEventEquals('startTag', $expects, $events->get(1));
}
+
+ // Regression: Malformed elements should be detected.
+ // '<foo baz="1" <bar></foo>' => array('foo', array('baz' => '1'), FALSE),
+ $events = $this->parse('<foo baz="1" <bar></foo>');
+ $this->assertEventError($events->get(0));
+ $this->assertEventEquals('startTag', array('foo', array('baz' => '1'), FALSE), $events->get(1));
+ $this->assertEventEquals('startTag', array('bar', array(), FALSE), $events->get(2));
+ $this->assertEventEquals('endTag', array('foo'), $events->get(3));
}
public function testRawText() {