diff options
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 36 | ||||
-rw-r--r-- | test/HTML5/Parser/TokenizerTest.php | 20 |
2 files changed, 41 insertions, 15 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 04baa10..a79781f 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -326,11 +326,18 @@ class Tokenizer { $attributes = array(); $selfClose = FALSE; - do { - $this->scanner->whitespace(); - $this->attribute($attributes); + // Handle attribute parse exceptions here so that we can + // react by trying to build a sensible parse tree. + try { + do { + $this->scanner->whitespace(); + $this->attribute($attributes); + } + while (!$this->isTagEnd($selfClose)); + } + catch (ParseError $e) { + $selfClose = FALSE; } - while (!$this->isTagEnd($selfClose)); $mode = $this->events->startTag($name, $attributes, $selfClose); // Should we do this? What does this buy that selfClose doesn't? @@ -390,6 +397,14 @@ class Tokenizer { return FALSE; } + if ($tok == '<') { + $this->parseError("Unexepcted '<' inside of attributes list."); + // Push the < back onto the stack. + $this->scanner->unconsume(); + // Let the caller figure out how to handle this. + throw new ParseError("Start tag inside of attribute."); + } + $name = strtolower($this->scanner->charsUntil("/>=\n\f\t ")); if (strlen($name) == 0) { @@ -483,6 +498,7 @@ class Tokenizer { while (strspn($tok, $stoplist) == 0 && $tok !== FALSE) { if ($tok == '&') { $val .= $this->decodeCharacterReference(TRUE); + $tok = $this->scanner->current(); } else { if(strspn($tok, "\"'<=`") > 0) { @@ -774,7 +790,7 @@ class Tokenizer { * * XML processing instructions are supposed to be ignored in HTML5, * treated as "bogus comments". However, since we're not a user - * agent, we allow them. We consume until ?> and then issue a + * agent, we allow them. We consume until ?> and then issue a * EventListener::processingInstruction() event. */ protected function processingInstruction() { @@ -819,7 +835,7 @@ class Tokenizer { // ================================================================ /** - * Read from the input stream until we get to the desired sequene + * Read from the input stream until we get to the desired sequene * or hit the end of the input stream. */ protected function readUntilSequence($sequence) { @@ -849,11 +865,11 @@ class Tokenizer { * This will read the stream for the $sequence. If it's * found, this will return TRUE. If not, return FALSE. * Since this unconsumes any chars it reads, the caller - * will still need to read the next sequence, even if + * will still need to read the next sequence, even if * this returns TRUE. * * Example: $this->sequenceMatches('</script>') will - * see if the input stream is at the start of a + * see if the input stream is at the start of a * '</script>' string. */ protected function sequenceMatches($sequence) { @@ -902,7 +918,7 @@ class Tokenizer { /** * Emit a parse error. * - * A parse error always returns FALSE because it never consumes any + * A parse error always returns FALSE because it never consumes any * characters. */ protected function parseError($msg) { @@ -1008,7 +1024,7 @@ class Tokenizer { return $entity; } - // If in an attribute, then failing to match ; means unconsume the + // If in an attribute, then failing to match ; means unconsume the // entire string. Otherwise, failure to match is an error. if ($inAttribute) { $this->scanner->unconsume($this->scanner->position() - $start); diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php index 9f335b0..a55250f 100644 --- a/test/HTML5/Parser/TokenizerTest.php +++ b/test/HTML5/Parser/TokenizerTest.php @@ -109,7 +109,7 @@ class TokenizerTest extends \HTML5\Tests\TestCase { $e1 = $events->get(0); $this->assertEquals('error', $e1['name']); - // FIXME: Once the text processor is done, need to verify that the + // FIXME: Once the text processor is done, need to verify that the // tokens are transformed correctly into text. } @@ -139,12 +139,12 @@ class TokenizerTest extends \HTML5\Tests\TestCase { $succeed = array( '</a>' => 'a', '</test>' => 'test', - '</test + '</test >' => 'test', '</thisIsTheTagThatDoesntEndItJustGoesOnAndOnMyFriend>' => 'thisisthetagthatdoesntenditjustgoesonandonmyfriend', // See 8.2.4.10, which requires this and does not say error. - '</a<b>' => 'a<b', + '</a<b>' => 'a<b', ); $this->isAllGood('endTag', 2, $succeed); @@ -271,8 +271,8 @@ class TokenizerTest extends \HTML5\Tests\TestCase { public function testProcessorInstruction() { $good = array( '<?hph ?>' => 'hph', - '<?hph echo "Hello World"; ?>' => array('hph', 'echo "Hello World"; '), - "<?hph \necho 'Hello World';\n?>" => array('hph', "echo 'Hello World';\n"), + '<?hph echo "Hello World"; ?>' => array('hph', 'echo "Hello World"; '), + "<?hph \necho 'Hello World';\n?>" => array('hph', "echo 'Hello World';\n"), ); $this->isAllGood('pi', 2, $good); } @@ -379,6 +379,8 @@ class TokenizerTest extends \HTML5\Tests\TestCase { $reallyBad = array( '<foo ="bar">' => array('foo', array('=' => NULL, '"bar"' => NULL), FALSE), '<foo////>' => array('foo', array(), TRUE), + // character "&" in unquoted attribute shouldn't cause an infinite loop + '<foo bar=index.php?str=1&id=29>' => array('foo', array('bar' => 'index.php?str=1&id=29'), FALSE), ); foreach ($reallyBad as $test => $expects) { $events = $this->parse($test); @@ -387,6 +389,14 @@ class TokenizerTest extends \HTML5\Tests\TestCase { $this->assertEventError($events->get(1)); //$this->assertEventEquals('startTag', $expects, $events->get(1)); } + + // Regression: Malformed elements should be detected. + // '<foo baz="1" <bar></foo>' => array('foo', array('baz' => '1'), FALSE), + $events = $this->parse('<foo baz="1" <bar></foo>'); + $this->assertEventError($events->get(0)); + $this->assertEventEquals('startTag', array('foo', array('baz' => '1'), FALSE), $events->get(1)); + $this->assertEventEquals('startTag', array('bar', array(), FALSE), $events->get(2)); + $this->assertEventEquals('endTag', array('foo'), $events->get(3)); } public function testRawText() { |