diff options
-rw-r--r-- | src/HTML5/Parser/DOMTreeBuilder.php | 28 | ||||
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 25 | ||||
-rw-r--r-- | test/HTML5/Parser/TokenizerTest.php | 9 |
3 files changed, 46 insertions, 16 deletions
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index 13ae3bc..b79c298 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -5,9 +5,9 @@ use HTML5\Elements; /** * Create an HTML5 DOM tree from events. * - * This attempts to create a DOM from events emitted by a parser. This - * attempts (but does not guarantee) to up-convert older HTML documents - * to HTML5. It does this by applying HTML5's rules, but it will not + * This attempts to create a DOM from events emitted by a parser. This + * attempts (but does not guarantee) to up-convert older HTML documents + * to HTML5. It does this by applying HTML5's rules, but it will not * change the architecture of the document itself. * * Many of the error correction and quirks features suggested in the specification @@ -61,7 +61,7 @@ class DOMTreeBuilder implements EventHandler { protected $insertMode = 0; /** - * Quirks mode is enabled by default. Any document that is missing the + * Quirks mode is enabled by default. Any document that is missing the * DT will be considered to be in quirks mode. */ protected $quirks = TRUE; @@ -71,7 +71,7 @@ class DOMTreeBuilder implements EventHandler { public function __construct($isFragment = FALSE) { $impl = new \DOMImplementation(); // XXX: - // Create the doctype. For now, we are always creating HTML5 + // Create the doctype. For now, we are always creating HTML5 // documents, and attempting to up-convert any older DTDs to HTML5. $dt = $impl->createDocumentType('html'); //$this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); @@ -103,7 +103,7 @@ class DOMTreeBuilder implements EventHandler { /** * Get the DOM fragment for the body. * - * This returns a DOMNodeList because a fragment may have zero or more + * This returns a DOMNodeList because a fragment may have zero or more * DOMNodes at its root. * * @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context @@ -141,7 +141,7 @@ class DOMTreeBuilder implements EventHandler { } public function doctype($name, $idType = 0, $id = NULL, $quirks = FALSE) { - // This is used solely for setting quirks mode. Currently we don't + // This is used solely for setting quirks mode. Currently we don't // try to preserve the inbound DT. We convert it to HTML5. $this->quirks = $quirks; @@ -234,7 +234,13 @@ class DOMTreeBuilder implements EventHandler { $aName = Elements::normalizeMathMlAttribute($aName); } - $ele->setAttribute($aName, $aVal); + try { + $ele->setAttribute($aName, $aVal); + } + catch(\DOMException $e) { + $this->parseError("Illegal attribute name for tag $name. Ignoring: $aName"); + continue; + } // This is necessary on a non-DTD schema, like HTML5. if ($aName == 'id') { @@ -262,7 +268,7 @@ class DOMTreeBuilder implements EventHandler { $this->insertMode = static::IM_IN_BODY; } - // Return the element mask, which the tokenizer can then use to set + // Return the element mask, which the tokenizer can then use to set // various processing rules. return Elements::element($name); } @@ -369,7 +375,7 @@ class DOMTreeBuilder implements EventHandler { return; } - // Important: The processor may modify the current DOM tree however + // Important: The processor may modify the current DOM tree however // it sees fit. if (isset($this->processor)) { $res = $this->processor->process($this->current, $name, $data); @@ -402,7 +408,7 @@ class DOMTreeBuilder implements EventHandler { protected function normalizeTagName($name) { /* Section 2.9 suggests that we should not do this. if (strpos($name, ':') !== FALSE) { - // We know from the grammar that there must be at least one other + // We know from the grammar that there must be at least one other // char besides :, since : is not a legal tag start. $parts = explode(':', $name); return array_pop($parts); diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index a79781f..f21d30b 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -414,16 +414,33 @@ class Tokenizer { $name = $this->scanner->current(); $this->scanner->next(); } - if (preg_match('/[\'\"]/', $name)) { - //if (strspn($name, '\'\"')) { + + $isValidAttribute = TRUE; + // Attribute names can contain most Unicode characters for HTML5. + // But method "DOMElement::setAttribute" is throwing exception + // because of it's own internal restriction so these have to be filtered. + // see issue #23: https://github.com/Masterminds/html5-php/issues/23 + // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name + if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) { $this->parseError("Unexpected characters in attribute name: %s", $name); + $isValidAttribute = FALSE; + } + // There is no limitation for 1st character in HTML5. + // But method "DOMElement::setAttribute" is throwing exception for the + // characters below so they have to be filtered. + // see issue #23: https://github.com/Masterminds/html5-php/issues/23 + // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name + else if (preg_match("/^[0-9.-]/u", $name)) { + $this->parseError("Unexpected character at the begining of attribute name: %s", $name); + $isValidAttribute = FALSE; } // 8.1.2.3 $this->scanner->whitespace(); $val = $this->attributeValue(); - //return array($name, $val); - $attributes[$name] = $val; + if($isValidAttribute) { + $attributes[$name] = $val; + } return TRUE; } diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php index 3d100e7..2a111bc 100644 --- a/test/HTML5/Parser/TokenizerTest.php +++ b/test/HTML5/Parser/TokenizerTest.php @@ -363,11 +363,18 @@ class TokenizerTest extends \HTML5\Tests\TestCase { // This will emit an entity lookup failure for &red. "<foo a='blue&red'>" => array('foo', array('a' => 'blue&red'), FALSE), "<foo a='blue&&&red'>" => array('foo', array('a' => 'blue&&&red'), FALSE), - '<foo b"="baz">' => array('foo', array('b"' => 'baz'), FALSE), '<foo bar=>' => array('foo', array('bar' => NULL), FALSE), '<foo bar="oh' => array('foo', array('bar' => 'oh'), FALSE), '<foo bar=oh">' => array('foo', array('bar' => 'oh"'), FALSE), + // these attributes are ignored because of current implementation + // of method "DOMElement::setAttribute" + // see issue #23: https://github.com/Masterminds/html5-php/issues/23 + '<foo b"="baz">' => array('foo', array(), FALSE), + '<foo 2abc="baz">' => array('foo', array(), FALSE), + '<foo ?="baz">' => array('foo', array(), FALSE), + '<foo foo?bar="baz">' => array('foo', array(), FALSE), + ); foreach ($bad as $test => $expects) { $events = $this->parse($test); |