From 95f3cf8d5735498e5de26cd81babecd076e4d6bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mi=C5=A1o=20Belica?= Date: Wed, 19 Feb 2014 19:53:04 +0100 Subject: Removed trailing whitespace --- src/HTML5/Parser/DOMTreeBuilder.php | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index 13ae3bc..cecebaf 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -5,9 +5,9 @@ use HTML5\Elements; /** * Create an HTML5 DOM tree from events. * - * This attempts to create a DOM from events emitted by a parser. This - * attempts (but does not guarantee) to up-convert older HTML documents - * to HTML5. It does this by applying HTML5's rules, but it will not + * This attempts to create a DOM from events emitted by a parser. This + * attempts (but does not guarantee) to up-convert older HTML documents + * to HTML5. It does this by applying HTML5's rules, but it will not * change the architecture of the document itself. * * Many of the error correction and quirks features suggested in the specification @@ -61,7 +61,7 @@ class DOMTreeBuilder implements EventHandler { protected $insertMode = 0; /** - * Quirks mode is enabled by default. Any document that is missing the + * Quirks mode is enabled by default. Any document that is missing the * DT will be considered to be in quirks mode. */ protected $quirks = TRUE; @@ -71,7 +71,7 @@ class DOMTreeBuilder implements EventHandler { public function __construct($isFragment = FALSE) { $impl = new \DOMImplementation(); // XXX: - // Create the doctype. For now, we are always creating HTML5 + // Create the doctype. For now, we are always creating HTML5 // documents, and attempting to up-convert any older DTDs to HTML5. $dt = $impl->createDocumentType('html'); //$this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); @@ -103,7 +103,7 @@ class DOMTreeBuilder implements EventHandler { /** * Get the DOM fragment for the body. * - * This returns a DOMNodeList because a fragment may have zero or more + * This returns a DOMNodeList because a fragment may have zero or more * DOMNodes at its root. * * @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context @@ -141,7 +141,7 @@ class DOMTreeBuilder implements EventHandler { } public function doctype($name, $idType = 0, $id = NULL, $quirks = FALSE) { - // This is used solely for setting quirks mode. Currently we don't + // This is used solely for setting quirks mode. Currently we don't // try to preserve the inbound DT. We convert it to HTML5. $this->quirks = $quirks; @@ -262,7 +262,7 @@ class DOMTreeBuilder implements EventHandler { $this->insertMode = static::IM_IN_BODY; } - // Return the element mask, which the tokenizer can then use to set + // Return the element mask, which the tokenizer can then use to set // various processing rules. return Elements::element($name); } @@ -369,7 +369,7 @@ class DOMTreeBuilder implements EventHandler { return; } - // Important: The processor may modify the current DOM tree however + // Important: The processor may modify the current DOM tree however // it sees fit. if (isset($this->processor)) { $res = $this->processor->process($this->current, $name, $data); @@ -402,7 +402,7 @@ class DOMTreeBuilder implements EventHandler { protected function normalizeTagName($name) { /* Section 2.9 suggests that we should not do this. if (strpos($name, ':') !== FALSE) { - // We know from the grammar that there must be at least one other + // We know from the grammar that there must be at least one other // char besides :, since : is not a legal tag start. $parts = explode(':', $name); return array_pop($parts); -- cgit v1.2.3 From 8f95f4ad58b96a7116083c847b247348ade279a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mi=C5=A1o=20Belica?= Date: Fri, 21 Feb 2014 11:31:44 +0100 Subject: Ignore attributes with illegal chars in name (fixes #23) This is neccesary because method "DOMElement::setAttribute" throws exception for wrong names so DOM elements can't contain these attributes. --- src/HTML5/Parser/DOMTreeBuilder.php | 8 +++++++- src/HTML5/Parser/Tokenizer.php | 25 +++++++++++++++++++++---- test/HTML5/Parser/TokenizerTest.php | 9 ++++++++- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index cecebaf..b79c298 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -234,7 +234,13 @@ class DOMTreeBuilder implements EventHandler { $aName = Elements::normalizeMathMlAttribute($aName); } - $ele->setAttribute($aName, $aVal); + try { + $ele->setAttribute($aName, $aVal); + } + catch(\DOMException $e) { + $this->parseError("Illegal attribute name for tag $name. Ignoring: $aName"); + continue; + } // This is necessary on a non-DTD schema, like HTML5. if ($aName == 'id') { diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index a79781f..f21d30b 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -414,16 +414,33 @@ class Tokenizer { $name = $this->scanner->current(); $this->scanner->next(); } - if (preg_match('/[\'\"]/', $name)) { - //if (strspn($name, '\'\"')) { + + $isValidAttribute = TRUE; + // Attribute names can contain most Unicode characters for HTML5. + // But method "DOMElement::setAttribute" is throwing exception + // because of it's own internal restriction so these have to be filtered. + // see issue #23: https://github.com/Masterminds/html5-php/issues/23 + // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name + if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) { $this->parseError("Unexpected characters in attribute name: %s", $name); + $isValidAttribute = FALSE; + } + // There is no limitation for 1st character in HTML5. + // But method "DOMElement::setAttribute" is throwing exception for the + // characters below so they have to be filtered. + // see issue #23: https://github.com/Masterminds/html5-php/issues/23 + // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name + else if (preg_match("/^[0-9.-]/u", $name)) { + $this->parseError("Unexpected character at the begining of attribute name: %s", $name); + $isValidAttribute = FALSE; } // 8.1.2.3 $this->scanner->whitespace(); $val = $this->attributeValue(); - //return array($name, $val); - $attributes[$name] = $val; + if($isValidAttribute) { + $attributes[$name] = $val; + } return TRUE; } diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php index 3d100e7..2a111bc 100644 --- a/test/HTML5/Parser/TokenizerTest.php +++ b/test/HTML5/Parser/TokenizerTest.php @@ -363,11 +363,18 @@ class TokenizerTest extends \HTML5\Tests\TestCase { // This will emit an entity lookup failure for &red. "" => array('foo', array('a' => 'blue&red'), FALSE), "" => array('foo', array('a' => 'blue&&&red'), FALSE), - '' => array('foo', array('b"' => 'baz'), FALSE), '' => array('foo', array('bar' => NULL), FALSE), '' => array('foo', array('bar' => 'oh"'), FALSE), + // these attributes are ignored because of current implementation + // of method "DOMElement::setAttribute" + // see issue #23: https://github.com/Masterminds/html5-php/issues/23 + '' => array('foo', array(), FALSE), + '' => array('foo', array(), FALSE), + '' => array('foo', array(), FALSE), + '' => array('foo', array(), FALSE), + ); foreach ($bad as $test => $expects) { $events = $this->parse($test); -- cgit v1.2.3