From 8f95f4ad58b96a7116083c847b247348ade279a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mi=C5=A1o=20Belica?= Date: Fri, 21 Feb 2014 11:31:44 +0100 Subject: Ignore attributes with illegal chars in name (fixes #23) This is neccesary because method "DOMElement::setAttribute" throws exception for wrong names so DOM elements can't contain these attributes. --- src/HTML5/Parser/DOMTreeBuilder.php | 8 +++++++- src/HTML5/Parser/Tokenizer.php | 25 +++++++++++++++++++++---- 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'src/HTML5') diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index cecebaf..b79c298 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -234,7 +234,13 @@ class DOMTreeBuilder implements EventHandler { $aName = Elements::normalizeMathMlAttribute($aName); } - $ele->setAttribute($aName, $aVal); + try { + $ele->setAttribute($aName, $aVal); + } + catch(\DOMException $e) { + $this->parseError("Illegal attribute name for tag $name. Ignoring: $aName"); + continue; + } // This is necessary on a non-DTD schema, like HTML5. if ($aName == 'id') { diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index a79781f..f21d30b 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -414,16 +414,33 @@ class Tokenizer { $name = $this->scanner->current(); $this->scanner->next(); } - if (preg_match('/[\'\"]/', $name)) { - //if (strspn($name, '\'\"')) { + + $isValidAttribute = TRUE; + // Attribute names can contain most Unicode characters for HTML5. + // But method "DOMElement::setAttribute" is throwing exception + // because of it's own internal restriction so these have to be filtered. + // see issue #23: https://github.com/Masterminds/html5-php/issues/23 + // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name + if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) { $this->parseError("Unexpected characters in attribute name: %s", $name); + $isValidAttribute = FALSE; + } + // There is no limitation for 1st character in HTML5. + // But method "DOMElement::setAttribute" is throwing exception for the + // characters below so they have to be filtered. + // see issue #23: https://github.com/Masterminds/html5-php/issues/23 + // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name + else if (preg_match("/^[0-9.-]/u", $name)) { + $this->parseError("Unexpected character at the begining of attribute name: %s", $name); + $isValidAttribute = FALSE; } // 8.1.2.3 $this->scanner->whitespace(); $val = $this->attributeValue(); - //return array($name, $val); - $attributes[$name] = $val; + if($isValidAttribute) { + $attributes[$name] = $val; + } return TRUE; } -- cgit v1.2.3