summaryrefslogtreecommitdiff
path: root/src/HTML5
diff options
context:
space:
mode:
authorMišo Belica <[email protected]>2014-02-21 11:31:44 +0100
committerMišo Belica <[email protected]>2014-02-21 11:31:44 +0100
commit8f95f4ad58b96a7116083c847b247348ade279a7 (patch)
tree64fbdd7be8867085ca01a3b506650f0577fbd03f /src/HTML5
parent95f3cf8d5735498e5de26cd81babecd076e4d6bd (diff)
Ignore attributes with illegal chars in name (fixes #23)
This is neccesary because method "DOMElement::setAttribute" throws exception for wrong names so DOM elements can't contain these attributes.
Diffstat (limited to 'src/HTML5')
-rw-r--r--src/HTML5/Parser/DOMTreeBuilder.php8
-rw-r--r--src/HTML5/Parser/Tokenizer.php25
2 files changed, 28 insertions, 5 deletions
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php
index cecebaf..b79c298 100644
--- a/src/HTML5/Parser/DOMTreeBuilder.php
+++ b/src/HTML5/Parser/DOMTreeBuilder.php
@@ -234,7 +234,13 @@ class DOMTreeBuilder implements EventHandler {
$aName = Elements::normalizeMathMlAttribute($aName);
}
- $ele->setAttribute($aName, $aVal);
+ try {
+ $ele->setAttribute($aName, $aVal);
+ }
+ catch(\DOMException $e) {
+ $this->parseError("Illegal attribute name for tag $name. Ignoring: $aName");
+ continue;
+ }
// This is necessary on a non-DTD schema, like HTML5.
if ($aName == 'id') {
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index a79781f..f21d30b 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -414,16 +414,33 @@ class Tokenizer {
$name = $this->scanner->current();
$this->scanner->next();
}
- if (preg_match('/[\'\"]/', $name)) {
- //if (strspn($name, '\'\"')) {
+
+ $isValidAttribute = TRUE;
+ // Attribute names can contain most Unicode characters for HTML5.
+ // But method "DOMElement::setAttribute" is throwing exception
+ // because of it's own internal restriction so these have to be filtered.
+ // see issue #23: https://github.com/Masterminds/html5-php/issues/23
+ // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
+ if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) {
$this->parseError("Unexpected characters in attribute name: %s", $name);
+ $isValidAttribute = FALSE;
+ }
+ // There is no limitation for 1st character in HTML5.
+ // But method "DOMElement::setAttribute" is throwing exception for the
+ // characters below so they have to be filtered.
+ // see issue #23: https://github.com/Masterminds/html5-php/issues/23
+ // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
+ else if (preg_match("/^[0-9.-]/u", $name)) {
+ $this->parseError("Unexpected character at the begining of attribute name: %s", $name);
+ $isValidAttribute = FALSE;
}
// 8.1.2.3
$this->scanner->whitespace();
$val = $this->attributeValue();
- //return array($name, $val);
- $attributes[$name] = $val;
+ if($isValidAttribute) {
+ $attributes[$name] = $val;
+ }
return TRUE;
}