summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Butcher <[email protected]>2014-02-21 09:42:49 -0700
committerMatt Butcher <[email protected]>2014-02-21 09:42:49 -0700
commitc3ac1b36c0fab25eff589b19b35b55a229ca8ec4 (patch)
tree64fbdd7be8867085ca01a3b506650f0577fbd03f
parent4b3da0978f5a77d4ac45daf3fb04fbe438701fe6 (diff)
parent8f95f4ad58b96a7116083c847b247348ade279a7 (diff)
Merge pull request #29 from miso-belica/fix-wrong-attr-names
Ignore attributes with illegal characters in name
-rw-r--r--src/HTML5/Parser/DOMTreeBuilder.php28
-rw-r--r--src/HTML5/Parser/Tokenizer.php25
-rw-r--r--test/HTML5/Parser/TokenizerTest.php9
3 files changed, 46 insertions, 16 deletions
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php
index 13ae3bc..b79c298 100644
--- a/src/HTML5/Parser/DOMTreeBuilder.php
+++ b/src/HTML5/Parser/DOMTreeBuilder.php
@@ -5,9 +5,9 @@ use HTML5\Elements;
/**
* Create an HTML5 DOM tree from events.
*
- * This attempts to create a DOM from events emitted by a parser. This
- * attempts (but does not guarantee) to up-convert older HTML documents
- * to HTML5. It does this by applying HTML5's rules, but it will not
+ * This attempts to create a DOM from events emitted by a parser. This
+ * attempts (but does not guarantee) to up-convert older HTML documents
+ * to HTML5. It does this by applying HTML5's rules, but it will not
* change the architecture of the document itself.
*
* Many of the error correction and quirks features suggested in the specification
@@ -61,7 +61,7 @@ class DOMTreeBuilder implements EventHandler {
protected $insertMode = 0;
/**
- * Quirks mode is enabled by default. Any document that is missing the
+ * Quirks mode is enabled by default. Any document that is missing the
* DT will be considered to be in quirks mode.
*/
protected $quirks = TRUE;
@@ -71,7 +71,7 @@ class DOMTreeBuilder implements EventHandler {
public function __construct($isFragment = FALSE) {
$impl = new \DOMImplementation();
// XXX:
- // Create the doctype. For now, we are always creating HTML5
+ // Create the doctype. For now, we are always creating HTML5
// documents, and attempting to up-convert any older DTDs to HTML5.
$dt = $impl->createDocumentType('html');
//$this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
@@ -103,7 +103,7 @@ class DOMTreeBuilder implements EventHandler {
/**
* Get the DOM fragment for the body.
*
- * This returns a DOMNodeList because a fragment may have zero or more
+ * This returns a DOMNodeList because a fragment may have zero or more
* DOMNodes at its root.
*
* @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context
@@ -141,7 +141,7 @@ class DOMTreeBuilder implements EventHandler {
}
public function doctype($name, $idType = 0, $id = NULL, $quirks = FALSE) {
- // This is used solely for setting quirks mode. Currently we don't
+ // This is used solely for setting quirks mode. Currently we don't
// try to preserve the inbound DT. We convert it to HTML5.
$this->quirks = $quirks;
@@ -234,7 +234,13 @@ class DOMTreeBuilder implements EventHandler {
$aName = Elements::normalizeMathMlAttribute($aName);
}
- $ele->setAttribute($aName, $aVal);
+ try {
+ $ele->setAttribute($aName, $aVal);
+ }
+ catch(\DOMException $e) {
+ $this->parseError("Illegal attribute name for tag $name. Ignoring: $aName");
+ continue;
+ }
// This is necessary on a non-DTD schema, like HTML5.
if ($aName == 'id') {
@@ -262,7 +268,7 @@ class DOMTreeBuilder implements EventHandler {
$this->insertMode = static::IM_IN_BODY;
}
- // Return the element mask, which the tokenizer can then use to set
+ // Return the element mask, which the tokenizer can then use to set
// various processing rules.
return Elements::element($name);
}
@@ -369,7 +375,7 @@ class DOMTreeBuilder implements EventHandler {
return;
}
- // Important: The processor may modify the current DOM tree however
+ // Important: The processor may modify the current DOM tree however
// it sees fit.
if (isset($this->processor)) {
$res = $this->processor->process($this->current, $name, $data);
@@ -402,7 +408,7 @@ class DOMTreeBuilder implements EventHandler {
protected function normalizeTagName($name) {
/* Section 2.9 suggests that we should not do this.
if (strpos($name, ':') !== FALSE) {
- // We know from the grammar that there must be at least one other
+ // We know from the grammar that there must be at least one other
// char besides :, since : is not a legal tag start.
$parts = explode(':', $name);
return array_pop($parts);
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index a79781f..f21d30b 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -414,16 +414,33 @@ class Tokenizer {
$name = $this->scanner->current();
$this->scanner->next();
}
- if (preg_match('/[\'\"]/', $name)) {
- //if (strspn($name, '\'\"')) {
+
+ $isValidAttribute = TRUE;
+ // Attribute names can contain most Unicode characters for HTML5.
+ // But method "DOMElement::setAttribute" is throwing exception
+ // because of it's own internal restriction so these have to be filtered.
+ // see issue #23: https://github.com/Masterminds/html5-php/issues/23
+ // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
+ if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) {
$this->parseError("Unexpected characters in attribute name: %s", $name);
+ $isValidAttribute = FALSE;
+ }
+ // There is no limitation for 1st character in HTML5.
+ // But method "DOMElement::setAttribute" is throwing exception for the
+ // characters below so they have to be filtered.
+ // see issue #23: https://github.com/Masterminds/html5-php/issues/23
+ // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
+ else if (preg_match("/^[0-9.-]/u", $name)) {
+ $this->parseError("Unexpected character at the begining of attribute name: %s", $name);
+ $isValidAttribute = FALSE;
}
// 8.1.2.3
$this->scanner->whitespace();
$val = $this->attributeValue();
- //return array($name, $val);
- $attributes[$name] = $val;
+ if($isValidAttribute) {
+ $attributes[$name] = $val;
+ }
return TRUE;
}
diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php
index 3d100e7..2a111bc 100644
--- a/test/HTML5/Parser/TokenizerTest.php
+++ b/test/HTML5/Parser/TokenizerTest.php
@@ -363,11 +363,18 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
// This will emit an entity lookup failure for &red.
"<foo a='blue&red'>" => array('foo', array('a' => 'blue&red'), FALSE),
"<foo a='blue&&amp;&red'>" => array('foo', array('a' => 'blue&&&red'), FALSE),
- '<foo b"="baz">' => array('foo', array('b"' => 'baz'), FALSE),
'<foo bar=>' => array('foo', array('bar' => NULL), FALSE),
'<foo bar="oh' => array('foo', array('bar' => 'oh'), FALSE),
'<foo bar=oh">' => array('foo', array('bar' => 'oh"'), FALSE),
+ // these attributes are ignored because of current implementation
+ // of method "DOMElement::setAttribute"
+ // see issue #23: https://github.com/Masterminds/html5-php/issues/23
+ '<foo b"="baz">' => array('foo', array(), FALSE),
+ '<foo 2abc="baz">' => array('foo', array(), FALSE),
+ '<foo ?="baz">' => array('foo', array(), FALSE),
+ '<foo foo?bar="baz">' => array('foo', array(), FALSE),
+
);
foreach ($bad as $test => $expects) {
$events = $this->parse($test);