From 2a38f56f3772f943be436c7b411c2ae5fac6cee6 Mon Sep 17 00:00:00 2001 From: Asmir Mustafic Date: Mon, 28 Aug 2017 14:44:01 +0200 Subject: Fixes https://github.com/Masterminds/html5-php/issues/124 Reference: https://www.w3.org/TR/html52/syntax.html#character-reference-state If the character reference was consumed as part of an attribute (return state is either attribute value (double-quoted) state, attribute value (single-quoted) state or attribute value (unquoted) state), and the last character matched is not a U+003B SEMICOLON character (;), and the next input character is either a U+003D EQUALS SIGN character (=) or an alphanumeric ASCII character, then, for historical reasons, switch to the character reference end state. If the last character matched is not a U+003B SEMICOLON character (;), this is a parse error. --- src/HTML5/Parser/Tokenizer.php | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'src/HTML5') diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 45774b2..6f0eb47 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -1067,8 +1067,10 @@ class Tokenizer } $entity = CharacterReference::lookupDecimal($numeric); } - } // String entity. - else { + } elseif ($tok === '=' && $inAttribute) { + return '&'; + } else { // String entity. + // Attempt to consume a string up to a ';'. // [a-zA-Z0-9]+; $cname = $this->scanner->getAsciiAlphaNum(); @@ -1078,7 +1080,9 @@ class Tokenizer // and continue on as the & is not part of an entity. The & will // be converted to & elsewhere. if ($entity == null) { - $this->parseError("No match in entity table for '%s'", $cname); + if (!$inAttribute || strlen($cname) === 0) { + $this->parseError("No match in entity table for '%s'", $cname); + } $this->scanner->unconsume($this->scanner->position() - $start); return '&'; } -- cgit v1.2.3