diff options
author | Asmir Mustafic <[email protected]> | 2018-11-08 09:18:28 +0100 |
---|---|---|
committer | Asmir Mustafic <[email protected]> | 2018-11-08 09:18:28 +0100 |
commit | 5c5634a0be9bba6851ca73fa778d2efd60169bfc (patch) | |
tree | 89497e309879303f539a06e3d462cbe7cfc3a4f4 /src/HTML5/Parser | |
parent | 9494e340adefaefc19439e9074d37b9ec957f4ca (diff) |
move sequenceMatches to the Scanner
Diffstat (limited to 'src/HTML5/Parser')
-rw-r--r-- | src/HTML5/Parser/Scanner.php | 24 | ||||
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 46 |
2 files changed, 38 insertions, 32 deletions
diff --git a/src/HTML5/Parser/Scanner.php b/src/HTML5/Parser/Scanner.php index cb14a56..e81b3a9 100644 --- a/src/HTML5/Parser/Scanner.php +++ b/src/HTML5/Parser/Scanner.php @@ -62,6 +62,30 @@ class Scanner } /** + * Check if upcomming chars match the given sequence. + * + * This will read the stream for the $sequence. If it's + * found, this will return true. If not, return false. + * Since this unconsumes any chars it reads, the caller + * will still need to read the next sequence, even if + * this returns true. + * + * Example: $this->scanner->sequenceMatches('</script>') will + * see if the input stream is at the start of a + * '</script>' string. + * + * @param string $sequence + * @param bool $caseSensitive + * + * @return bool + */ + public function sequenceMatches($sequence, $caseSensitive = true) + { + $portion = substr($this->data, $this->char, strlen($sequence)); + return $caseSensitive ? $portion === $sequence : strcasecmp($portion, $sequence) === 0; + } + + /** * Get the current position. * * @return int The current intiger byte position. diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index b413b52..7b0b3f3 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -263,7 +263,7 @@ class Tokenizer $txt = ''; $caseSensitive = !Elements::isHtml5Element($this->untilTag); - while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) { + while ($tok !== false && ! ($tok == '<' && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) { if ($tok == '&') { $txt .= $this->decodeCharacterReference(); $tok = $this->scanner->current(); @@ -313,12 +313,13 @@ class Tokenizer */ protected function characterReference() { - $ref = $this->decodeCharacterReference(); - if ($ref !== false) { - $this->buffer($ref); - return true; + if ($this->scanner->current() !== '&') { + return false; } - return false; + + $ref = $this->decodeCharacterReference(); + $this->buffer($ref); + return true; } /** @@ -892,7 +893,7 @@ class Tokenizer } $cdata .= $tok; $tok = $this->scanner->next(); - } while (! $this->sequenceMatches(']]>')); + } while (! $this->scanner->sequenceMatches(']]>')); // Consume ]]> $this->scanner->consume(3); @@ -972,7 +973,7 @@ class Tokenizer $buffer .= $this->scanner->charsUntil($first); // Stop as soon as we hit the stopping condition. - if ($this->sequenceMatches($sequence, false)) { + if ($this->scanner->sequenceMatches($sequence, false)) { return $buffer; } $buffer .= $this->scanner->current(); @@ -993,7 +994,7 @@ class Tokenizer * will still need to read the next sequence, even if * this returns true. * - * Example: $this->sequenceMatches('</script>') will + * Example: $this->scanner->sequenceMatches('</script>') will * see if the input stream is at the start of a * '</script>' string. * @@ -1004,22 +1005,9 @@ class Tokenizer */ protected function sequenceMatches($sequence, $caseSensitive = true) { - $len = strlen($sequence); - $buffer = ''; - for ($i = 0; $i < $len; ++ $i) { - $tok = $this->scanner->current(); - $buffer .= $tok; + @trigger_error(__METHOD__ . ' method is deprecated since version 2.4 and will be removed in 3.0. Use Scanner::sequenceMatches() instead.', E_USER_DEPRECATED); - // EOF. Rewind and let the caller handle it. - if ($tok === false) { - $this->scanner->unconsume($i); - return false; - } - $this->scanner->next(); - } - - $this->scanner->unconsume($len); - return $caseSensitive ? $buffer == $sequence : strcasecmp($buffer, $sequence) === 0; + return $this->scanner->sequenceMatches($sequence, $caseSensitive); } /** @@ -1079,22 +1067,16 @@ class Tokenizer /** * Decode a character reference and return the string. * - * Returns false if the entity could not be found. If $inAttribute is set - * to true, a bare & will be returned as-is. + * If $inAttribute is set to true, a bare & will be returned as-is. * * @param bool $inAttribute * Set to true if the text is inside of an attribute value. * false otherwise. * - * @return bool|string + * @return string */ protected function decodeCharacterReference($inAttribute = false) { - // If it fails this, it's definitely not an entity. - if ($this->scanner->current() != '&') { - return false; - } - // Next char after &. $tok = $this->scanner->next(); $start = $this->scanner->position(); |