From 9494e340adefaefc19439e9074d37b9ec957f4ca Mon Sep 17 00:00:00 2001 From: Asmir Mustafic Date: Thu, 8 Nov 2018 08:56:02 +0100 Subject: improve consume speed --- src/HTML5/Parser/Scanner.php | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/HTML5/Parser/Scanner.php b/src/HTML5/Parser/Scanner.php index dc685bb..cb14a56 100644 --- a/src/HTML5/Parser/Scanner.php +++ b/src/HTML5/Parser/Scanner.php @@ -126,9 +126,7 @@ class Scanner */ public function consume($count = 1) { - for ($i = 0; $i < $count; ++ $i) { - $this->next(); - } + $this->char += $count; } /** -- cgit v1.2.3 From 5c5634a0be9bba6851ca73fa778d2efd60169bfc Mon Sep 17 00:00:00 2001 From: Asmir Mustafic Date: Thu, 8 Nov 2018 09:18:28 +0100 Subject: move sequenceMatches to the Scanner --- src/HTML5/Parser/Scanner.php | 24 ++++++++++++++++++++++ src/HTML5/Parser/Tokenizer.php | 46 +++++++++++++----------------------------- 2 files changed, 38 insertions(+), 32 deletions(-) diff --git a/src/HTML5/Parser/Scanner.php b/src/HTML5/Parser/Scanner.php index cb14a56..e81b3a9 100644 --- a/src/HTML5/Parser/Scanner.php +++ b/src/HTML5/Parser/Scanner.php @@ -61,6 +61,30 @@ class Scanner $this->EOF = strlen($data); } + /** + * Check if upcomming chars match the given sequence. + * + * This will read the stream for the $sequence. If it's + * found, this will return true. If not, return false. + * Since this unconsumes any chars it reads, the caller + * will still need to read the next sequence, even if + * this returns true. + * + * Example: $this->scanner->sequenceMatches('') will + * see if the input stream is at the start of a + * '' string. + * + * @param string $sequence + * @param bool $caseSensitive + * + * @return bool + */ + public function sequenceMatches($sequence, $caseSensitive = true) + { + $portion = substr($this->data, $this->char, strlen($sequence)); + return $caseSensitive ? $portion === $sequence : strcasecmp($portion, $sequence) === 0; + } + /** * Get the current position. * diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index b413b52..7b0b3f3 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -263,7 +263,7 @@ class Tokenizer $txt = ''; $caseSensitive = !Elements::isHtml5Element($this->untilTag); - while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) { + while ($tok !== false && ! ($tok == '<' && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) { if ($tok == '&') { $txt .= $this->decodeCharacterReference(); $tok = $this->scanner->current(); @@ -313,12 +313,13 @@ class Tokenizer */ protected function characterReference() { - $ref = $this->decodeCharacterReference(); - if ($ref !== false) { - $this->buffer($ref); - return true; + if ($this->scanner->current() !== '&') { + return false; } - return false; + + $ref = $this->decodeCharacterReference(); + $this->buffer($ref); + return true; } /** @@ -892,7 +893,7 @@ class Tokenizer } $cdata .= $tok; $tok = $this->scanner->next(); - } while (! $this->sequenceMatches(']]>')); + } while (! $this->scanner->sequenceMatches(']]>')); // Consume ]]> $this->scanner->consume(3); @@ -972,7 +973,7 @@ class Tokenizer $buffer .= $this->scanner->charsUntil($first); // Stop as soon as we hit the stopping condition. - if ($this->sequenceMatches($sequence, false)) { + if ($this->scanner->sequenceMatches($sequence, false)) { return $buffer; } $buffer .= $this->scanner->current(); @@ -993,7 +994,7 @@ class Tokenizer * will still need to read the next sequence, even if * this returns true. * - * Example: $this->sequenceMatches('') will + * Example: $this->scanner->sequenceMatches('') will * see if the input stream is at the start of a * '' string. * @@ -1004,22 +1005,9 @@ class Tokenizer */ protected function sequenceMatches($sequence, $caseSensitive = true) { - $len = strlen($sequence); - $buffer = ''; - for ($i = 0; $i < $len; ++ $i) { - $tok = $this->scanner->current(); - $buffer .= $tok; + @trigger_error(__METHOD__ . ' method is deprecated since version 2.4 and will be removed in 3.0. Use Scanner::sequenceMatches() instead.', E_USER_DEPRECATED); - // EOF. Rewind and let the caller handle it. - if ($tok === false) { - $this->scanner->unconsume($i); - return false; - } - $this->scanner->next(); - } - - $this->scanner->unconsume($len); - return $caseSensitive ? $buffer == $sequence : strcasecmp($buffer, $sequence) === 0; + return $this->scanner->sequenceMatches($sequence, $caseSensitive); } /** @@ -1079,22 +1067,16 @@ class Tokenizer /** * Decode a character reference and return the string. * - * Returns false if the entity could not be found. If $inAttribute is set - * to true, a bare & will be returned as-is. + * If $inAttribute is set to true, a bare & will be returned as-is. * * @param bool $inAttribute * Set to true if the text is inside of an attribute value. * false otherwise. * - * @return bool|string + * @return string */ protected function decodeCharacterReference($inAttribute = false) { - // If it fails this, it's definitely not an entity. - if ($this->scanner->current() != '&') { - return false; - } - // Next char after &. $tok = $this->scanner->next(); $start = $this->scanner->position(); -- cgit v1.2.3