From b3ef91f0a13914d25469af64d01cb7df5964c978 Mon Sep 17 00:00:00 2001 From: Titouan Galopin Date: Sun, 4 Nov 2018 15:54:09 +0100 Subject: Improve Tokenizer performance by inlining text parsing and removing some Scanner::current calls --- src/HTML5/Parser/Tokenizer.php | 84 +++++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 21 deletions(-) (limited to 'src/HTML5') diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 9645f83..e1ca660 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -128,7 +128,31 @@ class Tokenizer $this->characterReference(); $this->tagOpen(); $this->eof(); - $this->characterData(); + + // Inline the parsing of characters as it's the critical performance path + $tok = $this->scanner->current(); + if ($tok !== false) { + switch ($this->textMode) { + case Elements::TEXT_RAW: + $this->rawText($tok); + break; + + case Elements::TEXT_RCDATA: + $this->rcdata($tok); + break; + + default: + if (!strspn($tok, "<&")) { + // NULL character + if ($tok === "\00") { + $this->parseError("Received null character."); + } + + $this->text .= $tok; + $this->scanner->next(); + } + } + } return $this->carryOn; } @@ -148,64 +172,78 @@ class Tokenizer } switch ($this->textMode) { case Elements::TEXT_RAW: - return $this->rawText(); + return $this->rawText($tok); case Elements::TEXT_RCDATA: - return $this->rcdata(); + return $this->rcdata($tok); default: if (strspn($tok, "<&")) { return false; } - return $this->text(); + return $this->text($tok); } } /** * This buffers the current token as character data. + * + * @param string $tok The current token. + * + * @return bool */ - protected function text() + protected function text($tok) { - $tok = $this->scanner->current(); - // This should never happen... if ($tok === false) { return false; } - // Null + + // NULL character if ($tok === "\00") { $this->parseError("Received null character."); } - // fprintf(STDOUT, "Writing '%s'", $tok); + $this->buffer($tok); $this->scanner->next(); + return true; } /** * Read text in RAW mode. + * + * @param string $tok The current token. + * + * @return bool */ - protected function rawText() + protected function rawText($tok) { if (is_null($this->untilTag)) { - return $this->text(); + return $this->text($tok); } + $sequence = 'untilTag . '>'; $txt = $this->readUntilSequence($sequence); $this->events->text($txt); $this->setTextMode(0); + return $this->endTag(); } /** * Read text in RCDATA mode. + * + * @param string $tok The current token. + * + * @return bool */ - protected function rcdata() + protected function rcdata($tok) { if (is_null($this->untilTag)) { - return $this->text(); + return $this->text($tok); } + $sequence = 'untilTag; $txt = ''; - $tok = $this->scanner->current(); $caseSensitive = !Elements::isHtml5Element($this->untilTag); while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) { @@ -223,9 +261,11 @@ class Tokenizer if ($this->scanner->current() !== '>') { $this->parseError("Unclosed RCDATA end tag"); } + $this->scanner->unconsume($len); $this->events->text($txt); $this->setTextMode(0); + return $this->endTag(); } @@ -279,7 +319,7 @@ class Tokenizer $this->scanner->next(); return $this->markupDeclaration() || $this->endTag() || $this->processingInstruction() || $this->tagName() || - /* This always returns false. */ + // This always returns false. $this->parseError("Illegal tag opening") || $this->characterData(); } @@ -343,8 +383,9 @@ class Tokenizer // Trash whitespace. $this->scanner->whitespace(); - if ($this->scanner->current() != '>') { - $this->parseError("Expected >, got '%s'", $this->scanner->current()); + $tok = $this->scanner->current(); + if ($tok != '>') { + $this->parseError("Expected >, got '%s'", $tok); // We just trash stuff until we get to the next tag close. $this->scanner->charsUntil('>'); } @@ -456,10 +497,11 @@ class Tokenizer $name = strtolower($this->scanner->charsUntil("/>=\n\f\t ")); if (strlen($name) == 0) { - $this->parseError("Expected an attribute name, got %s.", $this->scanner->current()); + $tok = $this->scanner->current(); + $this->parseError("Expected an attribute name, got %s.", $tok); // Really, only '=' can be the char here. Everything else gets absorbed // under one rule or another. - $name = $this->scanner->current(); + $name = $tok; $this->scanner->next(); } @@ -556,7 +598,7 @@ class Tokenizer $tok = $this->scanner->current(); if ($tok == '&') { - $val .= $this->decodeCharacterReference(true, $tok); + $val .= $this->decodeCharacterReference(true); continue; } break; @@ -1032,6 +1074,7 @@ class Tokenizer $line = $this->scanner->currentLine(); $col = $this->scanner->columnOffset(); $this->events->parseError($msg, $line, $col); + return false; } @@ -1049,7 +1092,6 @@ class Tokenizer */ protected function decodeCharacterReference($inAttribute = false) { - // If it fails this, it's definitely not an entity. if ($this->scanner->current() != '&') { return false; -- cgit v1.2.3