From b3ef91f0a13914d25469af64d01cb7df5964c978 Mon Sep 17 00:00:00 2001 From: Titouan Galopin Date: Sun, 4 Nov 2018 15:54:09 +0100 Subject: Improve Tokenizer performance by inlining text parsing and removing some Scanner::current calls --- src/HTML5/Parser/Tokenizer.php | 84 +++++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 21 deletions(-) (limited to 'src') diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 9645f83..e1ca660 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -128,7 +128,31 @@ class Tokenizer $this->characterReference(); $this->tagOpen(); $this->eof(); - $this->characterData(); + + // Inline the parsing of characters as it's the critical performance path + $tok = $this->scanner->current(); + if ($tok !== false) { + switch ($this->textMode) { + case Elements::TEXT_RAW: + $this->rawText($tok); + break; + + case Elements::TEXT_RCDATA: + $this->rcdata($tok); + break; + + default: + if (!strspn($tok, "<&")) { + // NULL character + if ($tok === "\00") { + $this->parseError("Received null character."); + } + + $this->text .= $tok; + $this->scanner->next(); + } + } + } return $this->carryOn; } @@ -148,64 +172,78 @@ class Tokenizer } switch ($this->textMode) { case Elements::TEXT_RAW: - return $this->rawText(); + return $this->rawText($tok); case Elements::TEXT_RCDATA: - return $this->rcdata(); + return $this->rcdata($tok); default: if (strspn($tok, "<&")) { return false; } - return $this->text(); + return $this->text($tok); } } /** * This buffers the current token as character data. + * + * @param string $tok The current token. + * + * @return bool */ - protected function text() + protected function text($tok) { - $tok = $this->scanner->current(); - // This should never happen... if ($tok === false) { return false; } - // Null + + // NULL character if ($tok === "\00") { $this->parseError("Received null character."); } - // fprintf(STDOUT, "Writing '%s'", $tok); + $this->buffer($tok); $this->scanner->next(); + return true; } /** * Read text in RAW mode. + * + * @param string $tok The current token. + * + * @return bool */ - protected function rawText() + protected function rawText($tok) { if (is_null($this->untilTag)) { - return $this->text(); + return $this->text($tok); } + $sequence = 'untilTag . '>'; $txt = $this->readUntilSequence($sequence); $this->events->text($txt); $this->setTextMode(0); + return $this->endTag(); } /** * Read text in RCDATA mode. + * + * @param string $tok The current token. + * + * @return bool */ - protected function rcdata() + protected function rcdata($tok) { if (is_null($this->untilTag)) { - return $this->text(); + return $this->text($tok); } + $sequence = 'untilTag; $txt = ''; - $tok = $this->scanner->current(); $caseSensitive = !Elements::isHtml5Element($this->untilTag); while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) { @@ -223,9 +261,11 @@ class Tokenizer if ($this->scanner->current() !== '>') { $this->parseError("Unclosed RCDATA end tag"); } + $this->scanner->unconsume($len); $this->events->text($txt); $this->setTextMode(0); + return $this->endTag(); } @@ -279,7 +319,7 @@ class Tokenizer $this->scanner->next(); return $this->markupDeclaration() || $this->endTag() || $this->processingInstruction() || $this->tagName() || - /* This always returns false. */ + // This always returns false. $this->parseError("Illegal tag opening") || $this->characterData(); } @@ -343,8 +383,9 @@ class Tokenizer // Trash whitespace. $this->scanner->whitespace(); - if ($this->scanner->current() != '>') { - $this->parseError("Expected >, got '%s'", $this->scanner->current()); + $tok = $this->scanner->current(); + if ($tok != '>') { + $this->parseError("Expected >, got '%s'", $tok); // We just trash stuff until we get to the next tag close. $this->scanner->charsUntil('>'); } @@ -456,10 +497,11 @@ class Tokenizer $name = strtolower($this->scanner->charsUntil("/>=\n\f\t ")); if (strlen($name) == 0) { - $this->parseError("Expected an attribute name, got %s.", $this->scanner->current()); + $tok = $this->scanner->current(); + $this->parseError("Expected an attribute name, got %s.", $tok); // Really, only '=' can be the char here. Everything else gets absorbed // under one rule or another. - $name = $this->scanner->current(); + $name = $tok; $this->scanner->next(); } @@ -556,7 +598,7 @@ class Tokenizer $tok = $this->scanner->current(); if ($tok == '&') { - $val .= $this->decodeCharacterReference(true, $tok); + $val .= $this->decodeCharacterReference(true); continue; } break; @@ -1032,6 +1074,7 @@ class Tokenizer $line = $this->scanner->currentLine(); $col = $this->scanner->columnOffset(); $this->events->parseError($msg, $line, $col); + return false; } @@ -1049,7 +1092,6 @@ class Tokenizer */ protected function decodeCharacterReference($inAttribute = false) { - // If it fails this, it's definitely not an entity. if ($this->scanner->current() != '&') { return false; -- cgit v1.2.3 From f7a954df2f0647c93b1d3d22c317aa5297ea4b05 Mon Sep 17 00:00:00 2001 From: Titouan Galopin Date: Mon, 5 Nov 2018 01:35:23 +0100 Subject: Inline tag open in Tokenizer to further improve performances --- src/HTML5/Parser/Tokenizer.php | 61 +++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 33 deletions(-) (limited to 'src') diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index e1ca660..d08cba4 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -121,16 +121,30 @@ class Tokenizer */ protected function consumeData() { - // Character Ref - /* - * $this->characterReference() || $this->tagOpen() || $this->eof() || $this->characterData(); - */ + // Character reference $this->characterReference(); - $this->tagOpen(); - $this->eof(); - // Inline the parsing of characters as it's the critical performance path + // Parse tag + if ($this->scanner->current() === '<') { + // Any buffered text data can go out now. + $this->flushBuffer(); + + $tok = $this->scanner->next(); + + $this->markupDeclaration($tok) + || $this->endTag() + || $this->processingInstruction() + || $this->tagName() + // This always returns false. + || $this->parseError("Illegal tag opening") + || $this->characterData(); + } + + // Handle end of document $tok = $this->scanner->current(); + $this->eof($tok); + + // Parse character if ($tok !== false) { switch ($this->textMode) { case Elements::TEXT_RAW: @@ -272,15 +286,17 @@ class Tokenizer /** * If the document is read, emit an EOF event. */ - protected function eof() + protected function eof($tok) { - if ($this->scanner->current() === false) { + if ($tok === false) { // fprintf(STDOUT, "EOF"); $this->flushBuffer(); $this->events->eof(); $this->carryOn = false; + return true; } + return false; } @@ -302,33 +318,12 @@ class Tokenizer return false; } - /** - * Emit a tagStart event on encountering a tag. - * - * 8.2.4.8 - */ - protected function tagOpen() - { - if ($this->scanner->current() != '<') { - return false; - } - - // Any buffered text data can go out now. - $this->flushBuffer(); - - $this->scanner->next(); - - return $this->markupDeclaration() || $this->endTag() || $this->processingInstruction() || $this->tagName() || - // This always returns false. - $this->parseError("Illegal tag opening") || $this->characterData(); - } - /** * Look for markup. */ - protected function markupDeclaration() + protected function markupDeclaration($tok) { - if ($this->scanner->current() != '!') { + if ($tok != '!') { return false; } @@ -756,7 +751,7 @@ class Tokenizer // EOF: die. if ($tok === false) { $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true); - return $this->eof(); + return $this->eof($tok); } // NULL char: convert. -- cgit v1.2.3 From 7ac198d906b00f5147dd1753521a914eb336b348 Mon Sep 17 00:00:00 2001 From: Titouan Galopin Date: Tue, 6 Nov 2018 10:35:50 +0100 Subject: Remove another current call --- src/HTML5/Parser/Tokenizer.php | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index d08cba4..b413b52 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -124,8 +124,10 @@ class Tokenizer // Character reference $this->characterReference(); + $tok = $this->scanner->current(); + // Parse tag - if ($this->scanner->current() === '<') { + if ($tok === '<') { // Any buffered text data can go out now. $this->flushBuffer(); @@ -138,10 +140,11 @@ class Tokenizer // This always returns false. || $this->parseError("Illegal tag opening") || $this->characterData(); + + $tok = $this->scanner->current(); } // Handle end of document - $tok = $this->scanner->current(); $this->eof($tok); // Parse character -- cgit v1.2.3