From 7b339b5d8c364d62b0b982604f63085c91720702 Mon Sep 17 00:00:00 2001 From: Christophe Coevoet Date: Sat, 24 Nov 2018 15:43:40 +0100 Subject: Optimize the handling of the EOF detection in the main loop The eof() method is a no-op when the token is not false. As the main loop already needs to identify that case anyway, skipping the method call allows to reduce the cost of parsing text tokens. --- src/HTML5/Parser/Tokenizer.php | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'src/HTML5') diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index bce9da9..74d86a3 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -144,11 +144,11 @@ class Tokenizer $tok = $this->scanner->current(); } - // Handle end of document - $this->eof($tok); - - // Parse character - if (false !== $tok) { + if (false === $tok) { + // Handle end of document + $this->eof(); + } else { + // Parse character switch ($this->textMode) { case Elements::TEXT_RAW: $this->rawText($tok); @@ -290,18 +290,12 @@ class Tokenizer /** * If the document is read, emit an EOF event. */ - protected function eof($tok) + protected function eof() { - if (false === $tok) { - // fprintf(STDOUT, "EOF"); - $this->flushBuffer(); - $this->events->eof(); - $this->carryOn = false; - - return true; - } - - return false; + // fprintf(STDOUT, "EOF"); + $this->flushBuffer(); + $this->events->eof(); + $this->carryOn = false; } /** @@ -744,8 +738,9 @@ class Tokenizer // EOF: die. if (false === $tok) { $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true); + $this->eof(); - return $this->eof($tok); + return true; } // NULL char: convert. -- cgit v1.2.3 From 4c337c89096d9acb798f68201d2b124860d1616e Mon Sep 17 00:00:00 2001 From: Christophe Coevoet Date: Sat, 24 Nov 2018 18:35:02 +0100 Subject: Simplify the doctype matching - the doctype() function is only called for a D or d token, so there is no need to check again inside the method - checking that we have the DOCTYPE string can use a sequence matching --- src/HTML5/Parser/Tokenizer.php | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'src/HTML5') diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 74d86a3..bba6ff2 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -721,12 +721,11 @@ class Tokenizer */ protected function doctype() { - if (strcasecmp($this->scanner->current(), 'D')) { - return false; - } // Check that string is DOCTYPE. - $chars = $this->scanner->charsWhile('DOCTYPEdoctype'); - if (strcasecmp($chars, 'DOCTYPE')) { + if ($this->scanner->sequenceMatches('DOCTYPE', false)) { + $this->scanner->consume(7); + } else { + $chars = $this->scanner->charsWhile('DOCTYPEdoctype'); $this->parseError('Expected DOCTYPE, got %s', $chars); return $this->bogusComment(' Date: Sat, 24 Nov 2018 20:55:34 +0100 Subject: Remove useless condition for the parsing of cdata The caller already ensures that the current token is the right one. --- src/HTML5/Parser/Tokenizer.php | 3 --- 1 file changed, 3 deletions(-) (limited to 'src/HTML5') diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index bba6ff2..9f3d7bd 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -865,9 +865,6 @@ class Tokenizer */ protected function cdataSection() { - if ('[' != $this->scanner->current()) { - return false; - } $cdata = ''; $this->scanner->consume(); -- cgit v1.2.3 From 6cdf4283046325b9bc2671d0648b73a2be1d0946 Mon Sep 17 00:00:00 2001 From: Christophe Coevoet Date: Sat, 24 Nov 2018 23:02:42 +0100 Subject: Optimize the main loop --- src/HTML5/Parser/Tokenizer.php | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'src/HTML5') diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 9f3d7bd..c2abb4f 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -133,13 +133,19 @@ class Tokenizer $tok = $this->scanner->next(); - $this->markupDeclaration($tok) - || $this->endTag() - || $this->processingInstruction() - || $this->tagName() - // This always returns false. - || $this->parseError('Illegal tag opening') - || $this->characterData(); + if ('!' === $tok) { + $this->markupDeclaration(); + } elseif ('/' === $tok) { + $this->endTag(); + } elseif ('?' === $tok) { + $this->processingInstruction(); + } elseif (ctype_alpha($tok)) { + $this->tagName(); + } else { + $this->parseError('Illegal tag opening'); + // TODO is this necessary ? + $this->characterData(); + } $tok = $this->scanner->current(); } @@ -301,12 +307,8 @@ class Tokenizer /** * Look for markup. */ - protected function markupDeclaration($tok) + protected function markupDeclaration() { - if ('!' != $tok) { - return false; - } - $tok = $this->scanner->next(); // Comment: @@ -373,11 +375,6 @@ class Tokenizer */ protected function tagName() { - $tok = $this->scanner->current(); - if (!ctype_alpha($tok)) { - return false; - } - // We know this is at least one char. $name = $this->scanner->charsWhile(':_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'); $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name); @@ -790,7 +787,7 @@ class Tokenizer if (false === $id) { $this->events->doctype($doctypeName, $type, $pub, false); - return false; + return true; } // Premature EOF. -- cgit v1.2.3