From 7b339b5d8c364d62b0b982604f63085c91720702 Mon Sep 17 00:00:00 2001
From: Christophe Coevoet <stof@notk.org>
Date: Sat, 24 Nov 2018 15:43:40 +0100
Subject: Optimize the handling of the EOF detection in the main loop

The eof() method is a no-op when the token is not false. As the main loop
already needs to identify that case anyway, skipping the method call allows
to reduce the cost of parsing text tokens.
---
 src/HTML5/Parser/Tokenizer.php | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

(limited to 'src/HTML5')

diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index bce9da9..74d86a3 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -144,11 +144,11 @@ class Tokenizer
             $tok = $this->scanner->current();
         }
 
-        // Handle end of document
-        $this->eof($tok);
-
-        // Parse character
-        if (false !== $tok) {
+        if (false === $tok) {
+            // Handle end of document
+            $this->eof();
+        } else {
+            // Parse character
             switch ($this->textMode) {
                 case Elements::TEXT_RAW:
                     $this->rawText($tok);
@@ -290,18 +290,12 @@ class Tokenizer
     /**
      * If the document is read, emit an EOF event.
      */
-    protected function eof($tok)
+    protected function eof()
     {
-        if (false === $tok) {
-            // fprintf(STDOUT, "EOF");
-            $this->flushBuffer();
-            $this->events->eof();
-            $this->carryOn = false;
-
-            return true;
-        }
-
-        return false;
+        // fprintf(STDOUT, "EOF");
+        $this->flushBuffer();
+        $this->events->eof();
+        $this->carryOn = false;
     }
 
     /**
@@ -744,8 +738,9 @@ class Tokenizer
         // EOF: die.
         if (false === $tok) {
             $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true);
+            $this->eof();
 
-            return $this->eof($tok);
+            return true;
         }
 
         // NULL char: convert.
-- 
cgit v1.2.3


From 4c337c89096d9acb798f68201d2b124860d1616e Mon Sep 17 00:00:00 2001
From: Christophe Coevoet <stof@notk.org>
Date: Sat, 24 Nov 2018 18:35:02 +0100
Subject: Simplify the doctype matching

- the doctype() function is only called for a D or d token, so there is no
  need to check again inside the method
- checking that we have the DOCTYPE string can use a sequence matching
---
 src/HTML5/Parser/Tokenizer.php | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'src/HTML5')

diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index 74d86a3..bba6ff2 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -721,12 +721,11 @@ class Tokenizer
      */
     protected function doctype()
     {
-        if (strcasecmp($this->scanner->current(), 'D')) {
-            return false;
-        }
         // Check that string is DOCTYPE.
-        $chars = $this->scanner->charsWhile('DOCTYPEdoctype');
-        if (strcasecmp($chars, 'DOCTYPE')) {
+        if ($this->scanner->sequenceMatches('DOCTYPE', false)) {
+            $this->scanner->consume(7);
+        } else {
+            $chars = $this->scanner->charsWhile('DOCTYPEdoctype');
             $this->parseError('Expected DOCTYPE, got %s', $chars);
 
             return $this->bogusComment('<!' . $chars);
-- 
cgit v1.2.3


From 1e58def01d8ef2ee773a989b7f738f232088a674 Mon Sep 17 00:00:00 2001
From: Christophe Coevoet <stof@notk.org>
Date: Sat, 24 Nov 2018 20:55:34 +0100
Subject: Remove useless condition for the parsing of cdata

The caller already ensures that the current token is the right one.
---
 src/HTML5/Parser/Tokenizer.php | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'src/HTML5')

diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index bba6ff2..9f3d7bd 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -865,9 +865,6 @@ class Tokenizer
      */
     protected function cdataSection()
     {
-        if ('[' != $this->scanner->current()) {
-            return false;
-        }
         $cdata = '';
         $this->scanner->consume();
 
-- 
cgit v1.2.3


From 6cdf4283046325b9bc2671d0648b73a2be1d0946 Mon Sep 17 00:00:00 2001
From: Christophe Coevoet <stof@notk.org>
Date: Sat, 24 Nov 2018 23:02:42 +0100
Subject: Optimize the main loop

---
 src/HTML5/Parser/Tokenizer.php | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'src/HTML5')

diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index 9f3d7bd..c2abb4f 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -133,13 +133,19 @@ class Tokenizer
 
             $tok = $this->scanner->next();
 
-            $this->markupDeclaration($tok)
-                || $this->endTag()
-                || $this->processingInstruction()
-                || $this->tagName()
-                // This always returns false.
-                || $this->parseError('Illegal tag opening')
-                || $this->characterData();
+            if ('!' === $tok) {
+                $this->markupDeclaration();
+            } elseif ('/' === $tok) {
+                $this->endTag();
+            } elseif ('?' === $tok) {
+                $this->processingInstruction();
+            } elseif (ctype_alpha($tok)) {
+                $this->tagName();
+            } else {
+                $this->parseError('Illegal tag opening');
+                // TODO is this necessary ?
+                $this->characterData();
+            }
 
             $tok = $this->scanner->current();
         }
@@ -301,12 +307,8 @@ class Tokenizer
     /**
      * Look for markup.
      */
-    protected function markupDeclaration($tok)
+    protected function markupDeclaration()
     {
-        if ('!' != $tok) {
-            return false;
-        }
-
         $tok = $this->scanner->next();
 
         // Comment:
@@ -373,11 +375,6 @@ class Tokenizer
      */
     protected function tagName()
     {
-        $tok = $this->scanner->current();
-        if (!ctype_alpha($tok)) {
-            return false;
-        }
-
         // We know this is at least one char.
         $name = $this->scanner->charsWhile(':_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz');
         $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name);
@@ -790,7 +787,7 @@ class Tokenizer
             if (false === $id) {
                 $this->events->doctype($doctypeName, $type, $pub, false);
 
-                return false;
+                return true;
             }
 
             // Premature EOF.
-- 
cgit v1.2.3