From b3ef91f0a13914d25469af64d01cb7df5964c978 Mon Sep 17 00:00:00 2001
From: Titouan Galopin <galopintitouan@gmail.com>
Date: Sun, 4 Nov 2018 15:54:09 +0100
Subject: Improve Tokenizer performance by inlining text parsing and removing
 some Scanner::current calls

---
 src/HTML5/Parser/Tokenizer.php | 84 +++++++++++++++++++++++++++++++-----------
 1 file changed, 63 insertions(+), 21 deletions(-)

(limited to 'src')

diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index 9645f83..e1ca660 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -128,7 +128,31 @@ class Tokenizer
         $this->characterReference();
         $this->tagOpen();
         $this->eof();
-        $this->characterData();
+
+        // Inline the parsing of characters as it's the critical performance path
+        $tok = $this->scanner->current();
+        if ($tok !== false) {
+            switch ($this->textMode) {
+                case Elements::TEXT_RAW:
+                    $this->rawText($tok);
+                    break;
+
+                case Elements::TEXT_RCDATA:
+                    $this->rcdata($tok);
+                    break;
+
+                default:
+                    if (!strspn($tok, "<&")) {
+                        // NULL character
+                        if ($tok === "\00") {
+                            $this->parseError("Received null character.");
+                        }
+
+                        $this->text .= $tok;
+                        $this->scanner->next();
+                    }
+            }
+        }
 
         return $this->carryOn;
     }
@@ -148,64 +172,78 @@ class Tokenizer
         }
         switch ($this->textMode) {
             case Elements::TEXT_RAW:
-                return $this->rawText();
+                return $this->rawText($tok);
             case Elements::TEXT_RCDATA:
-                return $this->rcdata();
+                return $this->rcdata($tok);
             default:
                 if (strspn($tok, "<&")) {
                     return false;
                 }
-                return $this->text();
+                return $this->text($tok);
         }
     }
 
     /**
      * This buffers the current token as character data.
+     *
+     * @param string $tok The current token.
+     *
+     * @return bool
      */
-    protected function text()
+    protected function text($tok)
     {
-        $tok = $this->scanner->current();
-
         // This should never happen...
         if ($tok === false) {
             return false;
         }
-        // Null
+
+        // NULL character
         if ($tok === "\00") {
             $this->parseError("Received null character.");
         }
-        // fprintf(STDOUT, "Writing '%s'", $tok);
+
         $this->buffer($tok);
         $this->scanner->next();
+
         return true;
     }
 
     /**
      * Read text in RAW mode.
+     *
+     * @param string $tok The current token.
+     *
+     * @return bool
      */
-    protected function rawText()
+    protected function rawText($tok)
     {
         if (is_null($this->untilTag)) {
-            return $this->text();
+            return $this->text($tok);
         }
+
         $sequence = '</' . $this->untilTag . '>';
         $txt = $this->readUntilSequence($sequence);
         $this->events->text($txt);
         $this->setTextMode(0);
+
         return $this->endTag();
     }
 
     /**
      * Read text in RCDATA mode.
+     *
+     * @param string $tok The current token.
+     *
+     * @return bool
      */
-    protected function rcdata()
+    protected function rcdata($tok)
     {
         if (is_null($this->untilTag)) {
-            return $this->text();
+            return $this->text($tok);
         }
+
         $sequence = '</' . $this->untilTag;
         $txt = '';
-        $tok = $this->scanner->current();
 
         $caseSensitive = !Elements::isHtml5Element($this->untilTag);
         while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) {
@@ -223,9 +261,11 @@ class Tokenizer
         if ($this->scanner->current() !== '>') {
             $this->parseError("Unclosed RCDATA end tag");
         }
+
         $this->scanner->unconsume($len);
         $this->events->text($txt);
         $this->setTextMode(0);
+
         return $this->endTag();
     }
 
@@ -279,7 +319,7 @@ class Tokenizer
         $this->scanner->next();
 
         return $this->markupDeclaration() || $this->endTag() || $this->processingInstruction() || $this->tagName() ||
-          /*  This always returns false. */
+          // This always returns false.
           $this->parseError("Illegal tag opening") || $this->characterData();
     }
 
@@ -343,8 +383,9 @@ class Tokenizer
         // Trash whitespace.
         $this->scanner->whitespace();
 
-        if ($this->scanner->current() != '>') {
-            $this->parseError("Expected >, got '%s'", $this->scanner->current());
+        $tok = $this->scanner->current();
+        if ($tok != '>') {
+            $this->parseError("Expected >, got '%s'", $tok);
             // We just trash stuff until we get to the next tag close.
             $this->scanner->charsUntil('>');
         }
@@ -456,10 +497,11 @@ class Tokenizer
         $name = strtolower($this->scanner->charsUntil("/>=\n\f\t "));
 
         if (strlen($name) == 0) {
-            $this->parseError("Expected an attribute name, got %s.", $this->scanner->current());
+            $tok = $this->scanner->current();
+            $this->parseError("Expected an attribute name, got %s.", $tok);
             // Really, only '=' can be the char here. Everything else gets absorbed
             // under one rule or another.
-            $name = $this->scanner->current();
+            $name = $tok;
             $this->scanner->next();
         }
 
@@ -556,7 +598,7 @@ class Tokenizer
 
             $tok = $this->scanner->current();
             if ($tok == '&') {
-                $val .= $this->decodeCharacterReference(true, $tok);
+                $val .= $this->decodeCharacterReference(true);
                 continue;
             }
             break;
@@ -1032,6 +1074,7 @@ class Tokenizer
         $line = $this->scanner->currentLine();
         $col = $this->scanner->columnOffset();
         $this->events->parseError($msg, $line, $col);
+
         return false;
     }
 
@@ -1049,7 +1092,6 @@ class Tokenizer
      */
     protected function decodeCharacterReference($inAttribute = false)
     {
-
         // If it fails this, it's definitely not an entity.
         if ($this->scanner->current() != '&') {
             return false;
-- 
cgit v1.2.3


From f7a954df2f0647c93b1d3d22c317aa5297ea4b05 Mon Sep 17 00:00:00 2001
From: Titouan Galopin <galopintitouan@gmail.com>
Date: Mon, 5 Nov 2018 01:35:23 +0100
Subject: Inline tag open in Tokenizer to further improve performances

---
 src/HTML5/Parser/Tokenizer.php | 61 +++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 33 deletions(-)

(limited to 'src')

diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index e1ca660..d08cba4 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -121,16 +121,30 @@ class Tokenizer
      */
     protected function consumeData()
     {
-        // Character Ref
-        /*
-         * $this->characterReference() || $this->tagOpen() || $this->eof() || $this->characterData();
-         */
+        // Character reference
         $this->characterReference();
-        $this->tagOpen();
-        $this->eof();
 
-        // Inline the parsing of characters as it's the critical performance path
+        // Parse tag
+        if ($this->scanner->current() === '<') {
+            // Any buffered text data can go out now.
+            $this->flushBuffer();
+
+            $tok = $this->scanner->next();
+
+            $this->markupDeclaration($tok)
+                || $this->endTag()
+                || $this->processingInstruction()
+                || $this->tagName()
+                // This always returns false.
+                || $this->parseError("Illegal tag opening")
+                || $this->characterData();
+        }
+
+        // Handle end of document
         $tok = $this->scanner->current();
+        $this->eof($tok);
+
+        // Parse character
         if ($tok !== false) {
             switch ($this->textMode) {
                 case Elements::TEXT_RAW:
@@ -272,15 +286,17 @@ class Tokenizer
     /**
      * If the document is read, emit an EOF event.
      */
-    protected function eof()
+    protected function eof($tok)
     {
-        if ($this->scanner->current() === false) {
+        if ($tok === false) {
             // fprintf(STDOUT, "EOF");
             $this->flushBuffer();
             $this->events->eof();
             $this->carryOn = false;
+
             return true;
         }
+
         return false;
     }
 
@@ -302,33 +318,12 @@ class Tokenizer
         return false;
     }
 
-    /**
-     * Emit a tagStart event on encountering a tag.
-     *
-     * 8.2.4.8
-     */
-    protected function tagOpen()
-    {
-        if ($this->scanner->current() != '<') {
-            return false;
-        }
-
-        // Any buffered text data can go out now.
-        $this->flushBuffer();
-
-        $this->scanner->next();
-
-        return $this->markupDeclaration() || $this->endTag() || $this->processingInstruction() || $this->tagName() ||
-          // This always returns false.
-          $this->parseError("Illegal tag opening") || $this->characterData();
-    }
-
     /**
      * Look for markup.
      */
-    protected function markupDeclaration()
+    protected function markupDeclaration($tok)
     {
-        if ($this->scanner->current() != '!') {
+        if ($tok != '!') {
             return false;
         }
 
@@ -756,7 +751,7 @@ class Tokenizer
         // EOF: die.
         if ($tok === false) {
             $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true);
-            return $this->eof();
+            return $this->eof($tok);
         }
 
         // NULL char: convert.
-- 
cgit v1.2.3


From 7ac198d906b00f5147dd1753521a914eb336b348 Mon Sep 17 00:00:00 2001
From: Titouan Galopin <galopintitouan@gmail.com>
Date: Tue, 6 Nov 2018 10:35:50 +0100
Subject: Remove another current call

---
 src/HTML5/Parser/Tokenizer.php | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index d08cba4..b413b52 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -124,8 +124,10 @@ class Tokenizer
         // Character reference
         $this->characterReference();
 
+        $tok = $this->scanner->current();
+
         // Parse tag
-        if ($this->scanner->current() === '<') {
+        if ($tok === '<') {
             // Any buffered text data can go out now.
             $this->flushBuffer();
 
@@ -138,10 +140,11 @@ class Tokenizer
                 // This always returns false.
                 || $this->parseError("Illegal tag opening")
                 || $this->characterData();
+
+            $tok = $this->scanner->current();
         }
 
         // Handle end of document
-        $tok = $this->scanner->current();
         $this->eof($tok);
 
         // Parse character
-- 
cgit v1.2.3