Optimize the processing of text between nodes

Instead of processing the text token one by one in the main loop, it is now processed in batch until the next special token (< and & which have special handling in the main loop and NUL characters which need to report a parse error).
author: Christophe Coevoet <[email protected]> 2018-11-27 14:16:40 +0100
committer: Christophe Coevoet <[email protected]> 2018-11-27 14:16:40 +0100
commit: ced5b845b048a65f63c0b324a1357926fdd8403f (patch)
tree: 2604002518f4ce2024219a2f10f3d208eea426e0 /src/HTML5/Parser
parent: 182f34ddba2e31e625f255416d85bba38cc6b39b (diff)
1 files changed, 11 insertions, 5 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index a41fcb4..f4e9652 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -163,15 +163,21 @@ class Tokenizer
                     break;
 
                 default:
-                    if ('<' !== $tok && '&' !== $tok) {
-                        // NULL character
-                        if ("\00" === $tok) {
-                            $this->parseError('Received null character.');
-                        }
+                    if ('<' === $tok || '&' === $tok) {
+                        break;
+                    }
+
+                    // NULL character
+                    if ("\00" === $tok) {
+                        $this->parseError('Received null character.');
 
                         $this->text .= $tok;
                         $this->scanner->consume();
+
+                        break;
                     }
+
+                    $this->text .= $this->scanner->charsUntil("<&\0");
             }
         }
author	Christophe Coevoet <[email protected]>	2018-11-27 14:16:40 +0100
committer	Christophe Coevoet <[email protected]>	2018-11-27 14:16:40 +0100
commit	ced5b845b048a65f63c0b324a1357926fdd8403f (patch)
tree	2604002518f4ce2024219a2f10f3d208eea426e0 /src/HTML5/Parser
parent	182f34ddba2e31e625f255416d85bba38cc6b39b (diff)