diff options
author | Asmir Mustafic <[email protected]> | 2017-09-01 15:29:03 +0200 |
---|---|---|
committer | GitHub <[email protected]> | 2017-09-01 15:29:03 +0200 |
commit | b8afbae8cdb626c786a1590b3a83d366933d807d (patch) | |
tree | b2d9b33d86beb00370d3c2d08fa3197318275940 /src | |
parent | e965886a79a560b4b00a4c471e2bdfafea23fdcb (diff) | |
parent | 5dca3fc598bd4ff2b13816e8338a3668da676c25 (diff) |
Merge pull request #135 from Masterminds/tokenizer-performance
Tokenizer performance
Diffstat (limited to 'src')
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 65 |
1 files changed, 36 insertions, 29 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 45774b2..95dbf84 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -83,11 +83,8 @@ class Tokenizer */ public function parse() { - $p = 0; do { - $p = $this->scanner->position(); $this->consumeData(); - // FIXME: Add infinite loop protection. } while ($this->carryOn); } @@ -145,7 +142,8 @@ class Tokenizer */ protected function characterData() { - if ($this->scanner->current() === false) { + $tok = $this->scanner->current(); + if ($tok === false) { return false; } switch ($this->textMode) { @@ -154,7 +152,6 @@ class Tokenizer case Elements::TEXT_RCDATA: return $this->rcdata(); default: - $tok = $this->scanner->current(); if (strspn($tok, "<&")) { return false; } @@ -408,24 +405,26 @@ class Tokenizer if ($tok == '/') { $this->scanner->next(); $this->scanner->whitespace(); - if ($this->scanner->current() == '>') { + $tok = $this->scanner->current(); + + if ($tok == '>') { $selfClose = true; return true; } - if ($this->scanner->current() === false) { + if ($tok === false) { $this->parseError("Unexpected EOF inside of tag."); return true; } // Basically, we skip the / token and go on. // See 8.2.4.43. - $this->parseError("Unexpected '%s' inside of a tag.", $this->scanner->current()); + $this->parseError("Unexpected '%s' inside of a tag.", $tok); return false; } - if ($this->scanner->current() == '>') { + if ($tok == '>') { return true; } - if ($this->scanner->current() === false) { + if ($tok === false) { $this->parseError("Unexpected EOF inside of tag."); return true; } @@ -541,15 +540,21 @@ class Tokenizer { $stoplist = "\f" . $quote; $val = ''; - $tok = $this->scanner->current(); - while (strspn($tok, $stoplist) == 0 && $tok !== false) { - if ($tok == '&') { - $val .= $this->decodeCharacterReference(true); - $tok = $this->scanner->current(); + + while (true) { + $tokens = $this->scanner->charsUntil($stoplist.'&'); + if ($tokens !== false) { + $val .= $tokens; } else { - $val .= $tok; - $tok = $this->scanner->next(); + break; } + + $tok = $this->scanner->current(); + if ($tok == '&') { + $val .= $this->decodeCharacterReference(true, $tok); + continue; + } + break; } $this->scanner->next(); return $val; @@ -591,18 +596,18 @@ class Tokenizer */ protected function bogusComment($leading = '') { - - // TODO: This can be done more efficiently when the - // scanner exposes a readUntil() method. $comment = $leading; + $tokens = $this->scanner->charsUntil('>'); + if ($tokens !== false) { + $comment .= $tokens; + } $tok = $this->scanner->current(); - do { + if ($tok !== false) { $comment .= $tok; - $tok = $this->scanner->next(); - } while ($tok !== false && $tok != '>'); + } $this->flushBuffer(); - $this->events->comment($comment . $tok); + $this->events->comment($comment); $this->scanner->next(); return true; @@ -646,15 +651,17 @@ class Tokenizer */ protected function isCommentEnd() { + $tok = $this->scanner->current(); + // EOF - if ($this->scanner->current() === false) { + if ($tok === false) { // Hit the end. $this->parseError("Unexpected EOF in a comment."); return true; } // If it doesn't start with -, not the end. - if ($this->scanner->current() != '-') { + if ($tok != '-') { return false; } @@ -737,7 +744,6 @@ class Tokenizer $pub = strtoupper($this->scanner->getAsciiAlpha()); $white = strlen($this->scanner->whitespace()); - $tok = $this->scanner->current(); // Get ID, and flag it as pub or system. if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) { @@ -938,10 +944,11 @@ class Tokenizer $len = strlen($sequence); $buffer = ''; for ($i = 0; $i < $len; ++ $i) { - $buffer .= $this->scanner->current(); + $tok = $this->scanner->current(); + $buffer .= $tok; // EOF. Rewind and let the caller handle it. - if ($this->scanner->current() === false) { + if ($tok === false) { $this->scanner->unconsume($i); return false; } |