summaryrefslogtreecommitdiff
path: root/src/HTML5/Parser
diff options
context:
space:
mode:
authorTitouan Galopin <[email protected]>2018-11-04 15:54:09 +0100
committerTitouan Galopin <[email protected]>2018-11-05 00:58:52 +0100
commitb3ef91f0a13914d25469af64d01cb7df5964c978 (patch)
treee89519a15f665eb924b82cab5381e72c765be691 /src/HTML5/Parser
parent7453ab08dc4c8f65d5db52fd2c6b8943d59bf95b (diff)
Improve Tokenizer performance by inlining text parsing and removing some Scanner::current calls
Diffstat (limited to 'src/HTML5/Parser')
-rw-r--r--src/HTML5/Parser/Tokenizer.php84
1 files changed, 63 insertions, 21 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index 9645f83..e1ca660 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -128,7 +128,31 @@ class Tokenizer
$this->characterReference();
$this->tagOpen();
$this->eof();
- $this->characterData();
+
+ // Inline the parsing of characters as it's the critical performance path
+ $tok = $this->scanner->current();
+ if ($tok !== false) {
+ switch ($this->textMode) {
+ case Elements::TEXT_RAW:
+ $this->rawText($tok);
+ break;
+
+ case Elements::TEXT_RCDATA:
+ $this->rcdata($tok);
+ break;
+
+ default:
+ if (!strspn($tok, "<&")) {
+ // NULL character
+ if ($tok === "\00") {
+ $this->parseError("Received null character.");
+ }
+
+ $this->text .= $tok;
+ $this->scanner->next();
+ }
+ }
+ }
return $this->carryOn;
}
@@ -148,64 +172,78 @@ class Tokenizer
}
switch ($this->textMode) {
case Elements::TEXT_RAW:
- return $this->rawText();
+ return $this->rawText($tok);
case Elements::TEXT_RCDATA:
- return $this->rcdata();
+ return $this->rcdata($tok);
default:
if (strspn($tok, "<&")) {
return false;
}
- return $this->text();
+ return $this->text($tok);
}
}
/**
* This buffers the current token as character data.
+ *
+ * @param string $tok The current token.
+ *
+ * @return bool
*/
- protected function text()
+ protected function text($tok)
{
- $tok = $this->scanner->current();
-
// This should never happen...
if ($tok === false) {
return false;
}
- // Null
+
+ // NULL character
if ($tok === "\00") {
$this->parseError("Received null character.");
}
- // fprintf(STDOUT, "Writing '%s'", $tok);
+
$this->buffer($tok);
$this->scanner->next();
+
return true;
}
/**
* Read text in RAW mode.
+ *
+ * @param string $tok The current token.
+ *
+ * @return bool
*/
- protected function rawText()
+ protected function rawText($tok)
{
if (is_null($this->untilTag)) {
- return $this->text();
+ return $this->text($tok);
}
+
$sequence = '</' . $this->untilTag . '>';
$txt = $this->readUntilSequence($sequence);
$this->events->text($txt);
$this->setTextMode(0);
+
return $this->endTag();
}
/**
* Read text in RCDATA mode.
+ *
+ * @param string $tok The current token.
+ *
+ * @return bool
*/
- protected function rcdata()
+ protected function rcdata($tok)
{
if (is_null($this->untilTag)) {
- return $this->text();
+ return $this->text($tok);
}
+
$sequence = '</' . $this->untilTag;
$txt = '';
- $tok = $this->scanner->current();
$caseSensitive = !Elements::isHtml5Element($this->untilTag);
while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) {
@@ -223,9 +261,11 @@ class Tokenizer
if ($this->scanner->current() !== '>') {
$this->parseError("Unclosed RCDATA end tag");
}
+
$this->scanner->unconsume($len);
$this->events->text($txt);
$this->setTextMode(0);
+
return $this->endTag();
}
@@ -279,7 +319,7 @@ class Tokenizer
$this->scanner->next();
return $this->markupDeclaration() || $this->endTag() || $this->processingInstruction() || $this->tagName() ||
- /* This always returns false. */
+ // This always returns false.
$this->parseError("Illegal tag opening") || $this->characterData();
}
@@ -343,8 +383,9 @@ class Tokenizer
// Trash whitespace.
$this->scanner->whitespace();
- if ($this->scanner->current() != '>') {
- $this->parseError("Expected >, got '%s'", $this->scanner->current());
+ $tok = $this->scanner->current();
+ if ($tok != '>') {
+ $this->parseError("Expected >, got '%s'", $tok);
// We just trash stuff until we get to the next tag close.
$this->scanner->charsUntil('>');
}
@@ -456,10 +497,11 @@ class Tokenizer
$name = strtolower($this->scanner->charsUntil("/>=\n\f\t "));
if (strlen($name) == 0) {
- $this->parseError("Expected an attribute name, got %s.", $this->scanner->current());
+ $tok = $this->scanner->current();
+ $this->parseError("Expected an attribute name, got %s.", $tok);
// Really, only '=' can be the char here. Everything else gets absorbed
// under one rule or another.
- $name = $this->scanner->current();
+ $name = $tok;
$this->scanner->next();
}
@@ -556,7 +598,7 @@ class Tokenizer
$tok = $this->scanner->current();
if ($tok == '&') {
- $val .= $this->decodeCharacterReference(true, $tok);
+ $val .= $this->decodeCharacterReference(true);
continue;
}
break;
@@ -1032,6 +1074,7 @@ class Tokenizer
$line = $this->scanner->currentLine();
$col = $this->scanner->columnOffset();
$this->events->parseError($msg, $line, $col);
+
return false;
}
@@ -1049,7 +1092,6 @@ class Tokenizer
*/
protected function decodeCharacterReference($inAttribute = false)
{
-
// If it fails this, it's definitely not an entity.
if ($this->scanner->current() != '&') {
return false;