summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAsmir Mustafic <[email protected]>2018-11-08 09:18:28 +0100
committerAsmir Mustafic <[email protected]>2018-11-08 09:18:28 +0100
commit5c5634a0be9bba6851ca73fa778d2efd60169bfc (patch)
tree89497e309879303f539a06e3d462cbe7cfc3a4f4 /src
parent9494e340adefaefc19439e9074d37b9ec957f4ca (diff)
move sequenceMatches to the Scanner
Diffstat (limited to 'src')
-rw-r--r--src/HTML5/Parser/Scanner.php24
-rw-r--r--src/HTML5/Parser/Tokenizer.php46
2 files changed, 38 insertions, 32 deletions
diff --git a/src/HTML5/Parser/Scanner.php b/src/HTML5/Parser/Scanner.php
index cb14a56..e81b3a9 100644
--- a/src/HTML5/Parser/Scanner.php
+++ b/src/HTML5/Parser/Scanner.php
@@ -62,6 +62,30 @@ class Scanner
}
/**
+ * Check if upcomming chars match the given sequence.
+ *
+ * This will read the stream for the $sequence. If it's
+ * found, this will return true. If not, return false.
+ * Since this unconsumes any chars it reads, the caller
+ * will still need to read the next sequence, even if
+ * this returns true.
+ *
+ * Example: $this->scanner->sequenceMatches('</script>') will
+ * see if the input stream is at the start of a
+ * '</script>' string.
+ *
+ * @param string $sequence
+ * @param bool $caseSensitive
+ *
+ * @return bool
+ */
+ public function sequenceMatches($sequence, $caseSensitive = true)
+ {
+ $portion = substr($this->data, $this->char, strlen($sequence));
+ return $caseSensitive ? $portion === $sequence : strcasecmp($portion, $sequence) === 0;
+ }
+
+ /**
* Get the current position.
*
* @return int The current intiger byte position.
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index b413b52..7b0b3f3 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -263,7 +263,7 @@ class Tokenizer
$txt = '';
$caseSensitive = !Elements::isHtml5Element($this->untilTag);
- while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) {
+ while ($tok !== false && ! ($tok == '<' && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) {
if ($tok == '&') {
$txt .= $this->decodeCharacterReference();
$tok = $this->scanner->current();
@@ -313,12 +313,13 @@ class Tokenizer
*/
protected function characterReference()
{
- $ref = $this->decodeCharacterReference();
- if ($ref !== false) {
- $this->buffer($ref);
- return true;
+ if ($this->scanner->current() !== '&') {
+ return false;
}
- return false;
+
+ $ref = $this->decodeCharacterReference();
+ $this->buffer($ref);
+ return true;
}
/**
@@ -892,7 +893,7 @@ class Tokenizer
}
$cdata .= $tok;
$tok = $this->scanner->next();
- } while (! $this->sequenceMatches(']]>'));
+ } while (! $this->scanner->sequenceMatches(']]>'));
// Consume ]]>
$this->scanner->consume(3);
@@ -972,7 +973,7 @@ class Tokenizer
$buffer .= $this->scanner->charsUntil($first);
// Stop as soon as we hit the stopping condition.
- if ($this->sequenceMatches($sequence, false)) {
+ if ($this->scanner->sequenceMatches($sequence, false)) {
return $buffer;
}
$buffer .= $this->scanner->current();
@@ -993,7 +994,7 @@ class Tokenizer
* will still need to read the next sequence, even if
* this returns true.
*
- * Example: $this->sequenceMatches('</script>') will
+ * Example: $this->scanner->sequenceMatches('</script>') will
* see if the input stream is at the start of a
* '</script>' string.
*
@@ -1004,22 +1005,9 @@ class Tokenizer
*/
protected function sequenceMatches($sequence, $caseSensitive = true)
{
- $len = strlen($sequence);
- $buffer = '';
- for ($i = 0; $i < $len; ++ $i) {
- $tok = $this->scanner->current();
- $buffer .= $tok;
+ @trigger_error(__METHOD__ . ' method is deprecated since version 2.4 and will be removed in 3.0. Use Scanner::sequenceMatches() instead.', E_USER_DEPRECATED);
- // EOF. Rewind and let the caller handle it.
- if ($tok === false) {
- $this->scanner->unconsume($i);
- return false;
- }
- $this->scanner->next();
- }
-
- $this->scanner->unconsume($len);
- return $caseSensitive ? $buffer == $sequence : strcasecmp($buffer, $sequence) === 0;
+ return $this->scanner->sequenceMatches($sequence, $caseSensitive);
}
/**
@@ -1079,22 +1067,16 @@ class Tokenizer
/**
* Decode a character reference and return the string.
*
- * Returns false if the entity could not be found. If $inAttribute is set
- * to true, a bare & will be returned as-is.
+ * If $inAttribute is set to true, a bare & will be returned as-is.
*
* @param bool $inAttribute
* Set to true if the text is inside of an attribute value.
* false otherwise.
*
- * @return bool|string
+ * @return string
*/
protected function decodeCharacterReference($inAttribute = false)
{
- // If it fails this, it's definitely not an entity.
- if ($this->scanner->current() != '&') {
- return false;
- }
-
// Next char after &.
$tok = $this->scanner->next();
$start = $this->scanner->position();