summaryrefslogtreecommitdiff
path: root/src/HTML5/Parser/Tokenizer.php
diff options
context:
space:
mode:
Diffstat (limited to 'src/HTML5/Parser/Tokenizer.php')
-rw-r--r--src/HTML5/Parser/Tokenizer.php332
1 files changed, 181 insertions, 151 deletions
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index ba9de52..cfd0e43 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -1,4 +1,5 @@
<?php
+
namespace Masterminds\HTML5\Parser;
use Masterminds\HTML5\Elements;
@@ -25,7 +26,6 @@ use Masterminds\HTML5\Elements;
*/
class Tokenizer
{
-
protected $scanner;
protected $events;
@@ -56,12 +56,9 @@ class Tokenizer
* it a scanner (input) and an event handler (output), and then calling
* the Tokenizer::parse() method.`
*
- * @param \Masterminds\HTML5\Parser\Scanner $scanner
- * A scanner initialized with an input stream.
- * @param \Masterminds\HTML5\Parser\EventHandler $eventHandler
- * An event handler, initialized and ready to receive
- * events.
- * @param string $mode
+ * @param Scanner $scanner A scanner initialized with an input stream
+ * @param EventHandler $eventHandler An event handler, initialized and ready to receive events
+ * @param string $mode
*/
public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML)
{
@@ -103,11 +100,9 @@ class Tokenizer
* Normally, setting is done by the event handler via a special return code on
* startTag(), but it can also be set manually using this function.
*
- * @param integer $textmode
- * One of Elements::TEXT_*
- * @param string $untilTag
- * The tag that should stop RAW or RCDATA mode. Normal mode does not
- * use this indicator.
+ * @param int $textmode One of Elements::TEXT_*
+ * @param string $untilTag The tag that should stop RAW or RCDATA mode. Normal mode does not
+ * use this indicator.
*/
public function setTextMode($textmode, $untilTag = null)
{
@@ -117,13 +112,13 @@ class Tokenizer
/**
* Consume a character and make a move.
- * HTML5 8.2.4.1
+ * HTML5 8.2.4.1.
*/
protected function consumeData()
{
$tok = $this->scanner->current();
- if ($tok === '&') {
+ if ('&' === $tok) {
// Character reference
$ref = $this->decodeCharacterReference();
$this->buffer($ref);
@@ -132,7 +127,7 @@ class Tokenizer
}
// Parse tag
- if ($tok === '<') {
+ if ('<' === $tok) {
// Any buffered text data can go out now.
$this->flushBuffer();
@@ -143,7 +138,7 @@ class Tokenizer
|| $this->processingInstruction()
|| $this->tagName()
// This always returns false.
- || $this->parseError("Illegal tag opening")
+ || $this->parseError('Illegal tag opening')
|| $this->characterData();
$tok = $this->scanner->current();
@@ -153,7 +148,7 @@ class Tokenizer
$this->eof($tok);
// Parse character
- if ($tok !== false) {
+ if (false !== $tok) {
switch ($this->textMode) {
case Elements::TEXT_RAW:
$this->rawText($tok);
@@ -164,10 +159,10 @@ class Tokenizer
break;
default:
- if (!strspn($tok, "<&")) {
+ if (!strspn($tok, '<&')) {
// NULL character
- if ($tok === "\00") {
- $this->parseError("Received null character.");
+ if ("\00" === $tok) {
+ $this->parseError('Received null character.');
}
$this->text .= $tok;
@@ -189,7 +184,7 @@ class Tokenizer
protected function characterData()
{
$tok = $this->scanner->current();
- if ($tok === false) {
+ if (false === $tok) {
return false;
}
switch ($this->textMode) {
@@ -198,9 +193,10 @@ class Tokenizer
case Elements::TEXT_RCDATA:
return $this->rcdata($tok);
default:
- if (strspn($tok, "<&")) {
+ if (strspn($tok, '<&')) {
return false;
}
+
return $this->text($tok);
}
}
@@ -208,20 +204,20 @@ class Tokenizer
/**
* This buffers the current token as character data.
*
- * @param string $tok The current token.
+ * @param string $tok the current token
*
* @return bool
*/
protected function text($tok)
{
// This should never happen...
- if ($tok === false) {
+ if (false === $tok) {
return false;
}
// NULL character
- if ($tok === "\00") {
- $this->parseError("Received null character.");
+ if ("\00" === $tok) {
+ $this->parseError('Received null character.');
}
$this->buffer($tok);
@@ -233,7 +229,7 @@ class Tokenizer
/**
* Read text in RAW mode.
*
- * @param string $tok The current token.
+ * @param string $tok the current token
*
* @return bool
*/
@@ -254,7 +250,7 @@ class Tokenizer
/**
* Read text in RCDATA mode.
*
- * @param string $tok The current token.
+ * @param string $tok the current token
*
* @return bool
*/
@@ -268,8 +264,8 @@ class Tokenizer
$txt = '';
$caseSensitive = !Elements::isHtml5Element($this->untilTag);
- while ($tok !== false && ! ($tok == '<' && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) {
- if ($tok == '&') {
+ while (false !== $tok && !('<' == $tok && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) {
+ if ('&' == $tok) {
$txt .= $this->decodeCharacterReference();
$tok = $this->scanner->current();
} else {
@@ -280,8 +276,8 @@ class Tokenizer
$len = strlen($sequence);
$this->scanner->consume($len);
$len += $this->scanner->whitespace();
- if ($this->scanner->current() !== '>') {
- $this->parseError("Unclosed RCDATA end tag");
+ if ('>' !== $this->scanner->current()) {
+ $this->parseError('Unclosed RCDATA end tag');
}
$this->scanner->unconsume($len);
@@ -296,7 +292,7 @@ class Tokenizer
*/
protected function eof($tok)
{
- if ($tok === false) {
+ if (false === $tok) {
// fprintf(STDOUT, "EOF");
$this->flushBuffer();
$this->events->eof();
@@ -313,40 +309,37 @@ class Tokenizer
*/
protected function markupDeclaration($tok)
{
- if ($tok != '!') {
+ if ('!' != $tok) {
return false;
}
$tok = $this->scanner->next();
// Comment:
- if ($tok == '-' && $this->scanner->peek() == '-') {
+ if ('-' == $tok && '-' == $this->scanner->peek()) {
$this->scanner->next(); // Consume the other '-'
$this->scanner->next(); // Next char.
return $this->comment();
- }
-
- elseif ($tok == 'D' || $tok == 'd') { // Doctype
+ } elseif ('D' == $tok || 'd' == $tok) { // Doctype
return $this->doctype();
- }
-
- elseif ($tok == '[') { // CDATA section
+ } elseif ('[' == $tok) { // CDATA section
return $this->cdataSection();
}
// FINISH
- $this->parseError("Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s", $tok);
+ $this->parseError('Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s', $tok);
$this->bogusComment('<!');
+
return true;
}
/**
* Consume an end tag.
- * 8.2.4.9
+ * 8.2.4.9.
*/
protected function endTag()
{
- if ($this->scanner->current() != '/') {
+ if ('/' != $this->scanner->current()) {
return false;
}
$tok = $this->scanner->next();
@@ -355,21 +348,22 @@ class Tokenizer
// > -> parse error
// EOF -> parse error
// -> parse error
- if (! ctype_alpha($tok)) {
+ if (!ctype_alpha($tok)) {
$this->parseError("Expected tag name, got '%s'", $tok);
- if ($tok == "\0" || $tok === false) {
+ if ("\0" == $tok || false === $tok) {
return false;
}
+
return $this->bogusComment('</');
}
$name = $this->scanner->charsUntil("\n\f \t>");
- $name = $this->mode === self::CONFORMANT_XML ? $name: strtolower($name);
+ $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name);
// Trash whitespace.
$this->scanner->whitespace();
$tok = $this->scanner->current();
- if ($tok != '>') {
+ if ('>' != $tok) {
$this->parseError("Expected >, got '%s'", $tok);
// We just trash stuff until we get to the next tag close.
$this->scanner->charsUntil('>');
@@ -377,23 +371,24 @@ class Tokenizer
$this->events->endTag($name);
$this->scanner->next();
+
return true;
}
/**
* Consume a tag name and body.
- * 8.2.4.10
+ * 8.2.4.10.
*/
protected function tagName()
{
$tok = $this->scanner->current();
- if (! ctype_alpha($tok)) {
+ if (!ctype_alpha($tok)) {
return false;
}
// We know this is at least one char.
- $name = $this->scanner->charsWhile(":_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
- $name = $this->mode === self::CONFORMANT_XML ? $name : strtolower($name);
+ $name = $this->scanner->charsWhile(':_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz');
+ $name = self::CONFORMANT_XML === $this->mode ? $name : strtolower($name);
$attributes = array();
$selfClose = false;
@@ -403,7 +398,7 @@ class Tokenizer
do {
$this->scanner->whitespace();
$this->attribute($attributes);
- } while (! $this->isTagEnd($selfClose));
+ } while (!$this->isTagEnd($selfClose));
} catch (ParseError $e) {
$selfClose = false;
}
@@ -425,30 +420,34 @@ class Tokenizer
protected function isTagEnd(&$selfClose)
{
$tok = $this->scanner->current();
- if ($tok == '/') {
+ if ('/' == $tok) {
$this->scanner->next();
$this->scanner->whitespace();
$tok = $this->scanner->current();
- if ($tok == '>') {
+ if ('>' == $tok) {
$selfClose = true;
+
return true;
}
- if ($tok === false) {
- $this->parseError("Unexpected EOF inside of tag.");
+ if (false === $tok) {
+ $this->parseError('Unexpected EOF inside of tag.');
+
return true;
}
// Basically, we skip the / token and go on.
// See 8.2.4.43.
$this->parseError("Unexpected '%s' inside of a tag.", $tok);
+
return false;
}
- if ($tok == '>') {
+ if ('>' == $tok) {
return true;
}
- if ($tok === false) {
- $this->parseError("Unexpected EOF inside of tag.");
+ if (false === $tok) {
+ $this->parseError('Unexpected EOF inside of tag.');
+
return true;
}
@@ -467,23 +466,23 @@ class Tokenizer
protected function attribute(&$attributes)
{
$tok = $this->scanner->current();
- if ($tok == '/' || $tok == '>' || $tok === false) {
+ if ('/' == $tok || '>' == $tok || false === $tok) {
return false;
}
- if ($tok == '<') {
+ if ('<' == $tok) {
$this->parseError("Unexpected '<' inside of attributes list.");
// Push the < back onto the stack.
$this->scanner->unconsume();
// Let the caller figure out how to handle this.
- throw new ParseError("Start tag inside of attribute.");
+ throw new ParseError('Start tag inside of attribute.');
}
$name = strtolower($this->scanner->charsUntil("/>=\n\f\t "));
- if (strlen($name) == 0) {
+ if (0 == strlen($name)) {
$tok = $this->scanner->current();
- $this->parseError("Expected an attribute name, got %s.", $tok);
+ $this->parseError('Expected an attribute name, got %s.', $tok);
// Really, only '=' can be the char here. Everything else gets absorbed
// under one rule or another.
$name = $tok;
@@ -497,18 +496,17 @@ class Tokenizer
// see issue #23: https://github.com/Masterminds/html5-php/issues/23
// and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) {
- $this->parseError("Unexpected characters in attribute name: %s", $name);
+ $this->parseError('Unexpected characters in attribute name: %s', $name);
$isValidAttribute = false;
} // There is no limitation for 1st character in HTML5.
// But method "DOMElement::setAttribute" is throwing exception for the
// characters below so they have to be filtered.
// see issue #23: https://github.com/Masterminds/html5-php/issues/23
// and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
- else
- if (preg_match("/^[0-9.-]/u", $name)) {
- $this->parseError("Unexpected character at the begining of attribute name: %s", $name);
- $isValidAttribute = false;
- }
+ elseif (preg_match('/^[0-9.-]/u', $name)) {
+ $this->parseError('Unexpected character at the begining of attribute name: %s', $name);
+ $isValidAttribute = false;
+ }
// 8.1.2.3
$this->scanner->whitespace();
@@ -516,6 +514,7 @@ class Tokenizer
if ($isValidAttribute) {
$attributes[$name] = $val;
}
+
return true;
}
@@ -527,7 +526,7 @@ class Tokenizer
*/
protected function attributeValue()
{
- if ($this->scanner->current() != '=') {
+ if ('=' != $this->scanner->current()) {
return null;
}
$this->scanner->next();
@@ -538,21 +537,24 @@ class Tokenizer
switch ($tok) {
case "\n":
case "\f":
- case " ":
+ case ' ':
case "\t":
// Whitespace here indicates an empty value.
return null;
case '"':
case "'":
$this->scanner->next();
+
return $this->quotedAttributeValue($tok);
case '>':
// case '/': // 8.2.4.37 seems to allow foo=/ as a valid attr.
- $this->parseError("Expected attribute value, got tag end.");
+ $this->parseError('Expected attribute value, got tag end.');
+
return null;
case '=':
case '`':
- $this->parseError("Expecting quotes, got %s.", $tok);
+ $this->parseError('Expecting quotes, got %s.', $tok);
+
return $this->unquotedAttributeValue();
default:
return $this->unquotedAttributeValue();
@@ -562,11 +564,11 @@ class Tokenizer
/**
* Get an attribute value string.
*
- * @param string $quote
- * IMPORTANT: This is a series of chars! Any one of which will be considered
- * termination of an attribute's value. E.g. "\"'" will stop at either
- * ' or ".
- * @return string The attribute value.
+ * @param string $quote IMPORTANT: This is a series of chars! Any one of which will be considered
+ * termination of an attribute's value. E.g. "\"'" will stop at either
+ * ' or ".
+ *
+ * @return string the attribute value
*/
protected function quotedAttributeValue($quote)
{
@@ -574,21 +576,22 @@ class Tokenizer
$val = '';
while (true) {
- $tokens = $this->scanner->charsUntil($stoplist.'&');
- if ($tokens !== false) {
+ $tokens = $this->scanner->charsUntil($stoplist . '&');
+ if (false !== $tokens) {
$val .= $tokens;
} else {
break;
}
$tok = $this->scanner->current();
- if ($tok == '&') {
+ if ('&' == $tok) {
$val .= $this->decodeCharacterReference(true);
continue;
}
break;
}
$this->scanner->next();
+
return $val;
}
@@ -597,34 +600,34 @@ class Tokenizer
$stoplist = "\t\n\f >";
$val = '';
$tok = $this->scanner->current();
- while (strspn($tok, $stoplist) == 0 && $tok !== false) {
- if ($tok == '&') {
+ while (0 == strspn($tok, $stoplist) && false !== $tok) {
+ if ('&' == $tok) {
$val .= $this->decodeCharacterReference(true);
$tok = $this->scanner->current();
} else {
if (strspn($tok, "\"'<=`") > 0) {
- $this->parseError("Unexpected chars in unquoted attribute value %s", $tok);
+ $this->parseError('Unexpected chars in unquoted attribute value %s', $tok);
}
$val .= $tok;
$tok = $this->scanner->next();
}
}
+
return $val;
}
/**
* Consume malformed markup as if it were a comment.
- * 8.2.4.44
+ * 8.2.4.44.
*
* The spec requires that the ENTIRE tag-like thing be enclosed inside of
* the comment. So this will generate comments like:
*
* &lt;!--&lt/+foo&gt;--&gt;
*
- * @param string $leading
- * Prepend any leading characters. This essentially
- * negates the need to backtrack, but it's sort of
- * a hack.
+ * @param string $leading Prepend any leading characters. This essentially
+ * negates the need to backtrack, but it's sort of
+ * a hack.
*
* @return bool
*/
@@ -632,11 +635,11 @@ class Tokenizer
{
$comment = $leading;
$tokens = $this->scanner->charsUntil('>');
- if ($tokens !== false) {
+ if (false !== $tokens) {
$comment .= $tokens;
}
$tok = $this->scanner->current();
- if ($tok !== false) {
+ if (false !== $tok) {
$comment .= $tok;
}
@@ -660,25 +663,27 @@ class Tokenizer
$comment = '';
// <!-->. Emit an empty comment because 8.2.4.46 says to.
- if ($tok == '>') {
+ if ('>' == $tok) {
// Parse error. Emit the comment token.
$this->parseError("Expected comment data, got '>'");
$this->events->comment('');
$this->scanner->next();
+
return true;
}
// Replace NULL with the replacement char.
- if ($tok == "\0") {
+ if ("\0" == $tok) {
$tok = UTF8Utils::FFFD;
}
- while (! $this->isCommentEnd()) {
+ while (!$this->isCommentEnd()) {
$comment .= $tok;
$tok = $this->scanner->next();
}
$this->events->comment($comment);
$this->scanner->next();
+
return true;
}
@@ -692,24 +697,26 @@ class Tokenizer
$tok = $this->scanner->current();
// EOF
- if ($tok === false) {
+ if (false === $tok) {
// Hit the end.
- $this->parseError("Unexpected EOF in a comment.");
+ $this->parseError('Unexpected EOF in a comment.');
+
return true;
}
// If it doesn't start with -, not the end.
- if ($tok != '-') {
+ if ('-' != $tok) {
return false;
}
// Advance one, and test for '->'
- if ($this->scanner->next() == '-' && $this->scanner->peek() == '>') {
+ if ('-' == $this->scanner->next() && '>' == $this->scanner->peek()) {
$this->scanner->next(); // Consume the last '>'
return true;
}
// Unread '-';
$this->scanner->unconsume(1);
+
return false;
}
@@ -729,9 +736,10 @@ class Tokenizer
return false;
}
// Check that string is DOCTYPE.
- $chars = $this->scanner->charsWhile("DOCTYPEdoctype");
+ $chars = $this->scanner->charsWhile('DOCTYPEdoctype');
if (strcasecmp($chars, 'DOCTYPE')) {
$this->parseError('Expected DOCTYPE, got %s', $chars);
+
return $this->bogusComment('<!' . $chars);
}
@@ -739,14 +747,15 @@ class Tokenizer
$tok = $this->scanner->current();
// EOF: die.
- if ($tok === false) {
+ if (false === $tok) {
$this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true);
+
return $this->eof($tok);
}
// NULL char: convert.
- if ($tok === "\0") {
- $this->parseError("Unexpected null character in DOCTYPE.");
+ if ("\0" === $tok) {
+ $this->parseError('Unexpected null character in DOCTYPE.');
}
$stop = " \n\f>";
@@ -757,23 +766,26 @@ class Tokenizer
$tok = $this->scanner->current();
// If false, emit a parse error, DOCTYPE, and return.
- if ($tok === false) {
+ if (false === $tok) {
$this->parseError('Unexpected EOF in DOCTYPE declaration.');
$this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true);
+
return true;
}
// Short DOCTYPE, like <!DOCTYPE html>
- if ($tok == '>') {
+ if ('>' == $tok) {
// DOCTYPE without a name.
- if (strlen($doctypeName) == 0) {
- $this->parseError("Expected a DOCTYPE name. Got nothing.");
+ if (0 == strlen($doctypeName)) {
+ $this->parseError('Expected a DOCTYPE name. Got nothing.');
$this->events->doctype($doctypeName, 0, null, true);
$this->scanner->next();
+
return true;
}
$this->events->doctype($doctypeName);
$this->scanner->next();
+
return true;
}
$this->scanner->whitespace();
@@ -782,36 +794,40 @@ class Tokenizer
$white = $this->scanner->whitespace();
// Get ID, and flag it as pub or system.
- if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) {
+ if (('PUBLIC' == $pub || 'SYSTEM' == $pub) && $white > 0) {
// Get the sys ID.
- $type = $pub == 'PUBLIC' ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM;
+ $type = 'PUBLIC' == $pub ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM;
$id = $this->quotedString("\0>");
- if ($id === false) {
+ if (false === $id) {
$this->events->doctype($doctypeName, $type, $pub, false);
+
return false;
}
// Premature EOF.
- if ($this->scanner->current() === false) {
- $this->parseError("Unexpected EOF in DOCTYPE");
+ if (false === $this->scanner->current()) {
+ $this->parseError('Unexpected EOF in DOCTYPE');
$this->events->doctype($doctypeName, $type, $id, true);
+
return true;
}
// Well-formed complete DOCTYPE.
$this->scanner->whitespace();
- if ($this->scanner->current() == '>') {
+ if ('>' == $this->scanner->current()) {
$this->events->doctype($doctypeName, $type, $id, false);
$this->scanner->next();
+
return true;
}
// If we get here, we have <!DOCTYPE foo PUBLIC "bar" SOME_JUNK
// Throw away the junk, parse error, quirks mode, return true.
- $this->scanner->charsUntil(">");
- $this->parseError("Malformed DOCTYPE.");
+ $this->scanner->charsUntil('>');
+ $this->parseError('Malformed DOCTYPE.');
$this->events->doctype($doctypeName, $type, $id, true);
$this->scanner->next();
+
return true;
}
@@ -819,35 +835,37 @@ class Tokenizer
// Consume to > and trash.
$this->scanner->charsUntil('>');
- $this->parseError("Expected PUBLIC or SYSTEM. Got %s.", $pub);
+ $this->parseError('Expected PUBLIC or SYSTEM. Got %s.', $pub);
$this->events->doctype($doctypeName, 0, null, true);
$this->scanner->next();
+
return true;
}
/**
* Utility for reading a quoted string.
*
- * @param string $stopchars
- * Characters (in addition to a close-quote) that should stop the string.
- * E.g. sometimes '>' is higher precedence than '"' or "'".
+ * @param string $stopchars Characters (in addition to a close-quote) that should stop the string.
+ * E.g. sometimes '>' is higher precedence than '"' or "'".
*
* @return mixed String if one is found (quotations omitted)
*/
protected function quotedString($stopchars)
{
$tok = $this->scanner->current();
- if ($tok == '"' || $tok == "'") {
+ if ('"' == $tok || "'" == $tok) {
$this->scanner->next();
$ret = $this->scanner->charsUntil($tok . $stopchars);
if ($this->scanner->current() == $tok) {
$this->scanner->next();
} else {
// Parse error because no close quote.
- $this->parseError("Expected %s, got %s", $tok, $this->scanner->current());
+ $this->parseError('Expected %s, got %s', $tok, $this->scanner->current());
}
+
return $ret;
}
+
return false;
}
@@ -858,39 +876,43 @@ class Tokenizer
*/
protected function cdataSection()
{
- if ($this->scanner->current() != '[') {
+ if ('[' != $this->scanner->current()) {
return false;
}
$cdata = '';
$this->scanner->next();
$chars = $this->scanner->charsWhile('CDAT');
- if ($chars != 'CDATA' || $this->scanner->current() != '[') {
+ if ('CDATA' != $chars || '[' != $this->scanner->current()) {
$this->parseError('Expected [CDATA[, got %s', $chars);
+
return $this->bogusComment('<![' . $chars);
}
$tok = $this->scanner->next();
do {
- if ($tok === false) {
+ if (false === $tok) {
$this->parseError('Unexpected EOF inside CDATA.');
$this->bogusComment('<![CDATA[' . $cdata);
+
return true;
}
$cdata .= $tok;
$tok = $this->scanner->next();
- } while (! $this->scanner->sequenceMatches(']]>'));
+ } while (!$this->scanner->sequenceMatches(']]>'));
// Consume ]]>
$this->scanner->consume(3);
$this->events->cdata($cdata);
+
return true;
}
// ================================================================
// Non-HTML5
// ================================================================
+
/**
* Handle a processing instruction.
*
@@ -903,7 +925,7 @@ class Tokenizer
*/
protected function processingInstruction()
{
- if ($this->scanner->current() != '?') {
+ if ('?' != $this->scanner->current()) {
return false;
}
@@ -912,21 +934,23 @@ class Tokenizer
$white = $this->scanner->whitespace();
// If not a PI, send to bogusComment.
- if (strlen($procName) == 0 || $white == 0 || $this->scanner->current() == false) {
+ if (0 == strlen($procName) || 0 == $white || false == $this->scanner->current()) {
$this->parseError("Expected processing instruction name, got $tok");
$this->bogusComment('<?' . $tok . $procName);
+
return true;
}
$data = '';
// As long as it's not the case that the next two chars are ? and >.
- while (! ($this->scanner->current() == '?' && $this->scanner->peek() == '>')) {
+ while (!('?' == $this->scanner->current() && '>' == $this->scanner->peek())) {
$data .= $this->scanner->current();
$tok = $this->scanner->next();
- if ($tok === false) {
- $this->parseError("Unexpected EOF in processing instruction.");
+ if (false === $tok) {
+ $this->parseError('Unexpected EOF in processing instruction.');
$this->events->processingInstruction($procName, $data);
+
return true;
}
}
@@ -934,6 +958,7 @@ class Tokenizer
$this->scanner->next(); // >
$this->scanner->next(); // Next token.
$this->events->processingInstruction($procName, $data);
+
return true;
}
@@ -955,7 +980,7 @@ class Tokenizer
// Optimization for reading larger blocks faster.
$first = substr($sequence, 0, 1);
- while ($this->scanner->current() !== false) {
+ while (false !== $this->scanner->current()) {
$buffer .= $this->scanner->charsUntil($first);
// Stop as soon as we hit the stopping condition.
@@ -967,7 +992,8 @@ class Tokenizer
}
// If we get here, we hit the EOF.
- $this->parseError("Unexpected EOF during text read.");
+ $this->parseError('Unexpected EOF during text read.');
+
return $buffer;
}
@@ -985,7 +1011,7 @@ class Tokenizer
* '</script>' string.
*
* @param string $sequence
- * @param bool $caseSensitive
+ * @param bool $caseSensitive
*
* @return bool
*/
@@ -1005,7 +1031,7 @@ class Tokenizer
*/
protected function flushBuffer()
{
- if ($this->text === '') {
+ if ('' === $this->text) {
return;
}
$this->events->text($this->text);
@@ -1055,9 +1081,8 @@ class Tokenizer
*
* If $inAttribute is set to true, a bare & will be returned as-is.
*
- * @param bool $inAttribute
- * Set to true if the text is inside of an attribute value.
- * false otherwise.
+ * @param bool $inAttribute Set to true if the text is inside of an attribute value.
+ * false otherwise.
*
* @return string
*/
@@ -1067,36 +1092,37 @@ class Tokenizer
$tok = $this->scanner->next();
$start = $this->scanner->position();
- if ($tok == false) {
+ if (false === $tok) {
return '&';
}
// These indicate not an entity. We return just
// the &.
- if (strspn($tok, static::WHITE . "&<") == 1) {
+ if (1 === strspn($tok, static::WHITE . '&<')) {
// $this->scanner->next();
return '&';
}
// Numeric entity
- if ($tok == '#') {
+ if ('#' === $tok) {
$tok = $this->scanner->next();
// Hexidecimal encoding.
// X[0-9a-fA-F]+;
// x[0-9a-fA-F]+;
- if ($tok == 'x' || $tok == 'X') {
+ if ('x' === $tok || 'X' === $tok) {
$tok = $this->scanner->next(); // Consume x
// Convert from hex code to char.
$hex = $this->scanner->getHex();
if (empty($hex)) {
- $this->parseError("Expected &#xHEX;, got &#x%s", $tok);
+ $this->parseError('Expected &#xHEX;, got &#x%s', $tok);
// We unconsume because we don't know what parser rules might
// be in effect for the remaining chars. For example. '&#>'
// might result in a specific parsing rule inside of tag
// contexts, while not inside of pcdata context.
$this->scanner->unconsume(2);
+
return '&';
}
$entity = CharacterReference::lookupHex($hex);
@@ -1105,17 +1131,17 @@ class Tokenizer
else {
// Convert from decimal to char.
$numeric = $this->scanner->getNumeric();
- if ($numeric === false) {
- $this->parseError("Expected &#DIGITS;, got &#%s", $tok);
+ if (false === $numeric) {
+ $this->parseError('Expected &#DIGITS;, got &#%s', $tok);
$this->scanner->unconsume(2);
+
return '&';
}
$entity = CharacterReference::lookupDecimal($numeric);
}
- } elseif ($tok === '=' && $inAttribute) {
+ } elseif ('=' === $tok && $inAttribute) {
return '&';
} else { // String entity.
-
// Attempt to consume a string up to a ';'.
// [a-zA-Z0-9]+;
$cname = $this->scanner->getAsciiAlphaNum();
@@ -1124,11 +1150,12 @@ class Tokenizer
// When no entity is found provide the name of the unmatched string
// and continue on as the & is not part of an entity. The & will
// be converted to &amp; elsewhere.
- if ($entity == null) {
- if (!$inAttribute || strlen($cname) === 0) {
+ if (null === $entity) {
+ if (!$inAttribute || '' === $cname) {
$this->parseError("No match in entity table for '%s'", $cname);
}
$this->scanner->unconsume($this->scanner->position() - $start);
+
return '&';
}
}
@@ -1137,8 +1164,9 @@ class Tokenizer
$tok = $this->scanner->current();
// We have an entity. We're done here.
- if ($tok == ';') {
+ if (';' === $tok) {
$this->scanner->next();
+
return $entity;
}
@@ -1146,10 +1174,12 @@ class Tokenizer
// entire string. Otherwise, failure to match is an error.
if ($inAttribute) {
$this->scanner->unconsume($this->scanner->position() - $start);
+
return '&';
}
- $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok);
+ $this->parseError('Expected &ENTITY;, got &ENTITY%s (no trailing ;) ', $tok);
+
return '&' . $entity;
}
}