summaryrefslogtreecommitdiff
path: root/src/HTML5
diff options
context:
space:
mode:
authorMatt Butcher <[email protected]>2013-04-08 18:35:22 -0500
committerMatt Butcher <[email protected]>2013-04-08 18:35:22 -0500
commit54f505584051982f1ea738d4a20cd0893d1de97f (patch)
tree552706bced28cdafe2acd396313bd92bf74812d2 /src/HTML5
parentf0aa87d02e8885825878781cd044c04c25e381ea (diff)
Reduced 4 spaces to 2 spaces.
Diffstat (limited to 'src/HTML5')
-rw-r--r--src/HTML5/Tokenizer.php4668
1 files changed, 2334 insertions, 2334 deletions
diff --git a/src/HTML5/Tokenizer.php b/src/HTML5/Tokenizer.php
index e27b16a..4a89b8e 100644
--- a/src/HTML5/Tokenizer.php
+++ b/src/HTML5/Tokenizer.php
@@ -35,2389 +35,2389 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
// all flags are in hyphenated form
class Tokenizer {
+ /**
+ * Points to an InputStream object.
+ */
+ protected $stream;
+
+ /**
+ * Tree builder that the tokenizer emits token to.
+ */
+ private $tree;
+
+ /**
+ * Current content model we are parsing as.
+ */
+ protected $content_model;
+
+ /**
+ * Current token that is being built, but not yet emitted. Also
+ * is the last token emitted, if applicable.
+ */
+ protected $token;
+
+ // These are constants describing the content model
+ const PCDATA = 0;
+ const RCDATA = 1;
+ const CDATA = 2;
+ const PLAINTEXT = 3;
+
+ // These are constants describing tokens
+ // XXX should probably be moved somewhere else, probably the
+ // HTML5 class.
+ const DOCTYPE = 0;
+ const STARTTAG = 1;
+ const ENDTAG = 2;
+ const COMMENT = 3;
+ const CHARACTER = 4;
+ const SPACECHARACTER = 5;
+ const EOF = 6;
+ const PARSEERROR = 7;
+
+ // These are constants representing bunches of characters.
+ const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
+ const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
+ const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
+ const DIGIT = '0123456789';
+ const HEX = '0123456789ABCDEFabcdef';
+ const WHITESPACE = "\t\n\x0c ";
+
+ /**
+ * @param $data Data to parse
+ */
+ public function __construct($data, $builder = null) {
+ $this->stream = new InputStream($data);
+ if (!$builder) $this->tree = new TreeBuilder;
+ else $this->tree = $builder;
+ $this->content_model = self::PCDATA;
+ }
+
+ public function parseFragment($context = null) {
+ $this->tree->setupContext($context);
+ if ($this->tree->content_model) {
+ $this->content_model = $this->tree->content_model;
+ $this->tree->content_model = null;
+ }
+ $this->parse();
+ }
+
+ // XXX maybe convert this into an iterator? regardless, this function
+ // and the save function should go into a Parser facade of some sort
+ /**
+ * Performs the actual parsing of the document.
+ */
+ public function parse() {
+ // Current state
+ $state = 'data';
+ // This is used to avoid having to have look-behind in the data state.
+ $lastFourChars = '';
/**
- * Points to an InputStream object.
+ * Escape flag as specified by the HTML5 specification: "used to
+ * control the behavior of the tokeniser. It is either true or
+ * false, and initially must be set to the false state."
*/
- protected $stream;
+ $escape = false;
+ //echo "\n\n";
+ while($state !== null) {
+
+ /*echo $state . ' ';
+ switch ($this->content_model) {
+ case self::PCDATA: echo 'PCDATA'; break;
+ case self::RCDATA: echo 'RCDATA'; break;
+ case self::CDATA: echo 'CDATA'; break;
+ case self::PLAINTEXT: echo 'PLAINTEXT'; break;
+ }
+ if ($escape) echo " escape";
+ echo "\n";*/
+
+ switch($state) {
+ case 'data':
+
+ /* Consume the next input character */
+ $char = $this->stream->char();
+ $lastFourChars .= $char;
+ if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
+
+ // see below for meaning
+ $hyp_cond =
+ !$escape &&
+ (
+ $this->content_model === self::RCDATA ||
+ $this->content_model === self::CDATA
+ );
+ $amp_cond =
+ !$escape &&
+ (
+ $this->content_model === self::PCDATA ||
+ $this->content_model === self::RCDATA
+ );
+ $lt_cond =
+ $this->content_model === self::PCDATA ||
+ (
+ (
+ $this->content_model === self::RCDATA ||
+ $this->content_model === self::CDATA
+ ) &&
+ !$escape
+ );
+ $gt_cond =
+ $escape &&
+ (
+ $this->content_model === self::RCDATA ||
+ $this->content_model === self::CDATA
+ );
+
+ if($char === '&' && $amp_cond) {
+ /* U+0026 AMPERSAND (&)
+ When the content model flag is set to one of the PCDATA or RCDATA
+ states and the escape flag is false: switch to the
+ character reference data state. Otherwise: treat it as per
+ the "anything else" entry below. */
+ $state = 'character reference data';
+
+ } elseif(
+ $char === '-' &&
+ $hyp_cond &&
+ $lastFourChars === '<!--'
+ ) {
+ /*
+ U+002D HYPHEN-MINUS (-)
+ If the content model flag is set to either the RCDATA state or
+ the CDATA state, and the escape flag is false, and there are at
+ least three characters before this one in the input stream, and the
+ last four characters in the input stream, including this one, are
+ U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
+ and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
+ $escape = true;
+
+ /* In any case, emit the input character as a character token. Stay
+ in the data state. */
+ $this->emitToken(array(
+ 'type' => self::CHARACTER,
+ 'data' => '-'
+ ));
+ // We do the "any case" part as part of "anything else".
+
+ /* U+003C LESS-THAN SIGN (<) */
+ } elseif($char === '<' && $lt_cond) {
+ /* When the content model flag is set to the PCDATA state: switch
+ to the tag open state.
+
+ When the content model flag is set to either the RCDATA state or
+ the CDATA state and the escape flag is false: switch to the tag
+ open state.
+
+ Otherwise: treat it as per the "anything else" entry below. */
+ $state = 'tag open';
+
+ /* U+003E GREATER-THAN SIGN (>) */
+ } elseif(
+ $char === '>' &&
+ $gt_cond &&
+ substr($lastFourChars, 1) === '-->'
+ ) {
+ /* If the content model flag is set to either the RCDATA state or
+ the CDATA state, and the escape flag is true, and the last three
+ characters in the input stream including this one are U+002D
+ HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
+ set the escape flag to false. */
+ $escape = false;
+
+ /* In any case, emit the input character as a character token.
+ Stay in the data state. */
+ $this->emitToken(array(
+ 'type' => self::CHARACTER,
+ 'data' => '>'
+ ));
+ // We do the "any case" part as part of "anything else".
+
+ } elseif($char === false) {
+ /* EOF
+ Emit an end-of-file token. */
+ $state = null;
+ $this->tree->emitToken(array(
+ 'type' => self::EOF
+ ));
+
+ } elseif($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ // Directly after emitting a token you switch back to the "data
+ // state". At that point spaceCharacters are important so they are
+ // emitted separately.
+ $chars = $this->stream->charsWhile(self::WHITESPACE);
+ $this->emitToken(array(
+ 'type' => self::SPACECHARACTER,
+ 'data' => $char . $chars
+ ));
+ $lastFourChars .= $chars;
+ if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
+
+ } else {
+ /* Anything else
+ THIS IS AN OPTIMIZATION: Get as many character that
+ otherwise would also be treated as a character token and emit it
+ as a single character token. Stay in the data state. */
+
+ $mask = '';
+ if ($hyp_cond) $mask .= '-';
+ if ($amp_cond) $mask .= '&';
+ if ($lt_cond) $mask .= '<';
+ if ($gt_cond) $mask .= '>';
+
+ if ($mask === '') {
+ $chars = $this->stream->remainingChars();
+ } else {
+ $chars = $this->stream->charsUntil($mask);
+ }
- /**
- * Tree builder that the tokenizer emits token to.
- */
- private $tree;
+ $this->emitToken(array(
+ 'type' => self::CHARACTER,
+ 'data' => $char . $chars
+ ));
- /**
- * Current content model we are parsing as.
- */
- protected $content_model;
+ $lastFourChars .= $chars;
+ if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
+
+ $state = 'data';
+ }
+ break;
+
+ case 'character reference data':
+ /* (This cannot happen if the content model flag
+ is set to the CDATA state.) */
+
+ /* Attempt to consume a character reference, with no
+ additional allowed character. */
+ $entity = $this->consumeCharacterReference();
+
+ /* If nothing is returned, emit a U+0026 AMPERSAND
+ character token. Otherwise, emit the character token that
+ was returned. */
+ // This is all done when consuming the character reference.
+ $this->emitToken(array(
+ 'type' => self::CHARACTER,
+ 'data' => $entity
+ ));
+
+ /* Finally, switch to the data state. */
+ $state = 'data';
+ break;
+
+ case 'tag open':
+ $char = $this->stream->char();
+
+ switch($this->content_model) {
+ case self::RCDATA:
+ case self::CDATA:
+ /* Consume the next input character. If it is a
+ U+002F SOLIDUS (/) character, switch to the close
+ tag open state. Otherwise, emit a U+003C LESS-THAN
+ SIGN character token and reconsume the current input
+ character in the data state. */
+ // We consumed above.
+
+ if($char === '/') {
+ $state = 'close tag open';
+
+ } else {
+ $this->emitToken(array(
+ 'type' => self::CHARACTER,
+ 'data' => '<'
+ ));
- /**
- * Current token that is being built, but not yet emitted. Also
- * is the last token emitted, if applicable.
- */
- protected $token;
-
- // These are constants describing the content model
- const PCDATA = 0;
- const RCDATA = 1;
- const CDATA = 2;
- const PLAINTEXT = 3;
-
- // These are constants describing tokens
- // XXX should probably be moved somewhere else, probably the
- // HTML5 class.
- const DOCTYPE = 0;
- const STARTTAG = 1;
- const ENDTAG = 2;
- const COMMENT = 3;
- const CHARACTER = 4;
- const SPACECHARACTER = 5;
- const EOF = 6;
- const PARSEERROR = 7;
-
- // These are constants representing bunches of characters.
- const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
- const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
- const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
- const DIGIT = '0123456789';
- const HEX = '0123456789ABCDEFabcdef';
- const WHITESPACE = "\t\n\x0c ";
+ $this->stream->unget();
- /**
- * @param $data Data to parse
- */
- public function __construct($data, $builder = null) {
- $this->stream = new InputStream($data);
- if (!$builder) $this->tree = new TreeBuilder;
- else $this->tree = $builder;
- $this->content_model = self::PCDATA;
- }
+ $state = 'data';
+ }
+ break;
+
+ case self::PCDATA:
+ /* If the content model flag is set to the PCDATA state
+ Consume the next input character: */
+ // We consumed above.
+
+ if($char === '!') {
+ /* U+0021 EXCLAMATION MARK (!)
+ Switch to the markup declaration open state. */
+ $state = 'markup declaration open';
+
+ } elseif($char === '/') {
+ /* U+002F SOLIDUS (/)
+ Switch to the close tag open state. */
+ $state = 'close tag open';
+
+ } elseif('A' <= $char && $char <= 'Z') {
+ /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
+ Create a new start tag token, set its tag name to the lowercase
+ version of the input character (add 0x0020 to the character's code
+ point), then switch to the tag name state. (Don't emit the token
+ yet; further details will be filled in before it is emitted.) */
+ $this->token = array(
+ 'name' => strtolower($char),
+ 'type' => self::STARTTAG,
+ 'attr' => array()
+ );
+
+ $state = 'tag name';
+
+ } elseif('a' <= $char && $char <= 'z') {
+ /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
+ Create a new start tag token, set its tag name to the input
+ character, then switch to the tag name state. (Don't emit
+ the token yet; further details will be filled in before it
+ is emitted.) */
+ $this->token = array(
+ 'name' => $char,
+ 'type' => self::STARTTAG,
+ 'attr' => array()
+ );
+
+ $state = 'tag name';
+
+ } elseif($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Emit a U+003C LESS-THAN SIGN character token and a
+ U+003E GREATER-THAN SIGN character token. Switch to the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-tag-name-but-got-right-bracket'
+ ));
+ $this->emitToken(array(
+ 'type' => self::CHARACTER,
+ 'data' => '<>'
+ ));
- public function parseFragment($context = null) {
- $this->tree->setupContext($context);
- if ($this->tree->content_model) {
- $this->content_model = $this->tree->content_model;
- $this->tree->content_model = null;
- }
- $this->parse();
- }
+ $state = 'data';
- // XXX maybe convert this into an iterator? regardless, this function
- // and the save function should go into a Parser facade of some sort
- /**
- * Performs the actual parsing of the document.
- */
- public function parse() {
- // Current state
- $state = 'data';
- // This is used to avoid having to have look-behind in the data state.
- $lastFourChars = '';
- /**
- * Escape flag as specified by the HTML5 specification: "used to
- * control the behavior of the tokeniser. It is either true or
- * false, and initially must be set to the false state."
- */
- $escape = false;
- //echo "\n\n";
- while($state !== null) {
-
- /*echo $state . ' ';
- switch ($this->content_model) {
- case self::PCDATA: echo 'PCDATA'; break;
- case self::RCDATA: echo 'RCDATA'; break;
- case self::CDATA: echo 'CDATA'; break;
- case self::PLAINTEXT: echo 'PLAINTEXT'; break;
+ } elseif($char === '?') {
+ /* U+003F QUESTION MARK (?)
+ Parse error. Switch to the bogus comment state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-tag-name-but-got-question-mark'
+ ));
+ $this->token = array(
+ 'data' => '?',
+ 'type' => self::COMMENT
+ );
+ $state = 'bogus comment';
+
+ } else {
+ /* Anything else
+ Parse error. Emit a U+003C LESS-THAN SIGN character token and
+ reconsume the current input character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-tag-name'
+ ));
+ $this->emitToken(array(
+ 'type' => self::CHARACTER,
+ 'data' => '<'
+ ));
+
+ $state = 'data';
+ $this->stream->unget();
+ }
+ break;
+ }
+ break;
+
+ case 'close tag open':
+ if (
+ $this->content_model === self::RCDATA ||
+ $this->content_model === self::CDATA
+ ) {
+ /* If the content model flag is set to the RCDATA or CDATA
+ states... */
+ $name = strtolower($this->stream->charsWhile(self::ALPHA));
+ $following = $this->stream->char();
+ $this->stream->unget();
+ if (
+ !$this->token ||
+ $this->token['name'] !== $name ||
+ $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
+ ) {
+ /* if no start tag token has ever been emitted by this instance
+ of the tokenizer (fragment case), or, if the next few
+ characters do not match the tag name of the last start tag
+ token emitted (compared in an ASCII case-insensitive manner),
+ or if they do but they are not immediately followed by one of
+ the following characters:
+
+ * U+0009 CHARACTER TABULATION
+ * U+000A LINE FEED (LF)
+ * U+000C FORM FEED (FF)
+ * U+0020 SPACE
+ * U+003E GREATER-THAN SIGN (>)
+ * U+002F SOLIDUS (/)
+ * EOF
+
+ ...then emit a U+003C LESS-THAN SIGN character token, a
+ U+002F SOLIDUS character token, and switch to the data
+ state to process the next input character. */
+ // XXX: Probably ought to replace in_array with $following === x ||...
+
+ // We also need to emit $name now we've consumed that, as we
+ // know it'll just be emitted as a character token.
+ $this->emitToken(array(
+ 'type' => self::CHARACTER,
+ 'data' => '</' . $name
+ ));
+
+ $state = 'data';
+ } else {
+ // This matches what would happen if we actually did the
+ // otherwise below (but we can't because we've consumed too
+ // much).
+
+ // Start the end tag token with the name we already have.
+ $this->token = array(
+ 'name' => $name,
+ 'type' => self::ENDTAG
+ );
+
+ // Change to tag name state.
+ $state = 'tag name';
}
- if ($escape) echo " escape";
- echo "\n";*/
-
- switch($state) {
- case 'data':
-
- /* Consume the next input character */
- $char = $this->stream->char();
- $lastFourChars .= $char;
- if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
-
- // see below for meaning
- $hyp_cond =
- !$escape &&
- (
- $this->content_model === self::RCDATA ||
- $this->content_model === self::CDATA
- );
- $amp_cond =
- !$escape &&
- (
- $this->content_model === self::PCDATA ||
- $this->content_model === self::RCDATA
- );
- $lt_cond =
- $this->content_model === self::PCDATA ||
- (
- (
- $this->content_model === self::RCDATA ||
- $this->content_model === self::CDATA
- ) &&
- !$escape
- );
- $gt_cond =
- $escape &&
- (
- $this->content_model === self::RCDATA ||
- $this->content_model === self::CDATA
- );
-
- if($char === '&' && $amp_cond) {
- /* U+0026 AMPERSAND (&)
- When the content model flag is set to one of the PCDATA or RCDATA
- states and the escape flag is false: switch to the
- character reference data state. Otherwise: treat it as per
- the "anything else" entry below. */
- $state = 'character reference data';
-
- } elseif(
- $char === '-' &&
- $hyp_cond &&
- $lastFourChars === '<!--'
- ) {
- /*
- U+002D HYPHEN-MINUS (-)
- If the content model flag is set to either the RCDATA state or
- the CDATA state, and the escape flag is false, and there are at
- least three characters before this one in the input stream, and the
- last four characters in the input stream, including this one, are
- U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
- and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
- $escape = true;
-
- /* In any case, emit the input character as a character token. Stay
- in the data state. */
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => '-'
- ));
- // We do the "any case" part as part of "anything else".
-
- /* U+003C LESS-THAN SIGN (<) */
- } elseif($char === '<' && $lt_cond) {
- /* When the content model flag is set to the PCDATA state: switch
- to the tag open state.
-
- When the content model flag is set to either the RCDATA state or
- the CDATA state and the escape flag is false: switch to the tag
- open state.
-
- Otherwise: treat it as per the "anything else" entry below. */
- $state = 'tag open';
-
- /* U+003E GREATER-THAN SIGN (>) */
- } elseif(
- $char === '>' &&
- $gt_cond &&
- substr($lastFourChars, 1) === '-->'
- ) {
- /* If the content model flag is set to either the RCDATA state or
- the CDATA state, and the escape flag is true, and the last three
- characters in the input stream including this one are U+002D
- HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
- set the escape flag to false. */
- $escape = false;
-
- /* In any case, emit the input character as a character token.
- Stay in the data state. */
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => '>'
- ));
- // We do the "any case" part as part of "anything else".
-
- } elseif($char === false) {
- /* EOF
- Emit an end-of-file token. */
- $state = null;
- $this->tree->emitToken(array(
- 'type' => self::EOF
- ));
-
- } elseif($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- // Directly after emitting a token you switch back to the "data
- // state". At that point spaceCharacters are important so they are
- // emitted separately.
- $chars = $this->stream->charsWhile(self::WHITESPACE);
- $this->emitToken(array(
- 'type' => self::SPACECHARACTER,
- 'data' => $char . $chars
- ));
- $lastFourChars .= $chars;
- if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
-
- } else {
- /* Anything else
- THIS IS AN OPTIMIZATION: Get as many character that
- otherwise would also be treated as a character token and emit it
- as a single character token. Stay in the data state. */
-
- $mask = '';
- if ($hyp_cond) $mask .= '-';
- if ($amp_cond) $mask .= '&';
- if ($lt_cond) $mask .= '<';
- if ($gt_cond) $mask .= '>';
-
- if ($mask === '') {
- $chars = $this->stream->remainingChars();
- } else {
- $chars = $this->stream->charsUntil($mask);
- }
-
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => $char . $chars
- ));
-
- $lastFourChars .= $chars;
- if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
-
- $state = 'data';
- }
- break;
-
- case 'character reference data':
- /* (This cannot happen if the content model flag
- is set to the CDATA state.) */
-
- /* Attempt to consume a character reference, with no
- additional allowed character. */
- $entity = $this->consumeCharacterReference();
-
- /* If nothing is returned, emit a U+0026 AMPERSAND
- character token. Otherwise, emit the character token that
- was returned. */
- // This is all done when consuming the character reference.
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => $entity
- ));
-
- /* Finally, switch to the data state. */
- $state = 'data';
- break;
-
- case 'tag open':
- $char = $this->stream->char();
-
- switch($this->content_model) {
- case self::RCDATA:
- case self::CDATA:
- /* Consume the next input character. If it is a
- U+002F SOLIDUS (/) character, switch to the close
- tag open state. Otherwise, emit a U+003C LESS-THAN
- SIGN character token and reconsume the current input
- character in the data state. */
- // We consumed above.
-
- if($char === '/') {
- $state = 'close tag open';
-
- } else {
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => '<'
- ));
-
- $this->stream->unget();
-
- $state = 'data';
- }
- break;
-
- case self::PCDATA:
- /* If the content model flag is set to the PCDATA state
- Consume the next input character: */
- // We consumed above.
-
- if($char === '!') {
- /* U+0021 EXCLAMATION MARK (!)
- Switch to the markup declaration open state. */
- $state = 'markup declaration open';
-
- } elseif($char === '/') {
- /* U+002F SOLIDUS (/)
- Switch to the close tag open state. */
- $state = 'close tag open';
-
- } elseif('A' <= $char && $char <= 'Z') {
- /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
- Create a new start tag token, set its tag name to the lowercase
- version of the input character (add 0x0020 to the character's code
- point), then switch to the tag name state. (Don't emit the token
- yet; further details will be filled in before it is emitted.) */
- $this->token = array(
- 'name' => strtolower($char),
- 'type' => self::STARTTAG,
- 'attr' => array()
- );
-
- $state = 'tag name';
-
- } elseif('a' <= $char && $char <= 'z') {
- /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
- Create a new start tag token, set its tag name to the input
- character, then switch to the tag name state. (Don't emit
- the token yet; further details will be filled in before it
- is emitted.) */
- $this->token = array(
- 'name' => $char,
- 'type' => self::STARTTAG,
- 'attr' => array()
- );
-
- $state = 'tag name';
-
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Parse error. Emit a U+003C LESS-THAN SIGN character token and a
- U+003E GREATER-THAN SIGN character token. Switch to the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-tag-name-but-got-right-bracket'
- ));
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => '<>'
- ));
-
- $state = 'data';
-
- } elseif($char === '?') {
- /* U+003F QUESTION MARK (?)
- Parse error. Switch to the bogus comment state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-tag-name-but-got-question-mark'
- ));
- $this->token = array(
- 'data' => '?',
- 'type' => self::COMMENT
- );
- $state = 'bogus comment';
-
- } else {
- /* Anything else
- Parse error. Emit a U+003C LESS-THAN SIGN character token and
- reconsume the current input character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-tag-name'
- ));
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => '<'
- ));
-
- $state = 'data';
- $this->stream->unget();
- }
- break;
- }
- break;
-
- case 'close tag open':
- if (
- $this->content_model === self::RCDATA ||
- $this->content_model === self::CDATA
- ) {
- /* If the content model flag is set to the RCDATA or CDATA
- states... */
- $name = strtolower($this->stream->charsWhile(self::ALPHA));
- $following = $this->stream->char();
- $this->stream->unget();
- if (
- !$this->token ||
- $this->token['name'] !== $name ||
- $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
- ) {
- /* if no start tag token has ever been emitted by this instance
- of the tokenizer (fragment case), or, if the next few
- characters do not match the tag name of the last start tag
- token emitted (compared in an ASCII case-insensitive manner),
- or if they do but they are not immediately followed by one of
- the following characters:
-
- * U+0009 CHARACTER TABULATION
- * U+000A LINE FEED (LF)
- * U+000C FORM FEED (FF)
- * U+0020 SPACE
- * U+003E GREATER-THAN SIGN (>)
- * U+002F SOLIDUS (/)
- * EOF
-
- ...then emit a U+003C LESS-THAN SIGN character token, a
- U+002F SOLIDUS character token, and switch to the data
- state to process the next input character. */
- // XXX: Probably ought to replace in_array with $following === x ||...
-
- // We also need to emit $name now we've consumed that, as we
- // know it'll just be emitted as a character token.
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => '</' . $name
- ));
-
- $state = 'data';
- } else {
- // This matches what would happen if we actually did the
- // otherwise below (but we can't because we've consumed too
- // much).
-
- // Start the end tag token with the name we already have.
- $this->token = array(
- 'name' => $name,
- 'type' => self::ENDTAG
- );
-
- // Change to tag name state.
- $state = 'tag name';
- }
- } elseif ($this->content_model === self::PCDATA) {
- /* Otherwise, if the content model flag is set to the PCDATA
- state [...]: */
- $char = $this->stream->char();
-
- if ('A' <= $char && $char <= 'Z') {
- /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
- Create a new end tag token, set its tag name to the lowercase version
- of the input character (add 0x0020 to the character's code point), then
- switch to the tag name state. (Don't emit the token yet; further details
- will be filled in before it is emitted.) */
- $this->token = array(
- 'name' => strtolower($char),
- 'type' => self::ENDTAG
- );
-
- $state = 'tag name';
-
- } elseif ('a' <= $char && $char <= 'z') {
- /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
- Create a new end tag token, set its tag name to the
- input character, then switch to the tag name state.
- (Don't emit the token yet; further details will be
- filled in before it is emitted.) */
- $this->token = array(
- 'name' => $char,
- 'type' => self::ENDTAG
- );
-
- $state = 'tag name';
-
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Parse error. Switch to the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-closing-tag-but-got-right-bracket'
- ));
- $state = 'data';
-
- } elseif($char === false) {
- /* EOF
- Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
- SOLIDUS character token. Reconsume the EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-closing-tag-but-got-eof'
- ));
- $this->emitToken(array(
- 'type' => self::CHARACTER,
- 'data' => '</'
- ));
-
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* Parse error. Switch to the bogus comment state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-closing-tag-but-got-char'
- ));
- $this->token = array(
- 'data' => $char,
- 'type' => self::COMMENT
- );
- $state = 'bogus comment';
- }
- }
- break;
-
- case 'tag name':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Switch to the before attribute name state. */
- $state = 'before attribute name';
-
- } elseif($char === '/') {
- /* U+002F SOLIDUS (/)
- Switch to the self-closing start tag state. */
- $state = 'self-closing start tag';
-
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current tag token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
-
- } elseif('A' <= $char && $char <= 'Z') {
- /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
- Append the lowercase version of the current input
- character (add 0x0020 to the character's code point) to
- the current tag token's tag name. Stay in the tag name state. */
- $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
-
- $this->token['name'] .= strtolower($char . $chars);
- $state = 'tag name';
-
- } elseif($char === false) {
- /* EOF
- Parse error. Reconsume the EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-tag-name'
- ));
-
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* Anything else
- Append the current input character to the current tag token's tag name.
- Stay in the tag name state. */
- $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
-
- $this->token['name'] .= $char . $chars;
- $state = 'tag name';
- }
- break;
-
- case 'before attribute name':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- // this conditional is optimized, check bottom
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Stay in the before attribute name state. */
- $state = 'before attribute name';
-
- } elseif($char === '/') {
- /* U+002F SOLIDUS (/)
- Switch to the self-closing start tag state. */
- $state = 'self-closing start tag';
-
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current tag token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
-
- } elseif('A' <= $char && $char <= 'Z') {
- /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
- Start a new attribute in the current tag token. Set that
- attribute's name to the lowercase version of the current
- input character (add 0x0020 to the character's code
- point), and its value to the empty string. Switch to the
- attribute name state.*/
- $this->token['attr'][] = array(
- 'name' => strtolower($char),
- 'value' => ''
- );
-
- $state = 'attribute name';
-
- } elseif($char === false) {
- /* EOF
- Parse error. Reconsume the EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-attribute-name-but-got-eof'
- ));
-
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* U+0022 QUOTATION MARK (")
- U+0027 APOSTROPHE (')
- U+003C LESS-THAN SIGN (<)
- U+003D EQUALS SIGN (=)
- Parse error. Treat it as per the "anything else" entry
- below. */
- if($char === '"' || $char === "'" || $char === '<' || $char === '=') {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'invalid-character-in-attribute-name'
- ));
- }
-
- /* Anything else
- Start a new attribute in the current tag token. Set that attribute's
- name to the current input character, and its value to the empty string.
- Switch to the attribute name state. */
- $this->token['attr'][] = array(
- 'name' => $char,
- 'value' => ''
- );
-
- $state = 'attribute name';
- }
- break;
-
- case 'attribute name':
- // Consume the next input character:
- $char = $this->stream->char();
-
- // this conditional is optimized, check bottom
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Switch to the after attribute name state. */
- $state = 'after attribute name';
-
- } elseif($char === '/') {
- /* U+002F SOLIDUS (/)
- Switch to the self-closing start tag state. */
- $state = 'self-closing start tag';
-
- } elseif($char === '=') {
- /* U+003D EQUALS SIGN (=)
- Switch to the before attribute value state. */
- $state = 'before attribute value';
-
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current tag token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
-
- } elseif('A' <= $char && $char <= 'Z') {
- /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
- Append the lowercase version of the current input
- character (add 0x0020 to the character's code point) to
- the current attribute's name. Stay in the attribute name
- state. */
- $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
-
- $last = count($this->token['attr']) - 1;
- $this->token['attr'][$last]['name'] .= strtolower($char . $chars);
-
- $state = 'attribute name';
-
- } elseif($char === false) {
- /* EOF
- Parse error. Reconsume the EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-attribute-name'
- ));
-
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* U+0022 QUOTATION MARK (")
- U+0027 APOSTROPHE (')
- U+003C LESS-THAN SIGN (<)
- Parse error. Treat it as per the "anything else"
- entry below. */
- if($char === '"' || $char === "'" || $char === '<') {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'invalid-character-in-attribute-name'
- ));
- }
-
- /* Anything else
- Append the current input character to the current attribute's name.
- Stay in the attribute name state. */
- $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
-
- $last = count($this->token['attr']) - 1;
- $this->token['attr'][$last]['name'] .= $char . $chars;
-
- $state = 'attribute name';
- }
-
- /* When the user agent leaves the attribute name state
- (and before emitting the tag token, if appropriate), the
- complete attribute's name must be compared to the other
- attributes on the same token; if there is already an
- attribute on the token with the exact same name, then this
- is a parse error and the new attribute must be dropped, along
- with the value that gets associated with it (if any). */
- // this might be implemented in the emitToken method
- break;
-
- case 'after attribute name':
- // Consume the next input character:
- $char = $this->stream->char();
-
- // this is an optimized conditional, check the bottom
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Stay in the after attribute name state. */
- $state = 'after attribute name';
-
- } elseif($char === '/') {
- /* U+002F SOLIDUS (/)
- Switch to the self-closing start tag state. */
- $state = 'self-closing start tag';
-
- } elseif($char === '=') {
- /* U+003D EQUALS SIGN (=)
- Switch to the before attribute value state. */
- $state = 'before attribute value';
-
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current tag token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
-
- } elseif('A' <= $char && $char <= 'Z') {
- /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
- Start a new attribute in the current tag token. Set that
- attribute's name to the lowercase version of the current
- input character (add 0x0020 to the character's code
- point), and its value to the empty string. Switch to the
- attribute name state. */
- $this->token['attr'][] = array(
- 'name' => strtolower($char),
- 'value' => ''
- );
-
- $state = 'attribute name';
-
- } elseif($char === false) {
- /* EOF
- Parse error. Reconsume the EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-end-of-tag-but-got-eof'
- ));
-
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* U+0022 QUOTATION MARK (")
- U+0027 APOSTROPHE (')
- U+003C LESS-THAN SIGN(<)
- Parse error. Treat it as per the "anything else"
- entry below. */
- if($char === '"' || $char === "'" || $char === "<") {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'invalid-character-after-attribute-name'
- ));
- }
-
- /* Anything else
- Start a new attribute in the current tag token. Set that attribute's
- name to the current input character, and its value to the empty string.
- Switch to the attribute name state. */
- $this->token['attr'][] = array(
- 'name' => $char,
- 'value' => ''
- );
-
- $state = 'attribute name';
- }
- break;
-
- case 'before attribute value':
- // Consume the next input character:
- $char = $this->stream->char();
-
- // this is an optimized conditional
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Stay in the before attribute value state. */
- $state = 'before attribute value';
-
- } elseif($char === '"') {
- /* U+0022 QUOTATION MARK (")
- Switch to the attribute value (double-quoted) state. */
- $state = 'attribute value (double-quoted)';
-
- } elseif($char === '&') {
- /* U+0026 AMPERSAND (&)
- Switch to the attribute value (unquoted) state and reconsume
- this input character. */
- $this->stream->unget();
- $state = 'attribute value (unquoted)';
-
- } elseif($char === '\'') {
- /* U+0027 APOSTROPHE (')
- Switch to the attribute value (single-quoted) state. */
- $state = 'attribute value (single-quoted)';
-
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Parse error. Emit the current tag token. Switch to the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-attribute-value-but-got-right-bracket'
- ));
- $this->emitToken($this->token);
- $state = 'data';
-
- } elseif($char === false) {
- /* EOF
- Parse error. Reconsume the EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-attribute-value-but-got-eof'
- ));
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* U+003D EQUALS SIGN (=)
- * U+003C LESS-THAN SIGN (<)
- Parse error. Treat it as per the "anything else" entry below. */
- if($char === '=' || $char === '<') {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'equals-in-unquoted-attribute-value'
- ));
- }
-
- /* Anything else
- Append the current input character to the current attribute's value.
- Switch to the attribute value (unquoted) state. */
- $last = count($this->token['attr']) - 1;
- $this->token['attr'][$last]['value'] .= $char;
-
- $state = 'attribute value (unquoted)';
- }
- break;
-
- case 'attribute value (double-quoted)':
- // Consume the next input character:
- $char = $this->stream->char();
-
- if($char === '"') {
- /* U+0022 QUOTATION MARK (")
- Switch to the after attribute value (quoted) state. */
- $state = 'after attribute value (quoted)';
-
- } elseif($char === '&') {
- /* U+0026 AMPERSAND (&)
- Switch to the character reference in attribute value
- state, with the additional allowed character
- being U+0022 QUOTATION MARK ("). */
- $this->characterReferenceInAttributeValue('"');
-
- } elseif($char === false) {
- /* EOF
- Parse error. Reconsume the EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-attribute-value-double-quote'
- ));
-
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* Anything else
- Append the current input character to the current attribute's value.
- Stay in the attribute value (double-quoted) state. */
- $chars = $this->stream->charsUntil('"&');
-
- $last = count($this->token['attr']) - 1;
- $this->token['attr'][$last]['value'] .= $char . $chars;
-
- $state = 'attribute value (double-quoted)';
- }
- break;
-
- case 'attribute value (single-quoted)':
- // Consume the next input character:
- $char = $this->stream->char();
-
- if($char === "'") {
- /* U+0022 QUOTATION MARK (')
- Switch to the after attribute value state. */
- $state = 'after attribute value (quoted)';
-
- } elseif($char === '&') {
- /* U+0026 AMPERSAND (&)
- Switch to the entity in attribute value state. */
- $this->characterReferenceInAttributeValue("'");
-
- } elseif($char === false) {
- /* EOF
- Parse error. Reconsume the EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-attribute-value-single-quote'
- ));
-
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* Anything else
- Append the current input character to the current attribute's value.
- Stay in the attribute value (single-quoted) state. */
- $chars = $this->stream->charsUntil("'&");
-
- $last = count($this->token['attr']) - 1;
- $this->token['attr'][$last]['value'] .= $char . $chars;
-
- $state = 'attribute value (single-quoted)';
- }
- break;
-
- case 'attribute value (unquoted)':
- // Consume the next input character:
- $char = $this->stream->char();
-
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Switch to the before attribute name state. */
- $state = 'before attribute name';
-
- } elseif($char === '&') {
- /* U+0026 AMPERSAND (&)
- Switch to the entity in attribute value state, with the
- additional allowed character being U+003E
- GREATER-THAN SIGN (>). */
- $this->characterReferenceInAttributeValue('>');
-
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current tag token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
-
- } elseif ($char === false) {
- /* EOF
- Parse error. Reconsume the EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-attribute-value-no-quotes'
- ));
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* U+0022 QUOTATION MARK (")
- U+0027 APOSTROPHE (')
- U+003C LESS-THAN SIGN (<)
- U+003D EQUALS SIGN (=)
- Parse error. Treat it as per the "anything else"
- entry below. */
- if($char === '"' || $char === "'" || $char === '=' || $char == '<') {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-character-in-unquoted-attribute-value'
- ));
- }
-
- /* Anything else
- Append the current input character to the current attribute's value.
- Stay in the attribute value (unquoted) state. */
- $chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
-
- $last = count($this->token['attr']) - 1;
- $this->token['attr'][$last]['value'] .= $char . $chars;
-
- $state = 'attribute value (unquoted)';
- }
- break;
-
- case 'after attribute value (quoted)':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Switch to the before attribute name state. */
- $state = 'before attribute name';
-
- } elseif ($char === '/') {
- /* U+002F SOLIDUS (/)
- Switch to the self-closing start tag state. */
- $state = 'self-closing start tag';
-
- } elseif ($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current tag token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
-
- } elseif ($char === false) {
- /* EOF
- Parse error. Reconsume the EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-EOF-after-attribute-value'
- ));
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* Anything else
- Parse error. Reconsume the character in the before attribute
- name state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-character-after-attribute-value'
- ));
- $this->stream->unget();
- $state = 'before attribute name';
- }
- break;
-
- case 'self-closing start tag':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if ($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Set the self-closing flag of the current tag token.
- Emit the current tag token. Switch to the data state. */
- // not sure if this is the name we want
- $this->token['self-closing'] = true;
- $this->emitToken($this->token);
- $state = 'data';
-
- } elseif ($char === false) {
- /* EOF
- Parse error. Reconsume the EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-eof-after-self-closing'
- ));
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* Anything else
- Parse error. Reconsume the character in the before attribute name state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-character-after-self-closing'
- ));
- $this->stream->unget();
- $state = 'before attribute name';
- }
- break;
-
- case 'bogus comment':
- /* (This can only happen if the content model flag is set to the PCDATA state.) */
- /* Consume every character up to the first U+003E GREATER-THAN SIGN
- character (>) or the end of the file (EOF), whichever comes first. Emit
- a comment token whose data is the concatenation of all the characters
- starting from and including the character that caused the state machine
- to switch into the bogus comment state, up to and including the last
- consumed character before the U+003E character, if any, or up to the
- end of the file otherwise. (If the comment was started by the end of
- the file (EOF), the token is empty.) */
- $this->token['data'] .= (string) $this->stream->charsUntil('>');
- $this->stream->char();
-
- $this->emitToken($this->token);
-
- /* Switch to the data state. */
- $state = 'data';
- break;
-
- case 'markup declaration open':
- // Consume for below
- $hyphens = $this->stream->charsWhile('-', 2);
- if ($hyphens === '-') {
- $this->stream->unget();
- }
- if ($hyphens !== '--') {
- $alpha = $this->stream->charsWhile(self::ALPHA, 7);
- }
-
- /* If the next two characters are both U+002D HYPHEN-MINUS (-)
- characters, consume those two characters, create a comment token whose
- data is the empty string, and switch to the comment state. */
- if($hyphens === '--') {
- $state = 'comment start';
- $this->token = array(
- 'data' => '',
- 'type' => self::COMMENT
- );
-
- /* Otherwise if the next seven characters are a case-insensitive match
- for the word "DOCTYPE", then consume those characters and switch to the
- DOCTYPE state. */
- } elseif(strtoupper($alpha) === 'DOCTYPE') {
- $state = 'DOCTYPE';
-
- // XXX not implemented
- /* Otherwise, if the insertion mode is "in foreign content"
- and the current node is not an element in the HTML namespace
- and the next seven characters are an ASCII case-sensitive
- match for the string "[CDATA[" (the five uppercase letters
- "CDATA" with a U+005B LEFT SQUARE BRACKET character before
- and after), then consume those characters and switch to the
- CDATA section state (which is unrelated to the content model
- flag's CDATA state). */
-
- /* Otherwise, is is a parse error. Switch to the bogus comment state.
- The next character that is consumed, if any, is the first character
- that will be in the comment. */
- } else {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-dashes-or-doctype'
- ));
- $this->token = array(
- 'data' => (string) $alpha,
- 'type' => self::COMMENT
- );
- $state = 'bogus comment';
- }
- break;
-
- case 'comment start':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if ($char === '-') {
- /* U+002D HYPHEN-MINUS (-)
- Switch to the comment start dash state. */
- $state = 'comment start dash';
- } elseif ($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Parse error. Emit the comment token. Switch to the
- data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'incorrect-comment'
- ));
- $this->emitToken($this->token);
- $state = 'data';
- } elseif ($char === false) {
- /* EOF
- Parse error. Emit the comment token. Reconsume the
- EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-comment'
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* Anything else
- Append the input character to the comment token's
- data. Switch to the comment state. */
- $this->token['data'] .= $char;
- $state = 'comment';
- }
- break;
-
- case 'comment start dash':
- /* Consume the next input character: */
- $char = $this->stream->char();
- if ($char === '-') {
- /* U+002D HYPHEN-MINUS (-)
- Switch to the comment end state */
- $state = 'comment end';
- } elseif ($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Parse error. Emit the comment token. Switch to the
- data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'incorrect-comment'
- ));
- $this->emitToken($this->token);
- $state = 'data';
- } elseif ($char === false) {
- /* Parse error. Emit the comment token. Reconsume the
- EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-comment'
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- $this->token['data'] .= '-' . $char;
- $state = 'comment';
- }
- break;
-
- case 'comment':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if($char === '-') {
- /* U+002D HYPHEN-MINUS (-)
- Switch to the comment end dash state */
- $state = 'comment end dash';
-
- } elseif($char === false) {
- /* EOF
- Parse error. Emit the comment token. Reconsume the EOF character
- in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-comment'
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* Anything else
- Append the input character to the comment token's data. Stay in
- the comment state. */
- $chars = $this->stream->charsUntil('-');
-
- $this->token['data'] .= $char . $chars;
- }
- break;
-
- case 'comment end dash':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if($char === '-') {
- /* U+002D HYPHEN-MINUS (-)
- Switch to the comment end state */
- $state = 'comment end';
-
- } elseif($char === false) {
- /* EOF
- Parse error. Emit the comment token. Reconsume the EOF character
- in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-comment-end-dash'
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* Anything else
- Append a U+002D HYPHEN-MINUS (-) character and the input
- character to the comment token's data. Switch to the comment state. */
- $this->token['data'] .= '-'.$char;
- $state = 'comment';
- }
- break;
-
- case 'comment end':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the comment token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
-
- } elseif($char === '-') {
- /* U+002D HYPHEN-MINUS (-)
- Parse error. Append a U+002D HYPHEN-MINUS (-) character
- to the comment token's data. Stay in the comment end
- state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-dash-after-double-dash-in-comment'
- ));
- $this->token['data'] .= '-';
-
- } elseif($char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ') {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-space-after-double-dash-in-comment'
- ));
- $this->token['data'] .= '--' . $char;
- $state = 'comment end space';
-
- } elseif($char === '!') {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-bang-after-double-dash-in-comment'
- ));
- $state = 'comment end bang';
-
- } elseif($char === false) {
- /* EOF
- Parse error. Emit the comment token. Reconsume the
- EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-comment-double-dash'
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* Anything else
- Parse error. Append two U+002D HYPHEN-MINUS (-)
- characters and the input character to the comment token's
- data. Switch to the comment state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-char-in-comment'
- ));
- $this->token['data'] .= '--'.$char;
- $state = 'comment';
- }
- break;
-
- case 'comment end bang':
- $char = $this->stream->char();
- if ($char === '>') {
- $this->emitToken($this->token);
- $state = 'data';
- } elseif ($char === "-") {
- $this->token['data'] .= '--!';
- $state = 'comment end dash';
- } elseif ($char === false) {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-comment-end-bang'
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- $this->token['data'] .= '--!' . $char;
- $state = 'comment';
- }
- break;
-
- case 'comment end space':
- $char = $this->stream->char();
- if ($char === '>') {
- $this->emitToken($this->token);
- $state = 'data';
- } elseif ($char === '-') {
- $state = 'comment end dash';
- } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- $this->token['data'] .= $char;
- } elseif ($char === false) {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-eof-in-comment-end-space',
- ));
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- $this->token['data'] .= $char;
- $state = 'comment';
- }
- break;
-
- case 'DOCTYPE':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Switch to the before DOCTYPE name state. */
- $state = 'before DOCTYPE name';
-
- } elseif($char === false) {
- /* EOF
- Parse error. Create a new DOCTYPE token. Set its
- force-quirks flag to on. Emit the token. Reconsume the
- EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'need-space-after-doctype-but-got-eof'
- ));
- $this->emitToken(array(
- 'name' => '',
- 'type' => self::DOCTYPE,
- 'force-quirks' => true,
- 'error' => true
- ));
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* Anything else
- Parse error. Reconsume the current character in the
- before DOCTYPE name state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'need-space-after-doctype'
- ));
- $this->stream->unget();
- $state = 'before DOCTYPE name';
- }
- break;
-
- case 'before DOCTYPE name':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Stay in the before DOCTYPE name state. */
-
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Parse error. Create a new DOCTYPE token. Set its
- force-quirks flag to on. Emit the token. Switch to the
- data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-doctype-name-but-got-right-bracket'
- ));
- $this->emitToken(array(
- 'name' => '',
- 'type' => self::DOCTYPE,
- 'force-quirks' => true,
- 'error' => true
- ));
-
- $state = 'data';
-
- } elseif('A' <= $char && $char <= 'Z') {
- /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
- Create a new DOCTYPE token. Set the token's name to the
- lowercase version of the input character (add 0x0020 to
- the character's code point). Switch to the DOCTYPE name
- state. */
- $this->token = array(
- 'name' => strtolower($char),
- 'type' => self::DOCTYPE,
- 'error' => true
- );
-
- $state = 'DOCTYPE name';
-
- } elseif($char === false) {
- /* EOF
- Parse error. Create a new DOCTYPE token. Set its
- force-quirks flag to on. Emit the token. Reconsume the
- EOF character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-doctype-name-but-got-eof'
- ));
- $this->emitToken(array(
- 'name' => '',
- 'type' => self::DOCTYPE,
- 'force-quirks' => true,
- 'error' => true
- ));
-
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* Anything else
- Create a new DOCTYPE token. Set the token's name to the
- current input character. Switch to the DOCTYPE name state. */
- $this->token = array(
- 'name' => $char,
- 'type' => self::DOCTYPE,
- 'error' => true
- );
-
- $state = 'DOCTYPE name';
- }
- break;
-
- case 'DOCTYPE name':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Switch to the after DOCTYPE name state. */
- $state = 'after DOCTYPE name';
-
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current DOCTYPE token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
-
- } elseif('A' <= $char && $char <= 'Z') {
- /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
- Append the lowercase version of the input character
- (add 0x0020 to the character's code point) to the current
- DOCTYPE token's name. Stay in the DOCTYPE name state. */
- $this->token['name'] .= strtolower($char);
-
- } elseif($char === false) {
- /* EOF
- Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Emit that DOCTYPE token. Reconsume the EOF
- character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-doctype-name'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* Anything else
- Append the current input character to the current
- DOCTYPE token's name. Stay in the DOCTYPE name state. */
- $this->token['name'] .= $char;
- }
-
- // XXX this is probably some sort of quirks mode designation,
- // check tree-builder to be sure. In general 'error' needs
- // to be specc'ified, this probably means removing it at the end
- $this->token['error'] = ($this->token['name'] === 'HTML')
- ? false
- : true;
- break;
-
- case 'after DOCTYPE name':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Stay in the after DOCTYPE name state. */
-
- } elseif($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current DOCTYPE token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
-
- } elseif($char === false) {
- /* EOF
- Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Emit that DOCTYPE token. Reconsume the EOF
- character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* Anything else */
-
- $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5));
- if ($nextSix === 'PUBLIC') {
- /* If the next six characters are an ASCII
- case-insensitive match for the word "PUBLIC", then
- consume those characters and switch to the before
- DOCTYPE public identifier state. */
- $state = 'before DOCTYPE public identifier';
-
- } elseif ($nextSix === 'SYSTEM') {
- /* Otherwise, if the next six characters are an ASCII
- case-insensitive match for the word "SYSTEM", then
- consume those characters and switch to the before
- DOCTYPE system identifier state. */
- $state = 'before DOCTYPE system identifier';
-
- } else {
- /* Otherwise, this is the parse error. Set the DOCTYPE
- token's force-quirks flag to on. Switch to the bogus
- DOCTYPE state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-space-or-right-bracket-in-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->token['error'] = true;
- $state = 'bogus DOCTYPE';
- }
- }
- break;
-
- case 'before DOCTYPE public identifier':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Stay in the before DOCTYPE public identifier state. */
- } elseif ($char === '"') {
- /* U+0022 QUOTATION MARK (")
- Set the DOCTYPE token's public identifier to the empty
- string (not missing), then switch to the DOCTYPE public
- identifier (double-quoted) state. */
- $this->token['public'] = '';
- $state = 'DOCTYPE public identifier (double-quoted)';
- } elseif ($char === "'") {
- /* U+0027 APOSTROPHE (')
- Set the DOCTYPE token's public identifier to the empty
- string (not missing), then switch to the DOCTYPE public
- identifier (single-quoted) state. */
- $this->token['public'] = '';
- $state = 'DOCTYPE public identifier (single-quoted)';
- } elseif ($char === '>') {
- /* Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Emit that DOCTYPE token. Switch to the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-end-of-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $state = 'data';
- } elseif ($char === false) {
- /* Parse error. Set the DOCTYPE token's force-quirks
- flag to on. Emit that DOCTYPE token. Reconsume the EOF
- character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Switch to the bogus DOCTYPE state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-char-in-doctype'
- ));
- $this->token['force-quirks'] = true;
- $state = 'bogus DOCTYPE';
- }
- break;
-
- case 'DOCTYPE public identifier (double-quoted)':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if ($char === '"') {
- /* U+0022 QUOTATION MARK (")
- Switch to the after DOCTYPE public identifier state. */
- $state = 'after DOCTYPE public identifier';
- } elseif ($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Emit that DOCTYPE token. Switch to the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-end-of-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $state = 'data';
- } elseif ($char === false) {
- /* EOF
- Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Emit that DOCTYPE token. Reconsume the EOF
- character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* Anything else
- Append the current input character to the current
- DOCTYPE token's public identifier. Stay in the DOCTYPE
- public identifier (double-quoted) state. */
- $this->token['public'] .= $char;
- }
- break;
-
- case 'DOCTYPE public identifier (single-quoted)':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if ($char === "'") {
- /* U+0027 APOSTROPHE (')
- Switch to the after DOCTYPE public identifier state. */
- $state = 'after DOCTYPE public identifier';
- } elseif ($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Emit that DOCTYPE token. Switch to the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-end-of-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $state = 'data';
- } elseif ($char === false) {
- /* EOF
- Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Emit that DOCTYPE token. Reconsume the EOF
- character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* Anything else
- Append the current input character to the current
- DOCTYPE token's public identifier. Stay in the DOCTYPE
- public identifier (double-quoted) state. */
- $this->token['public'] .= $char;
- }
- break;
-
- case 'after DOCTYPE public identifier':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Stay in the after DOCTYPE public identifier state. */
- } elseif ($char === '"') {
- /* U+0022 QUOTATION MARK (")
- Set the DOCTYPE token's system identifier to the
- empty string (not missing), then switch to the DOCTYPE
- system identifier (double-quoted) state. */
- $this->token['system'] = '';
- $state = 'DOCTYPE system identifier (double-quoted)';
- } elseif ($char === "'") {
- /* U+0027 APOSTROPHE (')
- Set the DOCTYPE token's system identifier to the
- empty string (not missing), then switch to the DOCTYPE
- system identifier (single-quoted) state. */
- $this->token['system'] = '';
- $state = 'DOCTYPE system identifier (single-quoted)';
- } elseif ($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current DOCTYPE token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
- } elseif ($char === false) {
- /* Parse error. Set the DOCTYPE token's force-quirks
- flag to on. Emit that DOCTYPE token. Reconsume the EOF
- character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* Anything else
- Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Switch to the bogus DOCTYPE state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-char-in-doctype'
- ));
- $this->token['force-quirks'] = true;
- $state = 'bogus DOCTYPE';
- }
- break;
-
- case 'before DOCTYPE system identifier':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Stay in the before DOCTYPE system identifier state. */
- } elseif ($char === '"') {
- /* U+0022 QUOTATION MARK (")
- Set the DOCTYPE token's system identifier to the empty
- string (not missing), then switch to the DOCTYPE system
- identifier (double-quoted) state. */
- $this->token['system'] = '';
- $state = 'DOCTYPE system identifier (double-quoted)';
- } elseif ($char === "'") {
- /* U+0027 APOSTROPHE (')
- Set the DOCTYPE token's system identifier to the empty
- string (not missing), then switch to the DOCTYPE system
- identifier (single-quoted) state. */
- $this->token['system'] = '';
- $state = 'DOCTYPE system identifier (single-quoted)';
- } elseif ($char === '>') {
- /* Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Emit that DOCTYPE token. Switch to the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-char-in-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $state = 'data';
- } elseif ($char === false) {
- /* Parse error. Set the DOCTYPE token's force-quirks
- flag to on. Emit that DOCTYPE token. Reconsume the EOF
- character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Switch to the bogus DOCTYPE state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-char-in-doctype'
- ));
- $this->token['force-quirks'] = true;
- $state = 'bogus DOCTYPE';
- }
- break;
-
- case 'DOCTYPE system identifier (double-quoted)':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if ($char === '"') {
- /* U+0022 QUOTATION MARK (")
- Switch to the after DOCTYPE system identifier state. */
- $state = 'after DOCTYPE system identifier';
- } elseif ($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Emit that DOCTYPE token. Switch to the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-end-of-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $state = 'data';
- } elseif ($char === false) {
- /* EOF
- Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Emit that DOCTYPE token. Reconsume the EOF
- character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* Anything else
- Append the current input character to the current
- DOCTYPE token's system identifier. Stay in the DOCTYPE
- system identifier (double-quoted) state. */
- $this->token['system'] .= $char;
- }
- break;
-
- case 'DOCTYPE system identifier (single-quoted)':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if ($char === "'") {
- /* U+0027 APOSTROPHE (')
- Switch to the after DOCTYPE system identifier state. */
- $state = 'after DOCTYPE system identifier';
- } elseif ($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Emit that DOCTYPE token. Switch to the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-end-of-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $state = 'data';
- } elseif ($char === false) {
- /* EOF
- Parse error. Set the DOCTYPE token's force-quirks flag
- to on. Emit that DOCTYPE token. Reconsume the EOF
- character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* Anything else
- Append the current input character to the current
- DOCTYPE token's system identifier. Stay in the DOCTYPE
- system identifier (double-quoted) state. */
- $this->token['system'] .= $char;
- }
- break;
-
- case 'after DOCTYPE system identifier':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
- /* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- Stay in the after DOCTYPE system identifier state. */
- } elseif ($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the current DOCTYPE token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
- } elseif ($char === false) {
- /* Parse error. Set the DOCTYPE token's force-quirks
- flag to on. Emit that DOCTYPE token. Reconsume the EOF
- character in the data state. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'eof-in-doctype'
- ));
- $this->token['force-quirks'] = true;
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
- } else {
- /* Anything else
- Parse error. Switch to the bogus DOCTYPE state.
- (This does not set the DOCTYPE token's force-quirks
- flag to on.) */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'unexpected-char-in-doctype'
- ));
- $state = 'bogus DOCTYPE';
- }
- break;
-
- case 'bogus DOCTYPE':
- /* Consume the next input character: */
- $char = $this->stream->char();
-
- if ($char === '>') {
- /* U+003E GREATER-THAN SIGN (>)
- Emit the DOCTYPE token. Switch to the data state. */
- $this->emitToken($this->token);
- $state = 'data';
-
- } elseif($char === false) {
- /* EOF
- Emit the DOCTYPE token. Reconsume the EOF character in
- the data state. */
- $this->emitToken($this->token);
- $this->stream->unget();
- $state = 'data';
-
- } else {
- /* Anything else
- Stay in the bogus DOCTYPE state. */
- }
- break;
-
- // case 'cdataSection':
+ } elseif ($this->content_model === self::PCDATA) {
+ /* Otherwise, if the content model flag is set to the PCDATA
+ state [...]: */
+ $char = $this->stream->char();
+
+ if ('A' <= $char && $char <= 'Z') {
+ /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
+ Create a new end tag token, set its tag name to the lowercase version
+ of the input character (add 0x0020 to the character's code point), then
+ switch to the tag name state. (Don't emit the token yet; further details
+ will be filled in before it is emitted.) */
+ $this->token = array(
+ 'name' => strtolower($char),
+ 'type' => self::ENDTAG
+ );
+
+ $state = 'tag name';
+
+ } elseif ('a' <= $char && $char <= 'z') {
+ /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
+ Create a new end tag token, set its tag name to the
+ input character, then switch to the tag name state.
+ (Don't emit the token yet; further details will be
+ filled in before it is emitted.) */
+ $this->token = array(
+ 'name' => $char,
+ 'type' => self::ENDTAG
+ );
+
+ $state = 'tag name';
+
+ } elseif($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Switch to the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-closing-tag-but-got-right-bracket'
+ ));
+ $state = 'data';
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
+ SOLIDUS character token. Reconsume the EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-closing-tag-but-got-eof'
+ ));
+ $this->emitToken(array(
+ 'type' => self::CHARACTER,
+ 'data' => '</'
+ ));
+
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ /* Parse error. Switch to the bogus comment state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-closing-tag-but-got-char'
+ ));
+ $this->token = array(
+ 'data' => $char,
+ 'type' => self::COMMENT
+ );
+ $state = 'bogus comment';
}
- }
- }
+ }
+ break;
- /**
- * Returns a serialized representation of the tree.
- */
- public function save() {
- return $this->tree->save();
- }
+ case 'tag name':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
- /**
- * Returns the input stream.
- */
- public function stream() {
- return $this->stream;
- }
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Switch to the before attribute name state. */
+ $state = 'before attribute name';
+
+ } elseif($char === '/') {
+ /* U+002F SOLIDUS (/)
+ Switch to the self-closing start tag state. */
+ $state = 'self-closing start tag';
+
+ } elseif($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current tag token. Switch to the data state. */
+ $this->emitToken($this->token);
+ $state = 'data';
+
+ } elseif('A' <= $char && $char <= 'Z') {
+ /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
+ Append the lowercase version of the current input
+ character (add 0x0020 to the character's code point) to
+ the current tag token's tag name. Stay in the tag name state. */
+ $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
+
+ $this->token['name'] .= strtolower($char . $chars);
+ $state = 'tag name';
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-tag-name'
+ ));
- private function consumeCharacterReference($allowed = false, $inattr = false) {
- // This goes quite far against spec, and is far closer to the Python
- // impl., mainly because we don't do the large unconsuming the spec
- // requires.
+ $this->stream->unget();
+ $state = 'data';
- // All consumed characters.
- $chars = $this->stream->char();
+ } else {
+ /* Anything else
+ Append the current input character to the current tag token's tag name.
+ Stay in the tag name state. */
+ $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
- /* This section defines how to consume a character
- reference. This definition is used when parsing character
- references in text and in attributes.
+ $this->token['name'] .= $char . $chars;
+ $state = 'tag name';
+ }
+ break;
- The behavior depends on the identity of the next character
- (the one immediately after the U+0026 AMPERSAND character): */
+ case 'before attribute name':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
- if (
- $chars[0] === "\x09" ||
- $chars[0] === "\x0A" ||
- $chars[0] === "\x0C" ||
- $chars[0] === "\x20" ||
- $chars[0] === '<' ||
- $chars[0] === '&' ||
- $chars === false ||
- $chars[0] === $allowed
- ) {
+ // this conditional is optimized, check bottom
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
/* U+0009 CHARACTER TABULATION
- U+000A LINE FEED (LF)
- U+000C FORM FEED (FF)
- U+0020 SPACE
- U+003C LESS-THAN SIGN
- U+0026 AMPERSAND
- EOF
- The additional allowed character, if there is one
- Not a character reference. No characters are consumed,
- and nothing is returned. (This is not an error, either.) */
- // We already consumed, so unconsume.
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Stay in the before attribute name state. */
+ $state = 'before attribute name';
+
+ } elseif($char === '/') {
+ /* U+002F SOLIDUS (/)
+ Switch to the self-closing start tag state. */
+ $state = 'self-closing start tag';
+
+ } elseif($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current tag token. Switch to the data state. */
+ $this->emitToken($this->token);
+ $state = 'data';
+
+ } elseif('A' <= $char && $char <= 'Z') {
+ /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
+ Start a new attribute in the current tag token. Set that
+ attribute's name to the lowercase version of the current
+ input character (add 0x0020 to the character's code
+ point), and its value to the empty string. Switch to the
+ attribute name state.*/
+ $this->token['attr'][] = array(
+ 'name' => strtolower($char),
+ 'value' => ''
+ );
+
+ $state = 'attribute name';
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-attribute-name-but-got-eof'
+ ));
+
$this->stream->unget();
- return '&';
- } elseif ($chars[0] === '#') {
- /* Consume the U+0023 NUMBER SIGN. */
- // Um, yeah, we already did that.
- /* The behavior further depends on the character after
- the U+0023 NUMBER SIGN: */
- $chars .= $this->stream->char();
- if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
- /* U+0078 LATIN SMALL LETTER X
- U+0058 LATIN CAPITAL LETTER X */
- /* Consume the X. */
- // Um, yeah, we already did that.
- /* Follow the steps below, but using the range of
- characters U+0030 DIGIT ZERO through to U+0039 DIGIT
- NINE, U+0061 LATIN SMALL LETTER A through to U+0066
- LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
- A, through to U+0046 LATIN CAPITAL LETTER F (in other
- words, 0123456789, ABCDEF, abcdef). */
- $char_class = self::HEX;
- /* When it comes to interpreting the
- number, interpret it as a hexadecimal number. */
- $hex = true;
- } else {
- /* Anything else */
- // Unconsume because we shouldn't have consumed this.
- $chars = $chars[0];
- $this->stream->unget();
- /* Follow the steps below, but using the range of
- characters U+0030 DIGIT ZERO through to U+0039 DIGIT
- NINE (i.e. just 0123456789). */
- $char_class = self::DIGIT;
- /* When it comes to interpreting the number,
- interpret it as a decimal number. */
- $hex = false;
+ $state = 'data';
+
+ } else {
+ /* U+0022 QUOTATION MARK (")
+ U+0027 APOSTROPHE (')
+ U+003C LESS-THAN SIGN (<)
+ U+003D EQUALS SIGN (=)
+ Parse error. Treat it as per the "anything else" entry
+ below. */
+ if($char === '"' || $char === "'" || $char === '<' || $char === '=') {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'invalid-character-in-attribute-name'
+ ));
}
- /* Consume as many characters as match the range of characters given above. */
- $consumed = $this->stream->charsWhile($char_class);
- if ($consumed === '' || $consumed === false) {
- /* If no characters match the range, then don't consume
- any characters (and unconsume the U+0023 NUMBER SIGN
- character and, if appropriate, the X character). This
- is a parse error; nothing is returned. */
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-numeric-entity'
- ));
- return '&' . $chars;
- } else {
- /* Otherwise, if the next character is a U+003B SEMICOLON,
- consume that too. If it isn't, there is a parse error. */
- if ($this->stream->char() !== ';') {
- $this->stream->unget();
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'numeric-entity-without-semicolon'
- ));
- }
-
- /* If one or more characters match the range, then take
- them all and interpret the string of characters as a number
- (either hexadecimal or decimal as appropriate). */
- $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
-
- /* If that number is one of the numbers in the first column
- of the following table, then this is a parse error. Find the
- row with that number in the first column, and return a
- character token for the Unicode character given in the
- second column of that row. */
- $new_codepoint = Data::getRealCodepoint($codepoint);
- if ($new_codepoint) {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'illegal-windows-1252-entity'
- ));
- return Data::utf8chr($new_codepoint);
- } else {
- /* Otherwise, if the number is greater than 0x10FFFF, then
- * this is a parse error. Return a U+FFFD REPLACEMENT
- * CHARACTER. */
- if ($codepoint > 0x10FFFF) {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'overlong-character-entity' // XXX probably not correct
- ));
- return "\xEF\xBF\xBD";
- }
- /* Otherwise, return a character token for the Unicode
- * character whose code point is that number. If the
- * number is in the range 0x0001 to 0x0008, 0x000E to
- * 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to
- * 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
- * 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE,
- * 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
- * 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE,
- * 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
- * 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE,
- * or 0x10FFFF, then this is a parse error. */
- // && has higher precedence than ||
- if (
- $codepoint >= 0x0000 && $codepoint <= 0x0008 ||
- $codepoint === 0x000B ||
- $codepoint >= 0x000E && $codepoint <= 0x001F ||
- $codepoint >= 0x007F && $codepoint <= 0x009F ||
- $codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
- $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
- ($codepoint & 0xFFFE) === 0xFFFE ||
- $codepoint == 0x10FFFF || $codepoint == 0x10FFFE
- ) {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'illegal-codepoint-for-numeric-entity'
- ));
- }
- return Data::utf8chr($codepoint);
- }
- }
+ /* Anything else
+ Start a new attribute in the current tag token. Set that attribute's
+ name to the current input character, and its value to the empty string.
+ Switch to the attribute name state. */
+ $this->token['attr'][] = array(
+ 'name' => $char,
+ 'value' => ''
+ );
+
+ $state = 'attribute name';
+ }
+ break;
+
+ case 'attribute name':
+ // Consume the next input character:
+ $char = $this->stream->char();
+
+ // this conditional is optimized, check bottom
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Switch to the after attribute name state. */
+ $state = 'after attribute name';
+
+ } elseif($char === '/') {
+ /* U+002F SOLIDUS (/)
+ Switch to the self-closing start tag state. */
+ $state = 'self-closing start tag';
+
+ } elseif($char === '=') {
+ /* U+003D EQUALS SIGN (=)
+ Switch to the before attribute value state. */
+ $state = 'before attribute value';
+
+ } elseif($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current tag token. Switch to the data state. */
+ $this->emitToken($this->token);
+ $state = 'data';
+
+ } elseif('A' <= $char && $char <= 'Z') {
+ /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
+ Append the lowercase version of the current input
+ character (add 0x0020 to the character's code point) to
+ the current attribute's name. Stay in the attribute name
+ state. */
+ $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
+
+ $last = count($this->token['attr']) - 1;
+ $this->token['attr'][$last]['name'] .= strtolower($char . $chars);
+
+ $state = 'attribute name';
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-attribute-name'
+ ));
- } else {
- /* Anything else */
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* U+0022 QUOTATION MARK (")
+ U+0027 APOSTROPHE (')
+ U+003C LESS-THAN SIGN (<)
+ Parse error. Treat it as per the "anything else"
+ entry below. */
+ if($char === '"' || $char === "'" || $char === '<') {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'invalid-character-in-attribute-name'
+ ));
+ }
- /* Consume the maximum number of characters possible,
- with the consumed characters matching one of the
- identifiers in the first column of the named character
- references table (in a case-sensitive manner). */
- // What we actually do here is consume as much as we can while it
- // matches the start of one of the identifiers in the first column.
+ /* Anything else
+ Append the current input character to the current attribute's name.
+ Stay in the attribute name state. */
+ $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
+
+ $last = count($this->token['attr']) - 1;
+ $this->token['attr'][$last]['name'] .= $char . $chars;
+
+ $state = 'attribute name';
+ }
+
+ /* When the user agent leaves the attribute name state
+ (and before emitting the tag token, if appropriate), the
+ complete attribute's name must be compared to the other
+ attributes on the same token; if there is already an
+ attribute on the token with the exact same name, then this
+ is a parse error and the new attribute must be dropped, along
+ with the value that gets associated with it (if any). */
+ // this might be implemented in the emitToken method
+ break;
+
+ case 'after attribute name':
+ // Consume the next input character:
+ $char = $this->stream->char();
+
+ // this is an optimized conditional, check the bottom
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Stay in the after attribute name state. */
+ $state = 'after attribute name';
+
+ } elseif($char === '/') {
+ /* U+002F SOLIDUS (/)
+ Switch to the self-closing start tag state. */
+ $state = 'self-closing start tag';
+
+ } elseif($char === '=') {
+ /* U+003D EQUALS SIGN (=)
+ Switch to the before attribute value state. */
+ $state = 'before attribute value';
+
+ } elseif($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current tag token. Switch to the data state. */
+ $this->emitToken($this->token);
+ $state = 'data';
+
+ } elseif('A' <= $char && $char <= 'Z') {
+ /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
+ Start a new attribute in the current tag token. Set that
+ attribute's name to the lowercase version of the current
+ input character (add 0x0020 to the character's code
+ point), and its value to the empty string. Switch to the
+ attribute name state. */
+ $this->token['attr'][] = array(
+ 'name' => strtolower($char),
+ 'value' => ''
+ );
+
+ $state = 'attribute name';
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-end-of-tag-but-got-eof'
+ ));
- $refs = Data::getNamedCharacterReferences();
-
- // Get the longest string which is the start of an identifier
- // ($chars) as well as the longest identifier which matches ($id)
- // and its codepoint ($codepoint).
- $codepoint = false;
- $char = $chars;
- while ($char !== false && isset($refs[$char])) {
- $refs = $refs[$char];
- if (isset($refs['codepoint'])) {
- $id = $chars;
- $codepoint = $refs['codepoint'];
- }
- $chars .= $char = $this->stream->char();
- }
-
- // Unconsume the one character we just took which caused the while
- // statement to fail. This could be anything and could cause state
- // changes (as if it matches the while loop it must be
- // alphanumeric so we can just concat it to whatever we get later).
$this->stream->unget();
- if ($char !== false) {
- $chars = substr($chars, 0, -1);
+ $state = 'data';
+
+ } else {
+ /* U+0022 QUOTATION MARK (")
+ U+0027 APOSTROPHE (')
+ U+003C LESS-THAN SIGN(<)
+ Parse error. Treat it as per the "anything else"
+ entry below. */
+ if($char === '"' || $char === "'" || $char === "<") {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'invalid-character-after-attribute-name'
+ ));
}
- /* If no match can be made, then this is a parse error.
- No characters are consumed, and nothing is returned. */
- if (!$codepoint) {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'expected-named-entity'
- ));
- return '&' . $chars;
- }
+ /* Anything else
+ Start a new attribute in the current tag token. Set that attribute's
+ name to the current input character, and its value to the empty string.
+ Switch to the attribute name state. */
+ $this->token['attr'][] = array(
+ 'name' => $char,
+ 'value' => ''
+ );
+
+ $state = 'attribute name';
+ }
+ break;
+
+ case 'before attribute value':
+ // Consume the next input character:
+ $char = $this->stream->char();
+
+ // this is an optimized conditional
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Stay in the before attribute value state. */
+ $state = 'before attribute value';
+
+ } elseif($char === '"') {
+ /* U+0022 QUOTATION MARK (")
+ Switch to the attribute value (double-quoted) state. */
+ $state = 'attribute value (double-quoted)';
+
+ } elseif($char === '&') {
+ /* U+0026 AMPERSAND (&)
+ Switch to the attribute value (unquoted) state and reconsume
+ this input character. */
+ $this->stream->unget();
+ $state = 'attribute value (unquoted)';
- /* If the last character matched is not a U+003B SEMICOLON
- (;), there is a parse error. */
- $semicolon = true;
- if (substr($id, -1) !== ';') {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'named-entity-without-semicolon'
- ));
- $semicolon = false;
- }
+ } elseif($char === '\'') {
+ /* U+0027 APOSTROPHE (')
+ Switch to the attribute value (single-quoted) state. */
+ $state = 'attribute value (single-quoted)';
- /* If the character reference is being consumed as part of
- an attribute, and the last character matched is not a
- U+003B SEMICOLON (;), and the next character is in the
- range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
- LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
- or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
- then, for historical reasons, all the characters that were
- matched after the U+0026 AMPERSAND (&) must be unconsumed,
- and nothing is returned. */
- if ($inattr && !$semicolon) {
- // The next character is either the next character in $chars or in the stream.
- if (strlen($chars) > strlen($id)) {
- $next = substr($chars, strlen($id), 1);
- } else {
- $next = $this->stream->char();
- $this->stream->unget();
- }
- if (
- '0' <= $next && $next <= '9' ||
- 'A' <= $next && $next <= 'Z' ||
- 'a' <= $next && $next <= 'z'
- ) {
- return '&' . $chars;
- }
+ } elseif($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Emit the current tag token. Switch to the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-attribute-value-but-got-right-bracket'
+ ));
+ $this->emitToken($this->token);
+ $state = 'data';
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-attribute-value-but-got-eof'
+ ));
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* U+003D EQUALS SIGN (=)
+ * U+003C LESS-THAN SIGN (<)
+ Parse error. Treat it as per the "anything else" entry below. */
+ if($char === '=' || $char === '<') {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'equals-in-unquoted-attribute-value'
+ ));
}
- /* Otherwise, return a character token for the character
- corresponding to the character reference name (as given
- by the second column of the named character references table). */
- return Data::utf8chr($codepoint) . substr($chars, strlen($id));
- }
- }
+ /* Anything else
+ Append the current input character to the current attribute's value.
+ Switch to the attribute value (unquoted) state. */
+ $last = count($this->token['attr']) - 1;
+ $this->token['attr'][$last]['value'] .= $char;
+
+ $state = 'attribute value (unquoted)';
+ }
+ break;
+
+ case 'attribute value (double-quoted)':
+ // Consume the next input character:
+ $char = $this->stream->char();
+
+ if($char === '"') {
+ /* U+0022 QUOTATION MARK (")
+ Switch to the after attribute value (quoted) state. */
+ $state = 'after attribute value (quoted)';
+
+ } elseif($char === '&') {
+ /* U+0026 AMPERSAND (&)
+ Switch to the character reference in attribute value
+ state, with the additional allowed character
+ being U+0022 QUOTATION MARK ("). */
+ $this->characterReferenceInAttributeValue('"');
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-attribute-value-double-quote'
+ ));
+
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Append the current input character to the current attribute's value.
+ Stay in the attribute value (double-quoted) state. */
+ $chars = $this->stream->charsUntil('"&');
+
+ $last = count($this->token['attr']) - 1;
+ $this->token['attr'][$last]['value'] .= $char . $chars;
+
+ $state = 'attribute value (double-quoted)';
+ }
+ break;
+
+ case 'attribute value (single-quoted)':
+ // Consume the next input character:
+ $char = $this->stream->char();
+
+ if($char === "'") {
+ /* U+0022 QUOTATION MARK (')
+ Switch to the after attribute value state. */
+ $state = 'after attribute value (quoted)';
+
+ } elseif($char === '&') {
+ /* U+0026 AMPERSAND (&)
+ Switch to the entity in attribute value state. */
+ $this->characterReferenceInAttributeValue("'");
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-attribute-value-single-quote'
+ ));
- private function characterReferenceInAttributeValue($allowed = false) {
- /* Attempt to consume a character reference. */
- $entity = $this->consumeCharacterReference($allowed, true);
+ $this->stream->unget();
+ $state = 'data';
- /* If nothing is returned, append a U+0026 AMPERSAND
- character to the current attribute's value.
+ } else {
+ /* Anything else
+ Append the current input character to the current attribute's value.
+ Stay in the attribute value (single-quoted) state. */
+ $chars = $this->stream->charsUntil("'&");
- Otherwise, append the returned character token to the
- current attribute's value. */
- $char = (!$entity)
- ? '&'
- : $entity;
+ $last = count($this->token['attr']) - 1;
+ $this->token['attr'][$last]['value'] .= $char . $chars;
- $last = count($this->token['attr']) - 1;
- $this->token['attr'][$last]['value'] .= $char;
+ $state = 'attribute value (single-quoted)';
+ }
+ break;
- /* Finally, switch back to the attribute value state that you
- were in when were switched into this state. */
- }
+ case 'attribute value (unquoted)':
+ // Consume the next input character:
+ $char = $this->stream->char();
- /**
- * Emits a token, passing it on to the tree builder.
- */
- protected function emitToken($token, $checkStream = true, $dry = false) {
- if ($checkStream) {
- // Emit errors from input stream.
- while ($this->stream->errors) {
- $this->emitToken(array_shift($this->stream->errors), false);
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Switch to the before attribute name state. */
+ $state = 'before attribute name';
+
+ } elseif($char === '&') {
+ /* U+0026 AMPERSAND (&)
+ Switch to the entity in attribute value state, with the
+ additional allowed character being U+003E
+ GREATER-THAN SIGN (>). */
+ $this->characterReferenceInAttributeValue('>');
+
+ } elseif($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current tag token. Switch to the data state. */
+ $this->emitToken($this->token);
+ $state = 'data';
+
+ } elseif ($char === false) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-attribute-value-no-quotes'
+ ));
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* U+0022 QUOTATION MARK (")
+ U+0027 APOSTROPHE (')
+ U+003C LESS-THAN SIGN (<)
+ U+003D EQUALS SIGN (=)
+ Parse error. Treat it as per the "anything else"
+ entry below. */
+ if($char === '"' || $char === "'" || $char === '=' || $char == '<') {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-character-in-unquoted-attribute-value'
+ ));
}
- }
- if($token['type'] === self::ENDTAG && !empty($token['attr'])) {
- for ($i = 0; $i < count($token['attr']); $i++) {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'attributes-in-end-tag'
- ));
+
+ /* Anything else
+ Append the current input character to the current attribute's value.
+ Stay in the attribute value (unquoted) state. */
+ $chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
+
+ $last = count($this->token['attr']) - 1;
+ $this->token['attr'][$last]['value'] .= $char . $chars;
+
+ $state = 'attribute value (unquoted)';
+ }
+ break;
+
+ case 'after attribute value (quoted)':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Switch to the before attribute name state. */
+ $state = 'before attribute name';
+
+ } elseif ($char === '/') {
+ /* U+002F SOLIDUS (/)
+ Switch to the self-closing start tag state. */
+ $state = 'self-closing start tag';
+
+ } elseif ($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current tag token. Switch to the data state. */
+ $this->emitToken($this->token);
+ $state = 'data';
+
+ } elseif ($char === false) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-EOF-after-attribute-value'
+ ));
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Parse error. Reconsume the character in the before attribute
+ name state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-character-after-attribute-value'
+ ));
+ $this->stream->unget();
+ $state = 'before attribute name';
+ }
+ break;
+
+ case 'self-closing start tag':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if ($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Set the self-closing flag of the current tag token.
+ Emit the current tag token. Switch to the data state. */
+ // not sure if this is the name we want
+ $this->token['self-closing'] = true;
+ $this->emitToken($this->token);
+ $state = 'data';
+
+ } elseif ($char === false) {
+ /* EOF
+ Parse error. Reconsume the EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-eof-after-self-closing'
+ ));
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Parse error. Reconsume the character in the before attribute name state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-character-after-self-closing'
+ ));
+ $this->stream->unget();
+ $state = 'before attribute name';
+ }
+ break;
+
+ case 'bogus comment':
+ /* (This can only happen if the content model flag is set to the PCDATA state.) */
+ /* Consume every character up to the first U+003E GREATER-THAN SIGN
+ character (>) or the end of the file (EOF), whichever comes first. Emit
+ a comment token whose data is the concatenation of all the characters
+ starting from and including the character that caused the state machine
+ to switch into the bogus comment state, up to and including the last
+ consumed character before the U+003E character, if any, or up to the
+ end of the file otherwise. (If the comment was started by the end of
+ the file (EOF), the token is empty.) */
+ $this->token['data'] .= (string) $this->stream->charsUntil('>');
+ $this->stream->char();
+
+ $this->emitToken($this->token);
+
+ /* Switch to the data state. */
+ $state = 'data';
+ break;
+
+ case 'markup declaration open':
+ // Consume for below
+ $hyphens = $this->stream->charsWhile('-', 2);
+ if ($hyphens === '-') {
+ $this->stream->unget();
+ }
+ if ($hyphens !== '--') {
+ $alpha = $this->stream->charsWhile(self::ALPHA, 7);
+ }
+
+ /* If the next two characters are both U+002D HYPHEN-MINUS (-)
+ characters, consume those two characters, create a comment token whose
+ data is the empty string, and switch to the comment state. */
+ if($hyphens === '--') {
+ $state = 'comment start';
+ $this->token = array(
+ 'data' => '',
+ 'type' => self::COMMENT
+ );
+
+ /* Otherwise if the next seven characters are a case-insensitive match
+ for the word "DOCTYPE", then consume those characters and switch to the
+ DOCTYPE state. */
+ } elseif(strtoupper($alpha) === 'DOCTYPE') {
+ $state = 'DOCTYPE';
+
+ // XXX not implemented
+ /* Otherwise, if the insertion mode is "in foreign content"
+ and the current node is not an element in the HTML namespace
+ and the next seven characters are an ASCII case-sensitive
+ match for the string "[CDATA[" (the five uppercase letters
+ "CDATA" with a U+005B LEFT SQUARE BRACKET character before
+ and after), then consume those characters and switch to the
+ CDATA section state (which is unrelated to the content model
+ flag's CDATA state). */
+
+ /* Otherwise, is is a parse error. Switch to the bogus comment state.
+ The next character that is consumed, if any, is the first character
+ that will be in the comment. */
+ } else {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-dashes-or-doctype'
+ ));
+ $this->token = array(
+ 'data' => (string) $alpha,
+ 'type' => self::COMMENT
+ );
+ $state = 'bogus comment';
+ }
+ break;
+
+ case 'comment start':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if ($char === '-') {
+ /* U+002D HYPHEN-MINUS (-)
+ Switch to the comment start dash state. */
+ $state = 'comment start dash';
+ } elseif ($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Emit the comment token. Switch to the
+ data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'incorrect-comment'
+ ));
+ $this->emitToken($this->token);
+ $state = 'data';
+ } elseif ($char === false) {
+ /* EOF
+ Parse error. Emit the comment token. Reconsume the
+ EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-comment'
+ ));
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ /* Anything else
+ Append the input character to the comment token's
+ data. Switch to the comment state. */
+ $this->token['data'] .= $char;
+ $state = 'comment';
+ }
+ break;
+
+ case 'comment start dash':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+ if ($char === '-') {
+ /* U+002D HYPHEN-MINUS (-)
+ Switch to the comment end state */
+ $state = 'comment end';
+ } elseif ($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Emit the comment token. Switch to the
+ data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'incorrect-comment'
+ ));
+ $this->emitToken($this->token);
+ $state = 'data';
+ } elseif ($char === false) {
+ /* Parse error. Emit the comment token. Reconsume the
+ EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-comment'
+ ));
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ $this->token['data'] .= '-' . $char;
+ $state = 'comment';
+ }
+ break;
+
+ case 'comment':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if($char === '-') {
+ /* U+002D HYPHEN-MINUS (-)
+ Switch to the comment end dash state */
+ $state = 'comment end dash';
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Emit the comment token. Reconsume the EOF character
+ in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-comment'
+ ));
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Append the input character to the comment token's data. Stay in
+ the comment state. */
+ $chars = $this->stream->charsUntil('-');
+
+ $this->token['data'] .= $char . $chars;
+ }
+ break;
+
+ case 'comment end dash':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if($char === '-') {
+ /* U+002D HYPHEN-MINUS (-)
+ Switch to the comment end state */
+ $state = 'comment end';
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Emit the comment token. Reconsume the EOF character
+ in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-comment-end-dash'
+ ));
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Append a U+002D HYPHEN-MINUS (-) character and the input
+ character to the comment token's data. Switch to the comment state. */
+ $this->token['data'] .= '-'.$char;
+ $state = 'comment';
+ }
+ break;
+
+ case 'comment end':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the comment token. Switch to the data state. */
+ $this->emitToken($this->token);
+ $state = 'data';
+
+ } elseif($char === '-') {
+ /* U+002D HYPHEN-MINUS (-)
+ Parse error. Append a U+002D HYPHEN-MINUS (-) character
+ to the comment token's data. Stay in the comment end
+ state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-dash-after-double-dash-in-comment'
+ ));
+ $this->token['data'] .= '-';
+
+ } elseif($char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ') {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-space-after-double-dash-in-comment'
+ ));
+ $this->token['data'] .= '--' . $char;
+ $state = 'comment end space';
+
+ } elseif($char === '!') {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-bang-after-double-dash-in-comment'
+ ));
+ $state = 'comment end bang';
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Emit the comment token. Reconsume the
+ EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-comment-double-dash'
+ ));
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Parse error. Append two U+002D HYPHEN-MINUS (-)
+ characters and the input character to the comment token's
+ data. Switch to the comment state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-char-in-comment'
+ ));
+ $this->token['data'] .= '--'.$char;
+ $state = 'comment';
+ }
+ break;
+
+ case 'comment end bang':
+ $char = $this->stream->char();
+ if ($char === '>') {
+ $this->emitToken($this->token);
+ $state = 'data';
+ } elseif ($char === "-") {
+ $this->token['data'] .= '--!';
+ $state = 'comment end dash';
+ } elseif ($char === false) {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-comment-end-bang'
+ ));
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ $this->token['data'] .= '--!' . $char;
+ $state = 'comment';
+ }
+ break;
+
+ case 'comment end space':
+ $char = $this->stream->char();
+ if ($char === '>') {
+ $this->emitToken($this->token);
+ $state = 'data';
+ } elseif ($char === '-') {
+ $state = 'comment end dash';
+ } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ $this->token['data'] .= $char;
+ } elseif ($char === false) {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-eof-in-comment-end-space',
+ ));
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ $this->token['data'] .= $char;
+ $state = 'comment';
+ }
+ break;
+
+ case 'DOCTYPE':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Switch to the before DOCTYPE name state. */
+ $state = 'before DOCTYPE name';
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Create a new DOCTYPE token. Set its
+ force-quirks flag to on. Emit the token. Reconsume the
+ EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'need-space-after-doctype-but-got-eof'
+ ));
+ $this->emitToken(array(
+ 'name' => '',
+ 'type' => self::DOCTYPE,
+ 'force-quirks' => true,
+ 'error' => true
+ ));
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Parse error. Reconsume the current character in the
+ before DOCTYPE name state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'need-space-after-doctype'
+ ));
+ $this->stream->unget();
+ $state = 'before DOCTYPE name';
+ }
+ break;
+
+ case 'before DOCTYPE name':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Stay in the before DOCTYPE name state. */
+
+ } elseif($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Create a new DOCTYPE token. Set its
+ force-quirks flag to on. Emit the token. Switch to the
+ data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-doctype-name-but-got-right-bracket'
+ ));
+ $this->emitToken(array(
+ 'name' => '',
+ 'type' => self::DOCTYPE,
+ 'force-quirks' => true,
+ 'error' => true
+ ));
+
+ $state = 'data';
+
+ } elseif('A' <= $char && $char <= 'Z') {
+ /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
+ Create a new DOCTYPE token. Set the token's name to the
+ lowercase version of the input character (add 0x0020 to
+ the character's code point). Switch to the DOCTYPE name
+ state. */
+ $this->token = array(
+ 'name' => strtolower($char),
+ 'type' => self::DOCTYPE,
+ 'error' => true
+ );
+
+ $state = 'DOCTYPE name';
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Create a new DOCTYPE token. Set its
+ force-quirks flag to on. Emit the token. Reconsume the
+ EOF character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-doctype-name-but-got-eof'
+ ));
+ $this->emitToken(array(
+ 'name' => '',
+ 'type' => self::DOCTYPE,
+ 'force-quirks' => true,
+ 'error' => true
+ ));
+
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Create a new DOCTYPE token. Set the token's name to the
+ current input character. Switch to the DOCTYPE name state. */
+ $this->token = array(
+ 'name' => $char,
+ 'type' => self::DOCTYPE,
+ 'error' => true
+ );
+
+ $state = 'DOCTYPE name';
+ }
+ break;
+
+ case 'DOCTYPE name':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Switch to the after DOCTYPE name state. */
+ $state = 'after DOCTYPE name';
+
+ } elseif($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current DOCTYPE token. Switch to the data state. */
+ $this->emitToken($this->token);
+ $state = 'data';
+
+ } elseif('A' <= $char && $char <= 'Z') {
+ /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
+ Append the lowercase version of the input character
+ (add 0x0020 to the character's code point) to the current
+ DOCTYPE token's name. Stay in the DOCTYPE name state. */
+ $this->token['name'] .= strtolower($char);
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Emit that DOCTYPE token. Reconsume the EOF
+ character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-doctype-name'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Append the current input character to the current
+ DOCTYPE token's name. Stay in the DOCTYPE name state. */
+ $this->token['name'] .= $char;
+ }
+
+ // XXX this is probably some sort of quirks mode designation,
+ // check tree-builder to be sure. In general 'error' needs
+ // to be specc'ified, this probably means removing it at the end
+ $this->token['error'] = ($this->token['name'] === 'HTML')
+ ? false
+ : true;
+ break;
+
+ case 'after DOCTYPE name':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Stay in the after DOCTYPE name state. */
+
+ } elseif($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current DOCTYPE token. Switch to the data state. */
+ $this->emitToken($this->token);
+ $state = 'data';
+
+ } elseif($char === false) {
+ /* EOF
+ Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Emit that DOCTYPE token. Reconsume the EOF
+ character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else */
+
+ $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5));
+ if ($nextSix === 'PUBLIC') {
+ /* If the next six characters are an ASCII
+ case-insensitive match for the word "PUBLIC", then
+ consume those characters and switch to the before
+ DOCTYPE public identifier state. */
+ $state = 'before DOCTYPE public identifier';
+
+ } elseif ($nextSix === 'SYSTEM') {
+ /* Otherwise, if the next six characters are an ASCII
+ case-insensitive match for the word "SYSTEM", then
+ consume those characters and switch to the before
+ DOCTYPE system identifier state. */
+ $state = 'before DOCTYPE system identifier';
+
+ } else {
+ /* Otherwise, this is the parse error. Set the DOCTYPE
+ token's force-quirks flag to on. Switch to the bogus
+ DOCTYPE state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-space-or-right-bracket-in-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->token['error'] = true;
+ $state = 'bogus DOCTYPE';
}
+ }
+ break;
+
+ case 'before DOCTYPE public identifier':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Stay in the before DOCTYPE public identifier state. */
+ } elseif ($char === '"') {
+ /* U+0022 QUOTATION MARK (")
+ Set the DOCTYPE token's public identifier to the empty
+ string (not missing), then switch to the DOCTYPE public
+ identifier (double-quoted) state. */
+ $this->token['public'] = '';
+ $state = 'DOCTYPE public identifier (double-quoted)';
+ } elseif ($char === "'") {
+ /* U+0027 APOSTROPHE (')
+ Set the DOCTYPE token's public identifier to the empty
+ string (not missing), then switch to the DOCTYPE public
+ identifier (single-quoted) state. */
+ $this->token['public'] = '';
+ $state = 'DOCTYPE public identifier (single-quoted)';
+ } elseif ($char === '>') {
+ /* Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Emit that DOCTYPE token. Switch to the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-end-of-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $state = 'data';
+ } elseif ($char === false) {
+ /* Parse error. Set the DOCTYPE token's force-quirks
+ flag to on. Emit that DOCTYPE token. Reconsume the EOF
+ character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ /* Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Switch to the bogus DOCTYPE state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-char-in-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $state = 'bogus DOCTYPE';
+ }
+ break;
+
+ case 'DOCTYPE public identifier (double-quoted)':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if ($char === '"') {
+ /* U+0022 QUOTATION MARK (")
+ Switch to the after DOCTYPE public identifier state. */
+ $state = 'after DOCTYPE public identifier';
+ } elseif ($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Emit that DOCTYPE token. Switch to the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-end-of-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $state = 'data';
+ } elseif ($char === false) {
+ /* EOF
+ Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Emit that DOCTYPE token. Reconsume the EOF
+ character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ /* Anything else
+ Append the current input character to the current
+ DOCTYPE token's public identifier. Stay in the DOCTYPE
+ public identifier (double-quoted) state. */
+ $this->token['public'] .= $char;
+ }
+ break;
+
+ case 'DOCTYPE public identifier (single-quoted)':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if ($char === "'") {
+ /* U+0027 APOSTROPHE (')
+ Switch to the after DOCTYPE public identifier state. */
+ $state = 'after DOCTYPE public identifier';
+ } elseif ($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Emit that DOCTYPE token. Switch to the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-end-of-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $state = 'data';
+ } elseif ($char === false) {
+ /* EOF
+ Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Emit that DOCTYPE token. Reconsume the EOF
+ character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ /* Anything else
+ Append the current input character to the current
+ DOCTYPE token's public identifier. Stay in the DOCTYPE
+ public identifier (double-quoted) state. */
+ $this->token['public'] .= $char;
+ }
+ break;
+
+ case 'after DOCTYPE public identifier':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Stay in the after DOCTYPE public identifier state. */
+ } elseif ($char === '"') {
+ /* U+0022 QUOTATION MARK (")
+ Set the DOCTYPE token's system identifier to the
+ empty string (not missing), then switch to the DOCTYPE
+ system identifier (double-quoted) state. */
+ $this->token['system'] = '';
+ $state = 'DOCTYPE system identifier (double-quoted)';
+ } elseif ($char === "'") {
+ /* U+0027 APOSTROPHE (')
+ Set the DOCTYPE token's system identifier to the
+ empty string (not missing), then switch to the DOCTYPE
+ system identifier (single-quoted) state. */
+ $this->token['system'] = '';
+ $state = 'DOCTYPE system identifier (single-quoted)';
+ } elseif ($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current DOCTYPE token. Switch to the data state. */
+ $this->emitToken($this->token);
+ $state = 'data';
+ } elseif ($char === false) {
+ /* Parse error. Set the DOCTYPE token's force-quirks
+ flag to on. Emit that DOCTYPE token. Reconsume the EOF
+ character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ /* Anything else
+ Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Switch to the bogus DOCTYPE state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-char-in-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $state = 'bogus DOCTYPE';
+ }
+ break;
+
+ case 'before DOCTYPE system identifier':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Stay in the before DOCTYPE system identifier state. */
+ } elseif ($char === '"') {
+ /* U+0022 QUOTATION MARK (")
+ Set the DOCTYPE token's system identifier to the empty
+ string (not missing), then switch to the DOCTYPE system
+ identifier (double-quoted) state. */
+ $this->token['system'] = '';
+ $state = 'DOCTYPE system identifier (double-quoted)';
+ } elseif ($char === "'") {
+ /* U+0027 APOSTROPHE (')
+ Set the DOCTYPE token's system identifier to the empty
+ string (not missing), then switch to the DOCTYPE system
+ identifier (single-quoted) state. */
+ $this->token['system'] = '';
+ $state = 'DOCTYPE system identifier (single-quoted)';
+ } elseif ($char === '>') {
+ /* Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Emit that DOCTYPE token. Switch to the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-char-in-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $state = 'data';
+ } elseif ($char === false) {
+ /* Parse error. Set the DOCTYPE token's force-quirks
+ flag to on. Emit that DOCTYPE token. Reconsume the EOF
+ character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ /* Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Switch to the bogus DOCTYPE state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-char-in-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $state = 'bogus DOCTYPE';
+ }
+ break;
+
+ case 'DOCTYPE system identifier (double-quoted)':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if ($char === '"') {
+ /* U+0022 QUOTATION MARK (")
+ Switch to the after DOCTYPE system identifier state. */
+ $state = 'after DOCTYPE system identifier';
+ } elseif ($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Emit that DOCTYPE token. Switch to the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-end-of-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $state = 'data';
+ } elseif ($char === false) {
+ /* EOF
+ Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Emit that DOCTYPE token. Reconsume the EOF
+ character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ /* Anything else
+ Append the current input character to the current
+ DOCTYPE token's system identifier. Stay in the DOCTYPE
+ system identifier (double-quoted) state. */
+ $this->token['system'] .= $char;
+ }
+ break;
+
+ case 'DOCTYPE system identifier (single-quoted)':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if ($char === "'") {
+ /* U+0027 APOSTROPHE (')
+ Switch to the after DOCTYPE system identifier state. */
+ $state = 'after DOCTYPE system identifier';
+ } elseif ($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Emit that DOCTYPE token. Switch to the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-end-of-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $state = 'data';
+ } elseif ($char === false) {
+ /* EOF
+ Parse error. Set the DOCTYPE token's force-quirks flag
+ to on. Emit that DOCTYPE token. Reconsume the EOF
+ character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ /* Anything else
+ Append the current input character to the current
+ DOCTYPE token's system identifier. Stay in the DOCTYPE
+ system identifier (double-quoted) state. */
+ $this->token['system'] .= $char;
+ }
+ break;
+
+ case 'after DOCTYPE system identifier':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ Stay in the after DOCTYPE system identifier state. */
+ } elseif ($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the current DOCTYPE token. Switch to the data state. */
+ $this->emitToken($this->token);
+ $state = 'data';
+ } elseif ($char === false) {
+ /* Parse error. Set the DOCTYPE token's force-quirks
+ flag to on. Emit that DOCTYPE token. Reconsume the EOF
+ character in the data state. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'eof-in-doctype'
+ ));
+ $this->token['force-quirks'] = true;
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+ } else {
+ /* Anything else
+ Parse error. Switch to the bogus DOCTYPE state.
+ (This does not set the DOCTYPE token's force-quirks
+ flag to on.) */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'unexpected-char-in-doctype'
+ ));
+ $state = 'bogus DOCTYPE';
+ }
+ break;
+
+ case 'bogus DOCTYPE':
+ /* Consume the next input character: */
+ $char = $this->stream->char();
+
+ if ($char === '>') {
+ /* U+003E GREATER-THAN SIGN (>)
+ Emit the DOCTYPE token. Switch to the data state. */
+ $this->emitToken($this->token);
+ $state = 'data';
+
+ } elseif($char === false) {
+ /* EOF
+ Emit the DOCTYPE token. Reconsume the EOF character in
+ the data state. */
+ $this->emitToken($this->token);
+ $this->stream->unget();
+ $state = 'data';
+
+ } else {
+ /* Anything else
+ Stay in the bogus DOCTYPE state. */
+ }
+ break;
+
+ // case 'cdataSection':
+
+ }
+ }
+ }
+
+ /**
+ * Returns a serialized representation of the tree.
+ */
+ public function save() {
+ return $this->tree->save();
+ }
+
+ /**
+ * Returns the input stream.
+ */
+ public function stream() {
+ return $this->stream;
+ }
+
+ private function consumeCharacterReference($allowed = false, $inattr = false) {
+ // This goes quite far against spec, and is far closer to the Python
+ // impl., mainly because we don't do the large unconsuming the spec
+ // requires.
+
+ // All consumed characters.
+ $chars = $this->stream->char();
+
+ /* This section defines how to consume a character
+ reference. This definition is used when parsing character
+ references in text and in attributes.
+
+ The behavior depends on the identity of the next character
+ (the one immediately after the U+0026 AMPERSAND character): */
+
+ if (
+ $chars[0] === "\x09" ||
+ $chars[0] === "\x0A" ||
+ $chars[0] === "\x0C" ||
+ $chars[0] === "\x20" ||
+ $chars[0] === '<' ||
+ $chars[0] === '&' ||
+ $chars === false ||
+ $chars[0] === $allowed
+ ) {
+ /* U+0009 CHARACTER TABULATION
+ U+000A LINE FEED (LF)
+ U+000C FORM FEED (FF)
+ U+0020 SPACE
+ U+003C LESS-THAN SIGN
+ U+0026 AMPERSAND
+ EOF
+ The additional allowed character, if there is one
+ Not a character reference. No characters are consumed,
+ and nothing is returned. (This is not an error, either.) */
+ // We already consumed, so unconsume.
+ $this->stream->unget();
+ return '&';
+ } elseif ($chars[0] === '#') {
+ /* Consume the U+0023 NUMBER SIGN. */
+ // Um, yeah, we already did that.
+ /* The behavior further depends on the character after
+ the U+0023 NUMBER SIGN: */
+ $chars .= $this->stream->char();
+ if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
+ /* U+0078 LATIN SMALL LETTER X
+ U+0058 LATIN CAPITAL LETTER X */
+ /* Consume the X. */
+ // Um, yeah, we already did that.
+ /* Follow the steps below, but using the range of
+ characters U+0030 DIGIT ZERO through to U+0039 DIGIT
+ NINE, U+0061 LATIN SMALL LETTER A through to U+0066
+ LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
+ A, through to U+0046 LATIN CAPITAL LETTER F (in other
+ words, 0123456789, ABCDEF, abcdef). */
+ $char_class = self::HEX;
+ /* When it comes to interpreting the
+ number, interpret it as a hexadecimal number. */
+ $hex = true;
+ } else {
+ /* Anything else */
+ // Unconsume because we shouldn't have consumed this.
+ $chars = $chars[0];
+ $this->stream->unget();
+ /* Follow the steps below, but using the range of
+ characters U+0030 DIGIT ZERO through to U+0039 DIGIT
+ NINE (i.e. just 0123456789). */
+ $char_class = self::DIGIT;
+ /* When it comes to interpreting the number,
+ interpret it as a decimal number. */
+ $hex = false;
+ }
+
+ /* Consume as many characters as match the range of characters given above. */
+ $consumed = $this->stream->charsWhile($char_class);
+ if ($consumed === '' || $consumed === false) {
+ /* If no characters match the range, then don't consume
+ any characters (and unconsume the U+0023 NUMBER SIGN
+ character and, if appropriate, the X character). This
+ is a parse error; nothing is returned. */
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-numeric-entity'
+ ));
+ return '&' . $chars;
+ } else {
+ /* Otherwise, if the next character is a U+003B SEMICOLON,
+ consume that too. If it isn't, there is a parse error. */
+ if ($this->stream->char() !== ';') {
+ $this->stream->unget();
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'numeric-entity-without-semicolon'
+ ));
}
- if($token['type'] === self::ENDTAG && !empty($token['self-closing'])) {
+
+ /* If one or more characters match the range, then take
+ them all and interpret the string of characters as a number
+ (either hexadecimal or decimal as appropriate). */
+ $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
+
+ /* If that number is one of the numbers in the first column
+ of the following table, then this is a parse error. Find the
+ row with that number in the first column, and return a
+ character token for the Unicode character given in the
+ second column of that row. */
+ $new_codepoint = Data::getRealCodepoint($codepoint);
+ if ($new_codepoint) {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'illegal-windows-1252-entity'
+ ));
+ return Data::utf8chr($new_codepoint);
+ } else {
+ /* Otherwise, if the number is greater than 0x10FFFF, then
+ * this is a parse error. Return a U+FFFD REPLACEMENT
+ * CHARACTER. */
+ if ($codepoint > 0x10FFFF) {
$this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'self-closing-flag-on-end-tag',
+ 'type' => self::PARSEERROR,
+ 'data' => 'overlong-character-entity' // XXX probably not correct
+ ));
+ return "\xEF\xBF\xBD";
+ }
+ /* Otherwise, return a character token for the Unicode
+ * character whose code point is that number. If the
+ * number is in the range 0x0001 to 0x0008, 0x000E to
+ * 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to
+ * 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
+ * 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE,
+ * 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
+ * 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE,
+ * 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+ * 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE,
+ * or 0x10FFFF, then this is a parse error. */
+ // && has higher precedence than ||
+ if (
+ $codepoint >= 0x0000 && $codepoint <= 0x0008 ||
+ $codepoint === 0x000B ||
+ $codepoint >= 0x000E && $codepoint <= 0x001F ||
+ $codepoint >= 0x007F && $codepoint <= 0x009F ||
+ $codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
+ $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
+ ($codepoint & 0xFFFE) === 0xFFFE ||
+ $codepoint == 0x10FFFF || $codepoint == 0x10FFFE
+ ) {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'illegal-codepoint-for-numeric-entity'
));
+ }
+ return Data::utf8chr($codepoint);
}
- if($token['type'] === self::STARTTAG) {
- // This could be changed to actually pass the tree-builder a hash
- $hash = array();
- foreach ($token['attr'] as $keypair) {
- if (isset($hash[$keypair['name']])) {
- $this->emitToken(array(
- 'type' => self::PARSEERROR,
- 'data' => 'duplicate-attribute',
- ));
- } else {
- $hash[$keypair['name']] = $keypair['value'];
- }
- }
+ }
+
+ } else {
+ /* Anything else */
+
+ /* Consume the maximum number of characters possible,
+ with the consumed characters matching one of the
+ identifiers in the first column of the named character
+ references table (in a case-sensitive manner). */
+ // What we actually do here is consume as much as we can while it
+ // matches the start of one of the identifiers in the first column.
+
+ $refs = Data::getNamedCharacterReferences();
+
+ // Get the longest string which is the start of an identifier
+ // ($chars) as well as the longest identifier which matches ($id)
+ // and its codepoint ($codepoint).
+ $codepoint = false;
+ $char = $chars;
+ while ($char !== false && isset($refs[$char])) {
+ $refs = $refs[$char];
+ if (isset($refs['codepoint'])) {
+ $id = $chars;
+ $codepoint = $refs['codepoint'];
+ }
+ $chars .= $char = $this->stream->char();
+ }
+
+ // Unconsume the one character we just took which caused the while
+ // statement to fail. This could be anything and could cause state
+ // changes (as if it matches the while loop it must be
+ // alphanumeric so we can just concat it to whatever we get later).
+ $this->stream->unget();
+ if ($char !== false) {
+ $chars = substr($chars, 0, -1);
+ }
+
+ /* If no match can be made, then this is a parse error.
+ No characters are consumed, and nothing is returned. */
+ if (!$codepoint) {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'expected-named-entity'
+ ));
+ return '&' . $chars;
+ }
+
+ /* If the last character matched is not a U+003B SEMICOLON
+ (;), there is a parse error. */
+ $semicolon = true;
+ if (substr($id, -1) !== ';') {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'named-entity-without-semicolon'
+ ));
+ $semicolon = false;
+ }
+
+ /* If the character reference is being consumed as part of
+ an attribute, and the last character matched is not a
+ U+003B SEMICOLON (;), and the next character is in the
+ range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
+ LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
+ or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
+ then, for historical reasons, all the characters that were
+ matched after the U+0026 AMPERSAND (&) must be unconsumed,
+ and nothing is returned. */
+ if ($inattr && !$semicolon) {
+ // The next character is either the next character in $chars or in the stream.
+ if (strlen($chars) > strlen($id)) {
+ $next = substr($chars, strlen($id), 1);
+ } else {
+ $next = $this->stream->char();
+ $this->stream->unget();
+ }
+ if (
+ '0' <= $next && $next <= '9' ||
+ 'A' <= $next && $next <= 'Z' ||
+ 'a' <= $next && $next <= 'z'
+ ) {
+ return '&' . $chars;
}
+ }
- if(!$dry) {
- // the current structure of attributes is not a terribly good one
- $this->tree->emitToken($token);
+ /* Otherwise, return a character token for the character
+ corresponding to the character reference name (as given
+ by the second column of the named character references table). */
+ return Data::utf8chr($codepoint) . substr($chars, strlen($id));
+ }
+ }
+
+ private function characterReferenceInAttributeValue($allowed = false) {
+ /* Attempt to consume a character reference. */
+ $entity = $this->consumeCharacterReference($allowed, true);
+
+ /* If nothing is returned, append a U+0026 AMPERSAND
+ character to the current attribute's value.
+
+ Otherwise, append the returned character token to the
+ current attribute's value. */
+ $char = (!$entity)
+ ? '&'
+ : $entity;
+
+ $last = count($this->token['attr']) - 1;
+ $this->token['attr'][$last]['value'] .= $char;
+
+ /* Finally, switch back to the attribute value state that you
+ were in when were switched into this state. */
+ }
+
+ /**
+ * Emits a token, passing it on to the tree builder.
+ */
+ protected function emitToken($token, $checkStream = true, $dry = false) {
+ if ($checkStream) {
+ // Emit errors from input stream.
+ while ($this->stream->errors) {
+ $this->emitToken(array_shift($this->stream->errors), false);
+ }
+ }
+ if($token['type'] === self::ENDTAG && !empty($token['attr'])) {
+ for ($i = 0; $i < count($token['attr']); $i++) {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'attributes-in-end-tag'
+ ));
+ }
+ }
+ if($token['type'] === self::ENDTAG && !empty($token['self-closing'])) {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'self-closing-flag-on-end-tag',
+ ));
+ }
+ if($token['type'] === self::STARTTAG) {
+ // This could be changed to actually pass the tree-builder a hash
+ $hash = array();
+ foreach ($token['attr'] as $keypair) {
+ if (isset($hash[$keypair['name']])) {
+ $this->emitToken(array(
+ 'type' => self::PARSEERROR,
+ 'data' => 'duplicate-attribute',
+ ));
+ } else {
+ $hash[$keypair['name']] = $keypair['value'];
}
+ }
+ }
- if(!$dry && is_int($this->tree->content_model)) {
- $this->content_model = $this->tree->content_model;
- $this->tree->content_model = null;
+ if(!$dry) {
+ // the current structure of attributes is not a terribly good one
+ $this->tree->emitToken($token);
+ }
- } elseif($token['type'] === self::ENDTAG) {
- $this->content_model = self::PCDATA;
- }
+ if(!$dry && is_int($this->tree->content_model)) {
+ $this->content_model = $this->tree->content_model;
+ $this->tree->content_model = null;
+
+ } elseif($token['type'] === self::ENDTAG) {
+ $this->content_model = self::PCDATA;
}
+ }
}