scanner = $scanner; $this->events = $eventHandler; } /** * Main entry point. */ public function parse() { $p = 0; do { $p = $this->scanner->position(); $this->consumeData(); // FIXME: Add infinite loop protection. } while ($this->carryOn); } /** * Consume a character and make a move. * HTML5 8.2.4.1 */ protected function consumeData() { // Character Ref $this->characterReference() || $this->tagOpen() || $this->eof() || $this->characterData(); return $this->carryOn; } /** * This buffers the current token as character data. */ protected function characterData() { $tok = $this->scanner->current(); // This should never happen... if ($tok === FALSE) { return FALSE; } // Null if ($tok === "\00") { $this->parseError("Received NULL character."); } // fprintf(STDOUT, "Writing '%s'", $tok); $this->buffer($tok); $this->scanner->next(); return TRUE; } protected function eof() { if ($this->scanner->current() === FALSE) { //fprintf(STDOUT, "EOF"); $this->flushBuffer(); $this->events->eof(); $this->carryOn = FALSE; return TRUE; } return FALSE; } /** * Handle character references (aka entities). * * HTML5 8.2.4.2 * * @param boolean $inAttribute * Set to TRUE if the text is inside of an attribute value. * FALSE otherwise. */ protected function characterReference($inAttribute = FALSE) { // If it fails this, it's definitely not an entity. if ($this->scanner->current() != '&') { return FALSE; } // Next char after &. $tok = $this->scanner->next(); $entity = ''; $start = $this->scanner->position(); // Whitespace: Ignore switch ($tok) { case NULL: case "\t": case "\n": case "\f": case ' ': case '&': case '<': // Don't consume; just return. Spec says return nothing, but I // think we have to append '&' to the string. $this->buffer('&'); return FALSE; case '#': // Consume and read a number $tok = $this->scanner->next(); // Hexidecimal encoding. // X[0-9a-fA-F]+; // x[0-9a-fA-F]+; if ($tok == 'x' || $tok == 'X') { $tok = $this->scanner->next(); // Consume x $hex = $this->scanner->getHex(); if (empty($hex)) { //throw new ParseError("Expected &#xHEX;, got &#x" . $tok); $this->parseError("Expected &#xHEX;, got &#x%s", $tok); return FALSE; } $entity = CharacterReference::lookupHex($hex); } // Decimal encoding. // [0-9]+; else { $numeric = $this->scanner->getNumeric(); if (empty($numeric)) { //throw ParseError("Expected &#DIGITS;, got $#" . $tok); $this->parseError("Expected &#DIGITS;, got $#%s", $tok); return FALSE; } $entity = CharacterReference::lookupDecimal($numeric); } break; default: // Attempt to consume a string up to a ';'. // [a-zA-Z0-9]+; $cname = $this->scanner->getAsciiAlpha(); $entity = CharacterReference::lookupName($cname); if ($entity == NULL) { $this->parseError("No match in entity table for '%s'", $entity); } } // The scanner has advanced the cursor for us. $tok = $this->scanner->current(); // We have an entity. We're done here. if ($tok == ';') { $this->buffer($entity); $this->scanner->next(); return TRUE; } // If in an attribute, then failing to match ; means unconsume the // entire string. Otherwise, failure to match is an error. if ($inAttribute) { $this->scanner->unconsume($this->scanner->position() - $start); $this->buffer('&'); return FALSE; } //throw new ParseError("Expected &ENTITY;, got &ENTITY (no trailing ;) " . $tok); $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok); } /** * 8.2.4.8 */ protected function tagOpen() { if ($this->scanner->current() != '<') { return FALSE; } // Any buffered text data can go out now. $this->flushBuffer(); $this->scanner->next(); return $this->markupDeclaration() || $this->endTag() || $this->processingInstruction() || $this->tagName() || // This always returns false. $this->parseError("Illegal tag opening") || $this->characterData(); } protected function markupDeclaration() { if ($this->scanner->current() != '!') { return FALSE; } $tok = $this->scanner->next(); // Comment: if ($tok == '-' && $this->scanner->peek() == '-') { $this->scanner->next(); // Consume the other '-' $this->scanner->next(); // Next char. return $this->comment(); } // Doctype elseif($tok == 'D') { return $this->doctype(''); } // CDATA section elseif($tok == '[') { return $this->cdataSection(); } // FINISH $this->parseError("Expected . Emit an empty comment because 8.2.4.46 says to. if ($tok == '>') { // Parse error. Emit the comment token. $this->parseError("Expected comment data, got '>'"); $this->events->comment(''); $this->scanner->next(); return TRUE; } // Replace NULL with the replacement char. if ($tok == "\0") { $tok = UTF8Utils::FFFD; } while (!$this->isCommentEnd()) { $comment .= $tok; $tok = $this->scanner->next(); } $this->events->comment($comment); $this->scanner->next(); return TRUE; } protected function isCommentEnd() { // EOF if($this->scanner->current() === FALSE) { // Hit the end. $this->parseError("Unexpected EOF in a comment."); return TRUE; } // If it doesn't start with -, not the end. if($this->scanner->current() != '-') { return FALSE; } // Advance one, and test for '->' if ($this->scanner->next() == '-' && $this->scanner->peek() == '>') { $this->scanner->next(); // Consume the last '>' return TRUE; } // Unread '-'; $this->scanner->unconsume(1); return FALSE; } /** * Parse a DOCTYPE. * * Parse a DOCTYPE declaration. This method has strong bearing on whether or * not Quirksmode is enabled on the event handler. * * @todo This method is a little long. Should probably refactor. */ protected function doctype() { if ($this->scanner->current() != 'D') { return FALSE; } // Check that string is DOCTYPE $chars = $this->scanner->charsWhile("DOCTYPE"); if ($chars != 'DOCTYPE') { $this->parseError('Expected DOCTYPE, got %s', $chars); return $this->bogusComment('scanner->whitespace(); $tok = $this->scanner->current(); // EOF: die. if ($tok === FALSE) { $this->events->doctype('html5',EventHandler::DOCTYPE_NONE,'', TRUE); return $this->eof(); } $doctypeName = ''; // NULL char: convert. if ($tok === "\0") { $this->parseError("Unexpected NULL character in DOCTYPE."); $doctypeName .= UTF8::FFFD; $tok = $this->scanner->next(); } $stop = " \n\f>"; $doctypeName = $this->scanner->charsUntil($stop); // Lowercase ASCII, replace \0 with FFFD $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD)); $tok = $this->scanner->current(); // If FALSE, emit a parse error, DOCTYPE, and return. if ($tok === FALSE) { $this->parseError('Unexpected EOF in DOCTYPE declaration.'); $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, NULL, TRUE); return TRUE; } // Short DOCTYPE, like if ($tok == '>') { // DOCTYPE without a name. if (strlen($doctypeName) == 0) { $this->parseError("Expected a DOCTYPE name. Got nothing."); $this->events->doctype($doctypeName, 0, NULL, TRUE); $this->scanner->next(); return TRUE; } $this->events->doctype($doctypeName); $this->scanner->next(); return TRUE; } $this->scanner->whitespace(); $pub = strtoupper($this->scanner->getAsciiAlpha()); $white = strlen($this->scanner->whitespace()); $tok = $this->scanner->current(); // Get ID, and flag it as pub or system. if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) { // Get the sys ID. $type = $pub == 'PUBLIC' ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM; $id = $this->quotedString("\0>"); if ($id === FALSE) { $this->events->doctype($doctypeName, $type, $pub, FALSE); return FALSE; } // Premature EOF. if ($this->scanner->current() === FALSE) { $this->parseError("Unexpected EOF in DOCTYPE"); $this->events->doctype($doctypeName, $type, $id, TRUE); return TRUE; } // Well-formed complete DOCTYPE. $this->scanner->whitespace(); if ($this->scanner->current() == '>') { $this->events->doctype($doctypeName, $type, $id, FALSE); $this->scanner->next(); return TRUE; } // If we get here, we have scanner->charsUntil(">"); $this->parseError("Malformed DOCTYPE."); $this->events->doctype($doctypeName, $type, $id, TRUE); $this->scanner->next(); return TRUE; } // Else it's a bogus DOCTYPE. // Consume to > and trash. $this->scanner->charsUntil('>'); $this->parseError("Expected PUBLIC or SYSTEM. Got %s.", $pub); $this->events->doctype($doctypeName, 0, NULL, TRUE); $this->scanner->next(); return TRUE; } /** * Utility for reading a quoted string. * * @param string $stopchars * Characters (in addition to a close-quote) that should stop the string. * E.g. sometimes '>' is higher precedence than '"' or "'". * @return mixed * String if one is found (quotations omitted) */ protected function quotedString($stopchars) { $tok = $this->scanner->current(); if ($tok == '"' || "'") { $this->scanner->next(); $ret = $this->scanner->charsUntil($tok . $stopchars); if ($this->scanner->current() == $tok) { $this->scanner->next(); } else { // Parse error because no close quote. $this->parseError("Expected %s, got %s", $tok, $this->scanner->current()); } return $ret; } return FALSE; } /** * Handle a CDATA section. */ protected function cdataSection() { if ($this->scanner->current() != '[') { return FALSE; } $cdata = ''; $this->scanner->next(); $chars = $this->scanner->charsWhile('CDAT'); if ($chars != 'CDATA' || $this->scanner->current() != '[') { $this->parseError('Expected [CDATA[, got %s', $chars); return $this->bogusComment('scanner->next(); do { if ($tok === FALSE) { $this->parseError('Unexpected EOF inside CDATA.'); $this->bogusComment('scanner->next(); } while (!$this->isCdataClose()); $this->scanner->next(); // consume > $this->scanner->next(); // Next char after > $this->events->cdata($cdata); return TRUE; } /** * Check whether the parser has reached the end of a CDATA section. */ protected function isCdataClose() { $tok = $this->scanner->current(); if ($tok != ']') { return FALSE; } $tok = $this->scanner->next(); if ($tok == ']' && $this->scanner->peek() == '>') { return TRUE; } // Unconsume one char and return. $this->scanner->unconsume(); return FALSE; } protected function rcdata() { // Ampersand // < // Null // EOF // Character } protected function rawtext() { // < is a literal // NULL is an error // EOF // Character data } protected function scriptData() { // < is a literal // NULL is an error // EOF // Character data } /** * 8.2.4.7 */ protected function plaintext() { // NULL -> parse error // EOF -> eof // -> Character data } /** * 8.2.4.11 */ protected function rcdataLessThan() { // / -> empty the tmp buffer and go to end-tag // ->rcdata } /** * 8.2.4.12 */ protected function rcdataEndTag() { // A-Za-z: append to tagname // -> rcdata state } /** * 8.2.4.13 */ protected function rcdataEndTagName() { // tab, lf, ff, space -> before attribute or treat as anything // / -> self-closing tag // > -> end tag, back to data // A-Za-z -> append to tagname // -> rcdata state } /** * 8.2.4.14 */ protected function rawtextLessThan() { // / -> rawtext endtag state // -> rawtext } /** * 8.2.4.15 */ protected function rawtextEndTagOpen() { // A-Za-z -> rawtext // ->rawtext } protected function rawtextEndTagName() { // tab, lf, ff, space -> before attr name // } protected function scriptLessThan(){ } protected function scriptEndTagOpen() { } protected function scriptEndTagName() { } protected function scriptEscapeStart() { } protected function scriptEscapeStartDash() { } protected function scriptEscaped() { } protected function scriptEscapedDash() { } protected function scriptEscapedDashDash() { } protected function scriptEscapedLessThan() { } protected function scriptEscapedEndTagOpen() { } protected function scriptEscapedEndTagName() { } protected function scriptDoubleEscapeStart() { } protected function scriptDoubleEscaped() { } protected function scriptDoubleEscapedDash() { } protected function scriptDoubleEscapedDashDash() { } protected function scriptDoubleEscapedLessThan() { } protected function scriptDoubleEscapeEnd() { } protected function beforeAttributeName() { } protected function attributeName() { } protected function afterAttributeName() { } protected function beforeAttributeValue() { } protected function attributeValueDoubleQuote() { } protected function attributeValueSingleQuote() { } protected function attributeValueUnquoted() { } protected function characterReferenceInAttributeValue() { } protected function afterAttributeValueQuoted() { } protected function selfCloseingStartTag() { } protected function beforeDoctype() { } protected function doctypeName() { } protected function afterDoctypeName() { } protected function doctypePublicKeyword() { } protected function beforeDoctypePublicId() { } protected function doctypePublicIdDoubleQuoted() { } protected function doctypePublicIdSingleQuoted() { } protected function afterDoctypePublicId() { } protected function betweenDoctypePublicAndSystem() { } protected function afterDoctypeSystemKeyword() { } protected function beforeDoctypeSystemIdentifier() { } protected function doctypeSystemIdDoubleQuoted() { } protected function doctypeSystemIdSingleQuoted() { } protected function afterDoctypeSystemId() { } protected function bogusDoctype() { } // ================================================================ // Non-HTML5 // ================================================================ /** * Handle a processing instruction. * * XML processing instructions are supposed to be ignored in HTML5, * treated as "bogus comments". However, since we're not a user * agent, we allow them. We consume until ?> and then issue a * EventListener::processingInstruction() event. */ protected function processingInstruction() { if ($this->scanner->current() != '?') { return FALSE; } $tok = $this->scanner->next(); $procName = $this->scanner->getAsciiAlpha(); $white = strlen($this->scanner->whitespace()); // If not a PI, send to bogusComment. if (strlen($procName) == 0 || $white == 0 || $this->scanner->current() == FALSE) { $this->parseError("Expected processing instruction name, got $tok"); $this->bogusComment('scanner->current() != '?' && $this->scanner->peek() != '>') { $data .= $this->scanner->current(); $tok = $this->scanner->next(); if ($tok === FALSE) { $this->parseError("Unexpected EOF in processing instruction."); $this->events->processingInstruction($procName, $data); return TRUE; } } $this->scanner->next(); // > $this->scanner->next(); // Next token. $this->events->processingInstruction($procName, $data); return TRUE; } // ================================================================ // UTILITY FUNCTIONS // ================================================================ /** * Send a TEXT event with the contents of the text buffer. * * This emits an EventHandler::text() event with the current contents of the * temporary text buffer. (The buffer is used to group as much PCDATA * as we can instead of emitting lots and lots of TEXT events.) */ protected function flushBuffer() { if (empty($this->text)) { return; } $this->events->text($this->text); $this->text = ''; } /** * Add text to the temporary buffer. * * @see flushBuffer() */ protected function buffer($str) { $this->text .= $str; } /** * Emit a parse error. * * A parse error always returns FALSE because it never consumes any * characters. */ protected function parseError($msg) { $args = func_get_args(); if (count($args) > 1) { array_shift($args); $msg = vsprintf($msg, $args); } $line = $this->scanner->currentLine(); $col = $this->scanner->columnOffset(); $this->events->parseError($msg, $line, $col); return FALSE; } }