diff options
author | Matt Butcher <[email protected]> | 2013-04-24 22:54:20 -0500 |
---|---|---|
committer | Matt Butcher <[email protected]> | 2013-04-24 22:54:20 -0500 |
commit | be7f40d38223cf6cf4aa1ce180de9e3bd36672c5 (patch) | |
tree | 15815b0298c52c1f496bb33612922ba8bb780e54 /src/HTML5/Parser/DOMTreeBuilder.php | |
parent | e6e65eccd8d37376a01f5ec134dd3e20e0e9dd49 (diff) |
MAJOR changes to the DOMTreeBuilder.
This now supports most of the tree building. It has limited support for
insertion modes, as well as some syntax correction.
Diffstat (limited to 'src/HTML5/Parser/DOMTreeBuilder.php')
-rw-r--r-- | src/HTML5/Parser/DOMTreeBuilder.php | 179 |
1 files changed, 167 insertions, 12 deletions
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index cf22953..5992002 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -11,12 +11,42 @@ use HTML5\Elements; * change the architecture of the document itself. */ class DOMTreeBuilder implements EventHandler { + + + /** + * Defined in 8.2.5. + */ + const IM_INITIAL = 0; + const IM_BEFORE_HTML = 1; + const IM_BEFORE_HEAD = 2; + const IM_IN_HEAD = 3; + const IM_IN_HEAD_NOSCRIPT = 4; + const IM_AFTER_HEAD = 5; + const IM_IN_BODY = 6; + const IM_TEXT = 7; + const IM_IN_TABLE = 8; + const IM_IN_TABLE_TEXT = 9; + const IM_IN_CAPTION = 10; + const IM_IN_COLUMN_GROUP = 11; + const IM_IN_TABLE_BODY = 12; + const IM_IN_ROW = 13; + const IM_IN_CELL = 14; + const IM_IN_SELECT = 15; + const IM_IN_SELECT_IN_TABLE = 16; + const IM_AFTER_BODY = 17; + const IM_IN_FRAMESET = 18; + const IM_AFTER_FRAMESET = 19; + const IM_AFTER_AFTER_BODY = 20; + const IM_AFTER_AFTER_FRAMESET = 21; + protected $stack = array(); protected $current; // Pointer in the tag hierarchy. protected $doc; protected $processor; + protected $insertMode = 0; + /** * Quirks mode is enabled by default. Any document that is missing the * DT will be considered to be in quirks mode. @@ -28,10 +58,12 @@ class DOMTreeBuilder implements EventHandler { // Create the doctype. For now, we are always creating HTML5 // documents, and attempting to up-convert any older DTDs to HTML5. $dt = \DOMImplementation::createDocumentType('html'); - $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); + //$this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); + $this->doc = \DOMImplementation::createDocument(NULL, NULL, $dt); $this->doc->errors = array(); - $this->current = $this->doc->documentElement; + // $this->current = $this->doc->documentElement; + $this->current = $this->doc; //->documentElement; } /** @@ -55,20 +87,75 @@ class DOMTreeBuilder implements EventHandler { // This is used solely for setting quirks mode. Currently we don't // try to preserve the inbound DT. We convert it to HTML5. $this->quirks = $quirks; + + if ($this->insertMode > self::IM_INITIAL) { + $this->parseError("Illegal placement of DOCTYPE tag. Ignoring: " . $name); + return; + } + + $this->insertMode = self::IM_BEFORE_HTML; } public function startTag($name, $attributes = array(), $selfClosing = FALSE) { $lname = $this->normalizeTagName($name); + // Make sure we have an html element. + if (!$this->doc->documentElement && $name !== 'html') { + $this->startTag('html'); + } - // XXX: Since we create the root element, we skip this if it occurs - // inside of the builder. We should probably check to make sure that - // there is only one element so far, and indicate an error if there - // is a structural problem. - if ($lname == 'html') { - return; + // Set quirks mode if we're at IM_INITIAL with no doctype. + if ($this->insertMode == self::IM_INITIAL) { + $this->quirks = TRUE; + $this->parseError("No DOCTYPE specified."); + } + + // SPECIAL TAG HANDLING: + // Spec says do this, and "don't ask." + if ($name == 'image') { + $name = 'img'; + } + elseif ($name == 'optgroup' && $this->current->tagName == 'option') { + $this->current = $this->current->parentNode; + } + // TODO: MathML support + elseif ($name == 'math') { + } + // TODO: SVG support. + elseif ($name == 'svg') { + } + + + // Autoclose p tags where appropriate. + if ($this->insertMode >= self::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) { + $this->autoclose('p'); } + // Set insert mode: + switch ($name) { + case 'html': + $this->insertMode = self::IM_BEFORE_HEAD; + break; + case 'head': + if ($this->insertMode > self::IM_BEFORE_HEAD) { + $this->parseError("Unexpected head tag outside of head context."); + } + else { + $this->isertMode = self::IM_IN_HEAD; + } + break; + case 'body': + $this->insertMode = self::IM_IN_BODY; + break; + case 'noscript': + if ($this->insertMode == self::IM_IN_HEAD) { + $this->insertMode = self::IM_IN_HEAD_NOSCRIPT; + } + break; + + } + + $ele = $this->doc->createElement($lname); foreach ($attributes as $aName => $aVal) { $ele->setAttribute($aName, $aVal); @@ -82,7 +169,9 @@ class DOMTreeBuilder implements EventHandler { $this->current->appendChild($ele); // XXX: Need to handle self-closing tags and unary tags. - $this->current = $ele; + if (!Elements::isA($name, Elements::UNARY_TAG)) { + $this->current = $ele; + } // Return the element mask, which the tokenizer can then use to set // various processing rules. @@ -91,7 +180,28 @@ class DOMTreeBuilder implements EventHandler { public function endTag($name) { $lname = $this->normalizeTagName($name); - if ($this->current->tagName != $lname) { + + // Ignore closing tags for unary elements. + if (Elements::isA($name, Elements::UNARY_TAG)) { + return; + } + + if ($this->insertMode <= self::IM_BEFORE_HTML) { + // 8.2.5.4.2 + if (in_array($name, array('html', 'br', 'head', 'title'))) { + $this->startTag('html'); + $this->endTag($name); + $this->insertMode = self::IM_BEFORE_HEAD; + return; + } + + // Ignore the tag. + $this->parseError("Illegal closing tag at global scope."); + return; + } + + if ($name != $lname) { + fprintf(STDOUT, "Mismatch on %s and %s", $name, $lname); return $this->quirksTreeResolver($lname); } @@ -100,7 +210,25 @@ class DOMTreeBuilder implements EventHandler { if ($lname == 'html') { return; } - $this->current = $this->current->parentNode; + + //$this->current = $this->current->parentNode; + if (!$this->autoclose($name)) { + $this->parseError('Could not find closing tag for ' . $name); + } + + switch ($this->insertMode) { + case "head": + $this->insertMode = self::IM_AFTER_HEAD; + break; + case "body": + $this->insertMode = self::IM_AFTER_BODY; + break; + } + + // 8.2.5.4.7 + if ($name == 'sarcasm') { + $this->text("Take a deep breath."); + } } public function comment($cdata) { @@ -110,6 +238,13 @@ class DOMTreeBuilder implements EventHandler { } public function text($data) { + if ($this->insertMode < self::IM_IN_HEAD) { + $data = trim($data); + if (!empty($data)) { + $this->parseError("Unexpected text. Ignoring: " . $data); + return; + } + } $node = $this->doc->createTextNode($data); $this->current->appendChild($node); } @@ -118,7 +253,7 @@ class DOMTreeBuilder implements EventHandler { // If the $current isn't the $root, do we need to do anything? } - public function parseError($msg, $line, $col) { + public function parseError($msg, $line = 0, $col = 0) { $this->doc->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg); } @@ -156,4 +291,24 @@ class DOMTreeBuilder implements EventHandler { throw new \Exception("Not implemented."); } + + /** + * Automatically climb the tree and close the closest node with the matching $tag. + */ + protected function autoclose($tag) { + $working = $this->current; + do { + if ($working->nodeType != XML_ELEMENT_NODE) { + return FALSE; + } + if ($working->tagName == $tag) { + $this->current = $working->parentNode; + return TRUE; + } + } while ($working = $working->parentNode); + return FALSE; + + } + + } |