summaryrefslogtreecommitdiff
path: root/src/HTML5/Parser/DOMTreeBuilder.php
diff options
context:
space:
mode:
authorMatt Butcher <[email protected]>2013-04-24 22:54:20 -0500
committerMatt Butcher <[email protected]>2013-04-24 22:54:20 -0500
commitbe7f40d38223cf6cf4aa1ce180de9e3bd36672c5 (patch)
tree15815b0298c52c1f496bb33612922ba8bb780e54 /src/HTML5/Parser/DOMTreeBuilder.php
parente6e65eccd8d37376a01f5ec134dd3e20e0e9dd49 (diff)
MAJOR changes to the DOMTreeBuilder.
This now supports most of the tree building. It has limited support for insertion modes, as well as some syntax correction.
Diffstat (limited to 'src/HTML5/Parser/DOMTreeBuilder.php')
-rw-r--r--src/HTML5/Parser/DOMTreeBuilder.php179
1 files changed, 167 insertions, 12 deletions
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php
index cf22953..5992002 100644
--- a/src/HTML5/Parser/DOMTreeBuilder.php
+++ b/src/HTML5/Parser/DOMTreeBuilder.php
@@ -11,12 +11,42 @@ use HTML5\Elements;
* change the architecture of the document itself.
*/
class DOMTreeBuilder implements EventHandler {
+
+
+ /**
+ * Defined in 8.2.5.
+ */
+ const IM_INITIAL = 0;
+ const IM_BEFORE_HTML = 1;
+ const IM_BEFORE_HEAD = 2;
+ const IM_IN_HEAD = 3;
+ const IM_IN_HEAD_NOSCRIPT = 4;
+ const IM_AFTER_HEAD = 5;
+ const IM_IN_BODY = 6;
+ const IM_TEXT = 7;
+ const IM_IN_TABLE = 8;
+ const IM_IN_TABLE_TEXT = 9;
+ const IM_IN_CAPTION = 10;
+ const IM_IN_COLUMN_GROUP = 11;
+ const IM_IN_TABLE_BODY = 12;
+ const IM_IN_ROW = 13;
+ const IM_IN_CELL = 14;
+ const IM_IN_SELECT = 15;
+ const IM_IN_SELECT_IN_TABLE = 16;
+ const IM_AFTER_BODY = 17;
+ const IM_IN_FRAMESET = 18;
+ const IM_AFTER_FRAMESET = 19;
+ const IM_AFTER_AFTER_BODY = 20;
+ const IM_AFTER_AFTER_FRAMESET = 21;
+
protected $stack = array();
protected $current; // Pointer in the tag hierarchy.
protected $doc;
protected $processor;
+ protected $insertMode = 0;
+
/**
* Quirks mode is enabled by default. Any document that is missing the
* DT will be considered to be in quirks mode.
@@ -28,10 +58,12 @@ class DOMTreeBuilder implements EventHandler {
// Create the doctype. For now, we are always creating HTML5
// documents, and attempting to up-convert any older DTDs to HTML5.
$dt = \DOMImplementation::createDocumentType('html');
- $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
+ //$this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
+ $this->doc = \DOMImplementation::createDocument(NULL, NULL, $dt);
$this->doc->errors = array();
- $this->current = $this->doc->documentElement;
+ // $this->current = $this->doc->documentElement;
+ $this->current = $this->doc; //->documentElement;
}
/**
@@ -55,20 +87,75 @@ class DOMTreeBuilder implements EventHandler {
// This is used solely for setting quirks mode. Currently we don't
// try to preserve the inbound DT. We convert it to HTML5.
$this->quirks = $quirks;
+
+ if ($this->insertMode > self::IM_INITIAL) {
+ $this->parseError("Illegal placement of DOCTYPE tag. Ignoring: " . $name);
+ return;
+ }
+
+ $this->insertMode = self::IM_BEFORE_HTML;
}
public function startTag($name, $attributes = array(), $selfClosing = FALSE) {
$lname = $this->normalizeTagName($name);
+ // Make sure we have an html element.
+ if (!$this->doc->documentElement && $name !== 'html') {
+ $this->startTag('html');
+ }
- // XXX: Since we create the root element, we skip this if it occurs
- // inside of the builder. We should probably check to make sure that
- // there is only one element so far, and indicate an error if there
- // is a structural problem.
- if ($lname == 'html') {
- return;
+ // Set quirks mode if we're at IM_INITIAL with no doctype.
+ if ($this->insertMode == self::IM_INITIAL) {
+ $this->quirks = TRUE;
+ $this->parseError("No DOCTYPE specified.");
+ }
+
+ // SPECIAL TAG HANDLING:
+ // Spec says do this, and "don't ask."
+ if ($name == 'image') {
+ $name = 'img';
+ }
+ elseif ($name == 'optgroup' && $this->current->tagName == 'option') {
+ $this->current = $this->current->parentNode;
+ }
+ // TODO: MathML support
+ elseif ($name == 'math') {
+ }
+ // TODO: SVG support.
+ elseif ($name == 'svg') {
+ }
+
+
+ // Autoclose p tags where appropriate.
+ if ($this->insertMode >= self::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) {
+ $this->autoclose('p');
}
+ // Set insert mode:
+ switch ($name) {
+ case 'html':
+ $this->insertMode = self::IM_BEFORE_HEAD;
+ break;
+ case 'head':
+ if ($this->insertMode > self::IM_BEFORE_HEAD) {
+ $this->parseError("Unexpected head tag outside of head context.");
+ }
+ else {
+ $this->isertMode = self::IM_IN_HEAD;
+ }
+ break;
+ case 'body':
+ $this->insertMode = self::IM_IN_BODY;
+ break;
+ case 'noscript':
+ if ($this->insertMode == self::IM_IN_HEAD) {
+ $this->insertMode = self::IM_IN_HEAD_NOSCRIPT;
+ }
+ break;
+
+ }
+
+
$ele = $this->doc->createElement($lname);
foreach ($attributes as $aName => $aVal) {
$ele->setAttribute($aName, $aVal);
@@ -82,7 +169,9 @@ class DOMTreeBuilder implements EventHandler {
$this->current->appendChild($ele);
// XXX: Need to handle self-closing tags and unary tags.
- $this->current = $ele;
+ if (!Elements::isA($name, Elements::UNARY_TAG)) {
+ $this->current = $ele;
+ }
// Return the element mask, which the tokenizer can then use to set
// various processing rules.
@@ -91,7 +180,28 @@ class DOMTreeBuilder implements EventHandler {
public function endTag($name) {
$lname = $this->normalizeTagName($name);
- if ($this->current->tagName != $lname) {
+
+ // Ignore closing tags for unary elements.
+ if (Elements::isA($name, Elements::UNARY_TAG)) {
+ return;
+ }
+
+ if ($this->insertMode <= self::IM_BEFORE_HTML) {
+ // 8.2.5.4.2
+ if (in_array($name, array('html', 'br', 'head', 'title'))) {
+ $this->startTag('html');
+ $this->endTag($name);
+ $this->insertMode = self::IM_BEFORE_HEAD;
+ return;
+ }
+
+ // Ignore the tag.
+ $this->parseError("Illegal closing tag at global scope.");
+ return;
+ }
+
+ if ($name != $lname) {
+ fprintf(STDOUT, "Mismatch on %s and %s", $name, $lname);
return $this->quirksTreeResolver($lname);
}
@@ -100,7 +210,25 @@ class DOMTreeBuilder implements EventHandler {
if ($lname == 'html') {
return;
}
- $this->current = $this->current->parentNode;
+
+ //$this->current = $this->current->parentNode;
+ if (!$this->autoclose($name)) {
+ $this->parseError('Could not find closing tag for ' . $name);
+ }
+
+ switch ($this->insertMode) {
+ case "head":
+ $this->insertMode = self::IM_AFTER_HEAD;
+ break;
+ case "body":
+ $this->insertMode = self::IM_AFTER_BODY;
+ break;
+ }
+
+ // 8.2.5.4.7
+ if ($name == 'sarcasm') {
+ $this->text("Take a deep breath.");
+ }
}
public function comment($cdata) {
@@ -110,6 +238,13 @@ class DOMTreeBuilder implements EventHandler {
}
public function text($data) {
+ if ($this->insertMode < self::IM_IN_HEAD) {
+ $data = trim($data);
+ if (!empty($data)) {
+ $this->parseError("Unexpected text. Ignoring: " . $data);
+ return;
+ }
+ }
$node = $this->doc->createTextNode($data);
$this->current->appendChild($node);
}
@@ -118,7 +253,7 @@ class DOMTreeBuilder implements EventHandler {
// If the $current isn't the $root, do we need to do anything?
}
- public function parseError($msg, $line, $col) {
+ public function parseError($msg, $line = 0, $col = 0) {
$this->doc->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg);
}
@@ -156,4 +291,24 @@ class DOMTreeBuilder implements EventHandler {
throw new \Exception("Not implemented.");
}
+
+ /**
+ * Automatically climb the tree and close the closest node with the matching $tag.
+ */
+ protected function autoclose($tag) {
+ $working = $this->current;
+ do {
+ if ($working->nodeType != XML_ELEMENT_NODE) {
+ return FALSE;
+ }
+ if ($working->tagName == $tag) {
+ $this->current = $working->parentNode;
+ return TRUE;
+ }
+ } while ($working = $working->parentNode);
+ return FALSE;
+
+ }
+
+
}