diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/HTML5/Elements.php | 127 | ||||
-rw-r--r-- | src/HTML5/Parser/DOMTreeBuilder.php | 179 |
2 files changed, 242 insertions, 64 deletions
diff --git a/src/HTML5/Elements.php b/src/HTML5/Elements.php index 4a9afb3..afa3327 100644 --- a/src/HTML5/Elements.php +++ b/src/HTML5/Elements.php @@ -10,11 +10,22 @@ namespace HTML5; */ class Elements { - const KNOWN_ELEMENT = 0x01; - const TEXT_RAW = 0x02; - const TEXT_RCDATA = 0x04; - const OMIT_START = 0x0a; - const OMIT_END = 0x0b; + const KNOWN_ELEMENT = 1; + const TEXT_RAW = 2; + const TEXT_RCDATA = 4; + const UNARY_TAG = 8; + + // "address", "article", "aside", "blockquote", "center", "details", "dialog", "dir", "div", "dl", + // "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", + // "nav", "ol", "p", "section", "summary", "ul" + // "h1", "h2", "h3", "h4", "h5", "h6" + // "pre", "listing" + // "form" + // "plaintext" + const AUTOCLOSE_P = 16; + + const TEXT_PLAINTEXT = 32; + /** * The HTML5 elements as defined in http://dev.w3.org/html5/markup/elements.html. @@ -23,18 +34,18 @@ class Elements { public static $elements = array( "a" => 1, "abbr" => 1, - "address" => 1, + "address" => 25, // NORMAL | UNARY_TAG | AUTOCLOSE_P "area" => 1, - "article" => 1, - "aside" => 1, + "article" => 17, // NORMAL | AUTOCLOSE_P + "aside" => 17, // NORMAL | AUTOCLOSE_P, "audio" => 1, "b" => 1, - "base" => 1, + "base" => 9, // | UNARY_TAG "bdi" => 1, "bdo" => 1, - "blockquote" => 1, + "blockquote" => 17, // NORMAL | AUTOCLOSE_P, "body" => 1, - "br" => 1, + "br" => 9, // NORMAL | UNARY_TAG "button" => 1, "canvas" => 1, "caption" => 1, @@ -42,61 +53,61 @@ class Elements { "code" => 1, "col" => 1, "colgroup" => 1, - "command" => 1, + "command" => 9, // NORMAL | UNARY_TAG //"data" => 1, // This is highly experimental and only part of the whatwg spec (not w3c). See https://developer.mozilla.org/en-US/docs/HTML/Element/data "datalist" => 1, "dd" => 1, "del" => 1, - "details" => 1, + "details" => 17, // NORMAL | AUTOCLOSE_P, "dfn" => 1, - "dialog" => 1, - "div" => 1, - "dl" => 1, + "dialog" => 17, // NORMAL | AUTOCLOSE_P, + "div" => 17, // NORMAL | AUTOCLOSE_P, + "dl" => 17, // NORMAL | AUTOCLOSE_P, "dt" => 1, "em" => 1, - "embed" => 1, - "fieldset" => 1, - "figcaption" => 1, - "figure" => 1, - "footer" => 1, - "form" => 1, - "h1" => 1, - "h2" => 1, - "h3" => 1, - "h4" => 1, - "h5" => 1, - "h6" => 1, + "embed" => 9, // NORMAL | UNARY_TAG + "fieldset" => 17, // NORMAL | AUTOCLOSE_P, + "figcaption" => 17, // NORMAL | AUTOCLOSE_P, + "figure" => 17, // NORMAL | AUTOCLOSE_P, + "footer" => 17, // NORMAL | AUTOCLOSE_P, + "form" => 17, // NORMAL | AUTOCLOSE_P, + "h1" => 17, // NORMAL | AUTOCLOSE_P, + "h2" => 17, // NORMAL | AUTOCLOSE_P, + "h3" => 17, // NORMAL | AUTOCLOSE_P, + "h4" => 17, // NORMAL | AUTOCLOSE_P, + "h5" => 17, // NORMAL | AUTOCLOSE_P, + "h6" => 17, // NORMAL | AUTOCLOSE_P, "head" => 1, - "header" => 1, - "hgroup" => 1, - "hr" => 1, + "header" => 17, // NORMAL | AUTOCLOSE_P, + "hgroup" => 17, // NORMAL | AUTOCLOSE_P, + "hr" => 9, // NORMAL | UNARY_TAG "html" => 1, "i" => 1, - "iframe" => 1, - "img" => 1, - "input" => 1, + "iframe" => 3, // NORMAL | TEXT_RAW + "img" => 9, // NORMAL | UNARY_TAG + "input" => 9, // NORMAL | UNARY_TAG "kbd" => 1, "ins" => 1, - "keygen" => 1, + "keygen" => 9, // NORMAL | UNARY_TAG "label" => 1, "legend" => 1, "li" => 1, - "link" => 1, + "link" => 9, // NORMAL | UNARY_TAG "map" => 1, "mark" => 1, - "menu" => 1, - "meta" => 1, + "menu" => 17, // NORMAL | AUTOCLOSE_P, + "meta" => 9, // NORMAL | UNARY_TAG "meter" => 1, - "nav" => 1, - "noscript" => 1, + "nav" => 17, // NORMAL | AUTOCLOSE_P, + "noscript" => 3, // NORMAL | TEXT_RAW "object" => 1, - "ol" => 1, + "ol" => 17, // NORMAL | AUTOCLOSE_P, "optgroup" => 1, "option" => 1, "output" => 1, - "p" => 1, - "param" => 1, - "pre" => 3, // NORMAL | TEXT_RAW + "p" => 17, // NORMAL | AUTOCLOSE_P, + "param" => 9, // NORMAL | UNARY_TAG + "pre" => 19, // NORMAL | TEXT_RAW | AUTOCLOSE_P "progress" => 1, "q" => 1, "rp" => 1, @@ -105,15 +116,15 @@ class Elements { "s" => 1, "samp" => 1, "script" => 3, // NORMAL | TEXT_RAW - "section" => 1, + "section" => 17, // NORMAL | AUTOCLOSE_P, "select" => 1, "small" => 1, - "source" => 1, + "source" => 9, // NORMAL | UNARY_TAG "span" => 1, "strong" => 1, "style" => 1, "sub" => 1, - "summary" => 1, + "summary" => 17, // NORMAL | AUTOCLOSE_P, "sup" => 1, "table" => 1, "tbody" => 1, @@ -123,14 +134,26 @@ class Elements { "th" => 1, "thead" => 1, "time" => 1, - "title" => 1, + "title" => 5, // NORMAL | TEXT_RCDATA "tr" => 1, - "track" => 1, + "track" => 9, // NORMAL | UNARY_TAG "u" => 1, - "ul" => 1, + "ul" => 17, // NORMAL | AUTOCLOSE_P, "var" => 1, "video" => 1, - "wbr" => 1, + "wbr" => 9, // NORMAL | UNARY_TAG + + // Legacy? + 'basefont' => 8, // UNARY_TAG + 'bgsound' => 8, // UNARY_TAG + 'noframes' => 2, // RAW_TEXT + 'center' => 16, 'dir' => 16, 'listing' => 16, // AUTOCLOSE_P + 'plaintext' => 48, // AUTOCLOSE_P | TEXT_PLAINTEXT + 'applet' => 0, + 'marquee' => 0, + 'isindex' => 8, // UNARY_TAG + 'xmp' => 18, // AUTOCLOSE_P | UNARY_TAG + 'noembed' => 2, // RAW_TEXT ); /** @@ -264,7 +287,7 @@ class Elements { "script" => 1, "set" => 1, "stop" => 1, - "style" => 1, + "style" => 3, // NORMAL | RAW_TEXT "svg" => 1, "switch" => 1, "symbol" => 1, diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index cf22953..5992002 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -11,12 +11,42 @@ use HTML5\Elements; * change the architecture of the document itself. */ class DOMTreeBuilder implements EventHandler { + + + /** + * Defined in 8.2.5. + */ + const IM_INITIAL = 0; + const IM_BEFORE_HTML = 1; + const IM_BEFORE_HEAD = 2; + const IM_IN_HEAD = 3; + const IM_IN_HEAD_NOSCRIPT = 4; + const IM_AFTER_HEAD = 5; + const IM_IN_BODY = 6; + const IM_TEXT = 7; + const IM_IN_TABLE = 8; + const IM_IN_TABLE_TEXT = 9; + const IM_IN_CAPTION = 10; + const IM_IN_COLUMN_GROUP = 11; + const IM_IN_TABLE_BODY = 12; + const IM_IN_ROW = 13; + const IM_IN_CELL = 14; + const IM_IN_SELECT = 15; + const IM_IN_SELECT_IN_TABLE = 16; + const IM_AFTER_BODY = 17; + const IM_IN_FRAMESET = 18; + const IM_AFTER_FRAMESET = 19; + const IM_AFTER_AFTER_BODY = 20; + const IM_AFTER_AFTER_FRAMESET = 21; + protected $stack = array(); protected $current; // Pointer in the tag hierarchy. protected $doc; protected $processor; + protected $insertMode = 0; + /** * Quirks mode is enabled by default. Any document that is missing the * DT will be considered to be in quirks mode. @@ -28,10 +58,12 @@ class DOMTreeBuilder implements EventHandler { // Create the doctype. For now, we are always creating HTML5 // documents, and attempting to up-convert any older DTDs to HTML5. $dt = \DOMImplementation::createDocumentType('html'); - $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); + //$this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); + $this->doc = \DOMImplementation::createDocument(NULL, NULL, $dt); $this->doc->errors = array(); - $this->current = $this->doc->documentElement; + // $this->current = $this->doc->documentElement; + $this->current = $this->doc; //->documentElement; } /** @@ -55,20 +87,75 @@ class DOMTreeBuilder implements EventHandler { // This is used solely for setting quirks mode. Currently we don't // try to preserve the inbound DT. We convert it to HTML5. $this->quirks = $quirks; + + if ($this->insertMode > self::IM_INITIAL) { + $this->parseError("Illegal placement of DOCTYPE tag. Ignoring: " . $name); + return; + } + + $this->insertMode = self::IM_BEFORE_HTML; } public function startTag($name, $attributes = array(), $selfClosing = FALSE) { $lname = $this->normalizeTagName($name); + // Make sure we have an html element. + if (!$this->doc->documentElement && $name !== 'html') { + $this->startTag('html'); + } - // XXX: Since we create the root element, we skip this if it occurs - // inside of the builder. We should probably check to make sure that - // there is only one element so far, and indicate an error if there - // is a structural problem. - if ($lname == 'html') { - return; + // Set quirks mode if we're at IM_INITIAL with no doctype. + if ($this->insertMode == self::IM_INITIAL) { + $this->quirks = TRUE; + $this->parseError("No DOCTYPE specified."); + } + + // SPECIAL TAG HANDLING: + // Spec says do this, and "don't ask." + if ($name == 'image') { + $name = 'img'; + } + elseif ($name == 'optgroup' && $this->current->tagName == 'option') { + $this->current = $this->current->parentNode; + } + // TODO: MathML support + elseif ($name == 'math') { + } + // TODO: SVG support. + elseif ($name == 'svg') { + } + + + // Autoclose p tags where appropriate. + if ($this->insertMode >= self::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) { + $this->autoclose('p'); } + // Set insert mode: + switch ($name) { + case 'html': + $this->insertMode = self::IM_BEFORE_HEAD; + break; + case 'head': + if ($this->insertMode > self::IM_BEFORE_HEAD) { + $this->parseError("Unexpected head tag outside of head context."); + } + else { + $this->isertMode = self::IM_IN_HEAD; + } + break; + case 'body': + $this->insertMode = self::IM_IN_BODY; + break; + case 'noscript': + if ($this->insertMode == self::IM_IN_HEAD) { + $this->insertMode = self::IM_IN_HEAD_NOSCRIPT; + } + break; + + } + + $ele = $this->doc->createElement($lname); foreach ($attributes as $aName => $aVal) { $ele->setAttribute($aName, $aVal); @@ -82,7 +169,9 @@ class DOMTreeBuilder implements EventHandler { $this->current->appendChild($ele); // XXX: Need to handle self-closing tags and unary tags. - $this->current = $ele; + if (!Elements::isA($name, Elements::UNARY_TAG)) { + $this->current = $ele; + } // Return the element mask, which the tokenizer can then use to set // various processing rules. @@ -91,7 +180,28 @@ class DOMTreeBuilder implements EventHandler { public function endTag($name) { $lname = $this->normalizeTagName($name); - if ($this->current->tagName != $lname) { + + // Ignore closing tags for unary elements. + if (Elements::isA($name, Elements::UNARY_TAG)) { + return; + } + + if ($this->insertMode <= self::IM_BEFORE_HTML) { + // 8.2.5.4.2 + if (in_array($name, array('html', 'br', 'head', 'title'))) { + $this->startTag('html'); + $this->endTag($name); + $this->insertMode = self::IM_BEFORE_HEAD; + return; + } + + // Ignore the tag. + $this->parseError("Illegal closing tag at global scope."); + return; + } + + if ($name != $lname) { + fprintf(STDOUT, "Mismatch on %s and %s", $name, $lname); return $this->quirksTreeResolver($lname); } @@ -100,7 +210,25 @@ class DOMTreeBuilder implements EventHandler { if ($lname == 'html') { return; } - $this->current = $this->current->parentNode; + + //$this->current = $this->current->parentNode; + if (!$this->autoclose($name)) { + $this->parseError('Could not find closing tag for ' . $name); + } + + switch ($this->insertMode) { + case "head": + $this->insertMode = self::IM_AFTER_HEAD; + break; + case "body": + $this->insertMode = self::IM_AFTER_BODY; + break; + } + + // 8.2.5.4.7 + if ($name == 'sarcasm') { + $this->text("Take a deep breath."); + } } public function comment($cdata) { @@ -110,6 +238,13 @@ class DOMTreeBuilder implements EventHandler { } public function text($data) { + if ($this->insertMode < self::IM_IN_HEAD) { + $data = trim($data); + if (!empty($data)) { + $this->parseError("Unexpected text. Ignoring: " . $data); + return; + } + } $node = $this->doc->createTextNode($data); $this->current->appendChild($node); } @@ -118,7 +253,7 @@ class DOMTreeBuilder implements EventHandler { // If the $current isn't the $root, do we need to do anything? } - public function parseError($msg, $line, $col) { + public function parseError($msg, $line = 0, $col = 0) { $this->doc->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg); } @@ -156,4 +291,24 @@ class DOMTreeBuilder implements EventHandler { throw new \Exception("Not implemented."); } + + /** + * Automatically climb the tree and close the closest node with the matching $tag. + */ + protected function autoclose($tag) { + $working = $this->current; + do { + if ($working->nodeType != XML_ELEMENT_NODE) { + return FALSE; + } + if ($working->tagName == $tag) { + $this->current = $working->parentNode; + return TRUE; + } + } while ($working = $working->parentNode); + return FALSE; + + } + + } |