summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/HTML5/Elements.php127
-rw-r--r--src/HTML5/Parser/DOMTreeBuilder.php179
2 files changed, 242 insertions, 64 deletions
diff --git a/src/HTML5/Elements.php b/src/HTML5/Elements.php
index 4a9afb3..afa3327 100644
--- a/src/HTML5/Elements.php
+++ b/src/HTML5/Elements.php
@@ -10,11 +10,22 @@ namespace HTML5;
*/
class Elements {
- const KNOWN_ELEMENT = 0x01;
- const TEXT_RAW = 0x02;
- const TEXT_RCDATA = 0x04;
- const OMIT_START = 0x0a;
- const OMIT_END = 0x0b;
+ const KNOWN_ELEMENT = 1;
+ const TEXT_RAW = 2;
+ const TEXT_RCDATA = 4;
+ const UNARY_TAG = 8;
+
+ // "address", "article", "aside", "blockquote", "center", "details", "dialog", "dir", "div", "dl",
+ // "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu",
+ // "nav", "ol", "p", "section", "summary", "ul"
+ // "h1", "h2", "h3", "h4", "h5", "h6"
+ // "pre", "listing"
+ // "form"
+ // "plaintext"
+ const AUTOCLOSE_P = 16;
+
+ const TEXT_PLAINTEXT = 32;
+
/**
* The HTML5 elements as defined in http://dev.w3.org/html5/markup/elements.html.
@@ -23,18 +34,18 @@ class Elements {
public static $elements = array(
"a" => 1,
"abbr" => 1,
- "address" => 1,
+ "address" => 25, // NORMAL | UNARY_TAG | AUTOCLOSE_P
"area" => 1,
- "article" => 1,
- "aside" => 1,
+ "article" => 17, // NORMAL | AUTOCLOSE_P
+ "aside" => 17, // NORMAL | AUTOCLOSE_P,
"audio" => 1,
"b" => 1,
- "base" => 1,
+ "base" => 9, // | UNARY_TAG
"bdi" => 1,
"bdo" => 1,
- "blockquote" => 1,
+ "blockquote" => 17, // NORMAL | AUTOCLOSE_P,
"body" => 1,
- "br" => 1,
+ "br" => 9, // NORMAL | UNARY_TAG
"button" => 1,
"canvas" => 1,
"caption" => 1,
@@ -42,61 +53,61 @@ class Elements {
"code" => 1,
"col" => 1,
"colgroup" => 1,
- "command" => 1,
+ "command" => 9, // NORMAL | UNARY_TAG
//"data" => 1, // This is highly experimental and only part of the whatwg spec (not w3c). See https://developer.mozilla.org/en-US/docs/HTML/Element/data
"datalist" => 1,
"dd" => 1,
"del" => 1,
- "details" => 1,
+ "details" => 17, // NORMAL | AUTOCLOSE_P,
"dfn" => 1,
- "dialog" => 1,
- "div" => 1,
- "dl" => 1,
+ "dialog" => 17, // NORMAL | AUTOCLOSE_P,
+ "div" => 17, // NORMAL | AUTOCLOSE_P,
+ "dl" => 17, // NORMAL | AUTOCLOSE_P,
"dt" => 1,
"em" => 1,
- "embed" => 1,
- "fieldset" => 1,
- "figcaption" => 1,
- "figure" => 1,
- "footer" => 1,
- "form" => 1,
- "h1" => 1,
- "h2" => 1,
- "h3" => 1,
- "h4" => 1,
- "h5" => 1,
- "h6" => 1,
+ "embed" => 9, // NORMAL | UNARY_TAG
+ "fieldset" => 17, // NORMAL | AUTOCLOSE_P,
+ "figcaption" => 17, // NORMAL | AUTOCLOSE_P,
+ "figure" => 17, // NORMAL | AUTOCLOSE_P,
+ "footer" => 17, // NORMAL | AUTOCLOSE_P,
+ "form" => 17, // NORMAL | AUTOCLOSE_P,
+ "h1" => 17, // NORMAL | AUTOCLOSE_P,
+ "h2" => 17, // NORMAL | AUTOCLOSE_P,
+ "h3" => 17, // NORMAL | AUTOCLOSE_P,
+ "h4" => 17, // NORMAL | AUTOCLOSE_P,
+ "h5" => 17, // NORMAL | AUTOCLOSE_P,
+ "h6" => 17, // NORMAL | AUTOCLOSE_P,
"head" => 1,
- "header" => 1,
- "hgroup" => 1,
- "hr" => 1,
+ "header" => 17, // NORMAL | AUTOCLOSE_P,
+ "hgroup" => 17, // NORMAL | AUTOCLOSE_P,
+ "hr" => 9, // NORMAL | UNARY_TAG
"html" => 1,
"i" => 1,
- "iframe" => 1,
- "img" => 1,
- "input" => 1,
+ "iframe" => 3, // NORMAL | TEXT_RAW
+ "img" => 9, // NORMAL | UNARY_TAG
+ "input" => 9, // NORMAL | UNARY_TAG
"kbd" => 1,
"ins" => 1,
- "keygen" => 1,
+ "keygen" => 9, // NORMAL | UNARY_TAG
"label" => 1,
"legend" => 1,
"li" => 1,
- "link" => 1,
+ "link" => 9, // NORMAL | UNARY_TAG
"map" => 1,
"mark" => 1,
- "menu" => 1,
- "meta" => 1,
+ "menu" => 17, // NORMAL | AUTOCLOSE_P,
+ "meta" => 9, // NORMAL | UNARY_TAG
"meter" => 1,
- "nav" => 1,
- "noscript" => 1,
+ "nav" => 17, // NORMAL | AUTOCLOSE_P,
+ "noscript" => 3, // NORMAL | TEXT_RAW
"object" => 1,
- "ol" => 1,
+ "ol" => 17, // NORMAL | AUTOCLOSE_P,
"optgroup" => 1,
"option" => 1,
"output" => 1,
- "p" => 1,
- "param" => 1,
- "pre" => 3, // NORMAL | TEXT_RAW
+ "p" => 17, // NORMAL | AUTOCLOSE_P,
+ "param" => 9, // NORMAL | UNARY_TAG
+ "pre" => 19, // NORMAL | TEXT_RAW | AUTOCLOSE_P
"progress" => 1,
"q" => 1,
"rp" => 1,
@@ -105,15 +116,15 @@ class Elements {
"s" => 1,
"samp" => 1,
"script" => 3, // NORMAL | TEXT_RAW
- "section" => 1,
+ "section" => 17, // NORMAL | AUTOCLOSE_P,
"select" => 1,
"small" => 1,
- "source" => 1,
+ "source" => 9, // NORMAL | UNARY_TAG
"span" => 1,
"strong" => 1,
"style" => 1,
"sub" => 1,
- "summary" => 1,
+ "summary" => 17, // NORMAL | AUTOCLOSE_P,
"sup" => 1,
"table" => 1,
"tbody" => 1,
@@ -123,14 +134,26 @@ class Elements {
"th" => 1,
"thead" => 1,
"time" => 1,
- "title" => 1,
+ "title" => 5, // NORMAL | TEXT_RCDATA
"tr" => 1,
- "track" => 1,
+ "track" => 9, // NORMAL | UNARY_TAG
"u" => 1,
- "ul" => 1,
+ "ul" => 17, // NORMAL | AUTOCLOSE_P,
"var" => 1,
"video" => 1,
- "wbr" => 1,
+ "wbr" => 9, // NORMAL | UNARY_TAG
+
+ // Legacy?
+ 'basefont' => 8, // UNARY_TAG
+ 'bgsound' => 8, // UNARY_TAG
+ 'noframes' => 2, // RAW_TEXT
+ 'center' => 16, 'dir' => 16, 'listing' => 16, // AUTOCLOSE_P
+ 'plaintext' => 48, // AUTOCLOSE_P | TEXT_PLAINTEXT
+ 'applet' => 0,
+ 'marquee' => 0,
+ 'isindex' => 8, // UNARY_TAG
+ 'xmp' => 18, // AUTOCLOSE_P | UNARY_TAG
+ 'noembed' => 2, // RAW_TEXT
);
/**
@@ -264,7 +287,7 @@ class Elements {
"script" => 1,
"set" => 1,
"stop" => 1,
- "style" => 1,
+ "style" => 3, // NORMAL | RAW_TEXT
"svg" => 1,
"switch" => 1,
"symbol" => 1,
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php
index cf22953..5992002 100644
--- a/src/HTML5/Parser/DOMTreeBuilder.php
+++ b/src/HTML5/Parser/DOMTreeBuilder.php
@@ -11,12 +11,42 @@ use HTML5\Elements;
* change the architecture of the document itself.
*/
class DOMTreeBuilder implements EventHandler {
+
+
+ /**
+ * Defined in 8.2.5.
+ */
+ const IM_INITIAL = 0;
+ const IM_BEFORE_HTML = 1;
+ const IM_BEFORE_HEAD = 2;
+ const IM_IN_HEAD = 3;
+ const IM_IN_HEAD_NOSCRIPT = 4;
+ const IM_AFTER_HEAD = 5;
+ const IM_IN_BODY = 6;
+ const IM_TEXT = 7;
+ const IM_IN_TABLE = 8;
+ const IM_IN_TABLE_TEXT = 9;
+ const IM_IN_CAPTION = 10;
+ const IM_IN_COLUMN_GROUP = 11;
+ const IM_IN_TABLE_BODY = 12;
+ const IM_IN_ROW = 13;
+ const IM_IN_CELL = 14;
+ const IM_IN_SELECT = 15;
+ const IM_IN_SELECT_IN_TABLE = 16;
+ const IM_AFTER_BODY = 17;
+ const IM_IN_FRAMESET = 18;
+ const IM_AFTER_FRAMESET = 19;
+ const IM_AFTER_AFTER_BODY = 20;
+ const IM_AFTER_AFTER_FRAMESET = 21;
+
protected $stack = array();
protected $current; // Pointer in the tag hierarchy.
protected $doc;
protected $processor;
+ protected $insertMode = 0;
+
/**
* Quirks mode is enabled by default. Any document that is missing the
* DT will be considered to be in quirks mode.
@@ -28,10 +58,12 @@ class DOMTreeBuilder implements EventHandler {
// Create the doctype. For now, we are always creating HTML5
// documents, and attempting to up-convert any older DTDs to HTML5.
$dt = \DOMImplementation::createDocumentType('html');
- $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
+ //$this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
+ $this->doc = \DOMImplementation::createDocument(NULL, NULL, $dt);
$this->doc->errors = array();
- $this->current = $this->doc->documentElement;
+ // $this->current = $this->doc->documentElement;
+ $this->current = $this->doc; //->documentElement;
}
/**
@@ -55,20 +87,75 @@ class DOMTreeBuilder implements EventHandler {
// This is used solely for setting quirks mode. Currently we don't
// try to preserve the inbound DT. We convert it to HTML5.
$this->quirks = $quirks;
+
+ if ($this->insertMode > self::IM_INITIAL) {
+ $this->parseError("Illegal placement of DOCTYPE tag. Ignoring: " . $name);
+ return;
+ }
+
+ $this->insertMode = self::IM_BEFORE_HTML;
}
public function startTag($name, $attributes = array(), $selfClosing = FALSE) {
$lname = $this->normalizeTagName($name);
+ // Make sure we have an html element.
+ if (!$this->doc->documentElement && $name !== 'html') {
+ $this->startTag('html');
+ }
- // XXX: Since we create the root element, we skip this if it occurs
- // inside of the builder. We should probably check to make sure that
- // there is only one element so far, and indicate an error if there
- // is a structural problem.
- if ($lname == 'html') {
- return;
+ // Set quirks mode if we're at IM_INITIAL with no doctype.
+ if ($this->insertMode == self::IM_INITIAL) {
+ $this->quirks = TRUE;
+ $this->parseError("No DOCTYPE specified.");
+ }
+
+ // SPECIAL TAG HANDLING:
+ // Spec says do this, and "don't ask."
+ if ($name == 'image') {
+ $name = 'img';
+ }
+ elseif ($name == 'optgroup' && $this->current->tagName == 'option') {
+ $this->current = $this->current->parentNode;
+ }
+ // TODO: MathML support
+ elseif ($name == 'math') {
+ }
+ // TODO: SVG support.
+ elseif ($name == 'svg') {
+ }
+
+
+ // Autoclose p tags where appropriate.
+ if ($this->insertMode >= self::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) {
+ $this->autoclose('p');
}
+ // Set insert mode:
+ switch ($name) {
+ case 'html':
+ $this->insertMode = self::IM_BEFORE_HEAD;
+ break;
+ case 'head':
+ if ($this->insertMode > self::IM_BEFORE_HEAD) {
+ $this->parseError("Unexpected head tag outside of head context.");
+ }
+ else {
+ $this->isertMode = self::IM_IN_HEAD;
+ }
+ break;
+ case 'body':
+ $this->insertMode = self::IM_IN_BODY;
+ break;
+ case 'noscript':
+ if ($this->insertMode == self::IM_IN_HEAD) {
+ $this->insertMode = self::IM_IN_HEAD_NOSCRIPT;
+ }
+ break;
+
+ }
+
+
$ele = $this->doc->createElement($lname);
foreach ($attributes as $aName => $aVal) {
$ele->setAttribute($aName, $aVal);
@@ -82,7 +169,9 @@ class DOMTreeBuilder implements EventHandler {
$this->current->appendChild($ele);
// XXX: Need to handle self-closing tags and unary tags.
- $this->current = $ele;
+ if (!Elements::isA($name, Elements::UNARY_TAG)) {
+ $this->current = $ele;
+ }
// Return the element mask, which the tokenizer can then use to set
// various processing rules.
@@ -91,7 +180,28 @@ class DOMTreeBuilder implements EventHandler {
public function endTag($name) {
$lname = $this->normalizeTagName($name);
- if ($this->current->tagName != $lname) {
+
+ // Ignore closing tags for unary elements.
+ if (Elements::isA($name, Elements::UNARY_TAG)) {
+ return;
+ }
+
+ if ($this->insertMode <= self::IM_BEFORE_HTML) {
+ // 8.2.5.4.2
+ if (in_array($name, array('html', 'br', 'head', 'title'))) {
+ $this->startTag('html');
+ $this->endTag($name);
+ $this->insertMode = self::IM_BEFORE_HEAD;
+ return;
+ }
+
+ // Ignore the tag.
+ $this->parseError("Illegal closing tag at global scope.");
+ return;
+ }
+
+ if ($name != $lname) {
+ fprintf(STDOUT, "Mismatch on %s and %s", $name, $lname);
return $this->quirksTreeResolver($lname);
}
@@ -100,7 +210,25 @@ class DOMTreeBuilder implements EventHandler {
if ($lname == 'html') {
return;
}
- $this->current = $this->current->parentNode;
+
+ //$this->current = $this->current->parentNode;
+ if (!$this->autoclose($name)) {
+ $this->parseError('Could not find closing tag for ' . $name);
+ }
+
+ switch ($this->insertMode) {
+ case "head":
+ $this->insertMode = self::IM_AFTER_HEAD;
+ break;
+ case "body":
+ $this->insertMode = self::IM_AFTER_BODY;
+ break;
+ }
+
+ // 8.2.5.4.7
+ if ($name == 'sarcasm') {
+ $this->text("Take a deep breath.");
+ }
}
public function comment($cdata) {
@@ -110,6 +238,13 @@ class DOMTreeBuilder implements EventHandler {
}
public function text($data) {
+ if ($this->insertMode < self::IM_IN_HEAD) {
+ $data = trim($data);
+ if (!empty($data)) {
+ $this->parseError("Unexpected text. Ignoring: " . $data);
+ return;
+ }
+ }
$node = $this->doc->createTextNode($data);
$this->current->appendChild($node);
}
@@ -118,7 +253,7 @@ class DOMTreeBuilder implements EventHandler {
// If the $current isn't the $root, do we need to do anything?
}
- public function parseError($msg, $line, $col) {
+ public function parseError($msg, $line = 0, $col = 0) {
$this->doc->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg);
}
@@ -156,4 +291,24 @@ class DOMTreeBuilder implements EventHandler {
throw new \Exception("Not implemented.");
}
+
+ /**
+ * Automatically climb the tree and close the closest node with the matching $tag.
+ */
+ protected function autoclose($tag) {
+ $working = $this->current;
+ do {
+ if ($working->nodeType != XML_ELEMENT_NODE) {
+ return FALSE;
+ }
+ if ($working->tagName == $tag) {
+ $this->current = $working->parentNode;
+ return TRUE;
+ }
+ } while ($working = $working->parentNode);
+ return FALSE;
+
+ }
+
+
}