summaryrefslogtreecommitdiff
path: root/src/HTML5/Parser
diff options
context:
space:
mode:
authorMatt Butcher <[email protected]>2013-04-24 20:09:02 -0500
committerMatt Butcher <[email protected]>2013-04-24 20:09:02 -0500
commit6815b2bd3a08201f6a75f09f7e24b50c5d3aeab2 (patch)
treefb547ecaeac2add94e6174dab62158791bff8dcb /src/HTML5/Parser
parent36d1367a7e365e1f3a4d63161999970799257e42 (diff)
parentfe3d7b815756b6f9ec3bad7c9bfe400b6ea11222 (diff)
Merge branch 'master' of github.com:technosophos/HTML5-PHP
Diffstat (limited to 'src/HTML5/Parser')
-rw-r--r--src/HTML5/Parser/DOMTreeBuilder.php24
-rw-r--r--src/HTML5/Parser/Tokenizer.php28
2 files changed, 32 insertions, 20 deletions
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php
index 305a733..cf22953 100644
--- a/src/HTML5/Parser/DOMTreeBuilder.php
+++ b/src/HTML5/Parser/DOMTreeBuilder.php
@@ -1,6 +1,7 @@
<?php
namespace HTML5\Parser;
+use HTML5\Elements;
/**
* Create an HTML5 DOM tree from events.
*
@@ -30,7 +31,14 @@ class DOMTreeBuilder implements EventHandler {
$this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
$this->doc->errors = array();
- $this->current = $this->doc->documentElement();
+ $this->current = $this->doc->documentElement;
+ }
+
+ /**
+ * Get the document.
+ */
+ public function document() {
+ return $this->doc;
}
/**
@@ -62,16 +70,28 @@ class DOMTreeBuilder implements EventHandler {
}
$ele = $this->doc->createElement($lname);
+ foreach ($attributes as $aName => $aVal) {
+ $ele->setAttribute($aName, $aVal);
+
+ // This is necessary on a non-DTD schema, like HTML5.
+ if ($aName == 'id') {
+ $ele->setIdAttribute('id', TRUE);
+ }
+ }
$this->current->appendChild($ele);
// XXX: Need to handle self-closing tags and unary tags.
$this->current = $ele;
+
+ // Return the element mask, which the tokenizer can then use to set
+ // various processing rules.
+ return Elements::element($name);
}
public function endTag($name) {
$lname = $this->normalizeTagName($name);
- if ($this->current->tagName() != $lname) {
+ if ($this->current->tagName != $lname) {
return $this->quirksTreeResolver($lname);
}
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index f3e45e1..02e78d9 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -1,6 +1,8 @@
<?php
namespace HTML5\Parser;
+use HTML5\Elements;
+
/**
* The HTML5 tokenizer.
*
@@ -40,17 +42,6 @@ class Tokenizer {
const WHITE="\t\n\f ";
/**
- * Textmodes are used to determine how to scan the text inside of tags.
- *
- * NORMAL: Scan non-elements.
- * RAW: Scan until a specific closing tag.
- * RCDATA: Scan until a specifc close state.
- */
- const TEXTMODE_NORMAL = 0;
- const TEXTMODE_RAW = 1;
- const TEXTMODE_RCDATA = 2;
-
- /**
* Create a new tokenizer.
*
* Typically, parsing a document involves creating a new tokenizer, giving
@@ -105,13 +96,13 @@ class Tokenizer {
* startTag(), but it can also be set manually using this function.
*
* @param integer $textmode
- * One of Tokenizer::TEXTMODE_*
+ * One of Elements::TEXT_*
* @param string $untilTag
* The tag that should stop RAW or RCDATA mode. Normal mode does not
* use this indicator.
*/
public function setTextMode($textmode, $untilTag = NULL) {
- $this->textMode = $textmode;
+ $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA);
$this->untilTag = $untilTag;
}
@@ -140,17 +131,18 @@ class Tokenizer {
/**
* Parse anything that looks like character data.
*
- * Different rules apply based on the current TEXTMODE.
+ * Different rules apply based on the current text mode.
+ *
+ * @see Elements::TEXT_RAW Elements::TEXT_RCDATA.
*/
protected function characterData() {
if ($this->scanner->current() === FALSE) {
return FALSE;
}
switch ($this->textMode) {
- case self::TEXTMODE_RAW:
- case self::TEXTMODE_RCDATA:
+ case Elements::TEXT_RAW:
+ case Elements::TEXT_RCDATA:
return $this->rawText();
- case self::TEXTMODE_NORMAL:
default:
$tok = $this->scanner->current();
if (strspn($tok, "<&")) {
@@ -190,7 +182,7 @@ class Tokenizer {
$sequence = '</' . $this->untilTag . '>';
$txt = $this->readUntilSequence($sequence);
$this->events->text($txt);
- $this->setTextMode(self::TEXTMODE_NORMAL);
+ $this->setTextMode(0);
return $this->endTag();
}