diff options
author | Matt Butcher <[email protected]> | 2013-04-24 20:09:02 -0500 |
---|---|---|
committer | Matt Butcher <[email protected]> | 2013-04-24 20:09:02 -0500 |
commit | 6815b2bd3a08201f6a75f09f7e24b50c5d3aeab2 (patch) | |
tree | fb547ecaeac2add94e6174dab62158791bff8dcb /src/HTML5/Parser | |
parent | 36d1367a7e365e1f3a4d63161999970799257e42 (diff) | |
parent | fe3d7b815756b6f9ec3bad7c9bfe400b6ea11222 (diff) |
Merge branch 'master' of github.com:technosophos/HTML5-PHP
Diffstat (limited to 'src/HTML5/Parser')
-rw-r--r-- | src/HTML5/Parser/DOMTreeBuilder.php | 24 | ||||
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 28 |
2 files changed, 32 insertions, 20 deletions
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index 305a733..cf22953 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -1,6 +1,7 @@ <?php namespace HTML5\Parser; +use HTML5\Elements; /** * Create an HTML5 DOM tree from events. * @@ -30,7 +31,14 @@ class DOMTreeBuilder implements EventHandler { $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); $this->doc->errors = array(); - $this->current = $this->doc->documentElement(); + $this->current = $this->doc->documentElement; + } + + /** + * Get the document. + */ + public function document() { + return $this->doc; } /** @@ -62,16 +70,28 @@ class DOMTreeBuilder implements EventHandler { } $ele = $this->doc->createElement($lname); + foreach ($attributes as $aName => $aVal) { + $ele->setAttribute($aName, $aVal); + + // This is necessary on a non-DTD schema, like HTML5. + if ($aName == 'id') { + $ele->setIdAttribute('id', TRUE); + } + } $this->current->appendChild($ele); // XXX: Need to handle self-closing tags and unary tags. $this->current = $ele; + + // Return the element mask, which the tokenizer can then use to set + // various processing rules. + return Elements::element($name); } public function endTag($name) { $lname = $this->normalizeTagName($name); - if ($this->current->tagName() != $lname) { + if ($this->current->tagName != $lname) { return $this->quirksTreeResolver($lname); } diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index f3e45e1..02e78d9 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -1,6 +1,8 @@ <?php namespace HTML5\Parser; +use HTML5\Elements; + /** * The HTML5 tokenizer. * @@ -40,17 +42,6 @@ class Tokenizer { const WHITE="\t\n\f "; /** - * Textmodes are used to determine how to scan the text inside of tags. - * - * NORMAL: Scan non-elements. - * RAW: Scan until a specific closing tag. - * RCDATA: Scan until a specifc close state. - */ - const TEXTMODE_NORMAL = 0; - const TEXTMODE_RAW = 1; - const TEXTMODE_RCDATA = 2; - - /** * Create a new tokenizer. * * Typically, parsing a document involves creating a new tokenizer, giving @@ -105,13 +96,13 @@ class Tokenizer { * startTag(), but it can also be set manually using this function. * * @param integer $textmode - * One of Tokenizer::TEXTMODE_* + * One of Elements::TEXT_* * @param string $untilTag * The tag that should stop RAW or RCDATA mode. Normal mode does not * use this indicator. */ public function setTextMode($textmode, $untilTag = NULL) { - $this->textMode = $textmode; + $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA); $this->untilTag = $untilTag; } @@ -140,17 +131,18 @@ class Tokenizer { /** * Parse anything that looks like character data. * - * Different rules apply based on the current TEXTMODE. + * Different rules apply based on the current text mode. + * + * @see Elements::TEXT_RAW Elements::TEXT_RCDATA. */ protected function characterData() { if ($this->scanner->current() === FALSE) { return FALSE; } switch ($this->textMode) { - case self::TEXTMODE_RAW: - case self::TEXTMODE_RCDATA: + case Elements::TEXT_RAW: + case Elements::TEXT_RCDATA: return $this->rawText(); - case self::TEXTMODE_NORMAL: default: $tok = $this->scanner->current(); if (strspn($tok, "<&")) { @@ -190,7 +182,7 @@ class Tokenizer { $sequence = '</' . $this->untilTag . '>'; $txt = $this->readUntilSequence($sequence); $this->events->text($txt); - $this->setTextMode(self::TEXTMODE_NORMAL); + $this->setTextMode(0); return $this->endTag(); } |