From b34490f3a1173ae2700b050868eb0301af49a22a Mon Sep 17 00:00:00 2001 From: Technosophos Date: Wed, 24 Apr 2013 16:34:12 -0500 Subject: Unit tests for DOMTreeBuilder begun. Minor bugs fixed, too. --- src/HTML5/Parser/DOMTreeBuilder.php | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'src/HTML5/Parser') diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index be9fa23..2807790 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -30,7 +30,14 @@ class DOMTreeBuilder implements EventHandler { $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); $this->doc->errors = array(); - $this->current = $this->doc->documentElement(); + $this->current = $this->doc->documentElement; + } + + /** + * Get the document. + */ + public function document() { + return $this->doc; } /** @@ -71,7 +78,7 @@ class DOMTreeBuilder implements EventHandler { public function endTag($name) { $lname = $this->normalizeTagName($name); - if ($this->current->tagName() != $lname) { + if ($this->current->tagName != $lname) { return $this->quirksTreeResolver($lname); } -- cgit v1.2.3 From 5bad030cd96865fd819ef477716decde07b04593 Mon Sep 17 00:00:00 2001 From: Technosophos Date: Wed, 24 Apr 2013 16:43:55 -0500 Subject: Moved to Elements::TEXT_RAW for Tokenizer. --- src/HTML5/Parser/Tokenizer.php | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'src/HTML5/Parser') diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index f3e45e1..c835120 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -1,6 +1,8 @@ textMode = $textmode; + $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA); $this->untilTag = $untilTag; } @@ -140,17 +143,18 @@ class Tokenizer { /** * Parse anything that looks like character data. * - * Different rules apply based on the current TEXTMODE. + * Different rules apply based on the current text mode. + * + * @see Elements::TEXT_RAW Elements::TEXT_RCDATA. */ protected function characterData() { if ($this->scanner->current() === FALSE) { return FALSE; } switch ($this->textMode) { - case self::TEXTMODE_RAW: - case self::TEXTMODE_RCDATA: + case Elements::TEXT_RAW: + case Elements::TEXT_RCDATA: return $this->rawText(); - case self::TEXTMODE_NORMAL: default: $tok = $this->scanner->current(); if (strspn($tok, "<&")) { @@ -190,7 +194,7 @@ class Tokenizer { $sequence = 'untilTag . '>'; $txt = $this->readUntilSequence($sequence); $this->events->text($txt); - $this->setTextMode(self::TEXTMODE_NORMAL); + $this->setTextMode(0); return $this->endTag(); } -- cgit v1.2.3 From 102c57cc46df3b2dfcb435e9e51b8f733f11b741 Mon Sep 17 00:00:00 2001 From: Technosophos Date: Wed, 24 Apr 2013 16:44:29 -0500 Subject: Removed deprecated constants. --- src/HTML5/Parser/Tokenizer.php | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'src/HTML5/Parser') diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index c835120..02e78d9 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -41,18 +41,6 @@ class Tokenizer { const WHITE="\t\n\f "; - /** - * Textmodes are used to determine how to scan the text inside of tags. - * - * NORMAL: Scan non-elements. - * RAW: Scan until a specific closing tag. - * RCDATA: Scan until a specifc close state. - *//* - const TEXTMODE_NORMAL = 0; - const TEXTMODE_RAW = 1; - const TEXTMODE_RCDATA = 2; - */ - /** * Create a new tokenizer. * -- cgit v1.2.3 From fe3d7b815756b6f9ec3bad7c9bfe400b6ea11222 Mon Sep 17 00:00:00 2001 From: Technosophos Date: Wed, 24 Apr 2013 17:33:08 -0500 Subject: Added attribute handling. --- src/HTML5/Parser/DOMTreeBuilder.php | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'src/HTML5/Parser') diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index 2807790..29e83c6 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -1,6 +1,7 @@ doc->createElement($lname); + foreach ($attributes as $aName => $aVal) { + $ele->setAttribute($aName, $aVal); + + // This is necessary on a non-DTD schema, like HTML5. + if ($aName == 'id') { + $ele->setIdAttribute('id', TRUE); + } + } $this->current->appendChild($ele); // XXX: Need to handle self-closing tags and unary tags. $this->current = $ele; + + // Return the element mask, which the tokenizer can then use to set + // various processing rules. + return Elements::element($name); } public function endTag($name) { -- cgit v1.2.3