diff options
-rw-r--r-- | src/HTML5/Parser/DOMTreeBuilder.php | 53 | ||||
-rw-r--r-- | src/HTML5/Parser/TreeBuildingRules.php | 88 | ||||
-rw-r--r-- | test/HTML5/Parser/TreeBuildingRulesTest.php | 65 |
3 files changed, 192 insertions, 14 deletions
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index 1e3faad..7d40d40 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -64,6 +64,9 @@ class DOMTreeBuilder implements EventHandler { // $this->current = $this->doc->documentElement; $this->current = $this->doc; //->documentElement; + + // Create a rules engine for tags. + $this->rules = new TreeBuildingRules($this->doc); } /** @@ -123,15 +126,6 @@ class DOMTreeBuilder implements EventHandler { if ($name == 'image') { $name = 'img'; } - elseif ($name == 'optgroup' && $this->current->tagName == 'option') { - $this->current = $this->current->parentNode; - } - // TODO: MathML support - elseif ($name == 'math') { - } - // TODO: SVG support. - elseif ($name == 'svg') { - } // Autoclose p tags where appropriate. @@ -149,7 +143,7 @@ class DOMTreeBuilder implements EventHandler { $this->parseError("Unexpected head tag outside of head context."); } else { - $this->isertMode = self::IM_IN_HEAD; + $this->insertMode = self::IM_IN_HEAD; } break; case 'body': @@ -174,11 +168,18 @@ class DOMTreeBuilder implements EventHandler { } } - $this->current->appendChild($ele); + // Some elements have special processing rules. Handle those separately. + if ($this->rules->hasRules($name)) { + $this->current = $this->rules->evaluate($ele, $this->current); + } + // Otherwise, it's a standard element. + else { + $this->current->appendChild($ele); - // XXX: Need to handle self-closing tags and unary tags. - if (!Elements::isA($name, Elements::VOID_TAG)) { - $this->current = $ele; + // XXX: Need to handle self-closing tags and unary tags. + if (!Elements::isA($name, Elements::VOID_TAG)) { + $this->current = $ele; + } } // Return the element mask, which the tokenizer can then use to set @@ -333,5 +334,29 @@ class DOMTreeBuilder implements EventHandler { } + /** + * Checks if the given tagname is an ancestor of the present candidate. + * + * If $this->current or anything above $this->current matches the given tag + * name, this returns TRUE. + */ + protected function isAncestor($tagname) { + $candidate = $this->current; + while ($candidate->nodeType === XML_ELEMENT_NODE) { + if ($candidate->tagName == $tagname) { + return TRUE; + } + $candidate = $candidate->parentNode; + } + return FALSE; + } + + /** + * Returns TRUE if the immediate parent element is of the given tagname. + */ + protected function isParent($tagname) { + return $this->current->tagName == $tagname; + } + } diff --git a/src/HTML5/Parser/TreeBuildingRules.php b/src/HTML5/Parser/TreeBuildingRules.php new file mode 100644 index 0000000..111da1e --- /dev/null +++ b/src/HTML5/Parser/TreeBuildingRules.php @@ -0,0 +1,88 @@ +<?php +namespace HTML5\Parser; + +use HTML5\Elements; + +/** + * Handles special-case rules for the DOM tree builder. + * + * Many tags have special rules that need to be accomodated on an + * individual basis. This class handles those rules. + * + * See section 8.1.2.4 of the spec. + */ +class TreeBuildingRules { + + protected static $tags = array( + 'li' => 1, + 'dd' => 1, + 'dt' => 1, + 'rt' => 1, + 'rp' => 1, + ); + + /** + * Build a new rules engine. + * + * @param \DOMDocument $doc + * The DOM document to use for evaluation and modification. + */ + public function __construct($doc) { + $this->doc = $doc; + } + + /** + * Returns TRUE if the given tagname has special processing rules. + */ + public function hasRules($tagname) { + return isset(self::$tags[$tagname]); + } + + /** + * Evaluate the rule for the current tag name. + * + * This may modify the existing DOM. + * + * @return \DOMElement + * The new Current DOM element. + */ + public function evaluate($new, $current) { + + switch($new->tagName) { + case 'li': + return $this->handleLI($new, $current); + case 'dt': + case 'dd': + return $this->handleDT($new, $current); + case 'rt': + case 'rp': + return $this->handleRT($new, $current); + + } + + return $current; + } + + protected function handleLI($ele, $current) { + return $this->closeIfCurrentMatches($ele, $current, array('li')); + } + + protected function handleDT($ele, $current) { + return $this->closeIfCurrentMatches($ele, $current, array('dt','dd')); + } + protected function handleRT($ele, $current) { + return $this->closeIfCurrentMatches($ele, $current, array('rt','rp')); + } + + protected function closeIfCurrentMatches($ele, $current, $match) { + $tname = $current->tagName; + if (in_array($current->tagName, $match)) { + $current->parentNode->appendChild($ele); + } + else { + $current->appendChild($ele); + } + return $ele; + + } +} diff --git a/test/HTML5/Parser/TreeBuildingRulesTest.php b/test/HTML5/Parser/TreeBuildingRulesTest.php new file mode 100644 index 0000000..fe02893 --- /dev/null +++ b/test/HTML5/Parser/TreeBuildingRulesTest.php @@ -0,0 +1,65 @@ +<?php +/** + * @file + * Test the Tree Builder's special-case rules. + */ +namespace HTML5\Parser; + +use HTML5\Elements; + +require_once __DIR__ . '/../TestCase.php'; + +/** + * These tests are functional, not necessarily unit tests. + */ +class TreeBuildingRulesTest extends \HTML5\Tests\TestCase { + + const HTML_STUB = '<!DOCTYPE html><html><head><title>test</title></head><body>%s</body></html>'; + + /** + * Convenience function for parsing. + */ + protected function parse($string) { + $treeBuilder = new DOMTreeBuilder(); + $input = new StringInputStream($string); + $scanner = new Scanner($input); + $parser = new Tokenizer($scanner, $treeBuilder); + + $parser->parse(); + + return $treeBuilder->document(); + } + + public function testHasRules() { + $doc = new \DOMDocument('1.0'); + $engine = new TreeBuildingRules($doc); + + $this->assertTrue($engine->hasRules('li')); + $this->assertFalse($engine->hasRules('imaginary')); + } + + public function testHandleLI() { + $html = sprintf(self::HTML_STUB, '<ul id="a"><li>test<li>test2</ul><a></a>'); + $doc = $this->parse($html); + + $list = $doc->getElementById('a'); + + $this->assertEquals(2, $list->childNodes->length); + foreach($list->childNodes as $ele) { + $this->assertEquals('li', $ele->tagName); + } + + } + + public function testHandleDT() { + $html = sprintf(self::HTML_STUB, '<dl id="a"><dt>Hello<dd>Hi</dl><a></a>'); + $doc = $this->parse($html); + + $list = $doc->getElementById('a'); + + $this->assertEquals(2, $list->childNodes->length); + $this->assertEquals('dt', $list->firstChild->tagName); + $this->assertEquals('dd', $list->lastChild->tagName); + } + +} |