diff options
-rw-r--r-- | src/HTML5/Elements.php | 54 | ||||
-rw-r--r-- | src/HTML5/Parser/DOMTreeBuilder.php | 24 | ||||
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 28 | ||||
-rw-r--r-- | test/HTML5/ElementsTest.php | 19 | ||||
-rw-r--r-- | test/HTML5/Parser/DOMTreeBuilderTest.php | 91 | ||||
-rw-r--r-- | test/HTML5/Parser/EventStack.php | 6 |
6 files changed, 191 insertions, 31 deletions
diff --git a/src/HTML5/Elements.php b/src/HTML5/Elements.php index fe55d42..4a9afb3 100644 --- a/src/HTML5/Elements.php +++ b/src/HTML5/Elements.php @@ -10,8 +10,9 @@ namespace HTML5; */ class Elements { - const TEXT_RAW = 0x01; - const TEXT_RCDATA = 0x02; + const KNOWN_ELEMENT = 0x01; + const TEXT_RAW = 0x02; + const TEXT_RCDATA = 0x04; const OMIT_START = 0x0a; const OMIT_END = 0x0b; @@ -95,7 +96,7 @@ class Elements { "output" => 1, "p" => 1, "param" => 1, - "pre" => 1, + "pre" => 3, // NORMAL | TEXT_RAW "progress" => 1, "q" => 1, "rp" => 1, @@ -103,7 +104,7 @@ class Elements { "ruby" => 1, "s" => 1, "samp" => 1, - "script" => 1, + "script" => 3, // NORMAL | TEXT_RAW "section" => 1, "select" => 1, "small" => 1, @@ -117,7 +118,7 @@ class Elements { "table" => 1, "tbody" => 1, "td" => 1, - "textarea" => 1, + "textarea" => 5, // NORMAL | TEXT_RCDATA "tfoot" => 1, "th" => 1, "thead" => 1, @@ -278,6 +279,30 @@ class Elements { ); /** + * Check whether the given element meets the given criterion. + * + * Example: + * + * Elements::isA('script', Elements::TEXT_RAW); // Returns true. + * + * Elements::isA('script', Elements::TEXT_RCDATA); // Returns false. + * + * @param string $name + * The element name. + * @param int $mask + * One of the constants on this class. + * @return boolean + * TRUE if the element matches the mask, FALSE otherwise. + */ + public static function isA($name, $mask) { + if (!self::isElement($name)) { + return FALSE; + } + + return (self::element($name) & $mask) == $mask; + } + + /** * Test if an element is a valid html5 element. * * @param string $name @@ -335,7 +360,24 @@ class Elements { * @return bool * True if valid and false otherwise. */ - public function isElement($name) { + public static function isElement($name) { return self::isHtml5Element($name) || self::isMathMLElement($name) || self::isSvgElement($name); } + + /** + * Get the element mask for the given element name. + */ + public static function element($name) { + if (isset(self::$elements[$name])) { + return self::$elements[$name]; + } + if (isset(self::$svg[$name])) { + return self::$svg[$name]; + } + if (isset(self::$mathml[$name])) { + return self::$mathml[$name]; + } + + return FALSE; + } } diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index 305a733..cf22953 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -1,6 +1,7 @@ <?php namespace HTML5\Parser; +use HTML5\Elements; /** * Create an HTML5 DOM tree from events. * @@ -30,7 +31,14 @@ class DOMTreeBuilder implements EventHandler { $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); $this->doc->errors = array(); - $this->current = $this->doc->documentElement(); + $this->current = $this->doc->documentElement; + } + + /** + * Get the document. + */ + public function document() { + return $this->doc; } /** @@ -62,16 +70,28 @@ class DOMTreeBuilder implements EventHandler { } $ele = $this->doc->createElement($lname); + foreach ($attributes as $aName => $aVal) { + $ele->setAttribute($aName, $aVal); + + // This is necessary on a non-DTD schema, like HTML5. + if ($aName == 'id') { + $ele->setIdAttribute('id', TRUE); + } + } $this->current->appendChild($ele); // XXX: Need to handle self-closing tags and unary tags. $this->current = $ele; + + // Return the element mask, which the tokenizer can then use to set + // various processing rules. + return Elements::element($name); } public function endTag($name) { $lname = $this->normalizeTagName($name); - if ($this->current->tagName() != $lname) { + if ($this->current->tagName != $lname) { return $this->quirksTreeResolver($lname); } diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index f3e45e1..02e78d9 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -1,6 +1,8 @@ <?php namespace HTML5\Parser; +use HTML5\Elements; + /** * The HTML5 tokenizer. * @@ -40,17 +42,6 @@ class Tokenizer { const WHITE="\t\n\f "; /** - * Textmodes are used to determine how to scan the text inside of tags. - * - * NORMAL: Scan non-elements. - * RAW: Scan until a specific closing tag. - * RCDATA: Scan until a specifc close state. - */ - const TEXTMODE_NORMAL = 0; - const TEXTMODE_RAW = 1; - const TEXTMODE_RCDATA = 2; - - /** * Create a new tokenizer. * * Typically, parsing a document involves creating a new tokenizer, giving @@ -105,13 +96,13 @@ class Tokenizer { * startTag(), but it can also be set manually using this function. * * @param integer $textmode - * One of Tokenizer::TEXTMODE_* + * One of Elements::TEXT_* * @param string $untilTag * The tag that should stop RAW or RCDATA mode. Normal mode does not * use this indicator. */ public function setTextMode($textmode, $untilTag = NULL) { - $this->textMode = $textmode; + $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA); $this->untilTag = $untilTag; } @@ -140,17 +131,18 @@ class Tokenizer { /** * Parse anything that looks like character data. * - * Different rules apply based on the current TEXTMODE. + * Different rules apply based on the current text mode. + * + * @see Elements::TEXT_RAW Elements::TEXT_RCDATA. */ protected function characterData() { if ($this->scanner->current() === FALSE) { return FALSE; } switch ($this->textMode) { - case self::TEXTMODE_RAW: - case self::TEXTMODE_RCDATA: + case Elements::TEXT_RAW: + case Elements::TEXT_RCDATA: return $this->rawText(); - case self::TEXTMODE_NORMAL: default: $tok = $this->scanner->current(); if (strspn($tok, "<&")) { @@ -190,7 +182,7 @@ class Tokenizer { $sequence = '</' . $this->untilTag . '>'; $txt = $this->readUntilSequence($sequence); $this->events->text($txt); - $this->setTextMode(self::TEXTMODE_NORMAL); + $this->setTextMode(0); return $this->endTag(); } diff --git a/test/HTML5/ElementsTest.php b/test/HTML5/ElementsTest.php index 20161bb..69d0675 100644 --- a/test/HTML5/ElementsTest.php +++ b/test/HTML5/ElementsTest.php @@ -322,4 +322,21 @@ class ElementsTest extends TestCase { } } -}
\ No newline at end of file + public function testElement() { + foreach ($this->html5Elements as $element) { + $this->assertGreaterThan(0, Elements::element($element)); + } + $nonhtml5 = array('foo', 'bar', 'baz'); + foreach ($nonhtml5 as $element) { + $this->assertFalse(Elements::element($element)); + } + } + + public function testIsA() { + $this->assertTrue(Elements::isA('script', Elements::KNOWN_ELEMENT)); + $this->assertFalse(Elements::isA('scriptypoo', Elements::KNOWN_ELEMENT)); + $this->assertTrue(Elements::isA('script', Elements::TEXT_RAW)); + $this->assertFalse(Elements::isA('script', Elements::TEXT_RCDATA)); + } + +} diff --git a/test/HTML5/Parser/DOMTreeBuilderTest.php b/test/HTML5/Parser/DOMTreeBuilderTest.php index 6ffae75..a901238 100644 --- a/test/HTML5/Parser/DOMTreeBuilderTest.php +++ b/test/HTML5/Parser/DOMTreeBuilderTest.php @@ -1,11 +1,98 @@ <?php /** * @file - * Test the Scanner. This requires the InputStream tests are all good. + * Test the Tree Builder. */ namespace HTML5\Parser; +use HTML5\Elements; + require_once __DIR__ . '/../TestCase.php'; -class DOMTreeParserTest extends \HTML5\Tests\TestCase { +/** + * These tests are functional, not necessarily unit tests. + */ +class DOMTreeBuilderTest extends \HTML5\Tests\TestCase { + + /** + * Convenience function for parsing. + */ + protected function parse($string) { + $treeBuilder = new DOMTreeBuilder(); + $input = new StringInputStream($string); + $scanner = new Scanner($input); + $parser = new Tokenizer($scanner, $treeBuilder); + + $parser->parse(); + + return $treeBuilder->document(); + } + + public function testDocument() { + $html = "<!DOCTYPE html><html></html>"; + $doc = $this->parse($html); + + $this->assertInstanceOf('\DOMDocument', $doc); + $this->assertEquals('html', $doc->documentElement->tagName); + } + + public function testElements() { + $html = "<!DOCTYPE html><html><head><title></title></head><body></body></html>"; + $doc = $this->parse($html); + $root = $doc->documentElement; + + $this->assertEquals('html', $root->tagName); + $this->assertEquals('html', $root->localName); + $this->assertEquals('html', $root->nodeName); + + $this->assertEquals(2, $root->childNodes->length); + $kids = $root->childNodes; + + $this->assertEquals('head', $kids->item(0)->tagName); + $this->assertEquals('body', $kids->item(1)->tagName); + + $head = $kids->item(0); + $this->assertEquals(1, $head->childNodes->length); + $this->assertEquals('title', $head->childNodes->item(0)->tagName); + } + + public function testAttributes() { + $html = "<!DOCTYPE html> + <html> + <head><title></title></head> + <body id='a' class='b c'></body> + </html>"; + $doc = $this->parse($html); + $root = $doc->documentElement; + + $body = $root->GetElementsByTagName('body')->item(0); + $this->assertEquals('body', $body->tagName); + $this->assertTrue($body->hasAttributes()); + $this->assertEquals('a', $body->getAttribute('id')); + $this->assertEquals('b c', $body->getAttribute('class')); + + $body2 = $doc->getElementById('a'); + $this->assertEquals('body', $body2->tagName); + $this->assertEquals('a', $body2->getAttribute('id')); + } + + public function testComment() { + $this->markTestIncomplete("Incomplete."); + } + + public function testCDATA() { + $this->markTestIncomplete("Incomplete."); + } + + public function testText() { + $this->markTestIncomplete("Incomplete."); + } + + public function testParseErrors() { + $this->markTestIncomplete("Incomplete."); + } + + public function testProcessingInstruction() { + $this->markTestIncomplete("Incomplete."); + } } diff --git a/test/HTML5/Parser/EventStack.php b/test/HTML5/Parser/EventStack.php index c9ac20e..1f56ea9 100644 --- a/test/HTML5/Parser/EventStack.php +++ b/test/HTML5/Parser/EventStack.php @@ -1,13 +1,15 @@ <?php namespace HTML5\Parser; +use HTML5\Elements; + /** * This testing class gathers events from a parser and builds a stack of events. * It is useful for checking the output of a tokenizer. * * IMPORTANT: * - * The startTag event also kicks the parser into TEXTMODE_RAW when it encounters + * The startTag event also kicks the parser into TEXT_RAW when it encounters * script or pre tags. This is to match the behavior required by the HTML5 spec, * which says that the tree builder must tell the tokenizer when to switch states. */ @@ -49,7 +51,7 @@ class EventStack implements EventHandler { $args = func_get_args(); $this->store('startTag', $args); if ($name == 'pre' || $name == 'script') { - return Tokenizer::TEXTMODE_RAW; + return Elements::TEXT_RAW; } } |