summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/HTML5/Elements.php54
-rw-r--r--src/HTML5/Parser/DOMTreeBuilder.php24
-rw-r--r--src/HTML5/Parser/Tokenizer.php28
-rw-r--r--test/HTML5/ElementsTest.php19
-rw-r--r--test/HTML5/Parser/DOMTreeBuilderTest.php91
-rw-r--r--test/HTML5/Parser/EventStack.php6
6 files changed, 191 insertions, 31 deletions
diff --git a/src/HTML5/Elements.php b/src/HTML5/Elements.php
index fe55d42..4a9afb3 100644
--- a/src/HTML5/Elements.php
+++ b/src/HTML5/Elements.php
@@ -10,8 +10,9 @@ namespace HTML5;
*/
class Elements {
- const TEXT_RAW = 0x01;
- const TEXT_RCDATA = 0x02;
+ const KNOWN_ELEMENT = 0x01;
+ const TEXT_RAW = 0x02;
+ const TEXT_RCDATA = 0x04;
const OMIT_START = 0x0a;
const OMIT_END = 0x0b;
@@ -95,7 +96,7 @@ class Elements {
"output" => 1,
"p" => 1,
"param" => 1,
- "pre" => 1,
+ "pre" => 3, // NORMAL | TEXT_RAW
"progress" => 1,
"q" => 1,
"rp" => 1,
@@ -103,7 +104,7 @@ class Elements {
"ruby" => 1,
"s" => 1,
"samp" => 1,
- "script" => 1,
+ "script" => 3, // NORMAL | TEXT_RAW
"section" => 1,
"select" => 1,
"small" => 1,
@@ -117,7 +118,7 @@ class Elements {
"table" => 1,
"tbody" => 1,
"td" => 1,
- "textarea" => 1,
+ "textarea" => 5, // NORMAL | TEXT_RCDATA
"tfoot" => 1,
"th" => 1,
"thead" => 1,
@@ -278,6 +279,30 @@ class Elements {
);
/**
+ * Check whether the given element meets the given criterion.
+ *
+ * Example:
+ *
+ * Elements::isA('script', Elements::TEXT_RAW); // Returns true.
+ *
+ * Elements::isA('script', Elements::TEXT_RCDATA); // Returns false.
+ *
+ * @param string $name
+ * The element name.
+ * @param int $mask
+ * One of the constants on this class.
+ * @return boolean
+ * TRUE if the element matches the mask, FALSE otherwise.
+ */
+ public static function isA($name, $mask) {
+ if (!self::isElement($name)) {
+ return FALSE;
+ }
+
+ return (self::element($name) & $mask) == $mask;
+ }
+
+ /**
* Test if an element is a valid html5 element.
*
* @param string $name
@@ -335,7 +360,24 @@ class Elements {
* @return bool
* True if valid and false otherwise.
*/
- public function isElement($name) {
+ public static function isElement($name) {
return self::isHtml5Element($name) || self::isMathMLElement($name) || self::isSvgElement($name);
}
+
+ /**
+ * Get the element mask for the given element name.
+ */
+ public static function element($name) {
+ if (isset(self::$elements[$name])) {
+ return self::$elements[$name];
+ }
+ if (isset(self::$svg[$name])) {
+ return self::$svg[$name];
+ }
+ if (isset(self::$mathml[$name])) {
+ return self::$mathml[$name];
+ }
+
+ return FALSE;
+ }
}
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php
index 305a733..cf22953 100644
--- a/src/HTML5/Parser/DOMTreeBuilder.php
+++ b/src/HTML5/Parser/DOMTreeBuilder.php
@@ -1,6 +1,7 @@
<?php
namespace HTML5\Parser;
+use HTML5\Elements;
/**
* Create an HTML5 DOM tree from events.
*
@@ -30,7 +31,14 @@ class DOMTreeBuilder implements EventHandler {
$this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
$this->doc->errors = array();
- $this->current = $this->doc->documentElement();
+ $this->current = $this->doc->documentElement;
+ }
+
+ /**
+ * Get the document.
+ */
+ public function document() {
+ return $this->doc;
}
/**
@@ -62,16 +70,28 @@ class DOMTreeBuilder implements EventHandler {
}
$ele = $this->doc->createElement($lname);
+ foreach ($attributes as $aName => $aVal) {
+ $ele->setAttribute($aName, $aVal);
+
+ // This is necessary on a non-DTD schema, like HTML5.
+ if ($aName == 'id') {
+ $ele->setIdAttribute('id', TRUE);
+ }
+ }
$this->current->appendChild($ele);
// XXX: Need to handle self-closing tags and unary tags.
$this->current = $ele;
+
+ // Return the element mask, which the tokenizer can then use to set
+ // various processing rules.
+ return Elements::element($name);
}
public function endTag($name) {
$lname = $this->normalizeTagName($name);
- if ($this->current->tagName() != $lname) {
+ if ($this->current->tagName != $lname) {
return $this->quirksTreeResolver($lname);
}
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index f3e45e1..02e78d9 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -1,6 +1,8 @@
<?php
namespace HTML5\Parser;
+use HTML5\Elements;
+
/**
* The HTML5 tokenizer.
*
@@ -40,17 +42,6 @@ class Tokenizer {
const WHITE="\t\n\f ";
/**
- * Textmodes are used to determine how to scan the text inside of tags.
- *
- * NORMAL: Scan non-elements.
- * RAW: Scan until a specific closing tag.
- * RCDATA: Scan until a specifc close state.
- */
- const TEXTMODE_NORMAL = 0;
- const TEXTMODE_RAW = 1;
- const TEXTMODE_RCDATA = 2;
-
- /**
* Create a new tokenizer.
*
* Typically, parsing a document involves creating a new tokenizer, giving
@@ -105,13 +96,13 @@ class Tokenizer {
* startTag(), but it can also be set manually using this function.
*
* @param integer $textmode
- * One of Tokenizer::TEXTMODE_*
+ * One of Elements::TEXT_*
* @param string $untilTag
* The tag that should stop RAW or RCDATA mode. Normal mode does not
* use this indicator.
*/
public function setTextMode($textmode, $untilTag = NULL) {
- $this->textMode = $textmode;
+ $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA);
$this->untilTag = $untilTag;
}
@@ -140,17 +131,18 @@ class Tokenizer {
/**
* Parse anything that looks like character data.
*
- * Different rules apply based on the current TEXTMODE.
+ * Different rules apply based on the current text mode.
+ *
+ * @see Elements::TEXT_RAW Elements::TEXT_RCDATA.
*/
protected function characterData() {
if ($this->scanner->current() === FALSE) {
return FALSE;
}
switch ($this->textMode) {
- case self::TEXTMODE_RAW:
- case self::TEXTMODE_RCDATA:
+ case Elements::TEXT_RAW:
+ case Elements::TEXT_RCDATA:
return $this->rawText();
- case self::TEXTMODE_NORMAL:
default:
$tok = $this->scanner->current();
if (strspn($tok, "<&")) {
@@ -190,7 +182,7 @@ class Tokenizer {
$sequence = '</' . $this->untilTag . '>';
$txt = $this->readUntilSequence($sequence);
$this->events->text($txt);
- $this->setTextMode(self::TEXTMODE_NORMAL);
+ $this->setTextMode(0);
return $this->endTag();
}
diff --git a/test/HTML5/ElementsTest.php b/test/HTML5/ElementsTest.php
index 20161bb..69d0675 100644
--- a/test/HTML5/ElementsTest.php
+++ b/test/HTML5/ElementsTest.php
@@ -322,4 +322,21 @@ class ElementsTest extends TestCase {
}
}
-} \ No newline at end of file
+ public function testElement() {
+ foreach ($this->html5Elements as $element) {
+ $this->assertGreaterThan(0, Elements::element($element));
+ }
+ $nonhtml5 = array('foo', 'bar', 'baz');
+ foreach ($nonhtml5 as $element) {
+ $this->assertFalse(Elements::element($element));
+ }
+ }
+
+ public function testIsA() {
+ $this->assertTrue(Elements::isA('script', Elements::KNOWN_ELEMENT));
+ $this->assertFalse(Elements::isA('scriptypoo', Elements::KNOWN_ELEMENT));
+ $this->assertTrue(Elements::isA('script', Elements::TEXT_RAW));
+ $this->assertFalse(Elements::isA('script', Elements::TEXT_RCDATA));
+ }
+
+}
diff --git a/test/HTML5/Parser/DOMTreeBuilderTest.php b/test/HTML5/Parser/DOMTreeBuilderTest.php
index 6ffae75..a901238 100644
--- a/test/HTML5/Parser/DOMTreeBuilderTest.php
+++ b/test/HTML5/Parser/DOMTreeBuilderTest.php
@@ -1,11 +1,98 @@
<?php
/**
* @file
- * Test the Scanner. This requires the InputStream tests are all good.
+ * Test the Tree Builder.
*/
namespace HTML5\Parser;
+use HTML5\Elements;
+
require_once __DIR__ . '/../TestCase.php';
-class DOMTreeParserTest extends \HTML5\Tests\TestCase {
+/**
+ * These tests are functional, not necessarily unit tests.
+ */
+class DOMTreeBuilderTest extends \HTML5\Tests\TestCase {
+
+ /**
+ * Convenience function for parsing.
+ */
+ protected function parse($string) {
+ $treeBuilder = new DOMTreeBuilder();
+ $input = new StringInputStream($string);
+ $scanner = new Scanner($input);
+ $parser = new Tokenizer($scanner, $treeBuilder);
+
+ $parser->parse();
+
+ return $treeBuilder->document();
+ }
+
+ public function testDocument() {
+ $html = "<!DOCTYPE html><html></html>";
+ $doc = $this->parse($html);
+
+ $this->assertInstanceOf('\DOMDocument', $doc);
+ $this->assertEquals('html', $doc->documentElement->tagName);
+ }
+
+ public function testElements() {
+ $html = "<!DOCTYPE html><html><head><title></title></head><body></body></html>";
+ $doc = $this->parse($html);
+ $root = $doc->documentElement;
+
+ $this->assertEquals('html', $root->tagName);
+ $this->assertEquals('html', $root->localName);
+ $this->assertEquals('html', $root->nodeName);
+
+ $this->assertEquals(2, $root->childNodes->length);
+ $kids = $root->childNodes;
+
+ $this->assertEquals('head', $kids->item(0)->tagName);
+ $this->assertEquals('body', $kids->item(1)->tagName);
+
+ $head = $kids->item(0);
+ $this->assertEquals(1, $head->childNodes->length);
+ $this->assertEquals('title', $head->childNodes->item(0)->tagName);
+ }
+
+ public function testAttributes() {
+ $html = "<!DOCTYPE html>
+ <html>
+ <head><title></title></head>
+ <body id='a' class='b c'></body>
+ </html>";
+ $doc = $this->parse($html);
+ $root = $doc->documentElement;
+
+ $body = $root->GetElementsByTagName('body')->item(0);
+ $this->assertEquals('body', $body->tagName);
+ $this->assertTrue($body->hasAttributes());
+ $this->assertEquals('a', $body->getAttribute('id'));
+ $this->assertEquals('b c', $body->getAttribute('class'));
+
+ $body2 = $doc->getElementById('a');
+ $this->assertEquals('body', $body2->tagName);
+ $this->assertEquals('a', $body2->getAttribute('id'));
+ }
+
+ public function testComment() {
+ $this->markTestIncomplete("Incomplete.");
+ }
+
+ public function testCDATA() {
+ $this->markTestIncomplete("Incomplete.");
+ }
+
+ public function testText() {
+ $this->markTestIncomplete("Incomplete.");
+ }
+
+ public function testParseErrors() {
+ $this->markTestIncomplete("Incomplete.");
+ }
+
+ public function testProcessingInstruction() {
+ $this->markTestIncomplete("Incomplete.");
+ }
}
diff --git a/test/HTML5/Parser/EventStack.php b/test/HTML5/Parser/EventStack.php
index c9ac20e..1f56ea9 100644
--- a/test/HTML5/Parser/EventStack.php
+++ b/test/HTML5/Parser/EventStack.php
@@ -1,13 +1,15 @@
<?php
namespace HTML5\Parser;
+use HTML5\Elements;
+
/**
* This testing class gathers events from a parser and builds a stack of events.
* It is useful for checking the output of a tokenizer.
*
* IMPORTANT:
*
- * The startTag event also kicks the parser into TEXTMODE_RAW when it encounters
+ * The startTag event also kicks the parser into TEXT_RAW when it encounters
* script or pre tags. This is to match the behavior required by the HTML5 spec,
* which says that the tree builder must tell the tokenizer when to switch states.
*/
@@ -49,7 +51,7 @@ class EventStack implements EventHandler {
$args = func_get_args();
$this->store('startTag', $args);
if ($name == 'pre' || $name == 'script') {
- return Tokenizer::TEXTMODE_RAW;
+ return Elements::TEXT_RAW;
}
}