diff options
Diffstat (limited to 'src/HTML5')
-rw-r--r-- | src/HTML5/Elements.php | 54 | ||||
-rw-r--r-- | src/HTML5/Parser/DOMTreeBuilder.php | 24 | ||||
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 28 |
3 files changed, 80 insertions, 26 deletions
diff --git a/src/HTML5/Elements.php b/src/HTML5/Elements.php index fe55d42..4a9afb3 100644 --- a/src/HTML5/Elements.php +++ b/src/HTML5/Elements.php @@ -10,8 +10,9 @@ namespace HTML5; */ class Elements { - const TEXT_RAW = 0x01; - const TEXT_RCDATA = 0x02; + const KNOWN_ELEMENT = 0x01; + const TEXT_RAW = 0x02; + const TEXT_RCDATA = 0x04; const OMIT_START = 0x0a; const OMIT_END = 0x0b; @@ -95,7 +96,7 @@ class Elements { "output" => 1, "p" => 1, "param" => 1, - "pre" => 1, + "pre" => 3, // NORMAL | TEXT_RAW "progress" => 1, "q" => 1, "rp" => 1, @@ -103,7 +104,7 @@ class Elements { "ruby" => 1, "s" => 1, "samp" => 1, - "script" => 1, + "script" => 3, // NORMAL | TEXT_RAW "section" => 1, "select" => 1, "small" => 1, @@ -117,7 +118,7 @@ class Elements { "table" => 1, "tbody" => 1, "td" => 1, - "textarea" => 1, + "textarea" => 5, // NORMAL | TEXT_RCDATA "tfoot" => 1, "th" => 1, "thead" => 1, @@ -278,6 +279,30 @@ class Elements { ); /** + * Check whether the given element meets the given criterion. + * + * Example: + * + * Elements::isA('script', Elements::TEXT_RAW); // Returns true. + * + * Elements::isA('script', Elements::TEXT_RCDATA); // Returns false. + * + * @param string $name + * The element name. + * @param int $mask + * One of the constants on this class. + * @return boolean + * TRUE if the element matches the mask, FALSE otherwise. + */ + public static function isA($name, $mask) { + if (!self::isElement($name)) { + return FALSE; + } + + return (self::element($name) & $mask) == $mask; + } + + /** * Test if an element is a valid html5 element. * * @param string $name @@ -335,7 +360,24 @@ class Elements { * @return bool * True if valid and false otherwise. */ - public function isElement($name) { + public static function isElement($name) { return self::isHtml5Element($name) || self::isMathMLElement($name) || self::isSvgElement($name); } + + /** + * Get the element mask for the given element name. + */ + public static function element($name) { + if (isset(self::$elements[$name])) { + return self::$elements[$name]; + } + if (isset(self::$svg[$name])) { + return self::$svg[$name]; + } + if (isset(self::$mathml[$name])) { + return self::$mathml[$name]; + } + + return FALSE; + } } diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index 305a733..cf22953 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -1,6 +1,7 @@ <?php namespace HTML5\Parser; +use HTML5\Elements; /** * Create an HTML5 DOM tree from events. * @@ -30,7 +31,14 @@ class DOMTreeBuilder implements EventHandler { $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); $this->doc->errors = array(); - $this->current = $this->doc->documentElement(); + $this->current = $this->doc->documentElement; + } + + /** + * Get the document. + */ + public function document() { + return $this->doc; } /** @@ -62,16 +70,28 @@ class DOMTreeBuilder implements EventHandler { } $ele = $this->doc->createElement($lname); + foreach ($attributes as $aName => $aVal) { + $ele->setAttribute($aName, $aVal); + + // This is necessary on a non-DTD schema, like HTML5. + if ($aName == 'id') { + $ele->setIdAttribute('id', TRUE); + } + } $this->current->appendChild($ele); // XXX: Need to handle self-closing tags and unary tags. $this->current = $ele; + + // Return the element mask, which the tokenizer can then use to set + // various processing rules. + return Elements::element($name); } public function endTag($name) { $lname = $this->normalizeTagName($name); - if ($this->current->tagName() != $lname) { + if ($this->current->tagName != $lname) { return $this->quirksTreeResolver($lname); } diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index f3e45e1..02e78d9 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -1,6 +1,8 @@ <?php namespace HTML5\Parser; +use HTML5\Elements; + /** * The HTML5 tokenizer. * @@ -40,17 +42,6 @@ class Tokenizer { const WHITE="\t\n\f "; /** - * Textmodes are used to determine how to scan the text inside of tags. - * - * NORMAL: Scan non-elements. - * RAW: Scan until a specific closing tag. - * RCDATA: Scan until a specifc close state. - */ - const TEXTMODE_NORMAL = 0; - const TEXTMODE_RAW = 1; - const TEXTMODE_RCDATA = 2; - - /** * Create a new tokenizer. * * Typically, parsing a document involves creating a new tokenizer, giving @@ -105,13 +96,13 @@ class Tokenizer { * startTag(), but it can also be set manually using this function. * * @param integer $textmode - * One of Tokenizer::TEXTMODE_* + * One of Elements::TEXT_* * @param string $untilTag * The tag that should stop RAW or RCDATA mode. Normal mode does not * use this indicator. */ public function setTextMode($textmode, $untilTag = NULL) { - $this->textMode = $textmode; + $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA); $this->untilTag = $untilTag; } @@ -140,17 +131,18 @@ class Tokenizer { /** * Parse anything that looks like character data. * - * Different rules apply based on the current TEXTMODE. + * Different rules apply based on the current text mode. + * + * @see Elements::TEXT_RAW Elements::TEXT_RCDATA. */ protected function characterData() { if ($this->scanner->current() === FALSE) { return FALSE; } switch ($this->textMode) { - case self::TEXTMODE_RAW: - case self::TEXTMODE_RCDATA: + case Elements::TEXT_RAW: + case Elements::TEXT_RCDATA: return $this->rawText(); - case self::TEXTMODE_NORMAL: default: $tok = $this->scanner->current(); if (strspn($tok, "<&")) { @@ -190,7 +182,7 @@ class Tokenizer { $sequence = '</' . $this->untilTag . '>'; $txt = $this->readUntilSequence($sequence); $this->events->text($txt); - $this->setTextMode(self::TEXTMODE_NORMAL); + $this->setTextMode(0); return $this->endTag(); } |