summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMatt Butcher <[email protected]>2013-04-24 20:09:02 -0500
committerMatt Butcher <[email protected]>2013-04-24 20:09:02 -0500
commit6815b2bd3a08201f6a75f09f7e24b50c5d3aeab2 (patch)
treefb547ecaeac2add94e6174dab62158791bff8dcb /src
parent36d1367a7e365e1f3a4d63161999970799257e42 (diff)
parentfe3d7b815756b6f9ec3bad7c9bfe400b6ea11222 (diff)
Merge branch 'master' of github.com:technosophos/HTML5-PHP
Diffstat (limited to 'src')
-rw-r--r--src/HTML5/Elements.php54
-rw-r--r--src/HTML5/Parser/DOMTreeBuilder.php24
-rw-r--r--src/HTML5/Parser/Tokenizer.php28
3 files changed, 80 insertions, 26 deletions
diff --git a/src/HTML5/Elements.php b/src/HTML5/Elements.php
index fe55d42..4a9afb3 100644
--- a/src/HTML5/Elements.php
+++ b/src/HTML5/Elements.php
@@ -10,8 +10,9 @@ namespace HTML5;
*/
class Elements {
- const TEXT_RAW = 0x01;
- const TEXT_RCDATA = 0x02;
+ const KNOWN_ELEMENT = 0x01;
+ const TEXT_RAW = 0x02;
+ const TEXT_RCDATA = 0x04;
const OMIT_START = 0x0a;
const OMIT_END = 0x0b;
@@ -95,7 +96,7 @@ class Elements {
"output" => 1,
"p" => 1,
"param" => 1,
- "pre" => 1,
+ "pre" => 3, // NORMAL | TEXT_RAW
"progress" => 1,
"q" => 1,
"rp" => 1,
@@ -103,7 +104,7 @@ class Elements {
"ruby" => 1,
"s" => 1,
"samp" => 1,
- "script" => 1,
+ "script" => 3, // NORMAL | TEXT_RAW
"section" => 1,
"select" => 1,
"small" => 1,
@@ -117,7 +118,7 @@ class Elements {
"table" => 1,
"tbody" => 1,
"td" => 1,
- "textarea" => 1,
+ "textarea" => 5, // NORMAL | TEXT_RCDATA
"tfoot" => 1,
"th" => 1,
"thead" => 1,
@@ -278,6 +279,30 @@ class Elements {
);
/**
+ * Check whether the given element meets the given criterion.
+ *
+ * Example:
+ *
+ * Elements::isA('script', Elements::TEXT_RAW); // Returns true.
+ *
+ * Elements::isA('script', Elements::TEXT_RCDATA); // Returns false.
+ *
+ * @param string $name
+ * The element name.
+ * @param int $mask
+ * One of the constants on this class.
+ * @return boolean
+ * TRUE if the element matches the mask, FALSE otherwise.
+ */
+ public static function isA($name, $mask) {
+ if (!self::isElement($name)) {
+ return FALSE;
+ }
+
+ return (self::element($name) & $mask) == $mask;
+ }
+
+ /**
* Test if an element is a valid html5 element.
*
* @param string $name
@@ -335,7 +360,24 @@ class Elements {
* @return bool
* True if valid and false otherwise.
*/
- public function isElement($name) {
+ public static function isElement($name) {
return self::isHtml5Element($name) || self::isMathMLElement($name) || self::isSvgElement($name);
}
+
+ /**
+ * Get the element mask for the given element name.
+ */
+ public static function element($name) {
+ if (isset(self::$elements[$name])) {
+ return self::$elements[$name];
+ }
+ if (isset(self::$svg[$name])) {
+ return self::$svg[$name];
+ }
+ if (isset(self::$mathml[$name])) {
+ return self::$mathml[$name];
+ }
+
+ return FALSE;
+ }
}
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php
index 305a733..cf22953 100644
--- a/src/HTML5/Parser/DOMTreeBuilder.php
+++ b/src/HTML5/Parser/DOMTreeBuilder.php
@@ -1,6 +1,7 @@
<?php
namespace HTML5\Parser;
+use HTML5\Elements;
/**
* Create an HTML5 DOM tree from events.
*
@@ -30,7 +31,14 @@ class DOMTreeBuilder implements EventHandler {
$this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
$this->doc->errors = array();
- $this->current = $this->doc->documentElement();
+ $this->current = $this->doc->documentElement;
+ }
+
+ /**
+ * Get the document.
+ */
+ public function document() {
+ return $this->doc;
}
/**
@@ -62,16 +70,28 @@ class DOMTreeBuilder implements EventHandler {
}
$ele = $this->doc->createElement($lname);
+ foreach ($attributes as $aName => $aVal) {
+ $ele->setAttribute($aName, $aVal);
+
+ // This is necessary on a non-DTD schema, like HTML5.
+ if ($aName == 'id') {
+ $ele->setIdAttribute('id', TRUE);
+ }
+ }
$this->current->appendChild($ele);
// XXX: Need to handle self-closing tags and unary tags.
$this->current = $ele;
+
+ // Return the element mask, which the tokenizer can then use to set
+ // various processing rules.
+ return Elements::element($name);
}
public function endTag($name) {
$lname = $this->normalizeTagName($name);
- if ($this->current->tagName() != $lname) {
+ if ($this->current->tagName != $lname) {
return $this->quirksTreeResolver($lname);
}
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index f3e45e1..02e78d9 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -1,6 +1,8 @@
<?php
namespace HTML5\Parser;
+use HTML5\Elements;
+
/**
* The HTML5 tokenizer.
*
@@ -40,17 +42,6 @@ class Tokenizer {
const WHITE="\t\n\f ";
/**
- * Textmodes are used to determine how to scan the text inside of tags.
- *
- * NORMAL: Scan non-elements.
- * RAW: Scan until a specific closing tag.
- * RCDATA: Scan until a specifc close state.
- */
- const TEXTMODE_NORMAL = 0;
- const TEXTMODE_RAW = 1;
- const TEXTMODE_RCDATA = 2;
-
- /**
* Create a new tokenizer.
*
* Typically, parsing a document involves creating a new tokenizer, giving
@@ -105,13 +96,13 @@ class Tokenizer {
* startTag(), but it can also be set manually using this function.
*
* @param integer $textmode
- * One of Tokenizer::TEXTMODE_*
+ * One of Elements::TEXT_*
* @param string $untilTag
* The tag that should stop RAW or RCDATA mode. Normal mode does not
* use this indicator.
*/
public function setTextMode($textmode, $untilTag = NULL) {
- $this->textMode = $textmode;
+ $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA);
$this->untilTag = $untilTag;
}
@@ -140,17 +131,18 @@ class Tokenizer {
/**
* Parse anything that looks like character data.
*
- * Different rules apply based on the current TEXTMODE.
+ * Different rules apply based on the current text mode.
+ *
+ * @see Elements::TEXT_RAW Elements::TEXT_RCDATA.
*/
protected function characterData() {
if ($this->scanner->current() === FALSE) {
return FALSE;
}
switch ($this->textMode) {
- case self::TEXTMODE_RAW:
- case self::TEXTMODE_RCDATA:
+ case Elements::TEXT_RAW:
+ case Elements::TEXT_RCDATA:
return $this->rawText();
- case self::TEXTMODE_NORMAL:
default:
$tok = $this->scanner->current();
if (strspn($tok, "<&")) {
@@ -190,7 +182,7 @@ class Tokenizer {
$sequence = '</' . $this->untilTag . '>';
$txt = $this->readUntilSequence($sequence);
$this->events->text($txt);
- $this->setTextMode(self::TEXTMODE_NORMAL);
+ $this->setTextMode(0);
return $this->endTag();
}