1, 'body' => 1, 'head' => 1, 'p' => 1, 'div' => 1, 'h1' => 1, 'h2' => 1, 'h3' => 1, 'h4' => 1, 'h5' => 1, 'h6' => 1, 'title' => 1, 'script' => 1, 'link' => 1, 'meta' => 1, 'section' => 1, 'article' => 1, 'table' => 1, 'tbody' => 1, 'tr' => 1, 'th' => 1, 'td' => 1, //'form' => 1, ); // TODO: Refactor this into an element mask. static $literal_elements = array( 'style' => 1, 'script' => 1, 'xmp' => 1, 'iframe' => 1, 'noembed' => 1, 'noframes' => 1, 'plaintext' => 1, ); /** * Unary elements. * HTML5 section 8.3: * If current node is an * area, base, basefont, bgsound, br, col, command, embed, frame, hr, img, * input, keygen, link, meta, param, source, track or wbr element, then * continue on to the next child node at this point. */ static $unary_elements = array( 'area' => 1, 'base' => 1, 'basefont' => 1, 'bgsound' => 1, 'br' => 1, 'col' => 1, 'command' => 1, 'embed' => 1, 'frame' => 1, 'hr' => 1, 'img' => 1, ); /** Namespaces that should be treated as "local" to HTML5. */ static $local_ns = array( 'http://www.w3.org/1999/xhtml' => 'html', 'http://www.w3.org/1998/Math/MathML' => 'mathml', 'http://www.w3.org/2000/svg' => 'svg', ); protected $dom; protected $out; protected $pretty = TRUE; const DOCTYPE = ''; /** * Create a traverser. * * @param DOMNode|DOMNodeList $dom * The document or node to traverse. * @param resource $out * A stream that allows writing. The traverser will output into this * stream. */ public function __construct($dom, $out) { $this->dom = $dom; $this->out = $out; } /** * Determine whether output should be formatted. * * IMPORTANT: Neither option will GUARANTEE that the spacing of the output * will exactly match the spacing of an origin document. The HTML5 specification * does not require any such behavior. * * Semantically (according to the HTML5 spec's definition), either flag * will produce an identical document. (Insignificant * whitespace does not impact semantics). * * @param boolean $useFormatting * If TRUE (default) output will be formatted. If FALSE, * the little or no formatting is done. */ public function formatOutput($useFormatting = TRUE) { $this->pretty = $useFormatting; } /** * Tell the traverser to walk the DOM. * * @return resource $out * Returns the output stream. */ public function walk() { // If DOMDocument, start with the DOCTYPE and travers. if ($this->dom instanceof \DOMDocument) { $this->doctype(); $this->document($this->dom); } // If NodeList, loop elseif ($this->dom instanceof \DOMNodeList) { // Loop through the list } // Else assume this is a DOMNode-like datastructure. else { $this->node($this->dom); } return $this->out; } protected function doctype() { $this->wr(self::DOCTYPE); $this->nl(); } protected function document($node) { $this->node($node->documentElement); $this->nl(); } protected function node($node) { switch ($node->nodeType) { case XML_ELEMENT_NODE: $this->element($node); break; case XML_TEXT_NODE: $this->text($node); break; case XML_CDATA_SECTION_NODE: $this->cdata($node); break; // FIXME: It appears that the parser doesn't do PI's. case XML_PI_NODE: $this->processorInstruction($ele); break; case XML_COMMENT_NODE: $this->comment($node); break; // Currently we don't support embedding DTDs. default: print ''; break; } } protected function element($ele) { $name = $ele->tagName; $block = $this->pretty && $this->isBlock($name); // Per spec: // If the element has a declared namespace in the HTML, MathML or // SVG namespaces, we use the lname instead of the tagName. if ($this->isLocalElement($ele)) { $name = $ele->localName; } // TODO: Really need to fix the spacing. // Add a newline for a block element. if ($block) $this->nl(); $this->openTag($ele); // Handle children. if ($ele->hasChildNodes()) { $this->children($ele->childNodes); } // If not unary, add a closing tag. if (!$this->isUnary($name)) { $this->closeTag($ele); if ($block) $this->nl(); } } protected function text($ele) { if ($this->isLiteral($ele)) { $this->wr($ele->wholeText); return; } // FIXME: This probably needs some flags set. $this->wr($this->enc($ele->wholeText)); } protected function cdata($ele) { $this->wr('wr($ele->wholeText)->wr(']]>'); } protected function comment($ele) { $this->wr(''); } protected function processorInstruction($ele) { $this->wr('wr($ele->target)->wr(' ')->wr($ele->data)->wr(' ?>'); } protected function children($nl) { foreach ($nl as $node) { $this->node($node); } } protected function openTag($ele) { // FIXME: Needs support for SVG, MathML, and namespaced XML. $this->wr('<')->wr($ele->tagName); $this->attrs($ele); $this->wr('>'); } protected function attrs($ele) { // FIXME: Needs support for xml, xmlns, xlink, and namespaced elements. if (!$ele->hasAttributes()) { return $this; } // TODO: Currently, this always writes name="value", and does not do // value-less attributes. $map = $ele->attributes; $len = $map->length; for ($i = 0; $i < $len; ++$i) { $node = $map->item($i); $val = $this->enc($node->value); // XXX: The spec says that we need to ensure that anything in // the XML, XMLNS, or XLink NS's should use the canonical // prefix. It seems that DOM does this for us already, but there // may be exceptions. $this->wr(' ')->wr($node->name)->wr('="')->wr($val)->wr('"'); } } protected function closeTag($ele) { // FIXME: Needs support for SVG, MathML, and namespaced XML. $this->wr('wr($ele->tagName)->wr('>'); } protected function wr($text) { fwrite($this->out, $text); return $this; } protected function nl() { fwrite($this->out, PHP_EOL); return $this; } protected function enc($text) { $flags = ENT_QUOTES; // TODO: Verify on PHP 5.4 that this works as desired. if (defined('ENT_HTML5')) { $flags = ENT_HTML5|ENT_SUBSTITUTE; } $ret = htmlentities($text, $flags, 'UTF-8'); //if ($ret != $text) printf("Replaced [%s] with [%s]", $text, $ret); return $ret; } /** * Is an unary tag. */ protected function isUnary($name) { return isset(self::$unary_elements[$name]); } /** * Is block element. */ protected function isBlock($name) { return isset(self::$block_elements[$name]); } protected function isLiteral($element) { if (!$element->parentNode) { return FALSE; } return isset(self::$literal_elements[$element->parentNode->tagName]); } protected function isLocalElement($ele) { $uri = $ele->namespaceURI; if (empty($uri)) { return FALSE; } return isset(self::$local_ns[$uri]); } }