diff options
Diffstat (limited to 'src/HTML5')
-rw-r--r-- | src/HTML5/Serializer/OutputRules.php | 203 | ||||
-rw-r--r-- | src/HTML5/Serializer/Serializer.php | 16 | ||||
-rw-r--r-- | src/HTML5/Serializer/Traverser.php | 254 |
3 files changed, 230 insertions, 243 deletions
diff --git a/src/HTML5/Serializer/OutputRules.php b/src/HTML5/Serializer/OutputRules.php new file mode 100644 index 0000000..5780d61 --- /dev/null +++ b/src/HTML5/Serializer/OutputRules.php @@ -0,0 +1,203 @@ +<?php +namespace HTML5\Serializer; + +use \HTML5\Elements; + +class OutputRules { + + protected $traverser; + protected $encode = FALSE; + protected $out; + + const DOCTYPE = '<!DOCTYPE html>'; + + public function __construct($traverser, $output, $options = array()) { + $this->traverser = $traverser; + + if (isset($options['encode'])) { + $this->encode = $options['encode']; + } + + $this->out = $output; + + } + + public function document($dom) { + $this->doctype(); + $this->traverser->node($dom->documentElement); + $this->nl(); + } + + protected function doctype() { + $this->wr(self::DOCTYPE); + $this->nl(); + } + + public function element($ele) { + $name = $ele->tagName; + + // Per spec: + // If the element has a declared namespace in the HTML, MathML or + // SVG namespaces, we use the lname instead of the tagName. + if ($this->traverser->isLocalElement($ele)) { + $name = $ele->localName; + } + + $this->openTag($ele); + + // Handle children. + if ($ele->hasChildNodes()) { + $this->traverser->children($ele->childNodes); + } + + // If not unary, add a closing tag. + if (!Elements::isA($name, Elements::VOID_TAG)) { + $this->closeTag($ele); + } + } + + /** + * Write a text node. + * + * @param \DOMText $ele + * The text node to write. + */ + public function text($ele) { + if (isset($ele->parentNode) && Elements::isA($ele->parentNode->tagName, Elements::TEXT_RAW)) { + $this->wr($ele->wholeText); + return; + } + + // FIXME: This probably needs some flags set. + $this->wr($this->enc($ele->wholeText)); + + } + + public function cdata($ele) { + $this->wr('<![CDATA[')->wr($ele->wholeText)->wr(']]>'); + } + + public function comment($ele) { + $this->wr('<!--')->wr($ele->data)->wr('-->'); + } + + public function processorInstruction($ele) { + $this->wr('<?')->wr($ele->target)->wr(' ')->wr($ele->data)->wr(' ?>'); + } + + /** + * Write the opening tag. + * + * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the + * qualified name (8.3). + * + * @param \DOMNode $ele + * The element being written. + */ + protected function openTag($ele) { + // FIXME: Needs support for SVG, MathML, and namespaced XML. + $this->wr('<')->wr($ele->tagName); + $this->attrs($ele); + $this->wr('>'); + } + + protected function attrs($ele) { + // FIXME: Needs support for xml, xmlns, xlink, and namespaced elements. + if (!$ele->hasAttributes()) { + return $this; + } + + // TODO: Currently, this always writes name="value", and does not do + // value-less attributes. + $map = $ele->attributes; + $len = $map->length; + for ($i = 0; $i < $len; ++$i) { + $node = $map->item($i); + $val = $this->enc($node->value); + + // XXX: The spec says that we need to ensure that anything in + // the XML, XMLNS, or XLink NS's should use the canonical + // prefix. It seems that DOM does this for us already, but there + // may be exceptions. + $this->wr(' ')->wr($node->name)->wr('="')->wr($val)->wr('"'); + } + } + + /** + * Write the closing tag. + * + * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the + * qualified name (8.3). + * + * @param \DOMNode $ele + * The element being written. + */ + protected function closeTag($ele) { + // FIXME: Needs support for SVG, MathML, and namespaced XML. + $this->wr('</')->wr($ele->tagName)->wr('>'); + } + + /** + * Write to the output. + * + * @param string $text + * The string to put into the output. + * + * @return HTML5\Serializer\Traverser + * $this so it can be used in chaining. + */ + protected function wr($text) { + fwrite($this->out, $text); + return $this; + } + + /** + * Write a new line character. + * + * @return HTML5\Serializer\Traverser + * $this so it can be used in chaining. + */ + protected function nl() { + fwrite($this->out, PHP_EOL); + return $this; + } + + /** + * Encode text. + * + * True encoding will turn all named character references into their entities. + * This includes such characters as +.# and many other common ones. By default + * encoding here will just escape &'<>". + * + * Note, PHP 5.4+ has better html5 encoding. + * + * @todo Use the Entities class in php 5.3 to have html5 entities. + * + * @param string $text + * text to encode. + * + * @return string + * The encoded text. + */ + protected function enc($text) { + $flags = ENT_QUOTES; + + // Escape rather than encode all entities. + if (!$this->encode) { + return htmlspecialchars($text, $flags, 'UTF-8'); + } + + // If we are in PHP 5.4+ we can use the native html5 entity functionality. + if (defined('ENT_HTML5')) { + $flags = ENT_HTML5 | ENT_SUBSTITUTE | ENT_QUOTES; + $ret = htmlentities($text, $flags, 'UTF-8', FALSE); + } + // If a version earlier than 5.4 html5 entities are not entirely handled. + // This manually handles them. + else { + $ret = strtr($text, \HTML5\Serializer\HTML5Entities::$map); + } + return $ret; + } + +}
\ No newline at end of file diff --git a/src/HTML5/Serializer/Serializer.php b/src/HTML5/Serializer/Serializer.php index 3f9187f..7053df5 100644 --- a/src/HTML5/Serializer/Serializer.php +++ b/src/HTML5/Serializer/Serializer.php @@ -15,8 +15,7 @@ namespace HTML5\Serializer; */ class Serializer { protected $dom; - protected $pretty = TRUE; - protected $encode = FALSE; + protected $options = array(); /** * Create a serializer. @@ -36,14 +35,7 @@ class Serializer { */ public function __construct($dom, $options = array()) { $this->dom = $dom; - - if (isset($options['format']) && is_bool($options['format'])) { - $this->pretty = $options['format']; - } - - if (isset($options['encode']) && is_bool($options['encode'])) { - $this->encode = $options['encode']; - } + $this->options = $options; } /** @@ -63,9 +55,7 @@ class Serializer { else { $file = fopen($filename, 'w'); } - $trav = new Traverser($this->dom, $file); - $trav->formatOutput($this->pretty); - $trav->encodeOutput($this->encode); + $trav = new Traverser($this->dom, $file, $this->options); $trav->walk(); diff --git a/src/HTML5/Serializer/Traverser.php b/src/HTML5/Serializer/Traverser.php index 68dea82..bd9d1ce 100644 --- a/src/HTML5/Serializer/Traverser.php +++ b/src/HTML5/Serializer/Traverser.php @@ -1,8 +1,6 @@ <?php namespace HTML5\Serializer; -use \HTML5\Elements; - /** * Traverser for walking a DOM tree. * @@ -22,11 +20,10 @@ class Traverser { ); protected $dom; - protected $out; - protected $pretty = TRUE; + protected $options; protected $encode = FALSE; - - const DOCTYPE = '<!DOCTYPE html>'; + protected $rules; + protected $out; /** * Create a traverser. @@ -36,44 +33,19 @@ class Traverser { * @param resource $out * A stream that allows writing. The traverser will output into this * stream. + * @param array $options + * An array or options for the traverser as key/value pairs. These include: + * - encode: A bool to specify if full encding should happen for all named + * charachter references. Defaults to FALSE which escapes &'<>". + * - rules: The path to the class handling the output rules. */ - public function __construct($dom, $out) { + public function __construct($dom, $out, $options = array()) { $this->dom = $dom; $this->out = $out; - } + $this->options = $options; - /** - * Determine whether output should be formatted. - * - * IMPORTANT: Neither option will GUARANTEE that the spacing of the output - * will exactly match the spacing of an origin document. The HTML5 specification - * does not require any such behavior. - * - * Semantically (according to the HTML5 spec's definition), either flag - * will produce an identical document. (Insignificant - * whitespace does not impact semantics). - * - * @param boolean $useFormatting - * If TRUE (default) output will be formatted. If FALSE, - * the little or no formatting is done. - */ - public function formatOutput($useFormatting = TRUE) { - $this->pretty = $useFormatting; - } - - /** - * Set whether encoding should encode all html5 entities. - * - * True encoding will turn all named character references into their entities. - * This includes such characters as +.# and many other common ones. By default - * encoding here will just escape &'<>". which is what most users expect. - * - * @param bool $encode - * Whether to encode all html5 entities. Defaults to FALSE where only - * &'<>". are escaped. - */ - public function encodeOutput($encode = FALSE) { - $this->encode = $encode; + $rulesClass = $this->options['rules']; + $this->rules = new $rulesClass($this, $out, $this->options); } /** @@ -85,8 +57,7 @@ class Traverser { public function walk() { // If DOMDocument, start with the DOCTYPE and travers. if ($this->dom instanceof \DOMDocument) { - $this->doctype(); - $this->document($this->dom); + $this->rules->document($this->dom); } // If NodeList, loop elseif ($this->dom instanceof \DOMNodeList) { @@ -100,40 +71,30 @@ class Traverser { return $this->out; } - protected function doctype() { - $this->wr(self::DOCTYPE); - $this->nl(); - } - - protected function document($node) { - $this->node($node->documentElement); - $this->nl(); - } - /** * Process a node in the DOM. * * @param mixed $node * A node implementing \DOMNode. */ - protected function node($node) { + public function node($node) { // A listing of types is at http://php.net/manual/en/dom.constants.php switch ($node->nodeType) { case XML_ELEMENT_NODE: - $this->element($node); + $this->rules->element($node); break; case XML_TEXT_NODE: - $this->text($node); + $this->rules->text($node); break; case XML_CDATA_SECTION_NODE: - $this->cdata($node); + $this->rules->cdata($node); break; // FIXME: It appears that the parser doesn't do PI's. case XML_PI_NODE: - $this->processorInstruction($ele); + $this->rules->processorInstruction($ele); break; case XML_COMMENT_NODE: - $this->comment($node); + $this->rules->comment($node); break; // Currently we don't support embedding DTDs. default: @@ -142,186 +103,19 @@ class Traverser { } } - protected function element($ele) { - $name = $ele->tagName; - $block = $this->pretty && Elements::isA($name, Elements::BLOCK_TAG); - - // Per spec: - // If the element has a declared namespace in the HTML, MathML or - // SVG namespaces, we use the lname instead of the tagName. - if ($this->isLocalElement($ele)) { - $name = $ele->localName; - } - - // TODO: Really need to fix the spacing. - // Add a newline for a block element. - if ($block) $this->nl(); - - $this->openTag($ele); - - // Handle children. - if ($ele->hasChildNodes()) { - $this->children($ele->childNodes); - } - - // If not unary, add a closing tag. - if (!Elements::isA($name, Elements::VOID_TAG)) { - $this->closeTag($ele); - if ($block) $this->nl(); - } - } - /** - * Write a text node. + * Walk through all the nodes on a node list. * - * @param \DOMText $ele - * The text node to write. + * @param \DOMNodeList $nl + * A list of child elements to walk through. */ - protected function text($ele) { - if (isset($ele->parentNode) && Elements::isA($ele->parentNode->tagName, Elements::TEXT_RAW)) { - $this->wr($ele->wholeText); - return; - } - - // FIXME: This probably needs some flags set. - $this->wr($this->enc($ele->wholeText)); - - } - - protected function cdata($ele) { - $this->wr('<![CDATA[')->wr($ele->wholeText)->wr(']]>'); - } - - protected function comment($ele) { - $this->wr('<!--')->wr($ele->data)->wr('-->'); - } - - protected function processorInstruction($ele) { - $this->wr('<?')->wr($ele->target)->wr(' ')->wr($ele->data)->wr(' ?>'); - } - - protected function children($nl) { + public function children($nl) { foreach ($nl as $node) { $this->node($node); } } /** - * Write the opening tag. - * - * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the - * qualified name (8.3). - * - * @param \DOMNode $ele - * The element being written. - */ - protected function openTag($ele) { - // FIXME: Needs support for SVG, MathML, and namespaced XML. - $this->wr('<')->wr($ele->tagName); - $this->attrs($ele); - $this->wr('>'); - } - - protected function attrs($ele) { - // FIXME: Needs support for xml, xmlns, xlink, and namespaced elements. - if (!$ele->hasAttributes()) { - return $this; - } - - // TODO: Currently, this always writes name="value", and does not do - // value-less attributes. - $map = $ele->attributes; - $len = $map->length; - for ($i = 0; $i < $len; ++$i) { - $node = $map->item($i); - $val = $this->enc($node->value); - - // XXX: The spec says that we need to ensure that anything in - // the XML, XMLNS, or XLink NS's should use the canonical - // prefix. It seems that DOM does this for us already, but there - // may be exceptions. - $this->wr(' ')->wr($node->name)->wr('="')->wr($val)->wr('"'); - } - } - - /** - * Write the closing tag. - * - * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the - * qualified name (8.3). - * - * @param \DOMNode $ele - * The element being written. - */ - protected function closeTag($ele) { - // FIXME: Needs support for SVG, MathML, and namespaced XML. - $this->wr('</')->wr($ele->tagName)->wr('>'); - } - - /** - * Write to the output. - * - * @param string $text - * The string to put into the output. - * - * @return HTML5\Serializer\Traverser - * $this so it can be used in chaining. - */ - protected function wr($text) { - fwrite($this->out, $text); - return $this; - } - - /** - * Write a new line character. - * - * @return HTML5\Serializer\Traverser - * $this so it can be used in chaining. - */ - protected function nl() { - fwrite($this->out, PHP_EOL); - return $this; - } - - /** - * Encode text. - * - * True encoding will turn all named character references into their entities. - * This includes such characters as +.# and many other common ones. By default - * encoding here will just escape &'<>". - * - * Note, PHP 5.4+ has better html5 encoding. - * - * @todo Use the Entities class in php 5.3 to have html5 entities. - * - * @param string $text - * text to encode. - * - * @return string - * The encoded text. - */ - protected function enc($text) { - $flags = ENT_QUOTES; - - // Escape rather than encode all entities. - if (!$this->encode) { - return htmlspecialchars($text, $flags, 'UTF-8'); - } - - // If we are in PHP 5.4+ we can use the native html5 entity functionality. - if (defined('ENT_HTML5')) { - $flags = ENT_HTML5 | ENT_SUBSTITUTE | ENT_QUOTES; - $ret = htmlentities($text, $flags, 'UTF-8', FALSE); - } - // If a version earlier than 5.4 html5 entities are not entirely handled. - // This manually handles them. - else { - $ret = strtr($text, \HTML5\Serializer\HTML5Entities::$map); - } - return $ret; - } - - /** * Is an element local? * * @param mixed $ele @@ -330,7 +124,7 @@ class Traverser { * @return bool * True if local and false otherwise. */ - protected function isLocalElement($ele) { + public function isLocalElement($ele) { $uri = $ele->namespaceURI; if (empty($uri)) { return FALSE; |