diff options
Diffstat (limited to 'src/HTML5')
-rw-r--r-- | src/HTML5/InstructionProcessor.php | 43 | ||||
-rw-r--r-- | src/HTML5/Parser/DOMTreeBuilder.php | 117 |
2 files changed, 159 insertions, 1 deletions
diff --git a/src/HTML5/InstructionProcessor.php b/src/HTML5/InstructionProcessor.php new file mode 100644 index 0000000..cfe7b33 --- /dev/null +++ b/src/HTML5/InstructionProcessor.php @@ -0,0 +1,43 @@ +<?php +/** + * A handler for processor instructions. + */ +namespace HTML5; + +/** + * Provide an processor to handle embedded instructions. + * + * XML defines a mechanism for inserting instructions (like PHP) into a + * document. These are called "Processor Instructions." The HTML5 parser + * provides an opportunity to handle these processor instructions during + * the tree-building phase (before the DOM is constructed), which makes + * it possible to alter the document as it is being created. + * + * One could, for example, use this mechanism to execute well-formed PHP + * code embedded inside of an HTML5 document. + */ +interface InstructionProcessor { + + /** + * Process an individual processing instruction. + * + * The process() function is responsible for doing the following: + * - Determining whether $name is an instruction type it can handle. + * - Determining what to do with the data passed in. + * - Making any subsequent modifications to the DOM by modifying the + * DOMElement or its attached DOM tree. + * + * @param DOMElement $element + * The parent element for the current processing instruction. + * @param string $name + * The instruction's name. E.g. `<?php` has the name `php`. + * @param string $data + * All of the data between the opening and closing PI marks. + * @return DOMElement + * The element that should be considered "Current". This may just be + * the element passed in, but if the processor added more elements, + * it may choose to reset the current element to one of the elements + * it created. (When in doubt, return the element passed in.) + */ + public function process(\DOMElement $element, $name, $data); +} diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index b0fbdfb..be9fa23 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -1,23 +1,138 @@ <?php namespace HTML5\Parser; +/** + * Create an HTML5 DOM tree from events. + * + * This attempts to create a DOM from events emitted by a parser. This + * attempts (but does not guarantee) to up-convert older HTML documents + * to HTML5. It does this by applying HTML5's rules, but it will not + * change the architecture of the document itself. + */ class DOMTreeBuilder implements EventHandler { + protected $stack = array(); + protected $current; // Pointer in the tag hierarchy. + protected $doc; + + protected $processor; + + /** + * Quirks mode is enabled by default. Any document that is missing the + * DT will be considered to be in quirks mode. + */ + protected $quirks = TRUE; + + public function __construct() { + // XXX: + // Create the doctype. For now, we are always creating HTML5 + // documents, and attempting to up-convert any older DTDs to HTML5. + $dt = \DOMImplementation::createDocumentType('html'); + $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); + $this->doc->errors = array(); + + $this->current = $this->doc->documentElement(); + } + + /** + * Provide an instruction processor. + * + * This is used for handling Processor Instructions as they are + * inserted. If omitted, PI's are inserted directly into the DOM tree. + */ + public function setInstructionProcessor(\HTML5\InstructionProcessor $proc) { + $this->processor = $proc; + } + public function doctype($name, $idType = 0, $id = NULL, $quirks = FALSE) { + // This is used solely for setting quirks mode. Currently we don't + // try to preserve the inbound DT. We convert it to HTML5. + $this->quirks = $quirks; } + public function startTag($name, $attributes = array(), $selfClosing = FALSE) { + $lname = $this->normalizeTagName($name); + + + // XXX: Since we create the root element, we skip this if it occurs + // inside of the builder. We should probably check to make sure that + // there is only one element so far, and indicate an error if there + // is a structural problem. + if ($lname == 'html') { + return; + } + + $ele = $this->doc->createElement($lname); + + $this->current->appendChild($ele); + + // XXX: Need to handle self-closing tags and unary tags. + $this->current = $ele; } + public function endTag($name) { + $lname = $this->normalizeTagName($name); + if ($this->current->tagName() != $lname) { + return $this->quirksTreeResolver($lname); + } + + // XXX: HTML has no parent. What do we do, though, + // if this element appears in the wrong place? + if ($lname == 'html') { + return; + } + $this->current = $this->current->parentNode; } + public function comment($cdata) { + $node = $this->doc->createComment($cdata); + $this->current->appendChild($node); } - public function text($cdata) { + + public function text($data) { + $node = $this->doc->createTextNode($data); + $this->current->appendChild($node); } + public function eof() { + // If the $current isn't the $root, do we need to do anything? } + public function parseError($msg, $line, $col) { + $this->doc->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg); } + public function cdata($data) { + $node = $this->doc->createCDATASection($data); } + public function processingInstruction($name, $data = NULL) { + // Important: The processor may modify the current DOM tree however + // it sees fit. + if (isset($this->processor)) { + $res = $processor->process($this->current, $name, $data); + if (!empty($res)) { + $this->current = $res; + } + } + } + + // ========================================================================== + // UTILITIES + // ========================================================================== + + protected function normalizeTagName($name) { + if (strpos($name, ':') !== FALSE) { + // We know from the grammar that there must be at least one other + // char besides :, since : is not a legal tag start. + $parts = explode(':', $name); + return array_pop($parts); + } + + return $name; + } + + protected function quirksTreeResolver($name) { + throw new \Exception("Not implemented."); + } } |