summaryrefslogtreecommitdiff
path: root/src/HTML5/Parser/DOMTreeBuilder.php
blob: cf22953710cc63115cafd2dc524f810b36f72a66 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
<?php
namespace HTML5\Parser;

use HTML5\Elements;
/**
 * Create an HTML5 DOM tree from events.
 *
 * This attempts to create a DOM from events emitted by a parser. This 
 * attempts (but does not guarantee) to up-convert older HTML documents 
 * to HTML5. It does this by applying HTML5's rules, but it will not 
 * change the architecture of the document itself.
 */
class DOMTreeBuilder implements EventHandler {
  protected $stack = array();
  protected $current; // Pointer in the tag hierarchy.
  protected $doc;

  protected $processor;

  /**
   * Quirks mode is enabled by default. Any document that is missing the 
   * DT will be considered to be in quirks mode.
   */
  protected $quirks = TRUE;

  public function __construct() {
    // XXX:
    // Create the doctype. For now, we are always creating HTML5 
    // documents, and attempting to up-convert any older DTDs to HTML5.
    $dt = \DOMImplementation::createDocumentType('html');
    $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
    $this->doc->errors = array();

    $this->current = $this->doc->documentElement;
  }

  /**
   * Get the document.
   */
  public function document() {
    return $this->doc;
  }

  /**
   * Provide an instruction processor.
   *
   * This is used for handling Processor Instructions as they are
   * inserted. If omitted, PI's are inserted directly into the DOM tree.
   */
  public function setInstructionProcessor(\HTML5\InstructionProcessor $proc) {
    $this->processor = $proc;
  }

  public function doctype($name, $idType = 0, $id = NULL, $quirks = FALSE) {
    // This is used solely for setting quirks mode. Currently we don't 
    // try to preserve the inbound DT. We convert it to HTML5.
    $this->quirks = $quirks;
  }

  public function startTag($name, $attributes = array(), $selfClosing = FALSE) {
    $lname = $this->normalizeTagName($name);


    // XXX: Since we create the root element, we skip this if it occurs
    // inside of the builder. We should probably check to make sure that
    // there is only one element so far, and indicate an error if there
    // is a structural problem.
    if ($lname == 'html') {
      return;
    }

    $ele = $this->doc->createElement($lname);
    foreach ($attributes as $aName => $aVal) {
      $ele->setAttribute($aName, $aVal);

      // This is necessary on a non-DTD schema, like HTML5.
      if ($aName == 'id') {
        $ele->setIdAttribute('id', TRUE);
      }
    }

    $this->current->appendChild($ele);

    // XXX: Need to handle self-closing tags and unary tags.
    $this->current = $ele;

    // Return the element mask, which the tokenizer can then use to set 
    // various processing rules.
    return Elements::element($name);
  }

  public function endTag($name) {
    $lname = $this->normalizeTagName($name);
    if ($this->current->tagName != $lname) {
      return $this->quirksTreeResolver($lname);
    }

    // XXX: HTML has no parent. What do we do, though,
    // if this element appears in the wrong place?
    if ($lname == 'html') {
      return;
    }
    $this->current = $this->current->parentNode;
  }

  public function comment($cdata) {
    // TODO: Need to handle case where comment appears outside of the HTML tag.
    $node = $this->doc->createComment($cdata);
    $this->current->appendChild($node);
  }

  public function text($data) {
    $node = $this->doc->createTextNode($data);
    $this->current->appendChild($node);
  }

  public function eof() {
    // If the $current isn't the $root, do we need to do anything?
  }

  public function parseError($msg, $line, $col) {
    $this->doc->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg);
  }

  public function cdata($data) {
    $node = $this->doc->createCDATASection($data);
  }

  public function processingInstruction($name, $data = NULL) {
    // Important: The processor may modify the current DOM tree however 
    // it sees fit.
    if (isset($this->processor)) {
      $res = $processor->process($this->current, $name, $data);
      if (!empty($res)) {
        $this->current = $res;
      }
    }
  }

  // ==========================================================================
  // UTILITIES
  // ==========================================================================

  protected function normalizeTagName($name) {
    if (strpos($name, ':') !== FALSE) {
      // We know from the grammar that there must be at least one other 
      // char besides :, since : is not a legal tag start.
      $parts = explode(':', $name);
      return array_pop($parts);
    }

    return $name;
  }

  protected function quirksTreeResolver($name) {
    throw new \Exception("Not implemented.");

  }
}