summaryrefslogtreecommitdiff
path: root/src/HTML5/Parser/DOMTreeBuilder.php
blob: be9fa23879f7937271c5af128718f889e2b0e790 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
<?php
namespace HTML5\Parser;

/**
 * Create an HTML5 DOM tree from events.
 *
 * This attempts to create a DOM from events emitted by a parser. This 
 * attempts (but does not guarantee) to up-convert older HTML documents 
 * to HTML5. It does this by applying HTML5's rules, but it will not 
 * change the architecture of the document itself.
 */
class DOMTreeBuilder implements EventHandler {
  protected $stack = array();
  protected $current; // Pointer in the tag hierarchy.
  protected $doc;

  protected $processor;

  /**
   * Quirks mode is enabled by default. Any document that is missing the 
   * DT will be considered to be in quirks mode.
   */
  protected $quirks = TRUE;

  public function __construct() {
    // XXX:
    // Create the doctype. For now, we are always creating HTML5 
    // documents, and attempting to up-convert any older DTDs to HTML5.
    $dt = \DOMImplementation::createDocumentType('html');
    $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
    $this->doc->errors = array();

    $this->current = $this->doc->documentElement();
  }

  /**
   * Provide an instruction processor.
   *
   * This is used for handling Processor Instructions as they are
   * inserted. If omitted, PI's are inserted directly into the DOM tree.
   */
  public function setInstructionProcessor(\HTML5\InstructionProcessor $proc) {
    $this->processor = $proc;
  }

  public function doctype($name, $idType = 0, $id = NULL, $quirks = FALSE) {
    // This is used solely for setting quirks mode. Currently we don't 
    // try to preserve the inbound DT. We convert it to HTML5.
    $this->quirks = $quirks;
  }

  public function startTag($name, $attributes = array(), $selfClosing = FALSE) {
    $lname = $this->normalizeTagName($name);


    // XXX: Since we create the root element, we skip this if it occurs
    // inside of the builder. We should probably check to make sure that
    // there is only one element so far, and indicate an error if there
    // is a structural problem.
    if ($lname == 'html') {
      return;
    }

    $ele = $this->doc->createElement($lname);

    $this->current->appendChild($ele);

    // XXX: Need to handle self-closing tags and unary tags.
    $this->current = $ele;
  }

  public function endTag($name) {
    $lname = $this->normalizeTagName($name);
    if ($this->current->tagName() != $lname) {
      return $this->quirksTreeResolver($lname);
    }

    // XXX: HTML has no parent. What do we do, though,
    // if this element appears in the wrong place?
    if ($lname == 'html') {
      return;
    }
    $this->current = $this->current->parentNode;
  }

  public function comment($cdata) {
    $node = $this->doc->createComment($cdata);
    $this->current->appendChild($node);
  }

  public function text($data) {
    $node = $this->doc->createTextNode($data);
    $this->current->appendChild($node);
  }

  public function eof() {
    // If the $current isn't the $root, do we need to do anything?
  }

  public function parseError($msg, $line, $col) {
    $this->doc->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg);
  }

  public function cdata($data) {
    $node = $this->doc->createCDATASection($data);
  }

  public function processingInstruction($name, $data = NULL) {
    // Important: The processor may modify the current DOM tree however 
    // it sees fit.
    if (isset($this->processor)) {
      $res = $processor->process($this->current, $name, $data);
      if (!empty($res)) {
        $this->current = $res;
      }
    }
  }

  // ==========================================================================
  // UTILITIES
  // ==========================================================================

  protected function normalizeTagName($name) {
    if (strpos($name, ':') !== FALSE) {
      // We know from the grammar that there must be at least one other 
      // char besides :, since : is not a legal tag start.
      $parts = explode(':', $name);
      return array_pop($parts);
    }

    return $name;
  }

  protected function quirksTreeResolver($name) {
    throw new \Exception("Not implemented.");

  }
}