summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMatt Butcher <[email protected]>2013-04-10 08:18:45 -0500
committerMatt Butcher <[email protected]>2013-04-10 08:18:45 -0500
commitb5ba917c1e142f1a1e3d4a23e1ef3e1d2dcf50e8 (patch)
tree3d8910ad9433589c19e03c94a7d0c4be9c35c31d /src
parent03f992f08d559f5fdae774cdae4b8dff0ea401ed (diff)
HIGHLY EXPERIMENTAL: Writing a new parser.
Diffstat (limited to 'src')
-rw-r--r--src/HTML5/InputStream.php13
-rw-r--r--src/HTML5/Parser/EventHandler.php20
-rw-r--r--src/HTML5/Parser/Scanner.php61
-rw-r--r--src/HTML5/Parser/Tokenizer.php327
-rw-r--r--src/HTML5/Tokenizer.php5
5 files changed, 423 insertions, 3 deletions
diff --git a/src/HTML5/InputStream.php b/src/HTML5/InputStream.php
index 860f883..1efaff3 100644
--- a/src/HTML5/InputStream.php
+++ b/src/HTML5/InputStream.php
@@ -390,9 +390,20 @@ class InputStream {
/**
* Unconsume one character.
*/
- public function unget() {
+ public function unconsume() {
if ($this->char > 0 && $this->char <= $this->EOF) {
$this->char--;
}
}
+ public function unget() {
+ $this->unconsume();
+ }
+
+ public function peek() {
+ return $this->data[$this->char + 1];
+ }
+
+ public function position() {
+ return $this->char;
+ }
}
diff --git a/src/HTML5/Parser/EventHandler.php b/src/HTML5/Parser/EventHandler.php
new file mode 100644
index 0000000..f144fc3
--- /dev/null
+++ b/src/HTML5/Parser/EventHandler.php
@@ -0,0 +1,20 @@
+<?php
+namespace HTML5\Parser;
+
+/**
+ * Standard events for HTML5.
+ *
+ * See HTML5 spec section 8.2.4
+ */
+interface EventHandler {
+ public function doctype($name, $publicID, $systemID, $quirks = FALSE);
+ public function startTag($name, $attributes = array(), $selfClosing = FALSE);
+ public function endTag($name);
+ public function comment($cdata);
+ public function character($cdata);
+ public function eof();
+
+ // Do we need...
+ // public function cdata();
+ // public function processorInstruction();
+}
diff --git a/src/HTML5/Parser/Scanner.php b/src/HTML5/Parser/Scanner.php
new file mode 100644
index 0000000..1ce8428
--- /dev/null
+++ b/src/HTML5/Parser/Scanner.php
@@ -0,0 +1,61 @@
+<?php
+namespace HTML5\Parser;
+
+/**
+ * The scanner.
+ *
+ * This scans over an input stream.
+ */
+class Scanner {
+ const CHARS_HEX = 'abcdefABCDEF01234567890';
+ const CHARS_ALNUM = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
+ const CHARS_ALPHA = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ';
+
+
+ protected $char;
+ protected $is;
+
+ public function __construct($input) {
+ $this->is = $input;
+ }
+
+ public function position() {
+ return $this->is->position();
+ }
+
+ public function peek() {
+ return $this->is->peek();
+ }
+
+ public function next() {
+ $this->char = $this->is->char();
+ return $this->char;
+ }
+
+ public function current() {
+ return $this->char;
+ }
+
+ public function unconsume($howMany = 1) {
+ for ($i = 0; $i < $howMany; ++$i) {
+ $this->is->unconsume();
+ }
+ }
+
+ public function getHex() {
+ $this->charsWhile(self::CHARS_HEX);
+ }
+ public function getAsciiAlpha() {
+ $this->charsWhile(self::CHARS_ALPHA);
+ }
+ public function getAsciiAlphaNum() {
+ $this->charsWhile(self::CHARS_ALNUM);
+ }
+ public function getNumeric() {
+ $this->charsWhile('0123456789');
+ }
+
+
+}
+class ParseError extends Exception {
+}
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
new file mode 100644
index 0000000..d201fd8
--- /dev/null
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -0,0 +1,327 @@
+<?php
+namespace HTML5\Parser;
+
+class Tokenizer {
+ protected $scanner;
+ protected $events;
+ protected $tok;
+
+ /**
+ * Buffer for text.
+ */
+ protected $text = '';
+
+ public function __construct($scanner, $eventHandler) {
+ $this->scanner = $scanner;
+ $this->events = $eventHandler;
+ }
+
+ /**
+ * 8.2.4.1
+ */
+ public function consumeData() {
+ // Scan a token
+ $this->scanner->next();
+ // Character Ref
+ $this->characterReference();
+
+ // TagOpen
+ // Null
+ // EOF
+ // Character
+ }
+
+ /**
+ * 8.2.4.2
+ */
+ protected function characterReference($inAttr = FALSE) {
+ if ($this->tok == '&') {
+ $this->tok = $this->scanner->next();
+ $$this->text .= $this->consumeCharacterReference($inAttr);
+ }
+ }
+
+ protected function consumeCharacterReference($inAttribute = FALSE) {
+ $entity = '';
+ $start = $this->scanner->position();
+
+ // Whitespace: Ignore
+ switch ($this->tok) {
+ case NULL:
+ case "\t":
+ case "\n":
+ case "\f":
+ case ' ':
+ case '&':
+ case '<':
+ // Don't consume; just return. Spec says return nothing, but I
+ // think we have to append '&' to the string.
+ return '&';
+ case '#':
+ // Consume and read a number
+ $this->tok = $this->scanner->next();
+ // X[0-9a-fA-F]+;
+ // x[0-9a-fA-F]+;
+ if ($this->tok == 'x' || $this->tok == 'X') {
+ $hex = $this->scanner->getHex();
+ $this->tok = $this->scanner->current();
+ if (empty($hex)) {
+ throw ParseError("Expected &#xHEX;, got &#x" . $this->tok);
+ }
+ $entity = hexdec($hex);
+ }
+ // [0-9]+;
+ else {
+ $entity = $this->scanner->getNumeric();
+ $this->tok = $this->scanner->current();
+ if (empty($numeric)) {
+ throw ParseError("Expected &#DIGITS;, got $#" . $this->tok);
+ }
+ }
+ break;
+ default:
+ // Attempt to consume a string up to a ';'.
+ // [a-zA-Z0-9]+;
+ $entity = $this->scanner->getAsciiAlpha();
+ $this->tok = $this->scanner->current();
+
+ }
+
+ // We have an entity. We're done here.
+ if ($this->tok == ';') {
+ return $entity;
+ }
+
+ // If in an attribute, then failing to match ; means unconsume the
+ // entire string. Otherwise, failure to match is an error.
+ if ($inAttribute) {
+ $this->scanner->unconsume($this->scanner->position() - $start);
+ return '&';
+ }
+
+ throw new ParseException("Expected &ENTITY;, got &ENTITY (no trailing ;)");
+
+ }
+
+ protected function rcdata() {
+ // Ampersand
+ // <
+ // Null
+ // EOF
+ // Character
+ }
+
+ protected function rawtext() {
+ // < is a literal
+ // NULL is an error
+ // EOF
+ // Character data
+ }
+
+ protected function scriptData() {
+ // < is a literal
+ // NULL is an error
+ // EOF
+ // Character data
+ }
+
+ /**
+ * 8.2.4.7
+ */
+ protected function plaintext() {
+ // NULL -> parse error
+ // EOF -> eof
+ // -> Character data
+ }
+
+ /**
+ * 8.2.4.8
+ */
+ protected function tagOpen() {
+ // ! -> markup declaration
+ // / -> end tagopen
+ // a-zA-Z -> tagname
+ // ? -> parse error
+ // -> Anything else is a parse error
+ }
+
+ /**
+ * 8.2.4.9
+ */
+ protected function endTagOpen() {
+ // a-zA-Z -> tagname
+ // > -> parse error
+ // EOF -> parse error
+ // -> parse error
+ }
+
+ /**
+ * 8.2.4.10
+ */
+ protected function tagName() {
+ // tab, lf, ff, space -> before attr name
+ // / -> self-closing tag
+ // > -> current tag is done, data-state
+ // NULL parse error
+ // EOF -> parse error
+ // -> append to tagname
+ }
+
+ /**
+ * 8.2.4.11
+ */
+ protected function rcdataLessThan() {
+ // / -> empty the tmp buffer and go to end-tag
+ // ->rcdata
+ }
+
+ /**
+ * 8.2.4.12
+ */
+ protected function rcdataEndTag() {
+ // A-Za-z: append to tagname
+ // -> rcdata state
+ }
+
+ /**
+ * 8.2.4.13
+ */
+ protected function rcdataEndTagName() {
+ // tab, lf, ff, space -> before attribute or treat as anything
+ // / -> self-closing tag
+ // > -> end tag, back to data
+ // A-Za-z -> append to tagname
+ // -> rcdata state
+ }
+
+ /**
+ * 8.2.4.14
+ */
+ protected function rawtextLessThan() {
+ // / -> rawtext endtag state
+ // -> rawtext
+ }
+
+ /**
+ * 8.2.4.15
+ */
+ protected function rawtextEndTagOpen() {
+ // A-Za-z -> rawtext
+ // ->rawtext
+ }
+
+ protected function rawtextEndTagName() {
+ // tab, lf, ff, space -> before attr name
+ //
+ }
+
+ protected function scriptLessThan(){
+ }
+ protected function scriptEndTagOpen() {
+ }
+ protected function scriptEndTagName() {
+ }
+ protected function scriptEscapeStart() {
+ }
+ protected function scriptEscapeStartDash() {
+ }
+ protected function scriptEscaped() {
+ }
+ protected function scriptEscapedDash() {
+ }
+ protected function scriptEscapedDashDash() {
+ }
+ protected function scriptEscapedLessThan() {
+ }
+ protected function scriptEscapedEndTagOpen() {
+ }
+ protected function scriptEscapedEndTagName() {
+ }
+ protected function scriptDoubleEscapeStart() {
+ }
+ protected function scriptDoubleEscaped() {
+ }
+ protected function scriptDoubleEscapedDash() {
+ }
+ protected function scriptDoubleEscapedDashDash() {
+ }
+ protected function scriptDoubleEscapedLessThan() {
+ }
+ protected function scriptDoubleEscapeEnd() {
+ }
+ protected function beforeAttributeName() {
+ }
+ protected function attributeName() {
+ }
+ protected function afterAttributeName() {
+ }
+ protected function beforeAttributeValue() {
+ }
+ protected function attributeValueDoubleQuote() {
+ }
+ protected function attributeValueSingleQuote() {
+ }
+ protected function attributeValueUnquoted() {
+ }
+ protected function characterReferenceInAttributeValue() {
+ }
+ protected function afterAttributeValueQuoted() {
+ }
+ protected function selfCloseingStartTag() {
+ }
+ protected function bogusComment() {
+ }
+ protected function markupDeclarationOpen() {
+ }
+ protected function commentStart() {
+ }
+ protected function commentStartDash() {
+ }
+ protected function comment() {
+ }
+ protected function commentEndDash() {
+ }
+ protected function commentEnd() {
+ }
+ protected function commentEndBangState() {
+ }
+ protected function doctype() {
+ }
+ protected function beforeDoctype() {
+ }
+ protected function doctypeName() {
+ }
+ protected function afterDoctypeName() {
+ }
+ protected function doctypePublicKeyword() {
+ }
+ protected function beforeDoctypePublicId() {
+ }
+ protected function doctypePublicIdDoubleQuoted() {
+ }
+ protected function doctypePublicIdSingleQuoted() {
+ }
+ protected function afterDoctypePublicId() {
+ }
+ protected function betweenDoctypePublicAndSystem() {
+ }
+ protected function afterDoctypeSystemKeyword() {
+ }
+ protected function beforeDoctypeSystemIdentifier() {
+ }
+ protected function doctypeSystemIdDoubleQuoted() {
+ }
+ protected function doctypeSystemIdSingleQuoted() {
+ }
+ protected function afterDoctypeSystemId() {
+ }
+ protected function bogusDoctype() {
+ }
+ protected function cdataSection() {
+ }
+
+
+
+
+
+}
diff --git a/src/HTML5/Tokenizer.php b/src/HTML5/Tokenizer.php
index 4a89b8e..d373d39 100644
--- a/src/HTML5/Tokenizer.php
+++ b/src/HTML5/Tokenizer.php
@@ -87,8 +87,9 @@ class Tokenizer {
*/
public function __construct($data, $builder = null) {
$this->stream = new InputStream($data);
- if (!$builder) $this->tree = new TreeBuilder;
- else $this->tree = $builder;
+
+ $this->tree = empty($builder) ? new TreeBuilder() : $builder;
+
$this->content_model = self::PCDATA;
}