diff options
author | Matt Butcher <[email protected]> | 2013-04-10 08:18:45 -0500 |
---|---|---|
committer | Matt Butcher <[email protected]> | 2013-04-10 08:18:45 -0500 |
commit | b5ba917c1e142f1a1e3d4a23e1ef3e1d2dcf50e8 (patch) | |
tree | 3d8910ad9433589c19e03c94a7d0c4be9c35c31d /src | |
parent | 03f992f08d559f5fdae774cdae4b8dff0ea401ed (diff) |
HIGHLY EXPERIMENTAL: Writing a new parser.
Diffstat (limited to 'src')
-rw-r--r-- | src/HTML5/InputStream.php | 13 | ||||
-rw-r--r-- | src/HTML5/Parser/EventHandler.php | 20 | ||||
-rw-r--r-- | src/HTML5/Parser/Scanner.php | 61 | ||||
-rw-r--r-- | src/HTML5/Parser/Tokenizer.php | 327 | ||||
-rw-r--r-- | src/HTML5/Tokenizer.php | 5 |
5 files changed, 423 insertions, 3 deletions
diff --git a/src/HTML5/InputStream.php b/src/HTML5/InputStream.php index 860f883..1efaff3 100644 --- a/src/HTML5/InputStream.php +++ b/src/HTML5/InputStream.php @@ -390,9 +390,20 @@ class InputStream { /** * Unconsume one character. */ - public function unget() { + public function unconsume() { if ($this->char > 0 && $this->char <= $this->EOF) { $this->char--; } } + public function unget() { + $this->unconsume(); + } + + public function peek() { + return $this->data[$this->char + 1]; + } + + public function position() { + return $this->char; + } } diff --git a/src/HTML5/Parser/EventHandler.php b/src/HTML5/Parser/EventHandler.php new file mode 100644 index 0000000..f144fc3 --- /dev/null +++ b/src/HTML5/Parser/EventHandler.php @@ -0,0 +1,20 @@ +<?php +namespace HTML5\Parser; + +/** + * Standard events for HTML5. + * + * See HTML5 spec section 8.2.4 + */ +interface EventHandler { + public function doctype($name, $publicID, $systemID, $quirks = FALSE); + public function startTag($name, $attributes = array(), $selfClosing = FALSE); + public function endTag($name); + public function comment($cdata); + public function character($cdata); + public function eof(); + + // Do we need... + // public function cdata(); + // public function processorInstruction(); +} diff --git a/src/HTML5/Parser/Scanner.php b/src/HTML5/Parser/Scanner.php new file mode 100644 index 0000000..1ce8428 --- /dev/null +++ b/src/HTML5/Parser/Scanner.php @@ -0,0 +1,61 @@ +<?php +namespace HTML5\Parser; + +/** + * The scanner. + * + * This scans over an input stream. + */ +class Scanner { + const CHARS_HEX = 'abcdefABCDEF01234567890'; + const CHARS_ALNUM = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890'; + const CHARS_ALPHA = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ'; + + + protected $char; + protected $is; + + public function __construct($input) { + $this->is = $input; + } + + public function position() { + return $this->is->position(); + } + + public function peek() { + return $this->is->peek(); + } + + public function next() { + $this->char = $this->is->char(); + return $this->char; + } + + public function current() { + return $this->char; + } + + public function unconsume($howMany = 1) { + for ($i = 0; $i < $howMany; ++$i) { + $this->is->unconsume(); + } + } + + public function getHex() { + $this->charsWhile(self::CHARS_HEX); + } + public function getAsciiAlpha() { + $this->charsWhile(self::CHARS_ALPHA); + } + public function getAsciiAlphaNum() { + $this->charsWhile(self::CHARS_ALNUM); + } + public function getNumeric() { + $this->charsWhile('0123456789'); + } + + +} +class ParseError extends Exception { +} diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php new file mode 100644 index 0000000..d201fd8 --- /dev/null +++ b/src/HTML5/Parser/Tokenizer.php @@ -0,0 +1,327 @@ +<?php +namespace HTML5\Parser; + +class Tokenizer { + protected $scanner; + protected $events; + protected $tok; + + /** + * Buffer for text. + */ + protected $text = ''; + + public function __construct($scanner, $eventHandler) { + $this->scanner = $scanner; + $this->events = $eventHandler; + } + + /** + * 8.2.4.1 + */ + public function consumeData() { + // Scan a token + $this->scanner->next(); + // Character Ref + $this->characterReference(); + + // TagOpen + // Null + // EOF + // Character + } + + /** + * 8.2.4.2 + */ + protected function characterReference($inAttr = FALSE) { + if ($this->tok == '&') { + $this->tok = $this->scanner->next(); + $$this->text .= $this->consumeCharacterReference($inAttr); + } + } + + protected function consumeCharacterReference($inAttribute = FALSE) { + $entity = ''; + $start = $this->scanner->position(); + + // Whitespace: Ignore + switch ($this->tok) { + case NULL: + case "\t": + case "\n": + case "\f": + case ' ': + case '&': + case '<': + // Don't consume; just return. Spec says return nothing, but I + // think we have to append '&' to the string. + return '&'; + case '#': + // Consume and read a number + $this->tok = $this->scanner->next(); + // X[0-9a-fA-F]+; + // x[0-9a-fA-F]+; + if ($this->tok == 'x' || $this->tok == 'X') { + $hex = $this->scanner->getHex(); + $this->tok = $this->scanner->current(); + if (empty($hex)) { + throw ParseError("Expected &#xHEX;, got &#x" . $this->tok); + } + $entity = hexdec($hex); + } + // [0-9]+; + else { + $entity = $this->scanner->getNumeric(); + $this->tok = $this->scanner->current(); + if (empty($numeric)) { + throw ParseError("Expected &#DIGITS;, got $#" . $this->tok); + } + } + break; + default: + // Attempt to consume a string up to a ';'. + // [a-zA-Z0-9]+; + $entity = $this->scanner->getAsciiAlpha(); + $this->tok = $this->scanner->current(); + + } + + // We have an entity. We're done here. + if ($this->tok == ';') { + return $entity; + } + + // If in an attribute, then failing to match ; means unconsume the + // entire string. Otherwise, failure to match is an error. + if ($inAttribute) { + $this->scanner->unconsume($this->scanner->position() - $start); + return '&'; + } + + throw new ParseException("Expected &ENTITY;, got &ENTITY (no trailing ;)"); + + } + + protected function rcdata() { + // Ampersand + // < + // Null + // EOF + // Character + } + + protected function rawtext() { + // < is a literal + // NULL is an error + // EOF + // Character data + } + + protected function scriptData() { + // < is a literal + // NULL is an error + // EOF + // Character data + } + + /** + * 8.2.4.7 + */ + protected function plaintext() { + // NULL -> parse error + // EOF -> eof + // -> Character data + } + + /** + * 8.2.4.8 + */ + protected function tagOpen() { + // ! -> markup declaration + // / -> end tagopen + // a-zA-Z -> tagname + // ? -> parse error + // -> Anything else is a parse error + } + + /** + * 8.2.4.9 + */ + protected function endTagOpen() { + // a-zA-Z -> tagname + // > -> parse error + // EOF -> parse error + // -> parse error + } + + /** + * 8.2.4.10 + */ + protected function tagName() { + // tab, lf, ff, space -> before attr name + // / -> self-closing tag + // > -> current tag is done, data-state + // NULL parse error + // EOF -> parse error + // -> append to tagname + } + + /** + * 8.2.4.11 + */ + protected function rcdataLessThan() { + // / -> empty the tmp buffer and go to end-tag + // ->rcdata + } + + /** + * 8.2.4.12 + */ + protected function rcdataEndTag() { + // A-Za-z: append to tagname + // -> rcdata state + } + + /** + * 8.2.4.13 + */ + protected function rcdataEndTagName() { + // tab, lf, ff, space -> before attribute or treat as anything + // / -> self-closing tag + // > -> end tag, back to data + // A-Za-z -> append to tagname + // -> rcdata state + } + + /** + * 8.2.4.14 + */ + protected function rawtextLessThan() { + // / -> rawtext endtag state + // -> rawtext + } + + /** + * 8.2.4.15 + */ + protected function rawtextEndTagOpen() { + // A-Za-z -> rawtext + // ->rawtext + } + + protected function rawtextEndTagName() { + // tab, lf, ff, space -> before attr name + // + } + + protected function scriptLessThan(){ + } + protected function scriptEndTagOpen() { + } + protected function scriptEndTagName() { + } + protected function scriptEscapeStart() { + } + protected function scriptEscapeStartDash() { + } + protected function scriptEscaped() { + } + protected function scriptEscapedDash() { + } + protected function scriptEscapedDashDash() { + } + protected function scriptEscapedLessThan() { + } + protected function scriptEscapedEndTagOpen() { + } + protected function scriptEscapedEndTagName() { + } + protected function scriptDoubleEscapeStart() { + } + protected function scriptDoubleEscaped() { + } + protected function scriptDoubleEscapedDash() { + } + protected function scriptDoubleEscapedDashDash() { + } + protected function scriptDoubleEscapedLessThan() { + } + protected function scriptDoubleEscapeEnd() { + } + protected function beforeAttributeName() { + } + protected function attributeName() { + } + protected function afterAttributeName() { + } + protected function beforeAttributeValue() { + } + protected function attributeValueDoubleQuote() { + } + protected function attributeValueSingleQuote() { + } + protected function attributeValueUnquoted() { + } + protected function characterReferenceInAttributeValue() { + } + protected function afterAttributeValueQuoted() { + } + protected function selfCloseingStartTag() { + } + protected function bogusComment() { + } + protected function markupDeclarationOpen() { + } + protected function commentStart() { + } + protected function commentStartDash() { + } + protected function comment() { + } + protected function commentEndDash() { + } + protected function commentEnd() { + } + protected function commentEndBangState() { + } + protected function doctype() { + } + protected function beforeDoctype() { + } + protected function doctypeName() { + } + protected function afterDoctypeName() { + } + protected function doctypePublicKeyword() { + } + protected function beforeDoctypePublicId() { + } + protected function doctypePublicIdDoubleQuoted() { + } + protected function doctypePublicIdSingleQuoted() { + } + protected function afterDoctypePublicId() { + } + protected function betweenDoctypePublicAndSystem() { + } + protected function afterDoctypeSystemKeyword() { + } + protected function beforeDoctypeSystemIdentifier() { + } + protected function doctypeSystemIdDoubleQuoted() { + } + protected function doctypeSystemIdSingleQuoted() { + } + protected function afterDoctypeSystemId() { + } + protected function bogusDoctype() { + } + protected function cdataSection() { + } + + + + + +} diff --git a/src/HTML5/Tokenizer.php b/src/HTML5/Tokenizer.php index 4a89b8e..d373d39 100644 --- a/src/HTML5/Tokenizer.php +++ b/src/HTML5/Tokenizer.php @@ -87,8 +87,9 @@ class Tokenizer { */ public function __construct($data, $builder = null) { $this->stream = new InputStream($data); - if (!$builder) $this->tree = new TreeBuilder; - else $this->tree = $builder; + + $this->tree = empty($builder) ? new TreeBuilder() : $builder; + $this->content_model = self::PCDATA; } |