diff options
-rw-r--r-- | src/HTML5/Elements.php | 13 | ||||
-rw-r--r-- | src/HTML5/Parser/CharacterReference.php | 3 | ||||
-rw-r--r-- | src/HTML5/Parser/DOMTreeBuilder.php | 5 | ||||
-rw-r--r-- | src/HTML5/Parser/EventHandler.php | 12 | ||||
-rw-r--r-- | src/HTML5/Parser/FileInputStream.php | 11 | ||||
-rw-r--r-- | src/HTML5/Parser/InputStream.php | 6 | ||||
-rw-r--r-- | src/HTML5/Parser/ParseError.php | 2 | ||||
-rw-r--r-- | src/HTML5/Parser/README.md | 2 | ||||
-rw-r--r-- | src/HTML5/Parser/Scanner.php | 6 | ||||
-rw-r--r-- | src/HTML5/Parser/StringInputStream.php | 3 | ||||
-rw-r--r-- | test/HTML5/Parser/DOMTreeBuilderTest.php | 4 |
11 files changed, 63 insertions, 4 deletions
diff --git a/src/HTML5/Elements.php b/src/HTML5/Elements.php index c5ba50b..3c1e57f 100644 --- a/src/HTML5/Elements.php +++ b/src/HTML5/Elements.php @@ -4,22 +4,31 @@ namespace HTML5; /** * Provide general element functions. * - * @todo consider using a bitmask table lookup. There is enought overlap in + * This class provides general information about HTML5 elements, + * including syntactic and semantic issues. Parsers and serializers can + * use this class as a reference point for information about the rules + * of various HTML5 elements. + * + * @todo consider using a bitmask table lookup. There is enough overlap in * naming that this could significantly shrink the size and maybe make it * faster. See the Go teams implementation at https://code.google.com/p/go/source/browse/html/atom. */ class Elements { + /** Indicates an element is described in the specification. */ const KNOWN_ELEMENT = 1; // From section 8.1.2: "script", "style" // From 8.2.5.4.7 ("in body" insertion mode): "noembed", "noscript" // From 8.4 "style", "xmp", "iframe", "noembed", "noframes" + /** Indicates the contained text should be processed as raw text. */ const TEXT_RAW = 2; // From section 8.1.2: "textarea", "title" + /** Indicates the contained text should be processed as RCDATA. */ const TEXT_RCDATA = 4; + /** Indicates the tag cannot have content. */ const VOID_TAG = 8; // "address", "article", "aside", "blockquote", "center", "details", "dialog", "dir", "div", "dl", @@ -29,6 +38,8 @@ class Elements { // "pre", "listing" // "form" // "plaintext" + /** Indicates that if a previous event is for a P tag, that element + * should be considered closed. */ const AUTOCLOSE_P = 16; const TEXT_PLAINTEXT = 32; diff --git a/src/HTML5/Parser/CharacterReference.php b/src/HTML5/Parser/CharacterReference.php index ea6a527..17a9285 100644 --- a/src/HTML5/Parser/CharacterReference.php +++ b/src/HTML5/Parser/CharacterReference.php @@ -5,6 +5,9 @@ use \HTML5\Entities; /** * Manage entity references. + * + * This is a simple resolver for HTML5 character reference entitites. + * See \HTML5\Entities for the list of supported entities. */ class CharacterReference { diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index f9d1e7a..f0caeb4 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -9,6 +9,11 @@ use HTML5\Elements; * attempts (but does not guarantee) to up-convert older HTML documents * to HTML5. It does this by applying HTML5's rules, but it will not * change the architecture of the document itself. + * + * Many of the error correction and quirks features suggested in the specification + * are implemented herein; however, not all of them are. Since we do not + * assume a graphical user agent, no presentation-specific logic is conducted + * during tree building. */ class DOMTreeBuilder implements EventHandler { diff --git a/src/HTML5/Parser/EventHandler.php b/src/HTML5/Parser/EventHandler.php index ebb30b2..4034938 100644 --- a/src/HTML5/Parser/EventHandler.php +++ b/src/HTML5/Parser/EventHandler.php @@ -4,6 +4,18 @@ namespace HTML5\Parser; /** * Standard events for HTML5. * + * This is roughly analogous to a SAX2 or expat-style interface. + * However, it is tuned specifically for HTML5, according to section 8 + * of the HTML5 specification. + * + * An event handler receives parser events. For a concrete + * implementation, see DOMTreeBuilder. + * + * Quirks support in the parser is limited to close-in syntax (malformed + * tags or attributes). Higher order syntax and semantic issues with a + * document (e.g. mismatched tags, illegal nesting, etc.) are the + * responsibility of the event handler implementation. + * * See HTML5 spec section 8.2.4 */ interface EventHandler { diff --git a/src/HTML5/Parser/FileInputStream.php b/src/HTML5/Parser/FileInputStream.php index ae3b4ef..c1bb128 100644 --- a/src/HTML5/Parser/FileInputStream.php +++ b/src/HTML5/Parser/FileInputStream.php @@ -8,6 +8,15 @@ namespace HTML5\Parser; */ class FileInputStream extends StringInputStream implements InputStream { + /* + * So right now we read files into strings and then process the + * string. We chose to do this largely for the sake of expediency of + * development, and also because we could optimize toward processing + * arbitrarily large chunks of the input. But in the future, we'd + * really like to rewrite this class to efficiently handle lower level + * stream reads (and thus efficiently handle large documents). + */ + /** * Load a file input stream. * @@ -23,4 +32,4 @@ class FileInputStream extends StringInputStream implements InputStream { } -}
\ No newline at end of file +} diff --git a/src/HTML5/Parser/InputStream.php b/src/HTML5/Parser/InputStream.php index bb2ca1c..713031d 100644 --- a/src/HTML5/Parser/InputStream.php +++ b/src/HTML5/Parser/InputStream.php @@ -3,6 +3,12 @@ namespace HTML5\Parser; /** * Interface for stream readers. + * + * The parser only reads from streams. Various input sources can write + * an adapater to this InputStream. + * + * Currently provided InputStream implementations include + * FileInputStream and StringInputStream. */ interface InputStream extends \Iterator { diff --git a/src/HTML5/Parser/ParseError.php b/src/HTML5/Parser/ParseError.php index 8fc646e..4d5f117 100644 --- a/src/HTML5/Parser/ParseError.php +++ b/src/HTML5/Parser/ParseError.php @@ -2,7 +2,7 @@ namespace HTML5\Parser; /** - * When the parser has an error. + * Emit when the parser has an error. */ class ParseError extends \Exception { } diff --git a/src/HTML5/Parser/README.md b/src/HTML5/Parser/README.md index 2f5a84a..9f92957 100644 --- a/src/HTML5/Parser/README.md +++ b/src/HTML5/Parser/README.md @@ -12,7 +12,7 @@ of the HTML5 specification, though we do not assume a networking layer. || [ Tree Builder ] // Organizes units into a tree of objects || - [DOM Document] // The final state of the parsed document. + [ DOM Document ] // The final state of the parsed document. ## InputStream diff --git a/src/HTML5/Parser/Scanner.php b/src/HTML5/Parser/Scanner.php index 1ab9b8b..b359f16 100644 --- a/src/HTML5/Parser/Scanner.php +++ b/src/HTML5/Parser/Scanner.php @@ -168,9 +168,15 @@ class Scanner { return $this->is->currentLine(); } + /** + * Read chars until something in the mask is encountered. + */ public function charsUntil($mask) { return $this->is->charsUntil($mask); } + /** + * Read chars as long as the mask matches. + */ public function charsWhile($mask) { return $this->is->charsWhile($mask); } diff --git a/src/HTML5/Parser/StringInputStream.php b/src/HTML5/Parser/StringInputStream.php index 9aa0b73..0d2a7f3 100644 --- a/src/HTML5/Parser/StringInputStream.php +++ b/src/HTML5/Parser/StringInputStream.php @@ -295,6 +295,9 @@ class StringInputStream implements InputStream { } } + /** + * Look ahead without moving cursor. + */ public function peek() { if (($this->char + 1) <= $this->EOF) { return $this->data[$this->char + 1]; diff --git a/test/HTML5/Parser/DOMTreeBuilderTest.php b/test/HTML5/Parser/DOMTreeBuilderTest.php index b13edbc..aec7499 100644 --- a/test/HTML5/Parser/DOMTreeBuilderTest.php +++ b/test/HTML5/Parser/DOMTreeBuilderTest.php @@ -27,6 +27,10 @@ class DOMTreeBuilderTest extends \HTML5\Tests\TestCase { return $treeBuilder->document(); } + + /** + * Utility function for parsing a fragment of HTML5. + */ protected function parseFragment($string) { $treeBuilder = new DOMTreeBuilder(TRUE); $input = new StringInputStream($string); |