summaryrefslogtreecommitdiff
path: root/src/HTML5.php
diff options
context:
space:
mode:
authorMatt Farina <[email protected]>2013-05-15 11:24:22 -0400
committerMatt Farina <[email protected]>2013-05-15 11:24:22 -0400
commit0cdf4ae1aaf6aefdc7218dbb49e543e8603035a8 (patch)
treec10b651f07469b06242ac5ac8863250a957e9fd9 /src/HTML5.php
parent5be96c81985e8a40cfcaeff9c5d7ee055b46ac08 (diff)
parent69d1932bac21ded3d10f5df1a94d892efa2c1b89 (diff)
Merge branch 'master' of github.com:technosophos/HTML5-PHP
Diffstat (limited to 'src/HTML5.php')
-rw-r--r--src/HTML5.php86
1 files changed, 84 insertions, 2 deletions
diff --git a/src/HTML5.php b/src/HTML5.php
index 31db307..23948b3 100644
--- a/src/HTML5.php
+++ b/src/HTML5.php
@@ -1,14 +1,82 @@
<?php
+use HTML5\Parser\StringInputStream;
+use HTML5\Parser\FileInputStream;
+use HTML5\Parser\Scanner;
+use HTML5\Parser\Tokenizer;
+use HTML5\Parser\DOMTreeBuilder;
+use HTML5\Serializer\Serializer;
+
/**
* The main HTML5 front end.
*
* This class offers convenience methods for parsing and serializing HTML5.
+ * It is roughly designed to mirror the \DOMDocument class that is
+ * provided with most versions of PHP.
*
* EXPERIMENTAL. This may change or be completely replaced.
*/
-class HTML5 extends \HTML5\Parser {
- // Inherit parse() and parseFragment().
+class HTML5 {
+
+ /**
+ * Load and parse an HTML file.
+ *
+ * This will apply the HTML5 parser, which is tolerant of many
+ * varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML
+ * 3. Note that in these cases, not all of the old data will be
+ * preserved. For example, XHTML's XML declaration will be removed.
+ *
+ * The rules governing parsing are set out in the HTML 5 spec.
+ *
+ * @param string $file
+ * The path to the file to parse. If this is a resource, it is
+ * assumed to be an open stream whose pointer is set to the first
+ * byte of input.
+ * @param array $options
+ * An array of options.
+ * @return \DOMDocument
+ * A DOM document. These object type is defined by the libxml
+ * library, and should have been included with your version of PHP.
+ */
+ public function load($file, $options = NULL) {
+
+ // Handle the case where file is a resource.
+ if (is_resource($file)) {
+ // FIXME: We need a StreamInputStream class.
+ return $this->loadHTML(stream_get_contents($file));
+ }
+
+ $input = new FileInputStream($file);
+ return $this->parse($input);
+ }
+
+ /**
+ * Parse an HTML string.
+ *
+ * Take a string of HTML 5 (or earlier) and parse it into a
+ * DOMDocument.
+ *
+ *
+ * @param array $options
+ * An array of options.
+ * @return \DOMDocument
+ * A DOM document. DOM is part of libxml, which is included with
+ * almost all distribtions of PHP.
+ */
+ public function loadHTML($string, $options = NULL) {
+ $input = new StringInputStream($string);
+ return $this->parse($input);
+ }
+
+ /**
+ * Convenience function to load an HTML file.
+ *
+ * This is here to provide backwards compatibility with the
+ * PHP DOM implementation. It simply calls load().
+ */
+ public function loadHTMLFile($file, $options = NULL) {
+ return $this->load($file, $options);
+ }
/**
* Save a DOM into a given file as HTML5.
@@ -25,4 +93,18 @@ class HTML5 extends \HTML5\Parser {
$serializer = new \HTML5\Serializer\Serializer($dom);
return $serializer->saveHTML();
}
+
+ /**
+ * Parse an input stream.
+ */
+ protected function parse($input) {
+ $events = new DOMTreeBuilder();
+ $scanner = new Scanner($input);
+ $parser = new Tokenizer($scanner, $events);
+
+ $parser->parse();
+
+ return $events->document();
+ }
+
}