diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/HTML5/Data.php | 11 | ||||
-rw-r--r-- | src/HTML5/InputStream.php | 3 | ||||
-rw-r--r-- | src/HTML5/Parser.php | 16 | ||||
-rw-r--r-- | src/HTML5/Tokenizer.php | 17 |
4 files changed, 29 insertions, 18 deletions
diff --git a/src/HTML5/Data.php b/src/HTML5/Data.php index 497345f..a7c865c 100644 --- a/src/HTML5/Data.php +++ b/src/HTML5/Data.php @@ -1,9 +1,14 @@ <?php +namespace HTML5; + // warning: this file is encoded in UTF-8! -class HTML5_Data -{ + +/** + * Character data. + */ +class Data { // at some point this should be moved to a .ser file. Another // possible optimization is to give UTF-8 bytes, not Unicode @@ -61,6 +66,8 @@ class HTML5_Data } public static function getNamedCharacterReferences() { + // Danger Will Robinson: This will prevent the opcode cache from + // caching the entity references. if (!self::$namedCharacterReferences) { self::$namedCharacterReferences = unserialize( file_get_contents(dirname(__FILE__) . '/named-character-references.ser')); diff --git a/src/HTML5/InputStream.php b/src/HTML5/InputStream.php index f98b427..d3bd8ac 100644 --- a/src/HTML5/InputStream.php +++ b/src/HTML5/InputStream.php @@ -1,4 +1,5 @@ <?php +namespace HTML5; /* @@ -29,7 +30,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. // /* */ indicates verbatim text from the HTML 5 specification // // indicates regular comments -class HTML5_InputStream { +class InputStream { /** * The string data we're parsing. */ diff --git a/src/HTML5/Parser.php b/src/HTML5/Parser.php index 5f9ca56..892ab54 100644 --- a/src/HTML5/Parser.php +++ b/src/HTML5/Parser.php @@ -1,14 +1,16 @@ <?php +namespace HTML5; -require_once dirname(__FILE__) . '/Data.php'; -require_once dirname(__FILE__) . '/InputStream.php'; -require_once dirname(__FILE__) . '/TreeBuilder.php'; -require_once dirname(__FILE__) . '/Tokenizer.php'; +# Use autoloader +#require_once dirname(__FILE__) . '/Data.php'; +#require_once dirname(__FILE__) . '/InputStream.php'; +#require_once dirname(__FILE__) . '/TreeBuilder.php'; +#require_once dirname(__FILE__) . '/Tokenizer.php'; /** * Outwards facing interface for HTML5. */ -class HTML5_Parser +class Parser { /** * Parses a full HTML document. @@ -17,7 +19,7 @@ class HTML5_Parser * @return Parsed HTML as DOMDocument */ static public function parse($text, $builder = null) { - $tokenizer = new HTML5_Tokenizer($text, $builder); + $tokenizer = new Tokenizer($text, $builder); $tokenizer->parse(); return $tokenizer->save(); } @@ -29,7 +31,7 @@ class HTML5_Parser * @return Parsed HTML as DOMDocument */ static public function parseFragment($text, $context = null, $builder = null) { - $tokenizer = new HTML5_Tokenizer($text, $builder); + $tokenizer = new Tokenizer($text, $builder); $tokenizer->parseFragment($context); return $tokenizer->save(); } diff --git a/src/HTML5/Tokenizer.php b/src/HTML5/Tokenizer.php index 0af0716..e27b16a 100644 --- a/src/HTML5/Tokenizer.php +++ b/src/HTML5/Tokenizer.php @@ -1,4 +1,5 @@ <?php +namespace HTML5; /* @@ -33,7 +34,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. // all flags are in hyphenated form -class HTML5_Tokenizer { +class Tokenizer { /** * Points to an InputStream object. */ @@ -85,8 +86,8 @@ class HTML5_Tokenizer { * @param $data Data to parse */ public function __construct($data, $builder = null) { - $this->stream = new HTML5_InputStream($data); - if (!$builder) $this->tree = new HTML5_TreeBuilder; + $this->stream = new InputStream($data); + if (!$builder) $this->tree = new TreeBuilder; else $this->tree = $builder; $this->content_model = self::PCDATA; } @@ -2208,13 +2209,13 @@ class HTML5_Tokenizer { row with that number in the first column, and return a character token for the Unicode character given in the second column of that row. */ - $new_codepoint = HTML5_Data::getRealCodepoint($codepoint); + $new_codepoint = Data::getRealCodepoint($codepoint); if ($new_codepoint) { $this->emitToken(array( 'type' => self::PARSEERROR, 'data' => 'illegal-windows-1252-entity' )); - return HTML5_Data::utf8chr($new_codepoint); + return Data::utf8chr($new_codepoint); } else { /* Otherwise, if the number is greater than 0x10FFFF, then * this is a parse error. Return a U+FFFD REPLACEMENT @@ -2253,7 +2254,7 @@ class HTML5_Tokenizer { 'data' => 'illegal-codepoint-for-numeric-entity' )); } - return HTML5_Data::utf8chr($codepoint); + return Data::utf8chr($codepoint); } } @@ -2267,7 +2268,7 @@ class HTML5_Tokenizer { // What we actually do here is consume as much as we can while it // matches the start of one of the identifiers in the first column. - $refs = HTML5_Data::getNamedCharacterReferences(); + $refs = Data::getNamedCharacterReferences(); // Get the longest string which is the start of an identifier // ($chars) as well as the longest identifier which matches ($id) @@ -2342,7 +2343,7 @@ class HTML5_Tokenizer { /* Otherwise, return a character token for the character corresponding to the character reference name (as given by the second column of the named character references table). */ - return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id)); + return Data::utf8chr($codepoint) . substr($chars, strlen($id)); } } |