diff options
author | Matt Butcher <[email protected]> | 2013-04-08 21:00:18 -0500 |
---|---|---|
committer | Matt Butcher <[email protected]> | 2013-04-08 21:00:18 -0500 |
commit | 31a282f8a5d4d7d3f6a53caf1f37b05609dad484 (patch) | |
tree | 3e124c68c8289ba81f0fff6ee5d5b250b90317a5 /src/HTML5/InputStream.php | |
parent | c61b018133b1113e5548c863fd21307ccb80fa35 (diff) |
Refactoring to make constructor readible.
Diffstat (limited to 'src/HTML5/InputStream.php')
-rw-r--r-- | src/HTML5/InputStream.php | 39 |
1 files changed, 22 insertions, 17 deletions
diff --git a/src/HTML5/InputStream.php b/src/HTML5/InputStream.php index ec0b88a..7712c65 100644 --- a/src/HTML5/InputStream.php +++ b/src/HTML5/InputStream.php @@ -52,10 +52,9 @@ class InputStream { public $errors = array(); /** - * @param $data Data to parse + * Convert data from the given encoding to UTF-8. */ - public function __construct($data) { - + protected function convertToUTF8($data, $encoding = 'UTF-8') { /* Given an encoding, the bytes in the input stream must be converted to Unicode characters for the tokeniser, as described by the rules for that encoding, except that the @@ -72,13 +71,26 @@ class InputStream { // We previously had an mbstring implementation here, but that // implementation is heavily non-conforming, so it's been // omitted. - if (extension_loaded('iconv')) { + if (function_exists('iconv') && $encoding != 'auto') { // non-conforming $data = @iconv('UTF-8', 'UTF-8//IGNORE', $data); - } else { + // MPB: Testing the newer mb_convert_encoding(). + } + elseif (function_exists('mb_convert_encoding')) { + $data = mb_convert_encoding($data, 'UTF-8', $encoding); + + } + else { // we can make a conforming native implementation throw new Exception('Not implemented, please install mbstring or iconv'); } + } + + /** + * @param $data Data to parse + */ + public function __construct($data) { + /* One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present. */ @@ -102,19 +114,12 @@ class InputStream { to LF characters. Thus, newlines in HTML DOMs are represented by LF characters, and there are never any CR characters in the input to the tokenization stage. */ - $data = str_replace( - array( - "\0", - "\r\n", - "\r" - ), - array( - "\xEF\xBF\xBD", - "\n", - "\n" - ), - $data + $crlfTable = array( + "\0" => "\xEF\xBF\xBD", + "\r\n" => "\n", + "\r" => "\n", ); + $data = strtr($data, $crlfTable); /* Any occurrences of any characters in the ranges U+0001 to U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, |