diff options
Diffstat (limited to 'src/HTML5/Parser/UTF8Utils.php')
-rw-r--r-- | src/HTML5/Parser/UTF8Utils.php | 83 |
1 files changed, 82 insertions, 1 deletions
diff --git a/src/HTML5/Parser/UTF8Utils.php b/src/HTML5/Parser/UTF8Utils.php index 0902050..0d692d1 100644 --- a/src/HTML5/Parser/UTF8Utils.php +++ b/src/HTML5/Parser/UTF8Utils.php @@ -1,4 +1,30 @@ <?php +/* + * + * Portions based on code from html5lib files with the following copyright: + +Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/> + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +*/ namespace HTML5\Parser; /** * UTF-8 Utilities @@ -33,6 +59,61 @@ class UTF8Utils { } /** + * Convert data from the given encoding to UTF-8. + * + * This has not yet been tested with charactersets other than UTF-8. + * It should work with ISO-8859-1/-13 and standard Latin Win charsets. + * + * @param string $data + * The data to convert. + * @param string $encoding + * A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php + */ + public static function convertToUTF8($data, $encoding = 'UTF-8') { + /* + * From the HTML5 spec: + Given an encoding, the bytes in the input stream must be + converted to Unicode characters for the tokeniser, as + described by the rules for that encoding, except that the + leading U+FEFF BYTE ORDER MARK character, if any, must not + be stripped by the encoding layer (it is stripped by the rule below). + + Bytes or sequences of bytes in the original byte stream that + could not be converted to Unicode characters must be converted + to U+FFFD REPLACEMENT CHARACTER code points. */ + + if (function_exists('iconv') && $encoding != 'auto') { + // fprintf(STDOUT, "iconv found\n"); + // iconv has the following behaviors: + // - Overlong representations are ignored. + // - Beyond Plane 16 is replaced with a lower char. + // - Incomplete sequences generate a warning. + $data = @iconv($encoding, 'UTF-8//IGNORE', $data); + } + // MPB: Testing the newer mb_convert_encoding(). This might need + // to be removed again. + elseif (function_exists('mb_convert_encoding')) { + fprintf(STDOUT, "MB found\n"); + // mb library has the following behaviors: + // - UTF-16 surrogates result in FALSE. + // - Overlongs and outside Plane 16 result in empty strings. + $data = mb_convert_encoding($data, 'UTF-8', $encoding); + } + else { + // we can make a conforming native implementation + throw new Exception('Not implemented, please install mbstring or iconv'); + } + + /* One leading U+FEFF BYTE ORDER MARK character must be + ignored if any are present. */ + if (substr($data, 0, 3) === "\xEF\xBB\xBF") { + $data = substr($data, 3); + } + + return $data; + } + + /** * Checks for Unicode code points that are not valid in a document. * * @param string $data @@ -84,7 +165,7 @@ class UTF8Utils { $matches ); for ($i = 0; $i < $count; $i++) { - $this[] = 'invalid-codepoint'; + $errors[] = 'invalid-codepoint'; } return $errors; } |