summaryrefslogtreecommitdiff
path: root/src/HTML5/Parser/UTF8Utils.php
diff options
context:
space:
mode:
Diffstat (limited to 'src/HTML5/Parser/UTF8Utils.php')
-rw-r--r--src/HTML5/Parser/UTF8Utils.php83
1 files changed, 82 insertions, 1 deletions
diff --git a/src/HTML5/Parser/UTF8Utils.php b/src/HTML5/Parser/UTF8Utils.php
index 0902050..0d692d1 100644
--- a/src/HTML5/Parser/UTF8Utils.php
+++ b/src/HTML5/Parser/UTF8Utils.php
@@ -1,4 +1,30 @@
<?php
+/*
+ *
+ * Portions based on code from html5lib files with the following copyright:
+
+Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*/
namespace HTML5\Parser;
/**
* UTF-8 Utilities
@@ -33,6 +59,61 @@ class UTF8Utils {
}
/**
+ * Convert data from the given encoding to UTF-8.
+ *
+ * This has not yet been tested with charactersets other than UTF-8.
+ * It should work with ISO-8859-1/-13 and standard Latin Win charsets.
+ *
+ * @param string $data
+ * The data to convert.
+ * @param string $encoding
+ * A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php
+ */
+ public static function convertToUTF8($data, $encoding = 'UTF-8') {
+ /*
+ * From the HTML5 spec:
+ Given an encoding, the bytes in the input stream must be
+ converted to Unicode characters for the tokeniser, as
+ described by the rules for that encoding, except that the
+ leading U+FEFF BYTE ORDER MARK character, if any, must not
+ be stripped by the encoding layer (it is stripped by the rule below).
+
+ Bytes or sequences of bytes in the original byte stream that
+ could not be converted to Unicode characters must be converted
+ to U+FFFD REPLACEMENT CHARACTER code points. */
+
+ if (function_exists('iconv') && $encoding != 'auto') {
+ // fprintf(STDOUT, "iconv found\n");
+ // iconv has the following behaviors:
+ // - Overlong representations are ignored.
+ // - Beyond Plane 16 is replaced with a lower char.
+ // - Incomplete sequences generate a warning.
+ $data = @iconv($encoding, 'UTF-8//IGNORE', $data);
+ }
+ // MPB: Testing the newer mb_convert_encoding(). This might need
+ // to be removed again.
+ elseif (function_exists('mb_convert_encoding')) {
+ fprintf(STDOUT, "MB found\n");
+ // mb library has the following behaviors:
+ // - UTF-16 surrogates result in FALSE.
+ // - Overlongs and outside Plane 16 result in empty strings.
+ $data = mb_convert_encoding($data, 'UTF-8', $encoding);
+ }
+ else {
+ // we can make a conforming native implementation
+ throw new Exception('Not implemented, please install mbstring or iconv');
+ }
+
+ /* One leading U+FEFF BYTE ORDER MARK character must be
+ ignored if any are present. */
+ if (substr($data, 0, 3) === "\xEF\xBB\xBF") {
+ $data = substr($data, 3);
+ }
+
+ return $data;
+ }
+
+ /**
* Checks for Unicode code points that are not valid in a document.
*
* @param string $data
@@ -84,7 +165,7 @@ class UTF8Utils {
$matches
);
for ($i = 0; $i < $count; $i++) {
- $this[] = 'invalid-codepoint';
+ $errors[] = 'invalid-codepoint';
}
return $errors;
}