summaryrefslogtreecommitdiff
path: root/src/HTML5
diff options
context:
space:
mode:
authorMatt Butcher <[email protected]>2013-05-30 09:23:26 -0500
committerMatt Butcher <[email protected]>2013-05-30 09:23:26 -0500
commitb1cbd9b4cd488471651751678cd90575bbb74bc9 (patch)
tree6dcf0afce7206b4372adc1a1ddbc25698b38e24f /src/HTML5
parent0e149588548834bbfee7770fac8455cc404fb8ca (diff)
parentfffeafbfe08e306356acd50cf568ec5904da882c (diff)
Merge branch 'master' of github.com:Masterminds/html5-php
Diffstat (limited to 'src/HTML5')
-rw-r--r--src/HTML5/Parser/UTF8Utils.php31
1 files changed, 21 insertions, 10 deletions
diff --git a/src/HTML5/Parser/UTF8Utils.php b/src/HTML5/Parser/UTF8Utils.php
index 022d628..974a670 100644
--- a/src/HTML5/Parser/UTF8Utils.php
+++ b/src/HTML5/Parser/UTF8Utils.php
@@ -86,7 +86,27 @@ class UTF8Utils {
could not be converted to Unicode characters must be converted
to U+FFFD REPLACEMENT CHARACTER code points. */
- if (function_exists('iconv') && $encoding != 'auto') {
+ // mb_convert_encoding is chosen over iconv because of a bug. The best
+ // details for the bug are on http://us1.php.net/manual/en/function.iconv.php#108643
+ // which contains links to the actual but reports as well as work around
+ // details.
+ if (function_exists('mb_convert_encoding')) {
+ // mb library has the following behaviors:
+ // - UTF-16 surrogates result in FALSE.
+ // - Overlongs and outside Plane 16 result in empty strings.
+
+ // Before we run mb_convert_encoding we need to tell it what to do with
+ // characters it does not know. This could be different than the parent
+ // application executing this library so we store the value, change it
+ // to our needs, and then change it back when we are done. This feels
+ // a little excessive and it would be great if there was a better way.
+ $save = ini_get('mbstring.substitute_character');
+ ini_set('mbstring.substitute_character', "none");
+ $data = mb_convert_encoding($data, 'UTF-8', $encoding);
+ ini_set('mbstring.substitute_character', $save);
+ }
+ // @todo Get iconv running in at least some environments if that is possible.
+ elseif (function_exists('iconv') && $encoding != 'auto') {
// fprintf(STDOUT, "iconv found\n");
// iconv has the following behaviors:
// - Overlong representations are ignored.
@@ -94,15 +114,6 @@ class UTF8Utils {
// - Incomplete sequences generate a warning.
$data = @iconv($encoding, 'UTF-8//IGNORE', $data);
}
- // MPB: Testing the newer mb_convert_encoding(). This might need
- // to be removed again.
- elseif (function_exists('mb_convert_encoding')) {
- fprintf(STDOUT, "MB found\n");
- // mb library has the following behaviors:
- // - UTF-16 surrogates result in FALSE.
- // - Overlongs and outside Plane 16 result in empty strings.
- $data = mb_convert_encoding($data, 'UTF-8', $encoding);
- }
else {
// we can make a conforming native implementation
throw new Exception('Not implemented, please install mbstring or iconv');