summaryrefslogtreecommitdiff
path: root/src/HTML5/InputStream.php
diff options
context:
space:
mode:
authorMatt Butcher <[email protected]>2013-04-08 21:00:18 -0500
committerMatt Butcher <[email protected]>2013-04-08 21:00:18 -0500
commit31a282f8a5d4d7d3f6a53caf1f37b05609dad484 (patch)
tree3e124c68c8289ba81f0fff6ee5d5b250b90317a5 /src/HTML5/InputStream.php
parentc61b018133b1113e5548c863fd21307ccb80fa35 (diff)
Refactoring to make constructor readible.
Diffstat (limited to 'src/HTML5/InputStream.php')
-rw-r--r--src/HTML5/InputStream.php39
1 files changed, 22 insertions, 17 deletions
diff --git a/src/HTML5/InputStream.php b/src/HTML5/InputStream.php
index ec0b88a..7712c65 100644
--- a/src/HTML5/InputStream.php
+++ b/src/HTML5/InputStream.php
@@ -52,10 +52,9 @@ class InputStream {
public $errors = array();
/**
- * @param $data Data to parse
+ * Convert data from the given encoding to UTF-8.
*/
- public function __construct($data) {
-
+ protected function convertToUTF8($data, $encoding = 'UTF-8') {
/* Given an encoding, the bytes in the input stream must be
converted to Unicode characters for the tokeniser, as
described by the rules for that encoding, except that the
@@ -72,13 +71,26 @@ class InputStream {
// We previously had an mbstring implementation here, but that
// implementation is heavily non-conforming, so it's been
// omitted.
- if (extension_loaded('iconv')) {
+ if (function_exists('iconv') && $encoding != 'auto') {
// non-conforming
$data = @iconv('UTF-8', 'UTF-8//IGNORE', $data);
- } else {
+ // MPB: Testing the newer mb_convert_encoding().
+ }
+ elseif (function_exists('mb_convert_encoding')) {
+ $data = mb_convert_encoding($data, 'UTF-8', $encoding);
+
+ }
+ else {
// we can make a conforming native implementation
throw new Exception('Not implemented, please install mbstring or iconv');
}
+ }
+
+ /**
+ * @param $data Data to parse
+ */
+ public function __construct($data) {
+
/* One leading U+FEFF BYTE ORDER MARK character must be
ignored if any are present. */
@@ -102,19 +114,12 @@ class InputStream {
to LF characters. Thus, newlines in HTML DOMs are represented
by LF characters, and there are never any CR characters in the
input to the tokenization stage. */
- $data = str_replace(
- array(
- "\0",
- "\r\n",
- "\r"
- ),
- array(
- "\xEF\xBF\xBD",
- "\n",
- "\n"
- ),
- $data
+ $crlfTable = array(
+ "\0" => "\xEF\xBF\xBD",
+ "\r\n" => "\n",
+ "\r" => "\n",
);
+ $data = strtr($data, $crlfTable);
/* Any occurrences of any characters in the ranges U+0001 to
U+0008, U+000B, U+000E to U+001F, U+007F to U+009F,