summaryrefslogtreecommitdiff
path: root/src/HTML5
diff options
context:
space:
mode:
authorMatt Butcher <[email protected]>2013-04-09 08:35:33 -0500
committerMatt Butcher <[email protected]>2013-04-09 08:35:33 -0500
commitfcdfdc44c59f7f965003b9de34e2a48523b3ae60 (patch)
tree5d698bcee4e4949686dfe26dc953dad3630e0a4c /src/HTML5
parent23abcc27371a32f2409284f09ab50fdc50011018 (diff)
Continued refactoring of InputStream.
Diffstat (limited to 'src/HTML5')
-rw-r--r--src/HTML5/InputStream.php41
1 files changed, 24 insertions, 17 deletions
diff --git a/src/HTML5/InputStream.php b/src/HTML5/InputStream.php
index 9d65d00..75675f8 100644
--- a/src/HTML5/InputStream.php
+++ b/src/HTML5/InputStream.php
@@ -52,6 +52,23 @@ class InputStream {
public $errors = array();
/**
+ * Create a new InputStream wrapper.
+ *
+ * @param $data Data to parse
+ */
+ public function __construct($data, $encoding = 'UTF-8') {
+
+ $data = $this->convertToUTF8($data, $encoding);
+ $this->checkForIllegalCodepoints($data);
+
+ $data = $this->replaceLinefeeds($data);
+
+ $this->data = $data;
+ $this->char = 0;
+ $this->EOF = strlen($data);
+ }
+
+ /**
* Convert data from the given encoding to UTF-8.
*
* This has not yet been tested with charactersets other than UTF-8.
@@ -80,12 +97,18 @@ class InputStream {
// implementation is heavily non-conforming, so it's been
// omitted.
if (function_exists('iconv') && $encoding != 'auto') {
- // non-conforming
+ // iconv has the following behaviors:
+ // - Overlong representations are ignored.
+ // - Beyond Plane 16 is replaced with a lower char.
+ // - Incomplete sequences generate a warning.
$data = @iconv($encoding, 'UTF-8//IGNORE', $data);
}
// MPB: Testing the newer mb_convert_encoding(). This might need
// to be removed again.
elseif (function_exists('mb_convert_encoding')) {
+ // mb library has the following behaviors:
+ // - UTF-16 surrogates result in FALSE.
+ // - Overlongs and outside Plane 16 result in empty strings.
$data = mb_convert_encoding($data, 'UTF-8', $encoding);
}
else {
@@ -177,22 +200,6 @@ class InputStream {
}
}
- /**
- * Create a new InputStream wrapper.
- *
- * @param $data Data to parse
- */
- public function __construct($data, $encoding = 'UTF-8') {
-
- $data = $this->convertToUTF8($data, $encoding);
- $data = $this->replaceLinefeeds($data);
-
- $this->checkForIllegalCodepoints($data);
-
- $this->data = $data;
- $this->char = 0;
- $this->EOF = strlen($data);
- }
/**
* Returns the current line that the tokenizer is at.