summaryrefslogtreecommitdiff
path: root/src/HTML5/Parser/StringInputStream.php
diff options
context:
space:
mode:
authorTechnosophos <[email protected]>2013-04-11 15:52:45 -0500
committerTechnosophos <[email protected]>2013-04-11 15:52:45 -0500
commit4b48113eed8e21ccc6b8b4bbb310cbecdd85af65 (patch)
tree0c2c0072e38c3d9fffba9c80d9f2748b163f70a5 /src/HTML5/Parser/StringInputStream.php
parent1678e73e34b167ba0312d1b65797363ea8f62fe2 (diff)
Addressed UTF-8 encoding issues.
Neither iconv nor mb seem to be able to convert UTF-8 surrogates into UTF-8. As I understand it, this is an extreme edge case. Still, the behavior in both cases is that the surrogates are stripped from the string. We test for that condition, now.
Diffstat (limited to 'src/HTML5/Parser/StringInputStream.php')
-rw-r--r--src/HTML5/Parser/StringInputStream.php132
1 files changed, 12 insertions, 120 deletions
diff --git a/src/HTML5/Parser/StringInputStream.php b/src/HTML5/Parser/StringInputStream.php
index 5f41c21..9aa0b73 100644
--- a/src/HTML5/Parser/StringInputStream.php
+++ b/src/HTML5/Parser/StringInputStream.php
@@ -62,17 +62,18 @@ class StringInputStream implements InputStream {
*
* @param $data Data to parse
*/
- public function __construct($data, $encoding = 'UTF-8') {
+ public function __construct($data, $encoding = 'UTF-8', $debug = '') {
- $data = $this->convertToUTF8($data, $encoding);
+ $data = UTF8Utils::convertToUTF8($data, $encoding);
+ if ($debug) fprintf(STDOUT, $debug, $data, strlen($data));
// There is good reason to question whether it makes sense to
// do this here, since most of these checks are done during
// parsing, and since this check doesn't actually *do* anything.
- $e = UTF8Utils::checkForIllegalCodepoints($data);
- if (!empty($e)) {
- throw new ParseError("UTF-8 encoding issues: " . implode(', ', $e));
- }
+ $this->errors = UTF8Utils::checkForIllegalCodepoints($data);
+ //if (!empty($e)) {
+ // throw new ParseError("UTF-8 encoding issues: " . implode(', ', $e));
+ //}
$data = $this->replaceLinefeeds($data);
@@ -82,63 +83,6 @@ class StringInputStream implements InputStream {
}
/**
- * Convert data from the given encoding to UTF-8.
- *
- * This has not yet been tested with charactersets other than UTF-8.
- * It should work with ISO-8859-1/-13 and standard Latin Win charsets.
- *
- * @param string $data
- * The data to convert.
- * @param string $encoding
- * A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php
- */
- protected function convertToUTF8($data, $encoding = 'UTF-8') {
- /* Given an encoding, the bytes in the input stream must be
- converted to Unicode characters for the tokeniser, as
- described by the rules for that encoding, except that the
- leading U+FEFF BYTE ORDER MARK character, if any, must not
- be stripped by the encoding layer (it is stripped by the rule below).
-
- Bytes or sequences of bytes in the original byte stream that
- could not be converted to Unicode characters must be converted
- to U+FFFD REPLACEMENT CHARACTER code points. */
-
- // XXX currently assuming input data is UTF-8; once we
- // build encoding detection this will no longer be the case
- //
- // We previously had an mbstring implementation here, but that
- // implementation is heavily non-conforming, so it's been
- // omitted.
- if (function_exists('iconv') && $encoding != 'auto') {
- // iconv has the following behaviors:
- // - Overlong representations are ignored.
- // - Beyond Plane 16 is replaced with a lower char.
- // - Incomplete sequences generate a warning.
- $data = @iconv($encoding, 'UTF-8//IGNORE', $data);
- }
- // MPB: Testing the newer mb_convert_encoding(). This might need
- // to be removed again.
- elseif (function_exists('mb_convert_encoding')) {
- // mb library has the following behaviors:
- // - UTF-16 surrogates result in FALSE.
- // - Overlongs and outside Plane 16 result in empty strings.
- $data = mb_convert_encoding($data, 'UTF-8', $encoding);
- }
- else {
- // we can make a conforming native implementation
- throw new Exception('Not implemented, please install mbstring or iconv');
- }
-
- /* One leading U+FEFF BYTE ORDER MARK character must be
- ignored if any are present. */
- if (substr($data, 0, 3) === "\xEF\xBB\xBF") {
- $data = substr($data, 3);
- }
-
- return $data;
- }
-
- /**
* Replace linefeed characters according to the spec.
*/
protected function replaceLinefeeds($data) {
@@ -158,62 +102,6 @@ class StringInputStream implements InputStream {
}
/**
- * Checks for Unicode code points that are not valid in a document.
- *
- * This stores a parse error for each error that is found.
- */
- protected function checkForIllegalCodepoints($data) {
- if (!function_exists('preg_match_all')) {
- throw \Exception('The PCRE library is not loaded or is not available.');
- }
-
- /* All U+0000 NULL characters in the input must be replaced
- by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
- characters is a parse error. */
- for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) {
- $this->errors[] = array(
- 'type' => Tokenizer::PARSEERROR,
- 'data' => 'null-character'
- );
- }
-
- /* Any occurrences of any characters in the ranges U+0001 to
- U+0008, U+000B, U+000E to U+001F, U+007F to U+009F,
- U+D800 to U+DFFF , U+FDD0 to U+FDEF, and
- characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
- U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
- U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
- U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
- U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and
- U+10FFFF are parse errors. (These are all control characters
- or permanently undefined Unicode characters.) */
- // Check PCRE is loaded.
- $count = preg_match_all(
- '/(?:
- [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F
- |
- \xC2[\x80-\x9F] # U+0080 to U+009F
- |
- \xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF
- |
- \xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF
- |
- \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF
- |
- [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16})
- )/x',
- $data,
- $matches
- );
- for ($i = 0; $i < $count; $i++) {
- $this->errors[] = array(
- 'type' => Tokenizer::PARSEERROR,
- 'data' => 'invalid-codepoint'
- );
- }
- }
-
- /**
* Returns the current line that the tokenizer is at.
*/
public function currentLine() {
@@ -319,6 +207,10 @@ class StringInputStream implements InputStream {
* end of the file.
*
* @note This performs bounds checking
+ *
+ * @return string
+ * Returns the remaining text. If called when the InputStream is
+ * already exhausted, it returns an empty string.
*/
public function remainingChars() {
if ($this->char < $this->EOF) {
@@ -326,7 +218,7 @@ class StringInputStream implements InputStream {
$this->char = $this->EOF;
return $data;
}
- return FALSE;
+ return '';//FALSE;
}
/**