summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTechnosophos <[email protected]>2013-04-11 13:18:56 -0500
committerTechnosophos <[email protected]>2013-04-11 13:18:56 -0500
commit1678e73e34b167ba0312d1b65797363ea8f62fe2 (patch)
treec9e9397dce845605e550ca67d10f9e1666257730
parent64eaa3365d74b58de23bb7787844ab3ead4b0b20 (diff)
Moved UTF-8 character check out to UTF8Utils.
-rw-r--r--src/HTML5/Parser/StringInputStream.php9
-rw-r--r--src/HTML5/Parser/UTF8Utils.php57
2 files changed, 65 insertions, 1 deletions
diff --git a/src/HTML5/Parser/StringInputStream.php b/src/HTML5/Parser/StringInputStream.php
index edd0399..5f41c21 100644
--- a/src/HTML5/Parser/StringInputStream.php
+++ b/src/HTML5/Parser/StringInputStream.php
@@ -65,7 +65,14 @@ class StringInputStream implements InputStream {
public function __construct($data, $encoding = 'UTF-8') {
$data = $this->convertToUTF8($data, $encoding);
- $this->checkForIllegalCodepoints($data);
+
+ // There is good reason to question whether it makes sense to
+ // do this here, since most of these checks are done during
+ // parsing, and since this check doesn't actually *do* anything.
+ $e = UTF8Utils::checkForIllegalCodepoints($data);
+ if (!empty($e)) {
+ throw new ParseError("UTF-8 encoding issues: " . implode(', ', $e));
+ }
$data = $this->replaceLinefeeds($data);
diff --git a/src/HTML5/Parser/UTF8Utils.php b/src/HTML5/Parser/UTF8Utils.php
index 71fcd10..0902050 100644
--- a/src/HTML5/Parser/UTF8Utils.php
+++ b/src/HTML5/Parser/UTF8Utils.php
@@ -31,4 +31,61 @@ class UTF8Utils {
return array_sum(array_slice($count, 0, 0x80)) +
array_sum(array_slice($count, 0xC2, 0x33));
}
+
+ /**
+ * Checks for Unicode code points that are not valid in a document.
+ *
+ * @param string $data
+ * A string to analyze.
+ * @return array
+ * An array of (string) error messages produced by the scanning.
+ */
+ public static function checkForIllegalCodepoints($data) {
+ if (!function_exists('preg_match_all')) {
+ throw \Exception('The PCRE library is not loaded or is not available.');
+ }
+
+ // Vestigal error handling.
+ $errors = array();
+
+ /* All U+0000 NULL characters in the input must be replaced
+ by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
+ characters is a parse error. */
+ for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) {
+ $errors[] = 'null-character';
+ }
+
+ /* Any occurrences of any characters in the ranges U+0001 to
+ U+0008, U+000B, U+000E to U+001F, U+007F to U+009F,
+ U+D800 to U+DFFF , U+FDD0 to U+FDEF, and
+ characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
+ U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
+ U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
+ U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
+ U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and
+ U+10FFFF are parse errors. (These are all control characters
+ or permanently undefined Unicode characters.) */
+ // Check PCRE is loaded.
+ $count = preg_match_all(
+ '/(?:
+ [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F
+ |
+ \xC2[\x80-\x9F] # U+0080 to U+009F
+ |
+ \xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF
+ |
+ \xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF
+ |
+ \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF
+ |
+ [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16})
+ )/x',
+ $data,
+ $matches
+ );
+ for ($i = 0; $i < $count; $i++) {
+ $this[] = 'invalid-codepoint';
+ }
+ return $errors;
+ }
}