From 1678e73e34b167ba0312d1b65797363ea8f62fe2 Mon Sep 17 00:00:00 2001
From: Technosophos <technosophos@gmail.com>
Date: Thu, 11 Apr 2013 13:18:56 -0500
Subject: Moved UTF-8 character check out to UTF8Utils.

---
 src/HTML5/Parser/StringInputStream.php |  9 +++++-
 src/HTML5/Parser/UTF8Utils.php         | 57 ++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/HTML5/Parser/StringInputStream.php b/src/HTML5/Parser/StringInputStream.php
index edd0399..5f41c21 100644
--- a/src/HTML5/Parser/StringInputStream.php
+++ b/src/HTML5/Parser/StringInputStream.php
@@ -65,7 +65,14 @@ class StringInputStream implements InputStream {
   public function __construct($data, $encoding = 'UTF-8') {
 
     $data = $this->convertToUTF8($data, $encoding);
-    $this->checkForIllegalCodepoints($data);
+
+    // There is good reason to question whether it makes sense to
+    // do this here, since most of these checks are done during
+    // parsing, and since this check doesn't actually *do* anything.
+    $e = UTF8Utils::checkForIllegalCodepoints($data);
+    if (!empty($e)) {
+      throw new ParseError("UTF-8 encoding issues: " . implode(', ', $e));
+    }
 
     $data = $this->replaceLinefeeds($data);
 
diff --git a/src/HTML5/Parser/UTF8Utils.php b/src/HTML5/Parser/UTF8Utils.php
index 71fcd10..0902050 100644
--- a/src/HTML5/Parser/UTF8Utils.php
+++ b/src/HTML5/Parser/UTF8Utils.php
@@ -31,4 +31,61 @@ class UTF8Utils {
     return array_sum(array_slice($count, 0, 0x80)) +
          array_sum(array_slice($count, 0xC2, 0x33));
   }
+
+  /**
+   * Checks for Unicode code points that are not valid in a document.
+   *
+   * @param string $data
+   *   A string to analyze.
+   * @return array
+   *   An array of (string) error messages produced by the scanning.
+   */
+  public static function checkForIllegalCodepoints($data) {
+    if (!function_exists('preg_match_all')) {
+      throw \Exception('The PCRE library is not loaded or is not available.');
+    }
+
+    // Vestigal error handling.
+    $errors = array();
+
+    /* All U+0000 NULL characters in the input must be replaced
+    by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
+    characters is a parse error. */
+    for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) {
+      $errors[] = 'null-character';
+    }
+
+    /* Any occurrences of any characters in the ranges U+0001 to
+    U+0008, U+000B,  U+000E to U+001F,  U+007F  to U+009F,
+    U+D800 to U+DFFF , U+FDD0 to U+FDEF, and
+    characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
+    U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
+    U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
+    U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
+    U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and
+    U+10FFFF are parse errors. (These are all control characters
+    or permanently undefined Unicode characters.) */
+    // Check PCRE is loaded.
+    $count = preg_match_all(
+      '/(?:
+        [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B,  U+000E to U+001F and U+007F
+      |
+        \xC2[\x80-\x9F] # U+0080 to U+009F
+      |
+        \xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF
+      |
+        \xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF
+      |
+        \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF
+      |
+        [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16})
+      )/x',
+      $data,
+      $matches
+    );
+    for ($i = 0; $i < $count; $i++) {
+      $this[] =  'invalid-codepoint';
+    }
+    return $errors;
+  }
 }
-- 
cgit v1.2.3