summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMatt Butcher <[email protected]>2013-04-08 21:20:23 -0500
committerMatt Butcher <[email protected]>2013-04-08 21:20:23 -0500
commitd58a72da4aca61fe9941db0431fdcdcb915342d0 (patch)
treeaa2b01f8c07e66ee0fa0f8584eef3fa7ac6aa5f6 /src
parent31a282f8a5d4d7d3f6a53caf1f37b05609dad484 (diff)
Applied Demeter's Law.
Diffstat (limited to 'src')
-rw-r--r--src/HTML5/InputStream.php119
1 files changed, 74 insertions, 45 deletions
diff --git a/src/HTML5/InputStream.php b/src/HTML5/InputStream.php
index 7712c65..9d65d00 100644
--- a/src/HTML5/InputStream.php
+++ b/src/HTML5/InputStream.php
@@ -53,6 +53,14 @@ class InputStream {
/**
* Convert data from the given encoding to UTF-8.
+ *
+ * This has not yet been tested with charactersets other than UTF-8.
+ * It should work with ISO-8859-1/-13 and standard Latin Win charsets.
+ *
+ * @param string $data
+ * The data to convert.
+ * @param string $encoding
+ * A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php
*/
protected function convertToUTF8($data, $encoding = 'UTF-8') {
/* Given an encoding, the bytes in the input stream must be
@@ -73,24 +81,17 @@ class InputStream {
// omitted.
if (function_exists('iconv') && $encoding != 'auto') {
// non-conforming
- $data = @iconv('UTF-8', 'UTF-8//IGNORE', $data);
- // MPB: Testing the newer mb_convert_encoding().
+ $data = @iconv($encoding, 'UTF-8//IGNORE', $data);
}
+ // MPB: Testing the newer mb_convert_encoding(). This might need
+ // to be removed again.
elseif (function_exists('mb_convert_encoding')) {
$data = mb_convert_encoding($data, 'UTF-8', $encoding);
-
}
else {
// we can make a conforming native implementation
throw new Exception('Not implemented, please install mbstring or iconv');
}
- }
-
- /**
- * @param $data Data to parse
- */
- public function __construct($data) {
-
/* One leading U+FEFF BYTE ORDER MARK character must be
ignored if any are present. */
@@ -98,15 +99,13 @@ class InputStream {
$data = substr($data, 3);
}
- /* All U+0000 NULL characters in the input must be replaced
- by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
- characters is a parse error. */
- for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) {
- $this->errors[] = array(
- 'type' => HTML5_Tokenizer::PARSEERROR,
- 'data' => 'null-character'
- );
- }
+ return $data;
+ }
+
+ /**
+ * Replace linefeed characters according to the spec.
+ */
+ protected function replaceLinefeeds($data) {
/* U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED
(LF) characters are treated specially. Any CR characters
that are followed by LF characters must be removed, and any
@@ -119,7 +118,28 @@ class InputStream {
"\r\n" => "\n",
"\r" => "\n",
);
- $data = strtr($data, $crlfTable);
+ return strtr($data, $crlfTable);
+ }
+
+ /**
+ * Checks for Unicode code points that are not valid in a document.
+ *
+ * This stores a parse error for each error that is found.
+ */
+ protected function checkForIllegalCodepoints($data) {
+ if (!function_exists('preg_match_all')) {
+ throw \Exception('The PCRE library is not loaded or is not available.');
+ }
+
+ /* All U+0000 NULL characters in the input must be replaced
+ by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
+ characters is a parse error. */
+ for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) {
+ $this->errors[] = array(
+ 'type' => Tokenizer::PARSEERROR,
+ 'data' => 'null-character'
+ );
+ }
/* Any occurrences of any characters in the ranges U+0001 to
U+0008, U+000B, U+000E to U+001F, U+007F to U+009F,
@@ -132,33 +152,42 @@ class InputStream {
U+10FFFF are parse errors. (These are all control characters
or permanently undefined Unicode characters.) */
// Check PCRE is loaded.
- if (extension_loaded('pcre')) {
- $count = preg_match_all(
- '/(?:
- [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F
- |
- \xC2[\x80-\x9F] # U+0080 to U+009F
- |
- \xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF
- |
- \xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF
- |
- \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF
- |
- [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16})
- )/x',
- $data,
- $matches
+ $count = preg_match_all(
+ '/(?:
+ [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F
+ |
+ \xC2[\x80-\x9F] # U+0080 to U+009F
+ |
+ \xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF
+ |
+ \xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF
+ |
+ \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF
+ |
+ [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16})
+ )/x',
+ $data,
+ $matches
+ );
+ for ($i = 0; $i < $count; $i++) {
+ $this->errors[] = array(
+ 'type' => Tokenizer::PARSEERROR,
+ 'data' => 'invalid-codepoint'
);
- for ($i = 0; $i < $count; $i++) {
- $this->errors[] = array(
- 'type' => HTML5_Tokenizer::PARSEERROR,
- 'data' => 'invalid-codepoint'
- );
- }
- } else {
- // XXX: Need non-PCRE impl, probably using substr_count
}
+ }
+
+ /**
+ * Create a new InputStream wrapper.
+ *
+ * @param $data Data to parse
+ */
+ public function __construct($data, $encoding = 'UTF-8') {
+
+ $data = $this->convertToUTF8($data, $encoding);
+ $data = $this->replaceLinefeeds($data);
+
+ $this->checkForIllegalCodepoints($data);
$this->data = $data;
$this->char = 0;