summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTitouan Galopin <[email protected]>2018-11-03 01:03:34 +0100
committerTitouan Galopin <[email protected]>2018-11-03 01:36:52 +0100
commit321ed9626c091f1f4dcef8223d88ee88a400a241 (patch)
tree32a565fac06feba41230228a806dc22d6e8f17c3
parent7453ab08dc4c8f65d5db52fd2c6b8943d59bf95b (diff)
Improve performance by relying on a native string instead of InputStream
-rw-r--r--README.md5
-rw-r--r--src/HTML5.php35
-rw-r--r--src/HTML5/Parser/FileInputStream.php3
-rw-r--r--src/HTML5/Parser/InputStream.php4
-rw-r--r--src/HTML5/Parser/Scanner.php237
-rw-r--r--src/HTML5/Parser/StringInputStream.php9
-rw-r--r--test/HTML5/Parser/DOMTreeBuilderTest.php11
-rw-r--r--test/HTML5/Parser/FileInputStreamTest.html10
-rw-r--r--test/HTML5/Parser/FileInputStreamTest.php195
-rw-r--r--test/HTML5/Parser/ScannerTest.php40
-rw-r--r--test/HTML5/Parser/StringInputStreamTest.php327
-rw-r--r--test/HTML5/Parser/TokenizerTest.php4
-rw-r--r--test/HTML5/Parser/TreeBuildingRulesTest.php5
-rw-r--r--test/HTML5/Serializer/TraverserTest.php40
14 files changed, 298 insertions, 627 deletions
diff --git a/README.md b/README.md
index 862579f..8d89966 100644
--- a/README.md
+++ b/README.md
@@ -50,8 +50,8 @@ Here is how you use the high-level `HTML5` library API:
<?php
// Assuming you installed from Composer:
require "vendor/autoload.php";
-use Masterminds\HTML5;
+use Masterminds\HTML5;
// An example HTML document:
$html = <<< 'HERE'
@@ -115,8 +115,6 @@ The following options are supported:
This library provides the following low-level APIs that you can use to
create more customized HTML5 tools:
-- An `InputStream` abstraction that can work with different kinds of
-input source (not just files and strings).
- A SAX-like event-based parser that you can hook into for special kinds
of parsing.
- A flexible error-reporting mechanism that can be tuned to document
@@ -130,7 +128,6 @@ is well-documented.
The parser is designed as follows:
-- The `InputStream` portion handles direct I/O.
- The `Scanner` handles scanning on behalf of the parser.
- The `Tokenizer` requests data off of the scanner, parses it, clasifies
it, and sends it to an `EventHandler`. It is a *recursive descent parser.*
diff --git a/src/HTML5.php b/src/HTML5.php
index 6c57553..a3db74c 100644
--- a/src/HTML5.php
+++ b/src/HTML5.php
@@ -55,7 +55,7 @@ class HTML5
*
* The rules governing parsing are set out in the HTML 5 spec.
*
- * @param string $file
+ * @param string|resource $file
* The path to the file to parse. If this is a resource, it is
* assumed to be an open stream whose pointer is set to the first
* byte of input.
@@ -68,13 +68,10 @@ class HTML5
{
// Handle the case where file is a resource.
if (is_resource($file)) {
- // FIXME: We need a StreamInputStream class.
- return $this->loadHTML(stream_get_contents($file), $options);
+ return $this->parse(stream_get_contents($file), $options);
}
- $input = new FileInputStream($file);
-
- return $this->parse($input, $options);
+ return $this->parse(file_get_contents($file), $options);
}
/**
@@ -92,9 +89,7 @@ class HTML5
*/
public function loadHTML($string, array $options = array())
{
- $input = new StringInputStream($string);
-
- return $this->parse($input, $options);
+ return $this->parse($string, $options);
}
/**
@@ -121,19 +116,15 @@ class HTML5
/**
* Parse a HTML fragment from a string.
*
- * @param string $string
- * The html5 fragment as a string.
- * @param array $options
- * Configuration options when parsing the HTML
+ * @param string $string The HTML5 fragment as a string.
+ * @param array $options Configuration options when parsing the HTML
*
* @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with
* almost all distributions of PHP.
*/
public function loadHTMLFragment($string, array $options = array())
{
- $input = new StringInputStream($string);
-
- return $this->parseFragment($input, $options);
+ return $this->parseFragment($string, $options);
}
/**
@@ -162,12 +153,12 @@ class HTML5
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
*
- * @param InputStream $input
+ * @param string $input
* @param array $options
*
* @return \DOMDocument
*/
- public function parse(InputStream $input, array $options = array())
+ public function parse($input, array $options = array())
{
$this->errors = array();
$options = array_merge($this->getOptions(), $options);
@@ -187,14 +178,12 @@ class HTML5
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
*
- * @param InputStream $input
- * The input data to parse in the form of a InputStream instance.
- * @param array $options
- * An array of options
+ * @param string $input The input data to parse in the form of a string.
+ * @param array $options An array of options
*
* @return \DOMDocumentFragment
*/
- public function parseFragment(InputStream $input, array $options = array())
+ public function parseFragment($input, array $options = array())
{
$options = array_merge($this->getOptions(), $options);
$events = new DOMTreeBuilder(true, $options);
diff --git a/src/HTML5/Parser/FileInputStream.php b/src/HTML5/Parser/FileInputStream.php
index fbf006d..76bd17b 100644
--- a/src/HTML5/Parser/FileInputStream.php
+++ b/src/HTML5/Parser/FileInputStream.php
@@ -11,11 +11,10 @@ namespace Masterminds\HTML5\Parser;
* really like to rewrite this class to efficiently handle lower level
* stream reads (and thus efficiently handle large documents).
*
- * @todo A buffered input stream would be useful.
+ * @deprecated since 2.4, to remove in 3.0. Use a string in the scanner instead.
*/
class FileInputStream extends StringInputStream implements InputStream
{
-
/**
* Load a file input stream.
*
diff --git a/src/HTML5/Parser/InputStream.php b/src/HTML5/Parser/InputStream.php
index 0bdc803..e4a106a 100644
--- a/src/HTML5/Parser/InputStream.php
+++ b/src/HTML5/Parser/InputStream.php
@@ -1,4 +1,5 @@
<?php
+
namespace Masterminds\HTML5\Parser;
/**
@@ -9,10 +10,11 @@ namespace Masterminds\HTML5\Parser;
*
* Currently provided InputStream implementations include
* FileInputStream and StringInputStream.
+ *
+ * @deprecated since 2.4, to remove in 3.0. Use a string in the scanner instead.
*/
interface InputStream extends \Iterator
{
-
/**
* Returns the current line that is being consumed.
*
diff --git a/src/HTML5/Parser/Scanner.php b/src/HTML5/Parser/Scanner.php
index f605c69..dc685bb 100644
--- a/src/HTML5/Parser/Scanner.php
+++ b/src/HTML5/Parser/Scanner.php
@@ -1,34 +1,64 @@
<?php
namespace Masterminds\HTML5\Parser;
+use Masterminds\HTML5\Exception;
+
/**
- * The scanner.
- *
- * This scans over an input stream.
+ * The scanner scans over a given data input to react appropriately to characters.
*/
class Scanner
{
-
const CHARS_HEX = 'abcdefABCDEF01234567890';
-
const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
-
const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
- protected $is;
+ /**
+ * The string data we're parsing.
+ */
+ private $data;
- // Flipping this to true will give minisculely more debugging info.
- public $debug = false;
+ /**
+ * The current integer byte position we are in $data
+ */
+ private $char;
+
+ /**
+ * Length of $data; when $char === $data, we are at the end-of-file.
+ */
+ private $EOF;
+
+ /**
+ * Parse errors.
+ */
+ public $errors = array();
/**
* Create a new Scanner.
*
- * @param \Masterminds\HTML5\Parser\InputStream $input
- * An InputStream to be scanned.
+ * @param string $data Data to parse
+ * @param string $encoding The encoding to use for the data.
+ *
+ * @throws Exception If the given data cannot be encoded to UTF-8.
*/
- public function __construct($input)
+ public function __construct($data, $encoding = 'UTF-8')
{
- $this->is = $input;
+ if ($data instanceof InputStream) {
+ @trigger_error('InputStream objects are deprecated since version 2.4 and will be removed in 3.0. Use strings instead.', E_USER_DEPRECATED);
+ $data = (string) $data;
+ }
+
+ $data = UTF8Utils::convertToUTF8($data, $encoding);
+
+ // There is good reason to question whether it makes sense to
+ // do this here, since most of these checks are done during
+ // parsing, and since this check doesn't actually *do* anything.
+ $this->errors = UTF8Utils::checkForIllegalCodepoints($data);
+
+ $data = $this->replaceLinefeeds($data);
+
+ $this->data = $data;
+ $this->char = 0;
+ $this->EOF = strlen($data);
}
/**
@@ -38,7 +68,7 @@ class Scanner
*/
public function position()
{
- return $this->is->key();
+ return $this->char;
}
/**
@@ -48,7 +78,11 @@ class Scanner
*/
public function peek()
{
- return $this->is->peek();
+ if (($this->char + 1) <= $this->EOF) {
+ return $this->data[$this->char + 1];
+ }
+
+ return false;
}
/**
@@ -60,11 +94,10 @@ class Scanner
*/
public function next()
{
- $this->is->next();
- if ($this->is->valid()) {
- if ($this->debug)
- fprintf(STDOUT, "> %s\n", $this->is->current());
- return $this->is->current();
+ $this->char++;
+
+ if ($this->char < $this->EOF) {
+ return $this->data[$this->char];
}
return false;
@@ -79,8 +112,8 @@ class Scanner
*/
public function current()
{
- if ($this->is->valid()) {
- return $this->is->current();
+ if ($this->char < $this->EOF) {
+ return $this->data[$this->char];
}
return false;
@@ -88,6 +121,8 @@ class Scanner
/**
* Silently consume N chars.
+ *
+ * @param int $count
*/
public function consume($count = 1)
{
@@ -105,7 +140,9 @@ class Scanner
*/
public function unconsume($howMany = 1)
{
- $this->is->unconsume($howMany);
+ if (($this->char - $howMany) >= 0) {
+ $this->char = $this->char - $howMany;
+ }
}
/**
@@ -118,7 +155,7 @@ class Scanner
*/
public function getHex()
{
- return $this->is->charsWhile(static::CHARS_HEX);
+ return $this->doCharsWhile(static::CHARS_HEX);
}
/**
@@ -131,7 +168,7 @@ class Scanner
*/
public function getAsciiAlpha()
{
- return $this->is->charsWhile(static::CHARS_ALPHA);
+ return $this->doCharsWhile(static::CHARS_ALPHA);
}
/**
@@ -144,7 +181,7 @@ class Scanner
*/
public function getAsciiAlphaNum()
{
- return $this->is->charsWhile(static::CHARS_ALNUM);
+ return $this->doCharsWhile(static::CHARS_ALNUM);
}
/**
@@ -157,7 +194,7 @@ class Scanner
*/
public function getNumeric()
{
- return $this->is->charsWhile('0123456789');
+ return $this->doCharsWhile('0123456789');
}
/**
@@ -167,7 +204,7 @@ class Scanner
*/
public function whitespace()
{
- return $this->is->charsWhile("\n\t\f ");
+ return $this->doCharsWhile("\n\t\f ");
}
/**
@@ -177,23 +214,37 @@ class Scanner
*/
public function currentLine()
{
- return $this->is->currentLine();
+ if (empty($this->EOF) || $this->char == 0) {
+ return 1;
+ }
+
+ // Add one to $this->char because we want the number for the next
+ // byte to be processed.
+ return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
}
/**
* Read chars until something in the mask is encountered.
+ *
+ * @param string $mask
+ *
+ * @return mixed
*/
public function charsUntil($mask)
{
- return $this->is->charsUntil($mask);
+ return $this->doCharsUntil($mask);
}
/**
* Read chars as long as the mask matches.
+ *
+ * @param string $mask
+ *
+ * @return int
*/
public function charsWhile($mask)
{
- return $this->is->charsWhile($mask);
+ return $this->doCharsWhile($mask);
}
/**
@@ -205,7 +256,29 @@ class Scanner
*/
public function columnOffset()
{
- return $this->is->columnOffset();
+ // Short circuit for the first char.
+ if ($this->char == 0) {
+ return 0;
+ }
+
+ // strrpos is weird, and the offset needs to be negative for what we
+ // want (i.e., the last \n before $this->char). This needs to not have
+ // one (to make it point to the next character, the one we want the
+ // position of) added to it because strrpos's behaviour includes the
+ // final offset byte.
+ $backwardFrom = $this->char - 1 - strlen($this->data);
+ $lastLine = strrpos($this->data, "\n", $backwardFrom);
+
+ // However, for here we want the length up until the next byte to be
+ // processed, so add one to the current byte ($this->char).
+ if ($lastLine !== false) {
+ $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
+ } else {
+ // After a newline.
+ $findLengthOf = substr($this->data, 0, $this->char);
+ }
+
+ return UTF8Utils::countChars($findLengthOf);
}
/**
@@ -217,6 +290,104 @@ class Scanner
*/
public function remainingChars()
{
- return $this->is->remainingChars();
+ if ($this->char < $this->EOF) {
+ $data = substr($this->data, $this->char);
+ $this->char = $this->EOF;
+
+ return $data;
+ }
+
+ return ''; // false;
+ }
+
+ /**
+ * Replace linefeed characters according to the spec.
+ *
+ * @param $data
+ *
+ * @return string
+ */
+ private function replaceLinefeeds($data)
+ {
+ /*
+ * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially.
+ * Any CR characters that are followed by LF characters must be removed, and any CR characters not
+ * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are
+ * represented by LF characters, and there are never any CR characters in the input to the tokenization
+ * stage.
+ */
+ $crlfTable = array(
+ "\0" => "\xEF\xBF\xBD",
+ "\r\n" => "\n",
+ "\r" => "\n"
+ );
+
+ return strtr($data, $crlfTable);
+ }
+
+ /**
+ * Read to a particular match (or until $max bytes are consumed).
+ *
+ * This operates on byte sequences, not characters.
+ *
+ * Matches as far as possible until we reach a certain set of bytes
+ * and returns the matched substring.
+ *
+ * @param string $bytes
+ * Bytes to match.
+ * @param int $max
+ * Maximum number of bytes to scan.
+ * @return mixed Index or false if no match is found. You should use strong
+ * equality when checking the result, since index could be 0.
+ */
+ private function doCharsUntil($bytes, $max = null)
+ {
+ if ($this->char >= $this->EOF) {
+ return false;
+ }
+
+ if ($max === 0 || $max) {
+ $len = strcspn($this->data, $bytes, $this->char, $max);
+ } else {
+ $len = strcspn($this->data, $bytes, $this->char);
+ }
+
+ $string = (string) substr($this->data, $this->char, $len);
+ $this->char += $len;
+
+ return $string;
+ }
+
+ /**
+ * Returns the string so long as $bytes matches.
+ *
+ * Matches as far as possible with a certain set of bytes
+ * and returns the matched substring.
+ *
+ * @param string $bytes
+ * A mask of bytes to match. If ANY byte in this mask matches the
+ * current char, the pointer advances and the char is part of the
+ * substring.
+ * @param int $max
+ * The max number of chars to read.
+ *
+ * @return string
+ */
+ private function doCharsWhile($bytes, $max = null)
+ {
+ if ($this->char >= $this->EOF) {
+ return false;
+ }
+
+ if ($max === 0 || $max) {
+ $len = strspn($this->data, $bytes, $this->char, $max);
+ } else {
+ $len = strspn($this->data, $bytes, $this->char);
+ }
+
+ $string = (string) substr($this->data, $this->char, $len);
+ $this->char += $len;
+
+ return $string;
}
}
diff --git a/src/HTML5/Parser/StringInputStream.php b/src/HTML5/Parser/StringInputStream.php
index 0973941..0118468 100644
--- a/src/HTML5/Parser/StringInputStream.php
+++ b/src/HTML5/Parser/StringInputStream.php
@@ -39,9 +39,11 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
// - // indicates regular comments
+/**
+ * @deprecated since 2.4, to remove in 3.0. Use a string in the scanner instead.
+ */
class StringInputStream implements InputStream
{
-
/**
* The string data we're parsing.
*/
@@ -88,6 +90,11 @@ class StringInputStream implements InputStream
$this->EOF = strlen($data);
}
+ public function __toString()
+ {
+ return $this->data;
+ }
+
/**
* Replace linefeed characters according to the spec.
*/
diff --git a/test/HTML5/Parser/DOMTreeBuilderTest.php b/test/HTML5/Parser/DOMTreeBuilderTest.php
index 68cf612..ef1b3cb 100644
--- a/test/HTML5/Parser/DOMTreeBuilderTest.php
+++ b/test/HTML5/Parser/DOMTreeBuilderTest.php
@@ -5,7 +5,6 @@
*/
namespace Masterminds\HTML5\Tests\Parser;
-use Masterminds\HTML5\Parser\StringInputStream;
use Masterminds\HTML5\Parser\Scanner;
use Masterminds\HTML5\Parser\Tokenizer;
use Masterminds\HTML5\Parser\DOMTreeBuilder;
@@ -16,14 +15,14 @@ use Masterminds\HTML5\Parser\DOMTreeBuilder;
class DOMTreeBuilderTest extends \Masterminds\HTML5\Tests\TestCase
{
protected $errors = array();
+
/**
* Convenience function for parsing.
*/
protected function parse($string, array $options = array())
{
$treeBuilder = new DOMTreeBuilder(false, $options);
- $input = new StringInputStream($string);
- $scanner = new Scanner($input);
+ $scanner = new Scanner($string);
$parser = new Tokenizer($scanner, $treeBuilder);
$parser->parse();
@@ -38,8 +37,7 @@ class DOMTreeBuilderTest extends \Masterminds\HTML5\Tests\TestCase
protected function parseFragment($string)
{
$treeBuilder = new DOMTreeBuilder(true);
- $input = new StringInputStream($string);
- $scanner = new Scanner($input);
+ $scanner = new Scanner($string);
$parser = new Tokenizer($scanner, $treeBuilder);
$parser->parse();
@@ -600,8 +598,7 @@ class DOMTreeBuilderTest extends \Masterminds\HTML5\Tests\TestCase
$is = new InstructionProcessorMock();
$treeBuilder->setInstructionProcessor($is);
- $input = new StringInputStream($string);
- $scanner = new Scanner($input);
+ $scanner = new Scanner($string);
$parser = new Tokenizer($scanner, $treeBuilder);
$parser->parse();
diff --git a/test/HTML5/Parser/FileInputStreamTest.html b/test/HTML5/Parser/FileInputStreamTest.html
deleted file mode 100644
index a976e8b..0000000
--- a/test/HTML5/Parser/FileInputStreamTest.html
+++ /dev/null
@@ -1,10 +0,0 @@
-<!doctype html>
-<html lang="en">
- <head>
- <meta charset="utf-8">
- <title>Test</title>
- </head>
- <body>
- <p>This is a test.</p>
- </body>
-</html> \ No newline at end of file
diff --git a/test/HTML5/Parser/FileInputStreamTest.php b/test/HTML5/Parser/FileInputStreamTest.php
deleted file mode 100644
index 71dd828..0000000
--- a/test/HTML5/Parser/FileInputStreamTest.php
+++ /dev/null
@@ -1,195 +0,0 @@
-<?php
-namespace Masterminds\HTML5\Tests\Parser;
-
-use Masterminds\HTML5\Parser\FileInputStream;
-
-class FileInputStreamTest extends \Masterminds\HTML5\Tests\TestCase
-{
-
- public function testConstruct()
- {
- $s = new FileInputStream(__DIR__ . '/FileInputStreamTest.html');
-
- $this->assertInstanceOf('\Masterminds\HTML5\Parser\FileInputStream', $s);
- }
-
- public function testNext()
- {
- $s = new FileInputStream(__DIR__ . '/FileInputStreamTest.html');
-
- $s->next();
- $this->assertEquals('!', $s->current());
- $s->next();
- $this->assertEquals('d', $s->current());
- }
-
- public function testKey()
- {
- $s = new FileInputStream(__DIR__ . '/FileInputStreamTest.html');
-
- $this->assertEquals(0, $s->key());
-
- $s->next();
- $this->assertEquals(1, $s->key());
- }
-
- public function testPeek()
- {
- $s = new FileInputStream(__DIR__ . '/FileInputStreamTest.html');
-
- $this->assertEquals('!', $s->peek());
-
- $s->next();
- $this->assertEquals('d', $s->peek());
- }
-
- public function testCurrent()
- {
- $s = new FileInputStream(__DIR__ . '/FileInputStreamTest.html');
-
- $this->assertEquals('<', $s->current());
-
- $s->next();
- $this->assertEquals('!', $s->current());
-
- $s->next();
- $this->assertEquals('d', $s->current());
- }
-
- public function testColumnOffset()
- {
- $s = new FileInputStream(__DIR__ . '/FileInputStreamTest.html');
- $this->assertEquals(0, $s->columnOffset());
- $s->next();
- $this->assertEquals(1, $s->columnOffset());
- $s->next();
- $this->assertEquals(2, $s->columnOffset());
- $s->next();
- $this->assertEquals(3, $s->columnOffset());
-
- // Make sure we get to the second line
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $this->assertEquals(0, $s->columnOffset());
-
- $s->next();
- $canary = $s->current(); // h
- $this->assertEquals('h', $canary);
- $this->assertEquals(1, $s->columnOffset());
- }
-
- public function testCurrentLine()
- {
- $s = new FileInputStream(__DIR__ . '/FileInputStreamTest.html');
-
- $this->assertEquals(1, $s->currentLine());
-
- // Make sure we get to the second line
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $this->assertEquals(2, $s->currentLine());
-
- // Make sure we get to the third line
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $s->next();
- $this->assertEquals(3, $s->currentLine());
- }
-
- public function testRemainingChars()
- {
- $text = file_get_contents(__DIR__ . '/FileInputStreamTest.html');
- $s = new FileInputStream(__DIR__ . '/FileInputStreamTest.html');
- $this->assertEquals($text, $s->remainingChars());
-
- $text = substr(file_get_contents(__DIR__ . '/FileInputStreamTest.html'), 1);
- $s = new FileInputStream(__DIR__ . '/FileInputStreamTest.html');
- $s->next(); // Pop one.
- $this->assertEquals($text, $s->remainingChars());
- }
-
- public function testCharsUnitl()
- {
- $s = new FileInputStream(__DIR__ . '/FileInputStreamTest.html');
-
- $this->assertEquals('', $s->charsUntil('<'));
- // Pointer at '<', moves to ' '
- $this->assertEquals('<!doctype', $s->charsUntil(' ', 20));
-
- // Pointer at ' ', moves to '>'
- $this->assertEquals(' html', $s->charsUntil('>'));
-
- // Pointer at '>', moves to '\n'.
- $this->assertEquals('>', $s->charsUntil("\n"));
-
- // Pointer at '\n', move forward then to the next'\n'.
- $s->next();
- $this->assertEquals('<html lang="en">', $s->charsUntil("\n"));
-
- // Ony get one of the spaces.
- $this->assertEquals("\n ", $s->charsUntil('<', 2));
-
- // Get the other space.
- $this->assertEquals(" ", $s->charsUntil('<'));
-
- // This should scan to the end of the file.
- $text = "<head>
- <meta charset=\"utf-8\">
- <title>Test</title>
- </head>
- <body>
- <p>This is a test.</p>
- </body>
-</html>";
- $this->assertEquals($text, $s->charsUntil("\t"));
- }
-
- public function testCharsWhile()
- {
- $s = new FileInputStream(__DIR__ . '/FileInputStreamTest.html');
-
- $this->assertEquals('<!', $s->charsWhile('!<'));
- $this->assertEquals('', $s->charsWhile('>'));
- $this->assertEquals('doctype', $s->charsWhile('odcyept'));
- $this->assertEquals(' htm', $s->charsWhile('html ', 4));
- }
-}
diff --git a/test/HTML5/Parser/ScannerTest.php b/test/HTML5/Parser/ScannerTest.php
index 8fa5110..763eebc 100644
--- a/test/HTML5/Parser/ScannerTest.php
+++ b/test/HTML5/Parser/ScannerTest.php
@@ -10,11 +10,10 @@ use Masterminds\HTML5\Parser\Scanner;
class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
{
-
/**
* A canary test to make sure the basics are setup and working.
*/
- public function testConstruct()
+ public function testConstructDeprecated()
{
$is = new StringInputStream("abc");
$s = new Scanner($is);
@@ -22,7 +21,12 @@ class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
$this->assertInstanceOf('\Masterminds\HTML5\Parser\Scanner', $s);
}
- public function testNext()
+ public function testConstruct()
+ {
+ $this->assertInstanceOf('\Masterminds\HTML5\Parser\Scanner', new Scanner('abc'));
+ }
+
+ public function testNextDeprecated()
{
$s = new Scanner(new StringInputStream("abc"));
@@ -30,9 +34,17 @@ class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
$this->assertEquals('c', $s->next());
}
+ public function testNext()
+ {
+ $s = new Scanner('abc');
+
+ $this->assertEquals('b', $s->next());
+ $this->assertEquals('c', $s->next());
+ }
+
public function testPosition()
{
- $s = new Scanner(new StringInputStream("abc"));
+ $s = new Scanner('abc');
$this->assertEquals(0, $s->position());
@@ -42,7 +54,7 @@ class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
public function testPeek()
{
- $s = new Scanner(new StringInputStream("abc"));
+ $s = new Scanner('abc');
$this->assertEquals('b', $s->peek());
@@ -52,7 +64,7 @@ class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
public function testCurrent()
{
- $s = new Scanner(new StringInputStream("abc"));
+ $s = new Scanner('abc');
// Before scanning the string begins the current is empty.
$this->assertEquals('a', $s->current());
@@ -67,7 +79,7 @@ class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
public function testUnconsume()
{
- $s = new Scanner(new StringInputStream("abcdefghijklmnopqrst"));
+ $s = new Scanner('abcdefghijklmnopqrst');
// Get initial position.
$s->next();
@@ -87,7 +99,7 @@ class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
public function testGetHex()
{
- $s = new Scanner(new StringInputStream("ab13ck45DE*"));
+ $s = new Scanner("ab13ck45DE*");
$this->assertEquals('ab13c', $s->getHex());
@@ -97,7 +109,7 @@ class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
public function testGetAsciiAlpha()
{
- $s = new Scanner(new StringInputStream("abcdef1%mnop*"));
+ $s = new Scanner('abcdef1%mnop*');
$this->assertEquals('abcdef', $s->getAsciiAlpha());
@@ -109,7 +121,7 @@ class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
public function testGetAsciiAlphaNum()
{
- $s = new Scanner(new StringInputStream("abcdef1ghpo#mn94op"));
+ $s = new Scanner('abcdef1ghpo#mn94op');
$this->assertEquals('abcdef1ghpo', $s->getAsciiAlphaNum());
@@ -120,7 +132,7 @@ class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
public function testGetNumeric()
{
- $s = new Scanner(new StringInputStream("1784a 45 9867 #"));
+ $s = new Scanner('1784a 45 9867 #');
$this->assertEquals('1784', $s->getNumeric());
@@ -132,7 +144,7 @@ class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
public function testCurrentLine()
{
- $s = new Scanner(new StringInputStream("1784a\n45\n9867 #\nThis is a test."));
+ $s = new Scanner("1784a\n45\n9867 #\nThis is a test.");
$this->assertEquals(1, $s->currentLine());
@@ -144,7 +156,7 @@ class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
public function testColumnOffset()
{
- $s = new Scanner(new StringInputStream("1784a a\n45 9867 #\nThis is a test."));
+ $s = new Scanner("1784a a\n45 9867 #\nThis is a test.");
// Move the pointer to the space.
$s->getAsciiAlphaNum();
@@ -163,7 +175,7 @@ class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
public function testRemainingChars()
{
$string = "\n45\n9867 #\nThis is a test.";
- $s = new Scanner(new StringInputStream("1784a\n45\n9867 #\nThis is a test."));
+ $s = new Scanner("1784a\n45\n9867 #\nThis is a test.");
$s->getAsciiAlphaNum();
$this->assertEquals($string, $s->remainingChars());
diff --git a/test/HTML5/Parser/StringInputStreamTest.php b/test/HTML5/Parser/StringInputStreamTest.php
deleted file mode 100644
index f87cc10..0000000
--- a/test/HTML5/Parser/StringInputStreamTest.php
+++ /dev/null
@@ -1,327 +0,0 @@
-<?php
-namespace Masterminds\HTML5\Tests\Parser;
-
-use Masterminds\HTML5\Parser\StringInputStream;
-
-class StringInputStreamTest extends \Masterminds\HTML5\Tests\TestCase
-{
-
- /**
- * A canary test to make sure the basics are setup and working.
- */
- public function testConstruct()
- {
- $s = new StringInputStream("abc");
-
- $this->assertInstanceOf('\Masterminds\HTML5\Parser\StringInputStream', $s);
- }
-
- public function testNext()
- {
- $s = new StringInputStream("abc");
-
- $s->next();
- $this->assertEquals('b', $s->current());
- $s->next();
- $this->assertEquals('c', $s->current());
- }
-
- public function testKey()
- {
- $s = new StringInputStream("abc");
-
- $this->assertEquals(0, $s->key());
-
- $s->next();
- $this->assertEquals(1, $s->key());
- }
-
- public function testPeek()
- {
- $s = new StringInputStream("abc");
-
- $this->assertEquals('b', $s->peek());
-
- $s->next();
- $this->assertEquals('c', $s->peek());
- }
-
- public function testCurrent()
- {
- $s = new StringInputStream("abc");
-
- // Before scanning the string begins the current is empty.
- $this->assertEquals('a', $s->current());
-
- $s->next();
- $this->assertEquals('b', $s->current());
-
- // Test movement through the string.
- $s->next();
- $this->assertEquals('c', $s->current());
- }
-
- public function testColumnOffset()
- {
- $s = new StringInputStream("abc\ndef\n");
- $this->assertEquals(0, $s->columnOffset());
- $s->next();
- $this->assertEquals(1, $s->columnOffset());
- $s->next();
- $this->assertEquals(2, $s->columnOffset());
- $s->next();
- $this->assertEquals(3, $s->columnOffset());
- $s->next(); // LF
- $this->assertEquals(0, $s->columnOffset());
- $s->next();
- $canary = $s->current(); // e
- $this->assertEquals('e', $canary);
- $this->assertEquals(1, $s->columnOffset());
-
- $s = new StringInputStream("abc");
- $this->assertEquals(0, $s->columnOffset());
- $s->next();
- $this->assertEquals(1, $s->columnOffset());
- $s->next();
- $this->assertEquals(2, $s->columnOffset());
- }
-
- public function testCurrentLine()
- {
- $txt = "1\n2\n\n\n\n3";
- $stream = new StringInputStream($txt);
- $this->assertEquals(1, $stream->currentLine());
-
- // Advance over 1 and LF on to line 2 value 2.
- $stream->next();
- $stream->next();
- $canary = $stream->current();
- $this->assertEquals(2, $stream->currentLine());
- $this->assertEquals('2', $canary);
-
- // Advance over 4x LF
- $stream->next();
- $stream->next();
- $stream->next();
- $stream->next();
- $stream->next();
- $this->assertEquals(6, $stream->currentLine());
- $this->assertEquals('3', $stream->current());
-
- // Make sure it doesn't do 7.
- $this->assertEquals(6, $stream->currentLine());
- }
-
- public function testRemainingChars()
- {
- $text = "abcd";
- $s = new StringInputStream($text);
- $this->assertEquals($text, $s->remainingChars());
-
- $text = "abcd";
- $s = new StringInputStream($text);
- $s->next(); // Pop one.
- $this->assertEquals('bcd', $s->remainingChars());
- }
-
- public function testCharsUnitl()
- {
- $text = "abcdefffffffghi";
- $s = new StringInputStream($text);
- $this->assertEquals('', $s->charsUntil('a'));
- // Pointer at 'a', moves 2 to 'c'
- $this->assertEquals('ab', $s->charsUntil('w', 2));
-
- // Pointer at 'c', moves to first 'f'
- $this->assertEquals('cde', $s->charsUntil('fzxv'));
-
- // Only get five 'f's
- $this->assertEquals('fffff', $s->charsUntil('g', 5));
-
- // Get just the last two 'f's
- $this->assertEquals('ff', $s->charsUntil('g'));
-
- // This should scan to the end.
- $this->assertEquals('ghi', $s->charsUntil('w', 9));
- }
-
- public function testCharsWhile()
- {
- $text = "abcdefffffffghi";
- $s = new StringInputStream($text);
-
- $this->assertEquals('ab', $s->charsWhile('ba'));
-
- $this->assertEquals('', $s->charsWhile('a'));
- $this->assertEquals('cde', $s->charsWhile('cdeba'));
- $this->assertEquals('ff', $s->charsWhile('f', 2));
- $this->assertEquals('fffff', $s->charsWhile('f'));
- $this->assertEquals('g', $s->charsWhile('fg'));
- $this->assertEquals('hi', $s->charsWhile('fghi', 99));
- }
-
- public function testBOM()
- {
- // Ignore in-text BOM.
- $stream = new StringInputStream("a\xEF\xBB\xBF");
- $this->assertEquals("a\xEF\xBB\xBF", $stream->remainingChars(), 'A non-leading U+FEFF (BOM/ZWNBSP) should remain');
-
- // Strip leading BOM
- $leading = new StringInputStream("\xEF\xBB\xBFa");
- $this->assertEquals('a', $leading->current(), 'BOM should be stripped');
- }
-
- public function testCarriageReturn()
- {
- // Replace NULL with Unicode replacement.
- $stream = new StringInputStream("\0\0\0");
- $this->assertEquals("\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD", $stream->remainingChars(), 'Null character should be replaced by U+FFFD');
- $this->assertEquals(3, count($stream->errors), 'Null character should set parse error: ' . print_r($stream->errors, true));
-
- // Remove CR when next to LF.
- $stream = new StringInputStream("\r\n");
- $this->assertEquals("\n", $stream->remainingChars(), 'CRLF should be replaced by LF');
-
- // Convert CR to LF when on its own.
- $stream = new StringInputStream("\r");
- $this->assertEquals("\n", $stream->remainingChars(), 'CR should be replaced by LF');
- }
-
- public function invalidParseErrorTestHandler($input, $numErrors, $name)
- {
- $stream = new StringInputStream($input, 'UTF-8');
- $this->assertEquals($input, $stream->remainingChars(), $name . ' (stream content)');
- $this->assertEquals($numErrors, count($stream->errors), $name . ' (number of errors)');
- }
-
- public function testInvalidReplace()
- {
- $invalidTest = array(
-
- // Min/max overlong
- "\xC0\x80a" => 'Overlong representation of U+0000',
- "\xE0\x80\x80a" => 'Overlong representation of U+0000',
- "\xF0\x80\x80\x80a" => 'Overlong representation of U+0000',
- "\xF8\x80\x80\x80\x80a" => 'Overlong representation of U+0000',
- "\xFC\x80\x80\x80\x80\x80a" => 'Overlong representation of U+0000',
- "\xC1\xBFa" => 'Overlong representation of U+007F',
- "\xE0\x9F\xBFa" => 'Overlong representation of U+07FF',
- "\xF0\x8F\xBF\xBFa" => 'Overlong representation of U+FFFF',
-
- "a\xDF" => 'Incomplete two byte sequence (missing final byte)',
- "a\xEF\xBF" => 'Incomplete three byte sequence (missing final byte)',
- "a\xF4\xBF\xBF" => 'Incomplete four byte sequence (missing final byte)',
-
- // Min/max continuation bytes
- "a\x80" => 'Lone 80 continuation byte',
- "a\xBF" => 'Lone BF continuation byte',
-
- // Invalid bytes (these can never occur)
- "a\xFE" => 'Invalid FE byte',
- "a\xFF" => 'Invalid FF byte'
- );
- foreach ($invalidTest as $test => $note) {
- $stream = new StringInputStream($test);
- $this->assertEquals('a', $stream->remainingChars(), $note);
- }
-
- // MPB:
- // It appears that iconv just leaves these alone. Not sure what to
- // do.
- /*
- * $converted = array( "a\xF5\x90\x80\x80" => 'U+110000, off unicode planes.', ); foreach ($converted as $test => $note) { $stream = new StringInputStream($test); $this->assertEquals(2, mb_strlen($stream->remainingChars()), $note); }
- */
- }
-
- public function testInvalidParseError()
- {
- // C0 controls (except U+0000 and U+000D due to different handling)
- $this->invalidParseErrorTestHandler("\x01", 1, 'U+0001 (C0 control)');
- $this->invalidParseErrorTestHandler("\x02", 1, 'U+0002 (C0 control)');
- $this->invalidParseErrorTestHandler("\x03", 1, 'U+0003 (C0 control)');
- $this->invalidParseErrorTestHandler("\x04", 1, 'U+0004 (C0 control)');
- $this->invalidParseErrorTestHandler("\x05", 1, 'U+0005 (C0 control)');
- $this->invalidParseErrorTestHandler("\x06", 1, 'U+0006 (C0 control)');
- $this->invalidParseErrorTestHandler("\x07", 1, 'U+0007 (C0 control)');
- $this->invalidParseErrorTestHandler("\x08", 1, 'U+0008 (C0 control)');
- $this->invalidParseErrorTestHandler("\x09", 0, 'U+0009 (C0 control)');
- $this->invalidParseErrorTestHandler("\x0A", 0, 'U+000A (C0 control)');
- $this->invalidParseErrorTestHandler("\x0B", 1, 'U+000B (C0 control)');
- $this->invalidParseErrorTestHandler("\x0C", 0, 'U+000C (C0 control)');
- $this->invalidParseErrorTestHandler("\x0E", 1, 'U+000E (C0 control)');
- $this->invalidParseErrorTestHandler("\x0F", 1, 'U+000F (C0 control)');
- $this->invalidParseErrorTestHandler("\x10", 1, 'U+0010 (C0 control)');
- $this->invalidParseErrorTestHandler("\x11", 1, 'U+0011 (C0 control)');
- $this->invalidParseErrorTestHandler("\x12", 1, 'U+0012 (C0 control)');
- $this->invalidParseErrorTestHandler("\x13", 1, 'U+0013 (C0 control)');
- $this->invalidParseErrorTestHandler("\x14", 1, 'U+0014 (C0 control)');
- $this->invalidParseErrorTestHandler("\x15", 1, 'U+0015 (C0 control)');
- $this->invalidParseErrorTestHandler("\x16", 1, 'U+0016 (C0 control)');
- $this->invalidParseErrorTestHandler("\x17", 1, 'U+0017 (C0 control)');
- $this->invalidParseErrorTestHandler("\x18", 1, 'U+0018 (C0 control)');
- $this->invalidParseErrorTestHandler("\x19", 1, 'U+0019 (C0 control)');
- $this->invalidParseErrorTestHandler("\x1A", 1, 'U+001A (C0 control)');
- $this->invalidParseErrorTestHandler("\x1B", 1, 'U+001B (C0 control)');
- $this->invalidParseErrorTestHandler("\x1C", 1, 'U+001C (C0 control)');
- $this->invalidParseErrorTestHandler("\x1D", 1, 'U+001D (C0 control)');
- $this->invalidParseErrorTestHandler("\x1E", 1, 'U+001E (C0 control)');
- $this->invalidParseErrorTestHandler("\x1F", 1, 'U+001F (C0 control)');
-
- // DEL (U+007F)
- $this->invalidParseErrorTestHandler("\x7F", 1, 'U+007F');
-
- // C1 Controls
- $this->invalidParseErrorTestHandler("\xC2\x80", 1, 'U+0080 (C1 control)');
- $this->invalidParseErrorTestHandler("\xC2\x9F", 1, 'U+009F (C1 control)');
- $this->invalidParseErrorTestHandler("\xC2\xA0", 0, 'U+00A0 (first codepoint above highest C1 control)');
-
- // Charcters surrounding surrogates
- $this->invalidParseErrorTestHandler("\xED\x9F\xBF", 0, 'U+D7FF (one codepoint below lowest surrogate codepoint)');
- $this->invalidParseErrorTestHandler("\xEF\xBF\xBD", 0, 'U+DE00 (one codepoint above highest surrogate codepoint)');
-
- // Permanent noncharacters
- $this->invalidParseErrorTestHandler("\xEF\xB7\x90", 1, 'U+FDD0 (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xEF\xB7\xAF", 1, 'U+FDEF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xEF\xBF\xBE", 1, 'U+FFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xEF\xBF\xBF", 1, 'U+FFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF0\x9F\xBF\xBE", 1, 'U+1FFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF0\x9F\xBF\xBF", 1, 'U+1FFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF0\xAF\xBF\xBE", 1, 'U+2FFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF0\xAF\xBF\xBF", 1, 'U+2FFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF0\xBF\xBF\xBE", 1, 'U+3FFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF0\xBF\xBF\xBF", 1, 'U+3FFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF1\x8F\xBF\xBE", 1, 'U+4FFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF1\x8F\xBF\xBF", 1, 'U+4FFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF1\x9F\xBF\xBE", 1, 'U+5FFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF1\x9F\xBF\xBF", 1, 'U+5FFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF1\xAF\xBF\xBE", 1, 'U+6FFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF1\xAF\xBF\xBF", 1, 'U+6FFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF1\xBF\xBF\xBE", 1, 'U+7FFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF1\xBF\xBF\xBF", 1, 'U+7FFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF2\x8F\xBF\xBE", 1, 'U+8FFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF2\x8F\xBF\xBF", 1, 'U+8FFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF2\x9F\xBF\xBE", 1, 'U+9FFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF2\x9F\xBF\xBF", 1, 'U+9FFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF2\xAF\xBF\xBE", 1, 'U+AFFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF2\xAF\xBF\xBF", 1, 'U+AFFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF2\xBF\xBF\xBE", 1, 'U+BFFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF2\xBF\xBF\xBF", 1, 'U+BFFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF3\x8F\xBF\xBE", 1, 'U+CFFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF3\x8F\xBF\xBF", 1, 'U+CFFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF3\x9F\xBF\xBE", 1, 'U+DFFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF3\x9F\xBF\xBF", 1, 'U+DFFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF3\xAF\xBF\xBE", 1, 'U+EFFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF3\xAF\xBF\xBF", 1, 'U+EFFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF3\xBF\xBF\xBE", 1, 'U+FFFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF3\xBF\xBF\xBF", 1, 'U+FFFFF (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF4\x8F\xBF\xBE", 1, 'U+10FFFE (permanent noncharacter)');
- $this->invalidParseErrorTestHandler("\xF4\x8F\xBF\xBF", 1, 'U+10FFFF (permanent noncharacter)');
-
- // MPB: These pass on some versions of iconv, and fail on others. Since we aren't in the
- // business of writing tests against iconv, I've just commented these out. Should revisit
- // at a later point.
- /*
- * $this->invalidParseErrorTestHandler("\xED\xA0\x80", 1, 'U+D800 (UTF-16 surrogate character)'); $this->invalidParseErrorTestHandler("\xED\xAD\xBF", 1, 'U+DB7F (UTF-16 surrogate character)'); $this->invalidParseErrorTestHandler("\xED\xAE\x80", 1, 'U+DB80 (UTF-16 surrogate character)'); $this->invalidParseErrorTestHandler("\xED\xAF\xBF", 1, 'U+DBFF (UTF-16 surrogate character)'); $this->invalidParseErrorTestHandler("\xED\xB0\x80", 1, 'U+DC00 (UTF-16 surrogate character)'); $this->invalidParseErrorTestHandler("\xED\xBE\x80", 1, 'U+DF80 (UTF-16 surrogate character)'); $this->invalidParseErrorTestHandler("\xED\xBF\xBF", 1, 'U+DFFF (UTF-16 surrogate character)'); // Paired UTF-16 surrogates $this->invalidParseErrorTestHandler("\xED\xA0\x80\xED\xB0\x80", 2, 'U+D800 U+DC00 (paired UTF-16 surrogates)'); $this->invalidParseErrorTestHandler("\xED\xA0\x80\xED\xBF\xBF", 2, 'U+D800 U+DFFF (paired UTF-16 surrogates)'); $this->invalidParseErrorTestHandler("\xED\xAD\xBF\xED\xB0\x80", 2, 'U+DB7F U+DC00 (paired UTF-16 surrogates)'); $this->invalidParseErrorTestHandler("\xED\xAD\xBF\xED\xBF\xBF", 2, 'U+DB7F U+DFFF (paired UTF-16 surrogates)'); $this->invalidParseErrorTestHandler("\xED\xAE\x80\xED\xB0\x80", 2, 'U+DB80 U+DC00 (paired UTF-16 surrogates)'); $this->invalidParseErrorTestHandler("\xED\xAE\x80\xED\xBF\xBF", 2, 'U+DB80 U+DFFF (paired UTF-16 surrogates)'); $this->invalidParseErrorTestHandler("\xED\xAF\xBF\xED\xB0\x80", 2, 'U+DBFF U+DC00 (paired UTF-16 surrogates)'); $this->invalidParseErrorTestHandler("\xED\xAF\xBF\xED\xBF\xBF", 2, 'U+DBFF U+DFFF (paired UTF-16 surrogates)');
- */
- }
-}
diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php
index 076dfea..30335e1 100644
--- a/test/HTML5/Parser/TokenizerTest.php
+++ b/test/HTML5/Parser/TokenizerTest.php
@@ -2,7 +2,6 @@
namespace Masterminds\HTML5\Tests\Parser;
use Masterminds\HTML5\Parser\UTF8Utils;
-use Masterminds\HTML5\Parser\StringInputStream;
use Masterminds\HTML5\Parser\Scanner;
use Masterminds\HTML5\Parser\Tokenizer;
@@ -960,8 +959,7 @@ class TokenizerTest extends \Masterminds\HTML5\Tests\TestCase
protected function createTokenizer($string, $debug = false)
{
$eventHandler = new EventStack();
- $stream = new StringInputStream($string);
- $scanner = new Scanner($stream);
+ $scanner = new Scanner($string);
$scanner->debug = $debug;
diff --git a/test/HTML5/Parser/TreeBuildingRulesTest.php b/test/HTML5/Parser/TreeBuildingRulesTest.php
index de94d06..1d08cbc 100644
--- a/test/HTML5/Parser/TreeBuildingRulesTest.php
+++ b/test/HTML5/Parser/TreeBuildingRulesTest.php
@@ -8,7 +8,6 @@ namespace Masterminds\HTML5\Tests\Parser;
use Masterminds\HTML5\Parser\TreeBuildingRules;
use Masterminds\HTML5\Parser\Tokenizer;
use Masterminds\HTML5\Parser\Scanner;
-use Masterminds\HTML5\Parser\StringInputStream;
use Masterminds\HTML5\Parser\DOMTreeBuilder;
/**
@@ -25,7 +24,7 @@ class TreeBuildingRulesTest extends \Masterminds\HTML5\Tests\TestCase
protected function parse($string)
{
$treeBuilder = new DOMTreeBuilder();
- $scanner = new Scanner(new StringInputStream($string));
+ $scanner = new Scanner($string);
$parser = new Tokenizer($scanner, $treeBuilder);
$parser->parse();
@@ -37,7 +36,7 @@ class TreeBuildingRulesTest extends \Masterminds\HTML5\Tests\TestCase
protected function parseFragment($string)
{
$events = new DOMTreeBuilder(true);
- $scanner = new Scanner(new StringInputStream($string));
+ $scanner = new Scanner($string);
$parser = new Tokenizer($scanner, $events);
$parser->parse();
diff --git a/test/HTML5/Serializer/TraverserTest.php b/test/HTML5/Serializer/TraverserTest.php
index c914633..a156553 100644
--- a/test/HTML5/Serializer/TraverserTest.php
+++ b/test/HTML5/Serializer/TraverserTest.php
@@ -71,7 +71,7 @@ class TraverserTest extends \Masterminds\HTML5\Tests\TestCase
$this->assertInstanceOf('\Masterminds\HTML5\Serializer\Traverser', $t);
}
- public function testFragment()
+ public function testFragmentDeprecated()
{
$html = '<span class="bar">foo</span><span></span><div>bar</div>';
$input = new \Masterminds\HTML5\Parser\StringInputStream($html);
@@ -82,12 +82,27 @@ class TraverserTest extends \Masterminds\HTML5\Tests\TestCase
$stream = fopen('php://temp', 'w');
$r = new OutputRules($stream, $this->html5->getOptions());
$t = new Traverser($dom, $stream, $r, $this->html5->getOptions());
+ $t->walk();
- $out = $t->walk();
$this->assertEquals($html, stream_get_contents($stream, - 1, 0));
}
- public function testProcessorInstruction()
+ public function testFragment()
+ {
+ $html = '<span class="bar">foo</span><span></span><div>bar</div>';
+ $dom = $this->html5->parseFragment($html);
+
+ $this->assertInstanceOf('\DOMDocumentFragment', $dom);
+
+ $stream = fopen('php://temp', 'w');
+ $r = new OutputRules($stream, $this->html5->getOptions());
+ $t = new Traverser($dom, $stream, $r, $this->html5->getOptions());
+ $t->walk();
+
+ $this->assertEquals($html, stream_get_contents($stream, - 1, 0));
+ }
+
+ public function testProcessorInstructionDeprecated()
{
$html = '<?foo bar ?>';
$input = new \Masterminds\HTML5\Parser\StringInputStream($html);
@@ -97,9 +112,26 @@ class TraverserTest extends \Masterminds\HTML5\Tests\TestCase
$stream = fopen('php://temp', 'w');
$r = new OutputRules($stream, $this->html5->getOptions());
+
+ $t = new Traverser($dom, $stream, $r, $this->html5->getOptions());
+ $t->walk();
+
+ $this->assertEquals($html, stream_get_contents($stream, - 1, 0));
+ }
+
+ public function testProcessorInstruction()
+ {
+ $html = '<?foo bar ?>';
+ $dom = $this->html5->parseFragment($html);
+
+ $this->assertInstanceOf('\DOMDocumentFragment', $dom);
+
+ $stream = fopen('php://temp', 'w');
+ $r = new OutputRules($stream, $this->html5->getOptions());
+
$t = new Traverser($dom, $stream, $r, $this->html5->getOptions());
+ $t->walk();
- $out = $t->walk();
$this->assertEquals($html, stream_get_contents($stream, - 1, 0));
}
}