summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTechnosophos <[email protected]>2013-04-19 17:12:54 -0500
committerTechnosophos <[email protected]>2013-04-19 17:12:54 -0500
commit4e5458898e6d9a73d3eae7b3213187407a940ce8 (patch)
tree2e4c6d5f79ca14574b9ac83e6e36ddb9f5f1904a
parentbdbd0e7dabcc7f0567fa5abcb40a5236fc204eb8 (diff)
Added support for raw text.
-rw-r--r--src/HTML5/Parser/EventHandler.php2
-rw-r--r--src/HTML5/Parser/Tokenizer.php488
-rw-r--r--test/HTML5/Parser/TokenizerTest.php29
3 files changed, 252 insertions, 267 deletions
diff --git a/src/HTML5/Parser/EventHandler.php b/src/HTML5/Parser/EventHandler.php
index c28d80b..8282cb7 100644
--- a/src/HTML5/Parser/EventHandler.php
+++ b/src/HTML5/Parser/EventHandler.php
@@ -26,6 +26,8 @@ interface EventHandler {
public function doctype($name, $idType = 0, $id = NULL, $quirks = FALSE);
/**
* A start tag.
+ *
+ *
*/
public function startTag($name, $attributes = array(), $selfClosing = FALSE);
/**
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index 381e9e6..4f2f792 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -34,9 +34,23 @@ class Tokenizer {
// When this goes to false, the parser stops.
protected $carryOn = TRUE;
+ protected $textMode = 0; // TEXTMODE_NORMAL;
+ protected $untilTag = NULL;
+
const WHITE="\t\n\f ";
/**
+ * Textmodes are used to determine how to scan the text inside of tags.
+ *
+ * NORMAL: Scan non-elements.
+ * RAW: Scan until a specific closing tag.
+ * RCDATA: Scan until a specifc close state.
+ */
+ const TEXTMODE_NORMAL = 0;
+ const TEXTMODE_RAW = 1;
+ const TEXTMODE_RCDATA = 2;
+
+ /**
* Create a new tokenizer.
*
* Typically, parsing a document involves creating a new tokenizer, giving
@@ -68,24 +82,60 @@ class Tokenizer {
while ($this->carryOn);
}
+ public function setTextMode($textmode, $untilTag = NULL) {
+ $this->textMode = $textmode;
+ $this->untilTag = $untilTag;
+ }
+
/**
* Consume a character and make a move.
* HTML5 8.2.4.1
*/
protected function consumeData() {
// Character Ref
+ /*
$this->characterReference() ||
$this->tagOpen() ||
$this->eof() ||
$this->characterData();
+ */
+
+ $this->characterReference();
+ $this->tagOpen();
+ $this->eof();
+ $this->characterData();
+
return $this->carryOn;
}
/**
- * This buffers the current token as character data.
+ * Parse anything that looks like character data.
+ *
+ * Different rules apply based on the current TEXTMODE.
*/
protected function characterData() {
+ if ($this->scanner->current() === FALSE) {
+ return FALSE;
+ }
+ switch ($this->textMode) {
+ case self::TEXTMODE_RAW:
+ case self::TEXTMODE_RCDATA:
+ return $this->rawText();
+ case self::TEXTMODE_NORMAL:
+ default:
+ $tok = $this->scanner->current();
+ if (strspn($tok, "<&")) {
+ return FALSE;
+ }
+ return $this->text();
+ }
+ }
+
+ /**
+ * This buffers the current token as character data.
+ */
+ protected function text() {
$tok = $this->scanner->current();
// This should never happen...
@@ -102,6 +152,19 @@ class Tokenizer {
return TRUE;
}
+ protected function rawText() {
+ if (is_null($this->untilTag)) {
+ return $this->text();
+ }
+ $sequence = '</' . $this->untilTag . '>';
+ $txt = $this->readUntilSequence($sequence);
+ $this->events->text($txt);
+ $this->setTextMode(self::TEXTMODE_NORMAL);
+ return $this->endTag();
+ }
+
+
+
protected function eof() {
if ($this->scanner->current() === FALSE) {
//fprintf(STDOUT, "EOF");
@@ -130,106 +193,6 @@ class Tokenizer {
return FALSE;
}
- /**
- * Decode a character reference and return the string.
- *
- * Returns FALSE if the entity could not be found. If $inAttribute is set
- * to TRUE, a bare & will be returned as-is.
- *
- * @param boolean $inAttribute
- * Set to TRUE if the text is inside of an attribute value.
- * FALSE otherwise.
- */
- protected function decodeCharacterReference($inAttribute = FALSE) {
-
- // If it fails this, it's definitely not an entity.
- if ($this->scanner->current() != '&') {
- return FALSE;
- }
-
- // Next char after &.
- $tok = $this->scanner->next();
- $entity = '';
- $start = $this->scanner->position();
-
- if ($tok == FALSE) {
- return '&';
- }
-
- // These indicate not an entity. We return just
- // the &.
- if (strspn($tok, self::WHITE . "&<") == 1) {
- //$this->scanner->next();
- return '&';
- }
-
- // Numeric entity
- if ($tok == '#') {
- $tok = $this->scanner->next();
-
- // Hexidecimal encoding.
- // X[0-9a-fA-F]+;
- // x[0-9a-fA-F]+;
- if ($tok == 'x' || $tok == 'X') {
- $tok = $this->scanner->next(); // Consume x
-
- // Convert from hex code to char.
- $hex = $this->scanner->getHex();
- if (empty($hex)) {
- $this->parseError("Expected &#xHEX;, got &#x%s", $tok);
- // We unconsume because we don't know what parser rules might
- // be in effect for the remaining chars. For example. '&#>'
- // might result in a specific parsing rule inside of tag
- // contexts, while not inside of pcdata context.
- $this->scanner->unconsume(2);
- return '&';
- }
- $entity = CharacterReference::lookupHex($hex);
- }
- // Decimal encoding.
- // [0-9]+;
- else {
- // Convert from decimal to char.
- $numeric = $this->scanner->getNumeric();
- if ($numeric === FALSE) {
- $this->parseError("Expected &#DIGITS;, got &#%s", $tok);
- $this->scanner->unconsume(2);
- return '&';
- }
- $entity = CharacterReference::lookupDecimal($numeric);
- }
- }
- // String entity.
- else {
- // Attempt to consume a string up to a ';'.
- // [a-zA-Z0-9]+;
- $cname = $this->scanner->getAsciiAlpha();
- $entity = CharacterReference::lookupName($cname);
- if ($entity == NULL) {
- $this->parseError("No match in entity table for '%s'", $entity);
- }
- }
-
- // The scanner has advanced the cursor for us.
- $tok = $this->scanner->current();
-
- // We have an entity. We're done here.
- if ($tok == ';') {
- $this->scanner->next();
- return $entity;
- }
-
- // If in an attribute, then failing to match ; means unconsume the
- // entire string. Otherwise, failure to match is an error.
- if ($inAttribute) {
- $this->scanner->unconsume($this->scanner->position() - $start);
- return '&';
- }
-
- $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok);
- return '&' . $entity;
-
- }
/**
* 8.2.4.8
@@ -781,172 +744,6 @@ class Tokenizer {
return FALSE;
}
- protected function rcdata() {
- // Ampersand
- // <
- // Null
- // EOF
- // Character
- }
-
- protected function rawtext() {
- // < is a literal
- // NULL is an error
- // EOF
- // Character data
- }
-
- protected function scriptData() {
- // < is a literal
- // NULL is an error
- // EOF
- // Character data
- }
-
- /**
- * 8.2.4.7
- */
- protected function plaintext() {
- // NULL -> parse error
- // EOF -> eof
- // -> Character data
- }
-
-
- /**
- * 8.2.4.11
- */
- protected function rcdataLessThan() {
- // / -> empty the tmp buffer and go to end-tag
- // ->rcdata
- }
-
- /**
- * 8.2.4.12
- */
- protected function rcdataEndTag() {
- // A-Za-z: append to tagname
- // -> rcdata state
- }
-
- /**
- * 8.2.4.13
- */
- protected function rcdataEndTagName() {
- // tab, lf, ff, space -> before attribute or treat as anything
- // / -> self-closing tag
- // > -> end tag, back to data
- // A-Za-z -> append to tagname
- // -> rcdata state
- }
-
- /**
- * 8.2.4.14
- */
- protected function rawtextLessThan() {
- // / -> rawtext endtag state
- // -> rawtext
- }
-
- /**
- * 8.2.4.15
- */
- protected function rawtextEndTagOpen() {
- // A-Za-z -> rawtext
- // ->rawtext
- }
-
- protected function rawtextEndTagName() {
- // tab, lf, ff, space -> before attr name
- //
- }
-
- protected function scriptLessThan(){
- }
- protected function scriptEndTagOpen() {
- }
- protected function scriptEndTagName() {
- }
- protected function scriptEscapeStart() {
- }
- protected function scriptEscapeStartDash() {
- }
- protected function scriptEscaped() {
- }
- protected function scriptEscapedDash() {
- }
- protected function scriptEscapedDashDash() {
- }
- protected function scriptEscapedLessThan() {
- }
- protected function scriptEscapedEndTagOpen() {
- }
- protected function scriptEscapedEndTagName() {
- }
- protected function scriptDoubleEscapeStart() {
- }
- protected function scriptDoubleEscaped() {
- }
- protected function scriptDoubleEscapedDash() {
- }
- protected function scriptDoubleEscapedDashDash() {
- }
- protected function scriptDoubleEscapedLessThan() {
- }
- protected function scriptDoubleEscapeEnd() {
- }
- protected function beforeAttributeName() {
- }
- protected function attributeName() {
- }
- protected function afterAttributeName() {
- }
- protected function beforeAttributeValue() {
- }
- protected function attributeValueDoubleQuote() {
- }
- protected function attributeValueSingleQuote() {
- }
- protected function attributeValueUnquoted() {
- }
- protected function characterReferenceInAttributeValue() {
- }
- protected function afterAttributeValueQuoted() {
- }
- protected function selfCloseingStartTag() {
- }
- protected function beforeDoctype() {
- }
- protected function doctypeName() {
- }
- protected function afterDoctypeName() {
- }
- protected function doctypePublicKeyword() {
- }
- protected function beforeDoctypePublicId() {
- }
- protected function doctypePublicIdDoubleQuoted() {
- }
- protected function doctypePublicIdSingleQuoted() {
- }
- protected function afterDoctypePublicId() {
- }
- protected function betweenDoctypePublicAndSystem() {
- }
- protected function afterDoctypeSystemKeyword() {
- }
- protected function beforeDoctypeSystemIdentifier() {
- }
- protected function doctypeSystemIdDoubleQuoted() {
- }
- protected function doctypeSystemIdSingleQuoted() {
- }
- protected function afterDoctypeSystemId() {
- }
- protected function bogusDoctype() {
- }
-
-
// ================================================================
// Non-HTML5
// ================================================================
@@ -999,6 +796,62 @@ class Tokenizer {
// ================================================================
/**
+ * Read from the input stream until we get to the desired sequene
+ * or hit the end of the input stream.
+ */
+ protected function readUntilSequence($sequence) {
+ $buffer = '';
+
+ // Optimization for reading larger blocks faster.
+ $first = substr($sequence, 0, 1);
+ while ($this->scanner->current() !== FALSE) {
+ $buffer .= $this->scanner->charsUntil($first);
+
+ // Stop as soon as we hit the stopping condition.
+ if ($this->sequenceMatches($sequence)) {
+ return $buffer;
+ }
+ $buffer .= $this->scanner->current();
+ $this->scanner->next();
+ }
+
+ // If we get here, we hit the EOF.
+ return $buffer;
+ }
+
+ /**
+ * Check if upcomming chars match the given sequence.
+ *
+ * This will read the stream for the $sequence. If it's
+ * found, this will return TRUE. If not, return FALSE.
+ * Since this unconsumes any chars it reads, the caller
+ * will still need to read the next sequence, even if
+ * this returns TRUE.
+ *
+ * Example: $this->sequenceMatches('</script>') will
+ * see if the input stream is at the start of a
+ * '</script>' string.
+ */
+ protected function sequenceMatches($sequence) {
+ $len = strlen($sequence);
+ $buffer = '';
+ for ($i = 0; $i < $len; ++$i) {
+ $buffer .= $this->scanner->current();
+
+ // EOF. Rewind and let the caller handle it.
+ if ($this->scanner->current() === FALSE) {
+ $this->scanner->unconsume($i);
+ return FALSE;
+ }
+ $this->scanner->next();
+ }
+
+ $this->scanner->unconsume($len);
+ return $buffer == $sequence;
+
+ }
+
+ /**
* Send a TEXT event with the contents of the text buffer.
*
* This emits an EventHandler::text() event with the current contents of the
@@ -1042,4 +895,105 @@ class Tokenizer {
return FALSE;
}
+ /**
+ * Decode a character reference and return the string.
+ *
+ * Returns FALSE if the entity could not be found. If $inAttribute is set
+ * to TRUE, a bare & will be returned as-is.
+ *
+ * @param boolean $inAttribute
+ * Set to TRUE if the text is inside of an attribute value.
+ * FALSE otherwise.
+ */
+ protected function decodeCharacterReference($inAttribute = FALSE) {
+
+ // If it fails this, it's definitely not an entity.
+ if ($this->scanner->current() != '&') {
+ return FALSE;
+ }
+
+ // Next char after &.
+ $tok = $this->scanner->next();
+ $entity = '';
+ $start = $this->scanner->position();
+
+ if ($tok == FALSE) {
+ return '&';
+ }
+
+ // These indicate not an entity. We return just
+ // the &.
+ if (strspn($tok, self::WHITE . "&<") == 1) {
+ //$this->scanner->next();
+ return '&';
+ }
+
+ // Numeric entity
+ if ($tok == '#') {
+ $tok = $this->scanner->next();
+
+ // Hexidecimal encoding.
+ // X[0-9a-fA-F]+;
+ // x[0-9a-fA-F]+;
+ if ($tok == 'x' || $tok == 'X') {
+ $tok = $this->scanner->next(); // Consume x
+
+ // Convert from hex code to char.
+ $hex = $this->scanner->getHex();
+ if (empty($hex)) {
+ $this->parseError("Expected &#xHEX;, got &#x%s", $tok);
+ // We unconsume because we don't know what parser rules might
+ // be in effect for the remaining chars. For example. '&#>'
+ // might result in a specific parsing rule inside of tag
+ // contexts, while not inside of pcdata context.
+ $this->scanner->unconsume(2);
+ return '&';
+ }
+ $entity = CharacterReference::lookupHex($hex);
+ }
+ // Decimal encoding.
+ // [0-9]+;
+ else {
+ // Convert from decimal to char.
+ $numeric = $this->scanner->getNumeric();
+ if ($numeric === FALSE) {
+ $this->parseError("Expected &#DIGITS;, got &#%s", $tok);
+ $this->scanner->unconsume(2);
+ return '&';
+ }
+ $entity = CharacterReference::lookupDecimal($numeric);
+ }
+ }
+ // String entity.
+ else {
+ // Attempt to consume a string up to a ';'.
+ // [a-zA-Z0-9]+;
+ $cname = $this->scanner->getAsciiAlpha();
+ $entity = CharacterReference::lookupName($cname);
+ if ($entity == NULL) {
+ $this->parseError("No match in entity table for '%s'", $entity);
+ }
+ }
+
+ // The scanner has advanced the cursor for us.
+ $tok = $this->scanner->current();
+
+ // We have an entity. We're done here.
+ if ($tok == ';') {
+ $this->scanner->next();
+ return $entity;
+ }
+
+ // If in an attribute, then failing to match ; means unconsume the
+ // entire string. Otherwise, failure to match is an error.
+ if ($inAttribute) {
+ $this->scanner->unconsume($this->scanner->position() - $start);
+ return '&';
+ }
+
+ $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok);
+ return '&' . $entity;
+
+ }
+
}
diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php
index aff819e..69f90b9 100644
--- a/test/HTML5/Parser/TokenizerTest.php
+++ b/test/HTML5/Parser/TokenizerTest.php
@@ -387,6 +387,35 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
}
}
+ public function testRawText() {
+ $good = array(
+ '<pre>abcd efg hijk lmnop</pre> ' => 'abcd efg hijk lmnop',
+ '<pre><not/><the/><tag></pre>' => '<not/><the/><tag>',
+ '<pre><<<<<<<<</pre>' => '<<<<<<<<',
+ '<pre>hello</pre</pre>' => 'hello</pre',
+ "<pre>\nhello</pre\n</pre>" => "\nhello</pre\n",
+ '<pre>&amp;</pre>' => '&amp;',
+ );
+ foreach ($good as $test => $expects) {
+ list($tok, $events) = $this->createTokenizer($test);
+
+ $tok->setTextMode(Tokenizer::TEXTMODE_RAW, 'pre');
+ $tok->parse();
+
+ //fprintf(STDOUT, "Test: %s\n", $test);
+ fprintf(STDOUT, "Test: %s %s\n", $test, print_r($events, TRUE));
+
+ $this->assertEventEquals('startTag', 'pre', $events->get(0));
+ $this->assertEventEquals('text', $expects, $events->get(1));
+ $this->assertEventEquals('endTag', 'pre', $events->get(2));
+ }
+
+ $bad = array(
+ '<pre>&amp;</pre' => '&amp;',
+ );
+
+ }
+
public function testText() {
$good = array(
'a<br>b',