From 65a11b73a06fcf3ce21db2298d9c460bcd286b66 Mon Sep 17 00:00:00 2001 From: Matt Farina Date: Wed, 7 Aug 2013 11:57:33 -0400 Subject: #11: Updating the text handling for parsing when in before head mode. Now passing the ignored string characters through to the DOM and giving a parse error on other strings. Since this DOM is not used to render for display and it may be turned back into html it is useful to preserve these characters. --- src/HTML5/Parser/DOMTreeBuilder.php | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'src/HTML5') diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index f0caeb4..b0e2e11 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -323,11 +323,17 @@ class DOMTreeBuilder implements EventHandler { public function text($data) { // XXX: Hmmm.... should we really be this strict? if ($this->insertMode < self::IM_IN_HEAD) { - $data = trim($data); - if (!empty($data)) { + // Per '8.2.5.4.3 The "before head" insertion mode' we are supposed to + // ignore " \t\n\r\f" characters and throw a parse error for other strings. + // In this case we are throwing a parse error for other strings while + // passing " \t\n\r\f" through to the DOM. Since this parser is not creating + // a DOM that will be used for rendering a display and the DOM may be + // turned back into html these characters are passed along to the DOM. + $dataTmp = trim($data, " \t\n\r\f"); + if (!empty($dataTmp)) { //fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode); - $this->parseError("Unexpected text. Ignoring: " . $data); - return; + $this->parseError("Unexpected text. Ignoring: " . $dataTmp); + $data = str_replace($dataTmp, '', $data); } } //fprintf(STDOUT, "Appending text %s.", $data); -- cgit v1.2.3