Updated the text escaping and added some separation between escaping and converting named character references.

author: Matt Farina <[email protected]> 2014-02-11 21:22:40 -0500
committer: Matt Farina <[email protected]> 2014-02-11 21:22:40 -0500
commit: 446d404b42cb25f82e8e75145c70d3fbd1cfecb9 (patch)
tree: 63119e6e1da4dc807a72657a00225d398e518cb3
parent: e21281f7c1d289d9237a34a906f9fec8195640d1 (diff)
2 files changed, 72 insertions, 16 deletions
diff --git a/src/HTML5/Serializer/OutputRules.php b/src/HTML5/Serializer/OutputRules.php
index 4bb6e19..3af1cde 100644
--- a/src/HTML5/Serializer/OutputRules.php
+++ b/src/HTML5/Serializer/OutputRules.php
@@ -230,6 +230,19 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
   /**
    * Encode text.
    *
+   * When encode is set to FALSE, the default value, the text passed in is
+   * escaped per section 8.3 of the html5 spec. For details on how text is
+   * escaped see the escape() method.
+   *
+   * When encoding is set to true the text is converted to named character
+   * references where appropriate. Section 8.1.4 Character references of the
+   * html5 spec refers to using named character references. This is useful for
+   * characters that can't otherwise legally be used in the text.
+   *
+   * The named character references are listed in section 8.5.
+   *
+   * @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#named-character-references
+   * 
    * True encoding will turn all named character references into their entities.
    * This includes such characters as +.# and many other common ones. By default
    * encoding here will just escape &'<>".
@@ -248,15 +261,13 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
    */
   protected function enc($text, $attribute = FALSE) {
 
-    $quotes = $attribute ? ENT_COMPAT : 0;
-    // Escape rather than encode all entities.
-    if (!$this->encode && $attribute) {
-        return strtr($text, array('"'=>'&quot;', '&'=>'&amp;', "\xc2\xa0"=>'&nbsp;'));
-    } elseif (!$this->encode) {
-      return htmlspecialchars($text, $quotes, 'UTF-8');
+    // Escape the text rather than convert to named character references.
+    if (!$this->encode) {
+      return $this->escape($text, $attribute);
     }
 
-    // If we are in PHP 5.4+ we can use the native html5 entity functionality.
+    // If we are in PHP 5.4+ we can use the native html5 entity functionality to
+    // convert the named character references.
     if (defined('ENT_HTML5')) {
       return htmlentities($text, ENT_HTML5 | ENT_SUBSTITUTE | ENT_QUOTES, 'UTF-8', FALSE);
     }
@@ -267,4 +278,37 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
     }
   }
 
+  /**
+   * Escape test.
+   *
+   * According to the html5 spec section 8.3 Serializing HTML fragments, text
+   * within tags that are not style, script, xmp, iframe, noembed, and noframes
+   * need to be properly escaped.
+   *
+   * The & should be converted to &amp;, no breaking space unicode characters
+   * converted to &nbsp;, when in attribute mode the " should be converted to
+   * &quot;, and when not in attribute mode the < and > should be converted to
+   * &lt; and &gt;.
+   *
+   * @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#escapingString
+   *
+   * @param string $text
+   *   text to escape.
+   * @param boolean $attribute
+   *   True if we are escaping an attrubute, false otherwise
+   */
+  protected function escape($text, $attribute = FALSE) {
+
+    // Not using htmlspecialchars because, while it does escaping, it doesn't
+    // match the requirements of section 8.5. For example, it doesn't handle
+    // non-breaking spaces.
+    if ($attribute) {
+      $replace = array('"'=>'&quot;', '&'=>'&amp;', "\xc2\xa0"=>'&nbsp;');
+    }
+    else {
+      $replace = array('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', "\xc2\xa0"=>'&nbsp;');
+    }
+
+    return strtr($text, $replace);
+  }
 }
diff --git a/test/HTML5/Serializer/OutputRulesTest.php b/test/HTML5/Serializer/OutputRulesTest.php
index d04ebef..8a6d0d2 100644
--- a/test/HTML5/Serializer/OutputRulesTest.php
+++ b/test/HTML5/Serializer/OutputRulesTest.php
@@ -238,19 +238,19 @@ class OutputRulesTest extends \HTML5\Tests\TestCase {
 
   function getEncData(){
   	return array(
-  	  array(false, '&\'<>"', '&amp;\'&lt;&gt;"', '&amp;&apos;&lt;&gt;&quot;'),
-  	  array(false, 'This + is. a < test', 'This + is. a &lt; test', 'This &plus; is&period; a &lt; test'),
-  	  array(false, '.+#', '.+#', '&period;&plus;&num;'),
-
-  	  array(true, '.+#\'', '.+#\'', '&period;&plus;&num;&apos;'),
-  	  array(true, '&".<', '&amp;&quot;.<', '&amp;&quot;&period;&lt;'),
-  	  array(true, '&\'<>"', '&amp;\'<>&quot;', '&amp;&apos;&lt;&gt;&quot;'),
-  	  array(true, "\xc2\xa0\"'", '&nbsp;&quot;\'', '&nbsp;&quot;&apos;'),
+  	  array(FALSE, '&\'<>"', '&amp;\'&lt;&gt;"', '&amp;&apos;&lt;&gt;&quot;'),
+  	  array(FALSE, 'This + is. a < test', 'This + is. a &lt; test', 'This &plus; is&period; a &lt; test'),
+  	  array(FALSE, '.+#', '.+#', '&period;&plus;&num;'),
+
+  	  array(TRUE, '.+#\'', '.+#\'', '&period;&plus;&num;&apos;'),
+  	  array(TRUE, '&".<', '&amp;&quot;.<', '&amp;&quot;&period;&lt;'),
+  	  array(TRUE, '&\'<>"', '&amp;\'<>&quot;', '&amp;&apos;&lt;&gt;&quot;'),
+  	  array(TRUE, "\xc2\xa0\"'", '&nbsp;&quot;\'', '&nbsp;&quot;&apos;'),
     );
   }
 
   /**
-   * Test basic escaping of text.
+   * Test basic encoding of text.
    * @dataProvider getEncData
    */
   function testEnc($isAttribute, $test, $expected, $expectedEncoded) {
@@ -265,6 +265,18 @@ class OutputRulesTest extends \HTML5\Tests\TestCase {
     $this->assertEquals($expectedEncoded, $m->invoke($o, $test, $isAttribute));
   }
 
+  /**
+   * Test basic encoding of text.
+   * @dataProvider getEncData
+   */
+  function testEscape($isAttribute, $test, $expected, $expectedEncoded) {
+
+    list($o, $s) = $this->getOutputRules();
+    $m = $this->getProtectedMethod('escape');
+
+    $this->assertEquals($expected, $m->invoke($o, $test, $isAttribute));
+  }
+
   function testAttrs() {
     $dom = \HTML5::loadHTML('<!doctype html>
     <html lang="en">
author	Matt Farina <[email protected]>	2014-02-11 21:22:40 -0500
committer	Matt Farina <[email protected]>	2014-02-11 21:22:40 -0500
commit	446d404b42cb25f82e8e75145c70d3fbd1cfecb9 (patch)
tree	63119e6e1da4dc807a72657a00225d398e518cb3
parent	e21281f7c1d289d9237a34a906f9fec8195640d1 (diff)