Merge branch 'master' of github.com:Masterminds/html5-php

author: Matt Butcher <[email protected]> 2013-05-30 09:23:26 -0500
committer: Matt Butcher <[email protected]> 2013-05-30 09:23:26 -0500
commit: b1cbd9b4cd488471651751678cd90575bbb74bc9 (patch)
tree: 6dcf0afce7206b4372adc1a1ddbc25698b38e24f
parent: 0e149588548834bbfee7770fac8455cc404fb8ca (diff)
parent: fffeafbfe08e306356acd50cf568ec5904da882c (diff)
7 files changed, 89 insertions, 16 deletions
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..35b3cae
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,11 @@
+language: php
+
+php: 
+  - 5.3
+  - 5.4
+
+before_script:
+  - curl -s http://getcomposer.org/installer | php
+  - php composer.phar install
+
+script: phpunit test/HTML5
+\ No newline at end of file
diff --git a/LICENSE.txt b/LICENSE.txt
index 5799468..6ecbf3e 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -3,7 +3,7 @@
 Copyright (c) 2013 The Authors of HTML5-PHP
 
 Matt Butcher - [email protected]
-Matt Farina - 
+Matt Farina - [email protected]
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in 
diff --git a/README.md b/README.md
index d39612f..caebfbc 100644
--- a/README.md
+++ b/README.md
@@ -88,15 +88,20 @@ events and builds a document tree (`DOMDocument`) based on the events.
 The serializer takes a data structure (the `DOMDocument`) and transforms
 it into a character representation -- an HTML5 document.
 
-The serializer is broken into two parts:
+The serializer is broken into three parts:
 
+- The `OutputRules` contain the rules to turn DOM elements into strings. The
+rules used are configurable with the `OutputRules` being the default. An option
+can be set by default or at call time to use a different ruleset that implements
+`RulesInterface`.
 - The `Traverser`, which is a special-purpose tree walker. It visits
-each node and transforms it into a string.
+each node node in the tree and uses the `OutputRules` to transform the node
+into a string.
 - The `Serializer` manages the `Traverser` and stores the resultant data
 in the correct place.
 
 The serializer (`save()`, `saveHTML()`) follows the 
-[section 8.9 of the HTML 5.0 spec] (http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#serializing-html-fragments).
+[section 8.9 of the HTML 5.0 spec](http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#serializing-html-fragments).
 So tags are serialized according to these rules:
 
 - A tag with children: &lt;foo&gt;CHILDREN&lt;/foo&gt;
diff --git a/src/HTML5/Parser/UTF8Utils.php b/src/HTML5/Parser/UTF8Utils.php
index 022d628..974a670 100644
--- a/src/HTML5/Parser/UTF8Utils.php
+++ b/src/HTML5/Parser/UTF8Utils.php
@@ -86,7 +86,27 @@ class UTF8Utils {
     could not be converted to Unicode characters must be converted
     to U+FFFD REPLACEMENT CHARACTER code points. */
 
-    if (function_exists('iconv') && $encoding != 'auto') {
+    // mb_convert_encoding is chosen over iconv because of a bug. The best
+    // details for the bug are on http://us1.php.net/manual/en/function.iconv.php#108643
+    // which contains links to the actual but reports as well as work around
+    // details.
+    if (function_exists('mb_convert_encoding')) {
+      // mb library has the following behaviors:
+      // - UTF-16 surrogates result in FALSE.
+      // - Overlongs and outside Plane 16 result in empty strings.
+      
+      // Before we run mb_convert_encoding we need to tell it what to do with
+      // characters it does not know. This could be different than the parent
+      // application executing this library so we store the value, change it
+      // to our needs, and then change it back when we are done. This feels
+      // a little excessive and it would be great if there was a better way.
+      $save = ini_get('mbstring.substitute_character');
+      ini_set('mbstring.substitute_character', "none");
+      $data = mb_convert_encoding($data, 'UTF-8', $encoding);
+      ini_set('mbstring.substitute_character', $save);
+    }
+    // @todo Get iconv running in at least some environments if that is possible.
+    elseif (function_exists('iconv') && $encoding != 'auto') {
       // fprintf(STDOUT, "iconv found\n");
       // iconv has the following behaviors:
       // - Overlong representations are ignored.
@@ -94,15 +114,6 @@ class UTF8Utils {
       // - Incomplete sequences generate a warning.
       $data = @iconv($encoding, 'UTF-8//IGNORE', $data);
     }
-    // MPB: Testing the newer mb_convert_encoding(). This might need
-    // to be removed again.
-    elseif (function_exists('mb_convert_encoding')) {
-      fprintf(STDOUT, "MB found\n");
-      // mb library has the following behaviors:
-      // - UTF-16 surrogates result in FALSE.
-      // - Overlongs and outside Plane 16 result in empty strings.
-      $data = mb_convert_encoding($data, 'UTF-8', $encoding);
-    }
     else {
       // we can make a conforming native implementation
       throw new Exception('Not implemented, please install mbstring or iconv');
diff --git a/test/HTML5/Html5Test.html b/test/HTML5/Html5Test.html
new file mode 100644
index 0000000..cc30e6b
--- /dev/null
+++ b/test/HTML5/Html5Test.html
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>Test</title>
+  </head>
+  <body>
+    <p>This is a test.</p>
+  </body>
+</html>
+\ No newline at end of file
diff --git a/test/HTML5/Html5Test.php b/test/HTML5/Html5Test.php
new file mode 100644
index 0000000..2d6e005
--- /dev/null
+++ b/test/HTML5/Html5Test.php
@@ -0,0 +1,36 @@
+<?php
+namespace HTML5\Tests;
+
+require_once 'TestCase.php';
+
+class Html5Test extends TestCase {
+
+  public function testLoad() {
+    $dom = \HTML5::load(__DIR__ . '/Html5Test.html');
+    $this->assertInstanceOf('\DOMDocument', $dom);
+    $this->assertEmpty($dom->errors);
+  }
+
+  public function testLoadHTML() {
+    $contents = file_get_contents(__DIR__ . '/Html5Test.html');
+    $dom = \HTML5::loadHTML($contents);
+    $this->assertInstanceOf('\DOMDocument', $dom);
+    $this->assertEmpty($dom->errors);
+  }
+
+  // This test reads a document into a dom, turn the dom into a document,
+  // then tries to read that document again. This makes sure we are reading,
+  // and generating a document that works at a high level.
+  public function testItWorks() {
+    $dom = \HTML5::load(__DIR__ . '/Html5Test.html');
+    $this->assertInstanceOf('\DOMDocument', $dom);
+    $this->assertEmpty($dom->errors);
+
+    $saved = \HTML5::saveHTML($dom);
+
+    $dom2 = \HTML5::loadHTML($saved);
+    $this->assertInstanceOf('\DOMDocument', $dom2);
+    $this->assertEmpty($dom2->errors);
+  }
+
+}
diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php
index 0692bc3..edc427c 100644
--- a/test/HTML5/Parser/TokenizerTest.php
+++ b/test/HTML5/Parser/TokenizerTest.php
@@ -1,6 +1,6 @@
 <?php
 namespace HTML5\Parser;
-require __DIR__ . '/../TestCase.php';
+require_once __DIR__ . '/../TestCase.php';
 require 'EventStack.php';
 
 class TokenizerTest extends \HTML5\Tests\TestCase {
@@ -18,7 +18,7 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
   public function assertEventEquals($type, $expects, $event) {
     $this->assertEquals($type, $event['name'], "Event $type for " . print_r($event, TRUE));
     if (is_array($expects)) {
-      $this->assertEquals($expects, $event['data'], "Event $type should equal $expects: " . print_r($event, TRUE));
+      $this->assertEquals($expects, $event['data'], "Event $type should equal " . print_r($expects, TRUE) . ": " . print_r($event, TRUE));
     }
     else {
       $this->assertEquals($expects, $event['data'][0], "Event $type should equal $expects: " . print_r($event, TRUE));
author	Matt Butcher <[email protected]>	2013-05-30 09:23:26 -0500
committer	Matt Butcher <[email protected]>	2013-05-30 09:23:26 -0500
commit	b1cbd9b4cd488471651751678cd90575bbb74bc9 (patch)
tree	6dcf0afce7206b4372adc1a1ddbc25698b38e24f
parent	0e149588548834bbfee7770fac8455cc404fb8ca (diff)
parent	fffeafbfe08e306356acd50cf568ec5904da882c (diff)