summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Butcher <[email protected]>2013-05-30 09:23:26 -0500
committerMatt Butcher <[email protected]>2013-05-30 09:23:26 -0500
commitb1cbd9b4cd488471651751678cd90575bbb74bc9 (patch)
tree6dcf0afce7206b4372adc1a1ddbc25698b38e24f
parent0e149588548834bbfee7770fac8455cc404fb8ca (diff)
parentfffeafbfe08e306356acd50cf568ec5904da882c (diff)
Merge branch 'master' of github.com:Masterminds/html5-php
-rw-r--r--.travis.yml11
-rw-r--r--LICENSE.txt2
-rw-r--r--README.md11
-rw-r--r--src/HTML5/Parser/UTF8Utils.php31
-rw-r--r--test/HTML5/Html5Test.html10
-rw-r--r--test/HTML5/Html5Test.php36
-rw-r--r--test/HTML5/Parser/TokenizerTest.php4
7 files changed, 89 insertions, 16 deletions
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..35b3cae
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,11 @@
+language: php
+
+php:
+ - 5.3
+ - 5.4
+
+before_script:
+ - curl -s http://getcomposer.org/installer | php
+ - php composer.phar install
+
+script: phpunit test/HTML5 \ No newline at end of file
diff --git a/LICENSE.txt b/LICENSE.txt
index 5799468..6ecbf3e 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -3,7 +3,7 @@
Copyright (c) 2013 The Authors of HTML5-PHP
Matt Butcher - [email protected]
-Matt Farina -
+Matt Farina - [email protected]
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
diff --git a/README.md b/README.md
index d39612f..caebfbc 100644
--- a/README.md
+++ b/README.md
@@ -88,15 +88,20 @@ events and builds a document tree (`DOMDocument`) based on the events.
The serializer takes a data structure (the `DOMDocument`) and transforms
it into a character representation -- an HTML5 document.
-The serializer is broken into two parts:
+The serializer is broken into three parts:
+- The `OutputRules` contain the rules to turn DOM elements into strings. The
+rules used are configurable with the `OutputRules` being the default. An option
+can be set by default or at call time to use a different ruleset that implements
+`RulesInterface`.
- The `Traverser`, which is a special-purpose tree walker. It visits
-each node and transforms it into a string.
+each node node in the tree and uses the `OutputRules` to transform the node
+into a string.
- The `Serializer` manages the `Traverser` and stores the resultant data
in the correct place.
The serializer (`save()`, `saveHTML()`) follows the
-[section 8.9 of the HTML 5.0 spec] (http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#serializing-html-fragments).
+[section 8.9 of the HTML 5.0 spec](http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#serializing-html-fragments).
So tags are serialized according to these rules:
- A tag with children: &lt;foo&gt;CHILDREN&lt;/foo&gt;
diff --git a/src/HTML5/Parser/UTF8Utils.php b/src/HTML5/Parser/UTF8Utils.php
index 022d628..974a670 100644
--- a/src/HTML5/Parser/UTF8Utils.php
+++ b/src/HTML5/Parser/UTF8Utils.php
@@ -86,7 +86,27 @@ class UTF8Utils {
could not be converted to Unicode characters must be converted
to U+FFFD REPLACEMENT CHARACTER code points. */
- if (function_exists('iconv') && $encoding != 'auto') {
+ // mb_convert_encoding is chosen over iconv because of a bug. The best
+ // details for the bug are on http://us1.php.net/manual/en/function.iconv.php#108643
+ // which contains links to the actual but reports as well as work around
+ // details.
+ if (function_exists('mb_convert_encoding')) {
+ // mb library has the following behaviors:
+ // - UTF-16 surrogates result in FALSE.
+ // - Overlongs and outside Plane 16 result in empty strings.
+
+ // Before we run mb_convert_encoding we need to tell it what to do with
+ // characters it does not know. This could be different than the parent
+ // application executing this library so we store the value, change it
+ // to our needs, and then change it back when we are done. This feels
+ // a little excessive and it would be great if there was a better way.
+ $save = ini_get('mbstring.substitute_character');
+ ini_set('mbstring.substitute_character', "none");
+ $data = mb_convert_encoding($data, 'UTF-8', $encoding);
+ ini_set('mbstring.substitute_character', $save);
+ }
+ // @todo Get iconv running in at least some environments if that is possible.
+ elseif (function_exists('iconv') && $encoding != 'auto') {
// fprintf(STDOUT, "iconv found\n");
// iconv has the following behaviors:
// - Overlong representations are ignored.
@@ -94,15 +114,6 @@ class UTF8Utils {
// - Incomplete sequences generate a warning.
$data = @iconv($encoding, 'UTF-8//IGNORE', $data);
}
- // MPB: Testing the newer mb_convert_encoding(). This might need
- // to be removed again.
- elseif (function_exists('mb_convert_encoding')) {
- fprintf(STDOUT, "MB found\n");
- // mb library has the following behaviors:
- // - UTF-16 surrogates result in FALSE.
- // - Overlongs and outside Plane 16 result in empty strings.
- $data = mb_convert_encoding($data, 'UTF-8', $encoding);
- }
else {
// we can make a conforming native implementation
throw new Exception('Not implemented, please install mbstring or iconv');
diff --git a/test/HTML5/Html5Test.html b/test/HTML5/Html5Test.html
new file mode 100644
index 0000000..cc30e6b
--- /dev/null
+++ b/test/HTML5/Html5Test.html
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html lang="en">
+ <head>
+ <meta charset="utf-8">
+ <title>Test</title>
+ </head>
+ <body>
+ <p>This is a test.</p>
+ </body>
+</html> \ No newline at end of file
diff --git a/test/HTML5/Html5Test.php b/test/HTML5/Html5Test.php
new file mode 100644
index 0000000..2d6e005
--- /dev/null
+++ b/test/HTML5/Html5Test.php
@@ -0,0 +1,36 @@
+<?php
+namespace HTML5\Tests;
+
+require_once 'TestCase.php';
+
+class Html5Test extends TestCase {
+
+ public function testLoad() {
+ $dom = \HTML5::load(__DIR__ . '/Html5Test.html');
+ $this->assertInstanceOf('\DOMDocument', $dom);
+ $this->assertEmpty($dom->errors);
+ }
+
+ public function testLoadHTML() {
+ $contents = file_get_contents(__DIR__ . '/Html5Test.html');
+ $dom = \HTML5::loadHTML($contents);
+ $this->assertInstanceOf('\DOMDocument', $dom);
+ $this->assertEmpty($dom->errors);
+ }
+
+ // This test reads a document into a dom, turn the dom into a document,
+ // then tries to read that document again. This makes sure we are reading,
+ // and generating a document that works at a high level.
+ public function testItWorks() {
+ $dom = \HTML5::load(__DIR__ . '/Html5Test.html');
+ $this->assertInstanceOf('\DOMDocument', $dom);
+ $this->assertEmpty($dom->errors);
+
+ $saved = \HTML5::saveHTML($dom);
+
+ $dom2 = \HTML5::loadHTML($saved);
+ $this->assertInstanceOf('\DOMDocument', $dom2);
+ $this->assertEmpty($dom2->errors);
+ }
+
+}
diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php
index 0692bc3..edc427c 100644
--- a/test/HTML5/Parser/TokenizerTest.php
+++ b/test/HTML5/Parser/TokenizerTest.php
@@ -1,6 +1,6 @@
<?php
namespace HTML5\Parser;
-require __DIR__ . '/../TestCase.php';
+require_once __DIR__ . '/../TestCase.php';
require 'EventStack.php';
class TokenizerTest extends \HTML5\Tests\TestCase {
@@ -18,7 +18,7 @@ class TokenizerTest extends \HTML5\Tests\TestCase {
public function assertEventEquals($type, $expects, $event) {
$this->assertEquals($type, $event['name'], "Event $type for " . print_r($event, TRUE));
if (is_array($expects)) {
- $this->assertEquals($expects, $event['data'], "Event $type should equal $expects: " . print_r($event, TRUE));
+ $this->assertEquals($expects, $event['data'], "Event $type should equal " . print_r($expects, TRUE) . ": " . print_r($event, TRUE));
}
else {
$this->assertEquals($expects, $event['data'][0], "Event $type should equal $expects: " . print_r($event, TRUE));