From 65811e8e391a3e3cd17c68a9b39872d6c657a078 Mon Sep 17 00:00:00 2001 From: Matt Farina Date: Tue, 28 May 2013 21:27:33 -0400 Subject: Fixed failed test suite runs. --- test/HTML5/Parser/TokenizerTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php index 10f9f9a..e752135 100644 --- a/test/HTML5/Parser/TokenizerTest.php +++ b/test/HTML5/Parser/TokenizerTest.php @@ -1,6 +1,6 @@ Date: Tue, 28 May 2013 21:30:53 -0400 Subject: Adding travis-ci config file. --- .travis.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..35b3cae --- /dev/null +++ b/.travis.yml @@ -0,0 +1,11 @@ +language: php + +php: + - 5.3 + - 5.4 + +before_script: + - curl -s http://getcomposer.org/installer | php + - php composer.phar install + +script: phpunit test/HTML5 \ No newline at end of file -- cgit v1.2.3 From b9745f737cd045e70c0d253a86f9a9e9e0ef4234 Mon Sep 17 00:00:00 2001 From: Matt Farina Date: Wed, 29 May 2013 11:56:18 -0400 Subject: Fixed test failures in PHP 5.4. --- test/HTML5/Parser/TokenizerTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php index e752135..33cfa98 100644 --- a/test/HTML5/Parser/TokenizerTest.php +++ b/test/HTML5/Parser/TokenizerTest.php @@ -18,7 +18,7 @@ class TokenizerTest extends \HTML5\Tests\TestCase { public function assertEventEquals($type, $expects, $event) { $this->assertEquals($type, $event['name'], "Event $type for " . print_r($event, TRUE)); if (is_array($expects)) { - $this->assertEquals($expects, $event['data'], "Event $type should equal $expects: " . print_r($event, TRUE)); + $this->assertEquals($expects, $event['data'], "Event $type should equal " . print_r($expects, TRUE) . ": " . print_r($event, TRUE)); } else { $this->assertEquals($expects, $event['data'][0], "Event $type should equal $expects: " . print_r($event, TRUE)); -- cgit v1.2.3 From b6da0ae136d962d49a9f9bfc37ef232adc7460ba Mon Sep 17 00:00:00 2001 From: Matt Farina Date: Wed, 29 May 2013 17:13:24 -0400 Subject: Moved mbstring encoding to be used before iconv. Tests passing on PHP 5.4. --- src/HTML5/Parser/UTF8Utils.php | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/HTML5/Parser/UTF8Utils.php b/src/HTML5/Parser/UTF8Utils.php index 022d628..974a670 100644 --- a/src/HTML5/Parser/UTF8Utils.php +++ b/src/HTML5/Parser/UTF8Utils.php @@ -86,7 +86,27 @@ class UTF8Utils { could not be converted to Unicode characters must be converted to U+FFFD REPLACEMENT CHARACTER code points. */ - if (function_exists('iconv') && $encoding != 'auto') { + // mb_convert_encoding is chosen over iconv because of a bug. The best + // details for the bug are on http://us1.php.net/manual/en/function.iconv.php#108643 + // which contains links to the actual but reports as well as work around + // details. + if (function_exists('mb_convert_encoding')) { + // mb library has the following behaviors: + // - UTF-16 surrogates result in FALSE. + // - Overlongs and outside Plane 16 result in empty strings. + + // Before we run mb_convert_encoding we need to tell it what to do with + // characters it does not know. This could be different than the parent + // application executing this library so we store the value, change it + // to our needs, and then change it back when we are done. This feels + // a little excessive and it would be great if there was a better way. + $save = ini_get('mbstring.substitute_character'); + ini_set('mbstring.substitute_character', "none"); + $data = mb_convert_encoding($data, 'UTF-8', $encoding); + ini_set('mbstring.substitute_character', $save); + } + // @todo Get iconv running in at least some environments if that is possible. + elseif (function_exists('iconv') && $encoding != 'auto') { // fprintf(STDOUT, "iconv found\n"); // iconv has the following behaviors: // - Overlong representations are ignored. @@ -94,15 +114,6 @@ class UTF8Utils { // - Incomplete sequences generate a warning. $data = @iconv($encoding, 'UTF-8//IGNORE', $data); } - // MPB: Testing the newer mb_convert_encoding(). This might need - // to be removed again. - elseif (function_exists('mb_convert_encoding')) { - fprintf(STDOUT, "MB found\n"); - // mb library has the following behaviors: - // - UTF-16 surrogates result in FALSE. - // - Overlongs and outside Plane 16 result in empty strings. - $data = mb_convert_encoding($data, 'UTF-8', $encoding); - } else { // we can make a conforming native implementation throw new Exception('Not implemented, please install mbstring or iconv'); -- cgit v1.2.3 From c023bad19a0ad1bf418a5842ffb3e8aac9045a55 Mon Sep 17 00:00:00 2001 From: Matt Farina Date: Thu, 30 May 2013 09:15:05 -0400 Subject: Added email address to license. --- LICENSE.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE.txt b/LICENSE.txt index 5799468..6ecbf3e 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -3,7 +3,7 @@ Copyright (c) 2013 The Authors of HTML5-PHP Matt Butcher - technosophos@gmail.com -Matt Farina - +Matt Farina - matt@mattfarina.com Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in -- cgit v1.2.3 From e109d03d1b8755cb783274c94c8b8926f2d7b8a1 Mon Sep 17 00:00:00 2001 From: Matt Farina Date: Thu, 30 May 2013 09:26:38 -0400 Subject: Added details about the outputrules to the readme.md --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d39612f..caebfbc 100644 --- a/README.md +++ b/README.md @@ -88,15 +88,20 @@ events and builds a document tree (`DOMDocument`) based on the events. The serializer takes a data structure (the `DOMDocument`) and transforms it into a character representation -- an HTML5 document. -The serializer is broken into two parts: +The serializer is broken into three parts: +- The `OutputRules` contain the rules to turn DOM elements into strings. The +rules used are configurable with the `OutputRules` being the default. An option +can be set by default or at call time to use a different ruleset that implements +`RulesInterface`. - The `Traverser`, which is a special-purpose tree walker. It visits -each node and transforms it into a string. +each node node in the tree and uses the `OutputRules` to transform the node +into a string. - The `Serializer` manages the `Traverser` and stores the resultant data in the correct place. The serializer (`save()`, `saveHTML()`) follows the -[section 8.9 of the HTML 5.0 spec] (http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#serializing-html-fragments). +[section 8.9 of the HTML 5.0 spec](http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#serializing-html-fragments). So tags are serialized according to these rules: - A tag with children: <foo>CHILDREN</foo> -- cgit v1.2.3 From fffeafbfe08e306356acd50cf568ec5904da882c Mon Sep 17 00:00:00 2001 From: Matt Farina Date: Thu, 30 May 2013 09:48:05 -0400 Subject: Added tests to make sure we can parse, save, and then parse the saved document. --- test/HTML5/Html5Test.html | 10 ++++++++++ test/HTML5/Html5Test.php | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 test/HTML5/Html5Test.html create mode 100644 test/HTML5/Html5Test.php diff --git a/test/HTML5/Html5Test.html b/test/HTML5/Html5Test.html new file mode 100644 index 0000000..cc30e6b --- /dev/null +++ b/test/HTML5/Html5Test.html @@ -0,0 +1,10 @@ + + + + + Test + + +

This is a test.

+ + \ No newline at end of file diff --git a/test/HTML5/Html5Test.php b/test/HTML5/Html5Test.php new file mode 100644 index 0000000..2d6e005 --- /dev/null +++ b/test/HTML5/Html5Test.php @@ -0,0 +1,36 @@ +assertInstanceOf('\DOMDocument', $dom); + $this->assertEmpty($dom->errors); + } + + public function testLoadHTML() { + $contents = file_get_contents(__DIR__ . '/Html5Test.html'); + $dom = \HTML5::loadHTML($contents); + $this->assertInstanceOf('\DOMDocument', $dom); + $this->assertEmpty($dom->errors); + } + + // This test reads a document into a dom, turn the dom into a document, + // then tries to read that document again. This makes sure we are reading, + // and generating a document that works at a high level. + public function testItWorks() { + $dom = \HTML5::load(__DIR__ . '/Html5Test.html'); + $this->assertInstanceOf('\DOMDocument', $dom); + $this->assertEmpty($dom->errors); + + $saved = \HTML5::saveHTML($dom); + + $dom2 = \HTML5::loadHTML($saved); + $this->assertInstanceOf('\DOMDocument', $dom2); + $this->assertEmpty($dom2->errors); + } + +} -- cgit v1.2.3