From ed7cc5f4e06eed696cdd829a3197fbe8927c0721 Mon Sep 17 00:00:00 2001 From: John Slegers Date: Thu, 4 Dec 2014 16:51:29 +0100 Subject: Adding "disableHtmlNsInDom" and "targetDocument" options to allow more flexible HTML DOM creation. New Options: * disableHtmlNsInDom = Allows the use of createElement instead of createElementNS for HTML elements. * targetDocument = allows an existing DOMDocument (or subclass thereof) to be passsed to the DOMTreeBuilder instead of creating a new one. --- src/HTML5/Parser/DOMTreeBuilder.php | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index 0349d60..da3f06b 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -157,13 +157,17 @@ class DOMTreeBuilder implements EventHandler { $this->options = $options; - $impl = new \DOMImplementation(); - // XXX: - // Create the doctype. For now, we are always creating HTML5 - // documents, and attempting to up-convert any older DTDs to HTML5. - $dt = $impl->createDocumentType('html'); - // $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); - $this->doc = $impl->createDocument(null, null, $dt); + if (isset($options['targetDocument'])) { + $this->doc = $options['targetDocument']; + } else { + $impl = new \DOMImplementation(); + // XXX: + // Create the doctype. For now, we are always creating HTML5 + // documents, and attempting to up-convert any older DTDs to HTML5. + $dt = $impl->createDocumentType('html'); + // $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); + $this->doc = $impl->createDocument(null, null, $dt); + } $this->errors = array(); $this->current = $this->doc; // ->documentElement; @@ -345,10 +349,10 @@ class DOMTreeBuilder implements EventHandler $ele = $this->doc->importNode($frag->documentElement, true); } else { - if (isset($this->nsStack[0][$prefix])) { - $ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname); - } else { + if (!isset($this->nsStack[0][$prefix]) || ($prefix === "" && isset($this->options['disableHtmlNsInDom']) && $this->options['disableHtmlNsInDom'])) { $ele = $this->doc->createElement($lname); + } else { + $ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname); } } @@ -664,4 +668,4 @@ class DOMTreeBuilder implements EventHandler { return $this->current->tagName == $tagname; } -} +} \ No newline at end of file -- cgit v1.2.3 From 23a692b708ed24f69480aaa59a9dfbb3e9a606d2 Mon Sep 17 00:00:00 2001 From: Asmir Mustafic Date: Fri, 6 Feb 2015 21:45:13 +0100 Subject: Using constant as options --- src/HTML5/Parser/DOMTreeBuilder.php | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index da3f06b..ccad229 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -38,6 +38,12 @@ class DOMTreeBuilder implements EventHandler const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/'; + const OPT_DISABLE_HTML_NS = 'disable_html_ns'; + + const OPT_TARGET_DOC = 'target_document'; + + const OPT_IMPLICIT_NS = 'implicit_namespaces'; + /** * Holds the HTML5 element names that causes a namespace switch * @@ -157,8 +163,8 @@ class DOMTreeBuilder implements EventHandler { $this->options = $options; - if (isset($options['targetDocument'])) { - $this->doc = $options['targetDocument']; + if (isset($options[self::OPT_TARGET_DOC])) { + $this->doc = $options[self::OPT_TARGET_DOC]; } else { $impl = new \DOMImplementation(); // XXX: @@ -175,8 +181,15 @@ class DOMTreeBuilder implements EventHandler // Create a rules engine for tags. $this->rules = new TreeBuildingRules($this->doc); + $implicitNS = array(); + if (isset($this->options[self::OPT_IMPLICIT_NS])) { + $implicitNS = $this->options[self::OPT_IMPLICIT_NS]; + } elseif (isset($this->options["implicitNamespaces"])) { + $implicitNS = $this->options["implicitNamespaces"]; + } + // Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options - array_unshift($this->nsStack, (isset($this->options["implicitNamespaces"]) ? $this->options["implicitNamespaces"] : array()) + array( + array_unshift($this->nsStack, $implicitNS + array( '' => self::NAMESPACE_HTML ) + $this->implicitNamespaces); @@ -349,7 +362,7 @@ class DOMTreeBuilder implements EventHandler $ele = $this->doc->importNode($frag->documentElement, true); } else { - if (!isset($this->nsStack[0][$prefix]) || ($prefix === "" && isset($this->options['disableHtmlNsInDom']) && $this->options['disableHtmlNsInDom'])) { + if (!isset($this->nsStack[0][$prefix]) || ($prefix === "" && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) { $ele = $this->doc->createElement($lname); } else { $ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname); -- cgit v1.2.3 From 46c4738fe893b8250da1531e093f3ccaee0769a9 Mon Sep 17 00:00:00 2001 From: Asmir Mustafic Date: Tue, 6 Jan 2015 22:28:44 +0100 Subject: Added tests for "target_document" and "disable_html_ns" options --- test/HTML5/Parser/DOMTreeBuilderTest.php | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/test/HTML5/Parser/DOMTreeBuilderTest.php b/test/HTML5/Parser/DOMTreeBuilderTest.php index 5bba7cc..b2a2d39 100644 --- a/test/HTML5/Parser/DOMTreeBuilderTest.php +++ b/test/HTML5/Parser/DOMTreeBuilderTest.php @@ -55,6 +55,7 @@ class DOMTreeBuilderTest extends \Masterminds\HTML5\Tests\TestCase $this->assertInstanceOf('\DOMDocument', $doc); $this->assertEquals('html', $doc->documentElement->tagName); + $this->assertEquals('http://www.w3.org/1999/xhtml', $doc->documentElement->namespaceURI); } public function testStrangeCapitalization() @@ -78,6 +79,28 @@ class DOMTreeBuilderTest extends \Masterminds\HTML5\Tests\TestCase $this->assertEquals("foo", $xpath->query( "//x:script" )->item( 0 )->nodeValue); } + public function testDocumentWithDisabledNamespaces() + { + $html = ""; + $doc = $this->parse($html, array('disable_html_ns' => true)); + + $this->assertInstanceOf('\DOMDocument', $doc); + $this->assertEquals('html', $doc->documentElement->tagName); + $this->assertNull($doc->documentElement->namespaceURI); + } + + public function testDocumentWithATargetDocument() + { + $targetDom = new \DOMDocument(); + + $html = ""; + $doc = $this->parse($html, array('target_document' => $targetDom)); + + $this->assertInstanceOf('\DOMDocument', $doc); + $this->assertSame($doc, $targetDom); + $this->assertEquals('html', $doc->documentElement->tagName); + } + public function testDocumentFakeAttrAbsence() { $html = "foo"; @@ -85,7 +108,6 @@ class DOMTreeBuilderTest extends \Masterminds\HTML5\Tests\TestCase $xp = new \DOMXPath($doc); $this->assertEquals(0, $xp->query("//@html5-php-fake-id-attribute")->length); - } public function testFragment() -- cgit v1.2.3 From 014514c3ca31027c094c6a77020685996e975b02 Mon Sep 17 00:00:00 2001 From: Asmir Mustafic Date: Tue, 6 Jan 2015 22:31:34 +0100 Subject: New options documented --- README.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/README.md b/README.md index 65d6889..1a2a96b 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,35 @@ $html5->save($dom, 'out.html'); The `$dom` created by the parser is a full `DOMDocument` object. And the `save()` and `saveHTML()` methods will take any DOMDocument. +### Options + +It is possible to pass in an array of configuration options when loading +an HTML5 document. + +```php +// An associative array of options +$options = array( + 'option_name' => 'option_value', +); + +// Provide the options to the constructor +$html5 = new HTML5($options); + +$dom = $html5->loadHTML($html); +``` + +The following options are supported: + +* `encode_entities` (boolean): Indicates that the serializer should aggressively + encode characters as entities. Without this, it only encodes the bare + minimum. +* `disable_html_ns` (boolean): Prevents the parser from automatically + assigning the HTML5 namespace to the DOM document. This is for + non-namespace aware DOM tools. +* `target_doc` (\DOMDocument): A DOM document that will be used as the + destination for the parsed nodes. +* `implicit_namespaces` (array): An assoc array of namespaces that should be + used by the parser. Name is tag prefix, value is NS URI. ## The Low-Level API -- cgit v1.2.3 From a50e919341b310b30b617a2e1ea5945d94856017 Mon Sep 17 00:00:00 2001 From: Asmir Mustafic Date: Fri, 6 Feb 2015 22:14:20 +0100 Subject: Updated release note (preparing the 2.1.0 release) --- RELEASE.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index f87bad5..36d1630 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,8 @@ # Release Notes -X.X.X (XXXX-XX-XX) +2.1.0 (2015-02-01) +- #74: Added `disable_html_ns` and `target_doc` dom parsing options +- Unified option names - #73: Fixed alphabet, ß now can be detected - #75 and #76: Allow whitespace in RCDATA tags - #77: Fixed parsing blunder for json embeds -- cgit v1.2.3