diff options
author | Asmir Mustafic <[email protected]> | 2015-02-09 17:22:51 +0100 |
---|---|---|
committer | Asmir Mustafic <[email protected]> | 2015-02-09 17:22:51 +0100 |
commit | 236faa2648f4e89e7f5fce60cfc5f3ed95243f8c (patch) | |
tree | 9c4c443314eedb7d17365e78783d5c99470fa680 | |
parent | dda3253aa29800d12795c291e55578bbabbc4cb4 (diff) | |
parent | a50e919341b310b30b617a2e1ea5945d94856017 (diff) |
Merge pull request #74 from Masterminds/feature/html-parsing-options
HTML parsing options
-rw-r--r-- | README.md | 29 | ||||
-rw-r--r-- | RELEASE.md | 4 | ||||
-rw-r--r-- | src/HTML5/Parser/DOMTreeBuilder.php | 41 | ||||
-rw-r--r-- | test/HTML5/Parser/DOMTreeBuilderTest.php | 24 |
4 files changed, 84 insertions, 14 deletions
@@ -76,6 +76,35 @@ $html5->save($dom, 'out.html'); The `$dom` created by the parser is a full `DOMDocument` object. And the `save()` and `saveHTML()` methods will take any DOMDocument. +### Options + +It is possible to pass in an array of configuration options when loading +an HTML5 document. + +```php +// An associative array of options +$options = array( + 'option_name' => 'option_value', +); + +// Provide the options to the constructor +$html5 = new HTML5($options); + +$dom = $html5->loadHTML($html); +``` + +The following options are supported: + +* `encode_entities` (boolean): Indicates that the serializer should aggressively + encode characters as entities. Without this, it only encodes the bare + minimum. +* `disable_html_ns` (boolean): Prevents the parser from automatically + assigning the HTML5 namespace to the DOM document. This is for + non-namespace aware DOM tools. +* `target_doc` (\DOMDocument): A DOM document that will be used as the + destination for the parsed nodes. +* `implicit_namespaces` (array): An assoc array of namespaces that should be + used by the parser. Name is tag prefix, value is NS URI. ## The Low-Level API @@ -1,6 +1,8 @@ # Release Notes -X.X.X (XXXX-XX-XX) +2.1.0 (2015-02-01) +- #74: Added `disable_html_ns` and `target_doc` dom parsing options +- Unified option names - #73: Fixed alphabet, ß now can be detected - #75 and #76: Allow whitespace in RCDATA tags - #77: Fixed parsing blunder for json embeds diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index 0349d60..ccad229 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -38,6 +38,12 @@ class DOMTreeBuilder implements EventHandler const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/'; + const OPT_DISABLE_HTML_NS = 'disable_html_ns'; + + const OPT_TARGET_DOC = 'target_document'; + + const OPT_IMPLICIT_NS = 'implicit_namespaces'; + /** * Holds the HTML5 element names that causes a namespace switch * @@ -157,13 +163,17 @@ class DOMTreeBuilder implements EventHandler { $this->options = $options; - $impl = new \DOMImplementation(); - // XXX: - // Create the doctype. For now, we are always creating HTML5 - // documents, and attempting to up-convert any older DTDs to HTML5. - $dt = $impl->createDocumentType('html'); - // $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); - $this->doc = $impl->createDocument(null, null, $dt); + if (isset($options[self::OPT_TARGET_DOC])) { + $this->doc = $options[self::OPT_TARGET_DOC]; + } else { + $impl = new \DOMImplementation(); + // XXX: + // Create the doctype. For now, we are always creating HTML5 + // documents, and attempting to up-convert any older DTDs to HTML5. + $dt = $impl->createDocumentType('html'); + // $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); + $this->doc = $impl->createDocument(null, null, $dt); + } $this->errors = array(); $this->current = $this->doc; // ->documentElement; @@ -171,8 +181,15 @@ class DOMTreeBuilder implements EventHandler // Create a rules engine for tags. $this->rules = new TreeBuildingRules($this->doc); + $implicitNS = array(); + if (isset($this->options[self::OPT_IMPLICIT_NS])) { + $implicitNS = $this->options[self::OPT_IMPLICIT_NS]; + } elseif (isset($this->options["implicitNamespaces"])) { + $implicitNS = $this->options["implicitNamespaces"]; + } + // Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options - array_unshift($this->nsStack, (isset($this->options["implicitNamespaces"]) ? $this->options["implicitNamespaces"] : array()) + array( + array_unshift($this->nsStack, $implicitNS + array( '' => self::NAMESPACE_HTML ) + $this->implicitNamespaces); @@ -345,10 +362,10 @@ class DOMTreeBuilder implements EventHandler $ele = $this->doc->importNode($frag->documentElement, true); } else { - if (isset($this->nsStack[0][$prefix])) { - $ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname); - } else { + if (!isset($this->nsStack[0][$prefix]) || ($prefix === "" && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) { $ele = $this->doc->createElement($lname); + } else { + $ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname); } } @@ -664,4 +681,4 @@ class DOMTreeBuilder implements EventHandler { return $this->current->tagName == $tagname; } -} +}
\ No newline at end of file diff --git a/test/HTML5/Parser/DOMTreeBuilderTest.php b/test/HTML5/Parser/DOMTreeBuilderTest.php index 5bba7cc..b2a2d39 100644 --- a/test/HTML5/Parser/DOMTreeBuilderTest.php +++ b/test/HTML5/Parser/DOMTreeBuilderTest.php @@ -55,6 +55,7 @@ class DOMTreeBuilderTest extends \Masterminds\HTML5\Tests\TestCase $this->assertInstanceOf('\DOMDocument', $doc); $this->assertEquals('html', $doc->documentElement->tagName); + $this->assertEquals('http://www.w3.org/1999/xhtml', $doc->documentElement->namespaceURI); } public function testStrangeCapitalization() @@ -78,6 +79,28 @@ class DOMTreeBuilderTest extends \Masterminds\HTML5\Tests\TestCase $this->assertEquals("foo", $xpath->query( "//x:script" )->item( 0 )->nodeValue); } + public function testDocumentWithDisabledNamespaces() + { + $html = "<!DOCTYPE html><html></html>"; + $doc = $this->parse($html, array('disable_html_ns' => true)); + + $this->assertInstanceOf('\DOMDocument', $doc); + $this->assertEquals('html', $doc->documentElement->tagName); + $this->assertNull($doc->documentElement->namespaceURI); + } + + public function testDocumentWithATargetDocument() + { + $targetDom = new \DOMDocument(); + + $html = "<!DOCTYPE html><html></html>"; + $doc = $this->parse($html, array('target_document' => $targetDom)); + + $this->assertInstanceOf('\DOMDocument', $doc); + $this->assertSame($doc, $targetDom); + $this->assertEquals('html', $doc->documentElement->tagName); + } + public function testDocumentFakeAttrAbsence() { $html = "<!DOCTYPE html><html xmlns=\"http://www.w3.org/1999/xhtml\"><body>foo</body></html>"; @@ -85,7 +108,6 @@ class DOMTreeBuilderTest extends \Masterminds\HTML5\Tests\TestCase $xp = new \DOMXPath($doc); $this->assertEquals(0, $xp->query("//@html5-php-fake-id-attribute")->length); - } public function testFragment() |