From 01fb375746d7ef9178c4bf651774da67632b7454 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Sun, 26 Mar 2017 11:34:29 +0100 Subject: Added normalizeEntities flag. --- src/HTMLParser.php | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 3226904..105e50c 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -100,6 +100,7 @@ class HTMLParser 'removeReadabilityTags' => true, 'fixRelativeURLs' => false, 'substituteEntities' => true, + 'normalizeEntities' => false, 'originalURL' => 'http://fakehost', ]; @@ -208,6 +209,11 @@ class HTMLParser $this->dom->substituteEntities = false; } + if ($this->getConfig()->getOption('normalizeEntities')) { + // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content + $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); + } + // Prepend the XML tag to avoid having issues with special characters. Should be harmless. $this->dom->loadHTML('' . $html); $this->dom->encoding = 'UTF-8'; -- cgit v1.2.3