summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-03-26 11:34:29 +0100
committerAndres Rey <[email protected]>2017-03-26 11:34:29 +0100
commit01fb375746d7ef9178c4bf651774da67632b7454 (patch)
tree8d141fcefa0dc643d1188ee01ffbf1952ed9406a /src
parent361a1f73048a3f68af539344b0c294c9633f5820 (diff)
Added normalizeEntities flag.
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php6
1 files changed, 6 insertions, 0 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 3226904..105e50c 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -100,6 +100,7 @@ class HTMLParser
'removeReadabilityTags' => true,
'fixRelativeURLs' => false,
'substituteEntities' => true,
+ 'normalizeEntities' => false,
'originalURL' => 'http://fakehost',
];
@@ -208,6 +209,11 @@ class HTMLParser
$this->dom->substituteEntities = false;
}
+ if ($this->getConfig()->getOption('normalizeEntities')) {
+ // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
+ $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
+ }
+
// Prepend the XML tag to avoid having issues with special characters. Should be harmless.
$this->dom->loadHTML('<?xml encoding="UTF-8">' . $html);
$this->dom->encoding = 'UTF-8';