summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-12-15 12:40:33 +0000
committerAndres Rey <[email protected]>2016-12-15 12:40:33 +0000
commit4c8d547cdadf871f09bc8c855bfdab98e3ce434b (patch)
tree16bcacdcbd67f1099cd352936ee794b5770f8929 /src
parentbd0bfd20b64300278335005f8a6ff745d545dd44 (diff)
Added a hack to load HTML with UTF-8 characters
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php8
1 files changed, 5 insertions, 3 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index d9db75c..9cc67a2 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -187,7 +187,8 @@ class HTMLParser
*/
private function loadHTML($html)
{
- $this->dom->loadHTML($html);
+ // Prepend the XML tag to avoid having issues with special characters. Should be harmless.
+ $this->dom->loadHTML('<?xml encoding="UTF-8">' . $html);
$this->dom->encoding = 'UTF-8';
// In case we need the original HTML to create a fake top candidate
@@ -501,7 +502,8 @@ class HTMLParser
// Move all of the page's children into topCandidate
$neededToCreateTopCandidate = true;
- $topCandidate = new DOMDocument();
+ $topCandidate = new DOMDocument('1.0', 'utf-8');
+ $topCandidate->encoding = 'UTF-8';
$topCandidate->appendChild($topCandidate->createElement('div', ''));
$kids = $this->dom->getElementsByTagName('body')->item(0)->childNodes;
@@ -557,7 +559,7 @@ class HTMLParser
* that we removed, etc.
*/
- $articleContent = new DOMDocument();
+ $articleContent = new DOMDocument('1.0', 'utf-8');
$articleContent->createElement('div');
$siblingScoreThreshold = max(10, $topCandidate->getContentScore() * 0.2);