diff options
author | Andres Rey <[email protected]> | 2016-12-15 12:40:33 +0000 |
---|---|---|
committer | Andres Rey <[email protected]> | 2016-12-15 12:40:33 +0000 |
commit | 4c8d547cdadf871f09bc8c855bfdab98e3ce434b (patch) | |
tree | 16bcacdcbd67f1099cd352936ee794b5770f8929 /src/HTMLParser.php | |
parent | bd0bfd20b64300278335005f8a6ff745d545dd44 (diff) |
Added a hack to load HTML with UTF-8 characters
Diffstat (limited to 'src/HTMLParser.php')
-rw-r--r-- | src/HTMLParser.php | 8 |
1 files changed, 5 insertions, 3 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php index d9db75c..9cc67a2 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -187,7 +187,8 @@ class HTMLParser */ private function loadHTML($html) { - $this->dom->loadHTML($html); + // Prepend the XML tag to avoid having issues with special characters. Should be harmless. + $this->dom->loadHTML('<?xml encoding="UTF-8">' . $html); $this->dom->encoding = 'UTF-8'; // In case we need the original HTML to create a fake top candidate @@ -501,7 +502,8 @@ class HTMLParser // Move all of the page's children into topCandidate $neededToCreateTopCandidate = true; - $topCandidate = new DOMDocument(); + $topCandidate = new DOMDocument('1.0', 'utf-8'); + $topCandidate->encoding = 'UTF-8'; $topCandidate->appendChild($topCandidate->createElement('div', '')); $kids = $this->dom->getElementsByTagName('body')->item(0)->childNodes; @@ -557,7 +559,7 @@ class HTMLParser * that we removed, etc. */ - $articleContent = new DOMDocument(); + $articleContent = new DOMDocument('1.0', 'utf-8'); $articleContent->createElement('div'); $siblingScoreThreshold = max(10, $topCandidate->getContentScore() * 0.2); |