diff options
Diffstat (limited to 'src/HTMLParser.php')
-rw-r--r-- | src/HTMLParser.php | 15 |
1 files changed, 14 insertions, 1 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 5fb27b8..1b1a516 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -35,7 +35,7 @@ class HTMLParser 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', 'byline' => '/byline|author|dateline|writtenby|p-author/i', 'replaceFonts' => '/<(\/?)font[^>]*>/gi', - 'normalize' => '/\s{2,}/g', + 'normalize' => '/\s{2,}/', 'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i', 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', 'prevLink' => '/(prev|earl|old|new|<|«)/i', @@ -99,6 +99,8 @@ class HTMLParser 'weightClasses' => true, 'removeReadabilityTags' => true, 'fixRelativeURLs' => false, + 'normalizeSpaces' => false, + 'substituteEntities' => true, 'originalURL' => 'http://fakehost', ]; @@ -202,6 +204,11 @@ class HTMLParser */ private function loadHTML($html) { + if (!$this->getConfig()->getOption('substituteEntities')) { + // Keep the original HTML entities + $this->dom->substituteEntities = false; + } + // Prepend the XML tag to avoid having issues with special characters. Should be harmless. $this->dom->loadHTML('<?xml encoding="UTF-8">' . $html); $this->dom->encoding = 'UTF-8'; @@ -334,6 +341,12 @@ class HTMLParser } } + if ($this->getConfig()->getOption('normalizeSpaces')) { + foreach ($article->getElementsByTagName('p') as $node) { + $node->nodeValue = preg_replace($this->regexps['normalize'], ' ', $node->nodeValue); + } + } + return $article; } |