From 3b73cde640956aa08cee59a9be44d941a819b5e6 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Fri, 3 Mar 2017 16:58:42 +0000 Subject: Functons to normalize space and disable subtitute entities --- src/HTMLParser.php | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 5fb27b8..1b1a516 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -35,7 +35,7 @@ class HTMLParser 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', 'byline' => '/byline|author|dateline|writtenby|p-author/i', 'replaceFonts' => '/<(\/?)font[^>]*>/gi', - 'normalize' => '/\s{2,}/g', + 'normalize' => '/\s{2,}/', 'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i', 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', 'prevLink' => '/(prev|earl|old|new|<|«)/i', @@ -99,6 +99,8 @@ class HTMLParser 'weightClasses' => true, 'removeReadabilityTags' => true, 'fixRelativeURLs' => false, + 'normalizeSpaces' => false, + 'substituteEntities' => true, 'originalURL' => 'http://fakehost', ]; @@ -202,6 +204,11 @@ class HTMLParser */ private function loadHTML($html) { + if (!$this->getConfig()->getOption('substituteEntities')) { + // Keep the original HTML entities + $this->dom->substituteEntities = false; + } + // Prepend the XML tag to avoid having issues with special characters. Should be harmless. $this->dom->loadHTML('' . $html); $this->dom->encoding = 'UTF-8'; @@ -334,6 +341,12 @@ class HTMLParser } } + if ($this->getConfig()->getOption('normalizeSpaces')) { + foreach ($article->getElementsByTagName('p') as $node) { + $node->nodeValue = preg_replace($this->regexps['normalize'], ' ', $node->nodeValue); + } + } + return $article; } -- cgit v1.2.3