summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-03-03 16:58:42 +0000
committerAndres Rey <[email protected]>2017-03-03 16:58:42 +0000
commit3b73cde640956aa08cee59a9be44d941a819b5e6 (patch)
tree4dd3dc6dd52ca7a67b954231cb18ddf9f660aef9 /src
parent0b2e1c28ca9ad54290fd036afbcda7e808becc9e (diff)
Functons to normalize space and disable subtitute entities
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php15
1 files changed, 14 insertions, 1 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 5fb27b8..1b1a516 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -35,7 +35,7 @@ class HTMLParser
'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
'byline' => '/byline|author|dateline|writtenby|p-author/i',
'replaceFonts' => '/<(\/?)font[^>]*>/gi',
- 'normalize' => '/\s{2,}/g',
+ 'normalize' => '/\s{2,}/',
'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i',
'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i',
'prevLink' => '/(prev|earl|old|new|<|«)/i',
@@ -99,6 +99,8 @@ class HTMLParser
'weightClasses' => true,
'removeReadabilityTags' => true,
'fixRelativeURLs' => false,
+ 'normalizeSpaces' => false,
+ 'substituteEntities' => true,
'originalURL' => 'http://fakehost',
];
@@ -202,6 +204,11 @@ class HTMLParser
*/
private function loadHTML($html)
{
+ if (!$this->getConfig()->getOption('substituteEntities')) {
+ // Keep the original HTML entities
+ $this->dom->substituteEntities = false;
+ }
+
// Prepend the XML tag to avoid having issues with special characters. Should be harmless.
$this->dom->loadHTML('<?xml encoding="UTF-8">' . $html);
$this->dom->encoding = 'UTF-8';
@@ -334,6 +341,12 @@ class HTMLParser
}
}
+ if ($this->getConfig()->getOption('normalizeSpaces')) {
+ foreach ($article->getElementsByTagName('p') as $node) {
+ $node->nodeValue = preg_replace($this->regexps['normalize'], ' ', $node->nodeValue);
+ }
+ }
+
return $article;
}