diff options
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | src/HTMLParser.php | 15 | ||||
-rw-r--r-- | test/HTMLParserTest.php | 23 |
3 files changed, 34 insertions, 8 deletions
@@ -51,7 +51,9 @@ If the parsing process was unsuccessful the HTMLParser will return `false` - **weightClasses**: default value `true`, weight classes during the rating phase. - **removeReadabilityTags**: default value `true`, remove the data-readability tags inside the nodes that are added during the rating phase. - **fixRelativeURLs**: default value `false`, convert relative URLs to absolute. Like `/test` to `http://host/test`. -- **originalURL**: default value `http://fakehost`, original URL from the article used to fix relative URLs. +- **normalizeSpaces**: default value `false`, normalize all spaces. Changes all consecutive spaces to one space. +- **substituteEntities**: default value `false`, disables the `substituteEntities` flag of libxml. Will avoid substituting HTML entities. Like `´` to á. +- **originalURL**: default value `http://fakehost`, original URL from the article used to fix relative URLs. ## Limitations diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 5fb27b8..1b1a516 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -35,7 +35,7 @@ class HTMLParser 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', 'byline' => '/byline|author|dateline|writtenby|p-author/i', 'replaceFonts' => '/<(\/?)font[^>]*>/gi', - 'normalize' => '/\s{2,}/g', + 'normalize' => '/\s{2,}/', 'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i', 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', 'prevLink' => '/(prev|earl|old|new|<|«)/i', @@ -99,6 +99,8 @@ class HTMLParser 'weightClasses' => true, 'removeReadabilityTags' => true, 'fixRelativeURLs' => false, + 'normalizeSpaces' => false, + 'substituteEntities' => true, 'originalURL' => 'http://fakehost', ]; @@ -202,6 +204,11 @@ class HTMLParser */ private function loadHTML($html) { + if (!$this->getConfig()->getOption('substituteEntities')) { + // Keep the original HTML entities + $this->dom->substituteEntities = false; + } + // Prepend the XML tag to avoid having issues with special characters. Should be harmless. $this->dom->loadHTML('<?xml encoding="UTF-8">' . $html); $this->dom->encoding = 'UTF-8'; @@ -334,6 +341,12 @@ class HTMLParser } } + if ($this->getConfig()->getOption('normalizeSpaces')) { + foreach ($article->getElementsByTagName('p') as $node) { + $node->nodeValue = preg_replace($this->regexps['normalize'], ' ', $node->nodeValue); + } + } + return $article; } diff --git a/test/HTMLParserTest.php b/test/HTMLParserTest.php index c97745d..55b82b8 100644 --- a/test/HTMLParserTest.php +++ b/test/HTMLParserTest.php @@ -9,12 +9,19 @@ class HTMLParserTest extends \PHPUnit_Framework_TestCase /** * @dataProvider getSamplePages */ - public function testHTMLParserParsesHTML($html, $expectedResult, $expectedMetadata) + public function testHTMLParserParsesHTML($html, $expectedResult, $expectedMetadata, $config) { - $readability = new HTMLParser([ - 'originalURL' => 'http://fakehost/test/test.html', - 'fixRelativeURLs' => true - ]); + $options = ['originalURL' => 'http://fakehost/test/test.html', + 'fixRelativeURLs' => true, + 'normalizeSpaces' => false, + 'substituteEntities' => true + ]; + + if ($config) { + $options = $config; + } + + $readability = new HTMLParser($options); $result = $readability->parse($html); $this->assertEquals($expectedResult, $result['html']); @@ -34,8 +41,12 @@ class HTMLParserTest extends \PHPUnit_Framework_TestCase $source = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'source.html'); $expectedHTML = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'expected.html'); $expectedMetadata = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'expected-metadata.json'); + $config = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'config.json'); + if ($config) { + $config = json_decode($config); + } - $pages[$testPage] = [$source, $expectedHTML, $expectedMetadata]; + $pages[$testPage] = [$source, $expectedHTML, $expectedMetadata, $config]; } return $pages; |