summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md4
-rw-r--r--src/HTMLParser.php15
-rw-r--r--test/HTMLParserTest.php23
3 files changed, 34 insertions, 8 deletions
diff --git a/README.md b/README.md
index ab11a1c..44b937e 100644
--- a/README.md
+++ b/README.md
@@ -51,7 +51,9 @@ If the parsing process was unsuccessful the HTMLParser will return `false`
- **weightClasses**: default value `true`, weight classes during the rating phase.
- **removeReadabilityTags**: default value `true`, remove the data-readability tags inside the nodes that are added during the rating phase.
- **fixRelativeURLs**: default value `false`, convert relative URLs to absolute. Like `/test` to `http://host/test`.
-- **originalURL**: default value `http://fakehost`, original URL from the article used to fix relative URLs.
+- **normalizeSpaces**: default value `false`, normalize all spaces. Changes all consecutive spaces to one space.
+- **substituteEntities**: default value `false`, disables the `substituteEntities` flag of libxml. Will avoid substituting HTML entities. Like `´` to á.
+- **originalURL**: default value `http://fakehost`, original URL from the article used to fix relative URLs.
## Limitations
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 5fb27b8..1b1a516 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -35,7 +35,7 @@ class HTMLParser
'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
'byline' => '/byline|author|dateline|writtenby|p-author/i',
'replaceFonts' => '/<(\/?)font[^>]*>/gi',
- 'normalize' => '/\s{2,}/g',
+ 'normalize' => '/\s{2,}/',
'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i',
'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i',
'prevLink' => '/(prev|earl|old|new|<|«)/i',
@@ -99,6 +99,8 @@ class HTMLParser
'weightClasses' => true,
'removeReadabilityTags' => true,
'fixRelativeURLs' => false,
+ 'normalizeSpaces' => false,
+ 'substituteEntities' => true,
'originalURL' => 'http://fakehost',
];
@@ -202,6 +204,11 @@ class HTMLParser
*/
private function loadHTML($html)
{
+ if (!$this->getConfig()->getOption('substituteEntities')) {
+ // Keep the original HTML entities
+ $this->dom->substituteEntities = false;
+ }
+
// Prepend the XML tag to avoid having issues with special characters. Should be harmless.
$this->dom->loadHTML('<?xml encoding="UTF-8">' . $html);
$this->dom->encoding = 'UTF-8';
@@ -334,6 +341,12 @@ class HTMLParser
}
}
+ if ($this->getConfig()->getOption('normalizeSpaces')) {
+ foreach ($article->getElementsByTagName('p') as $node) {
+ $node->nodeValue = preg_replace($this->regexps['normalize'], ' ', $node->nodeValue);
+ }
+ }
+
return $article;
}
diff --git a/test/HTMLParserTest.php b/test/HTMLParserTest.php
index c97745d..55b82b8 100644
--- a/test/HTMLParserTest.php
+++ b/test/HTMLParserTest.php
@@ -9,12 +9,19 @@ class HTMLParserTest extends \PHPUnit_Framework_TestCase
/**
* @dataProvider getSamplePages
*/
- public function testHTMLParserParsesHTML($html, $expectedResult, $expectedMetadata)
+ public function testHTMLParserParsesHTML($html, $expectedResult, $expectedMetadata, $config)
{
- $readability = new HTMLParser([
- 'originalURL' => 'http://fakehost/test/test.html',
- 'fixRelativeURLs' => true
- ]);
+ $options = ['originalURL' => 'http://fakehost/test/test.html',
+ 'fixRelativeURLs' => true,
+ 'normalizeSpaces' => false,
+ 'substituteEntities' => true
+ ];
+
+ if ($config) {
+ $options = $config;
+ }
+
+ $readability = new HTMLParser($options);
$result = $readability->parse($html);
$this->assertEquals($expectedResult, $result['html']);
@@ -34,8 +41,12 @@ class HTMLParserTest extends \PHPUnit_Framework_TestCase
$source = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'source.html');
$expectedHTML = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'expected.html');
$expectedMetadata = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'expected-metadata.json');
+ $config = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'config.json');
+ if ($config) {
+ $config = json_decode($config);
+ }
- $pages[$testPage] = [$source, $expectedHTML, $expectedMetadata];
+ $pages[$testPage] = [$source, $expectedHTML, $expectedMetadata, $config];
}
return $pages;