diff options
author | Andres Rey <[email protected]> | 2017-03-07 15:41:24 +0000 |
---|---|---|
committer | Andres Rey <[email protected]> | 2017-03-07 15:41:24 +0000 |
commit | a0d42e6578d641f83443b863a333e7b1a9d50357 (patch) | |
tree | 79fb320415176329a9321d76b1af8d2b1a3a3d84 | |
parent | 3b73cde640956aa08cee59a9be44d941a819b5e6 (diff) |
Fuck this, we are not going to normalize blank space.
-rw-r--r-- | README.md | 1 | ||||
-rw-r--r-- | src/HTMLParser.php | 7 | ||||
-rw-r--r-- | test/test-pages/normalize-spaces/expected-metadata.json | 6 | ||||
-rw-r--r-- | test/test-pages/normalize-spaces/expected.html | 3 | ||||
-rw-r--r-- | test/test-pages/normalize-spaces/source.html | 35 |
5 files changed, 0 insertions, 52 deletions
@@ -51,7 +51,6 @@ If the parsing process was unsuccessful the HTMLParser will return `false` - **weightClasses**: default value `true`, weight classes during the rating phase. - **removeReadabilityTags**: default value `true`, remove the data-readability tags inside the nodes that are added during the rating phase. - **fixRelativeURLs**: default value `false`, convert relative URLs to absolute. Like `/test` to `http://host/test`. -- **normalizeSpaces**: default value `false`, normalize all spaces. Changes all consecutive spaces to one space. - **substituteEntities**: default value `false`, disables the `substituteEntities` flag of libxml. Will avoid substituting HTML entities. Like `´` to รก. - **originalURL**: default value `http://fakehost`, original URL from the article used to fix relative URLs. diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 1b1a516..1ef4489 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -99,7 +99,6 @@ class HTMLParser 'weightClasses' => true, 'removeReadabilityTags' => true, 'fixRelativeURLs' => false, - 'normalizeSpaces' => false, 'substituteEntities' => true, 'originalURL' => 'http://fakehost', ]; @@ -341,12 +340,6 @@ class HTMLParser } } - if ($this->getConfig()->getOption('normalizeSpaces')) { - foreach ($article->getElementsByTagName('p') as $node) { - $node->nodeValue = preg_replace($this->regexps['normalize'], ' ', $node->nodeValue); - } - } - return $article; } diff --git a/test/test-pages/normalize-spaces/expected-metadata.json b/test/test-pages/normalize-spaces/expected-metadata.json deleted file mode 100644 index 3887fbb..0000000 --- a/test/test-pages/normalize-spaces/expected-metadata.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "title": "Normalize space test", - "byline": null, - "excerpt": "Lorem\n ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n\ttab here\n incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", - "readerable": false -} diff --git a/test/test-pages/normalize-spaces/expected.html b/test/test-pages/normalize-spaces/expected.html deleted file mode 100644 index c81a738..0000000 --- a/test/test-pages/normalize-spaces/expected.html +++ /dev/null @@ -1,3 +0,0 @@ - <article> - <p> Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tab here incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p> - <p> Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
\ No newline at end of file diff --git a/test/test-pages/normalize-spaces/source.html b/test/test-pages/normalize-spaces/source.html deleted file mode 100644 index b230798..0000000 --- a/test/test-pages/normalize-spaces/source.html +++ /dev/null @@ -1,35 +0,0 @@ -<!DOCTYPE html> -<html> -<head> - <meta charset="utf-8"/> - <title>Normalize space test</title> -</head> -<body> - <article> - <h1>Lorem</h1> - <div> - Lorem - ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod - tab here - incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, - quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo - consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse - cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non - proident, sunt in culpa qui officia deserunt mollit anim id est laborum. - </div> - <h2>Foo</h2> - <div> - Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, - quis nostrud exercitation - - - - - ullamco laboris nisi ut aliquip ex ea commodo - consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse - cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non - proident, sunt in culpa qui officia deserunt mollit anim id est laborum. - </div> - </article> -</body> -</html> |