summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-03-07 15:41:24 +0000
committerAndres Rey <[email protected]>2017-03-07 15:41:24 +0000
commita0d42e6578d641f83443b863a333e7b1a9d50357 (patch)
tree79fb320415176329a9321d76b1af8d2b1a3a3d84
parent3b73cde640956aa08cee59a9be44d941a819b5e6 (diff)
Fuck this, we are not going to normalize blank space.
-rw-r--r--README.md1
-rw-r--r--src/HTMLParser.php7
-rw-r--r--test/test-pages/normalize-spaces/expected-metadata.json6
-rw-r--r--test/test-pages/normalize-spaces/expected.html3
-rw-r--r--test/test-pages/normalize-spaces/source.html35
5 files changed, 0 insertions, 52 deletions
diff --git a/README.md b/README.md
index 44b937e..77e9565 100644
--- a/README.md
+++ b/README.md
@@ -51,7 +51,6 @@ If the parsing process was unsuccessful the HTMLParser will return `false`
- **weightClasses**: default value `true`, weight classes during the rating phase.
- **removeReadabilityTags**: default value `true`, remove the data-readability tags inside the nodes that are added during the rating phase.
- **fixRelativeURLs**: default value `false`, convert relative URLs to absolute. Like `/test` to `http://host/test`.
-- **normalizeSpaces**: default value `false`, normalize all spaces. Changes all consecutive spaces to one space.
- **substituteEntities**: default value `false`, disables the `substituteEntities` flag of libxml. Will avoid substituting HTML entities. Like `&acute;` to รก.
- **originalURL**: default value `http://fakehost`, original URL from the article used to fix relative URLs.
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 1b1a516..1ef4489 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -99,7 +99,6 @@ class HTMLParser
'weightClasses' => true,
'removeReadabilityTags' => true,
'fixRelativeURLs' => false,
- 'normalizeSpaces' => false,
'substituteEntities' => true,
'originalURL' => 'http://fakehost',
];
@@ -341,12 +340,6 @@ class HTMLParser
}
}
- if ($this->getConfig()->getOption('normalizeSpaces')) {
- foreach ($article->getElementsByTagName('p') as $node) {
- $node->nodeValue = preg_replace($this->regexps['normalize'], ' ', $node->nodeValue);
- }
- }
-
return $article;
}
diff --git a/test/test-pages/normalize-spaces/expected-metadata.json b/test/test-pages/normalize-spaces/expected-metadata.json
deleted file mode 100644
index 3887fbb..0000000
--- a/test/test-pages/normalize-spaces/expected-metadata.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "title": "Normalize space test",
- "byline": null,
- "excerpt": "Lorem\n ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n\ttab here\n incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.",
- "readerable": false
-}
diff --git a/test/test-pages/normalize-spaces/expected.html b/test/test-pages/normalize-spaces/expected.html
deleted file mode 100644
index c81a738..0000000
--- a/test/test-pages/normalize-spaces/expected.html
+++ /dev/null
@@ -1,3 +0,0 @@
- <article>
- <p> Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tab here incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
- <p> Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p> \ No newline at end of file
diff --git a/test/test-pages/normalize-spaces/source.html b/test/test-pages/normalize-spaces/source.html
deleted file mode 100644
index b230798..0000000
--- a/test/test-pages/normalize-spaces/source.html
+++ /dev/null
@@ -1,35 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
- <meta charset="utf-8"/>
- <title>Normalize space test</title>
-</head>
-<body>
- <article>
- <h1>Lorem</h1>
- <div>
- Lorem
- ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
- tab here
- incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
- quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
- consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
- cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
- proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
- </div>
- <h2>Foo</h2>
- <div>
- Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
- quis nostrud exercitation
-
-
-
-
- ullamco laboris nisi ut aliquip ex ea commodo
- consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
- cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
- proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
- </div>
- </article>
-</body>
-</html>