summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFiveFilters.org <[email protected]>2021-08-25 02:35:50 +0200
committerFiveFilters.org <[email protected]>2021-08-25 02:35:50 +0200
commitdc2da4743d4b167be4507178771a1fd1bb4159fc (patch)
tree17da6ee7391e400a024fe8f01af5e315cc82f09b
parent803131a61052fb657022b298944cdda0c5e41b07 (diff)
Allow new test output to be written to disk inside test/changed/
-rw-r--r--.gitignore3
-rw-r--r--README.md40
-rw-r--r--test/ReadabilityTest.php50
3 files changed, 78 insertions, 15 deletions
diff --git a/.gitignore b/.gitignore
index 5608600..52b9f38 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
.idea/
vendor
composer.lock
-/test.* \ No newline at end of file
+/test.*
+/test/changed/ \ No newline at end of file
diff --git a/README.md b/README.md
index 61b0f85..8526a42 100644
--- a/README.md
+++ b/README.md
@@ -186,11 +186,24 @@ Readability.php uses
Readability parses all the text with DOMDocument, scans the text nodes and gives the a score, based on the amount of words, links and type of element. Then it selects the highest scoring element and creates a new DOMDocument with all its siblings. Each sibling is scored to discard useless elements, like nav bars, empty nodes, etc.
+## Security
+
+If you're going to use Readability with untrusted input (whether in HTML or DOM form), we **strongly** recommend you use a sanitizer library like [HTML Purifier](https://github.com/ezyang/htmlpurifier) to avoid script injection when you use
+the output of Readability. We would also recommend using [CSP](https://developer.mozilla.org/en-US/docs/Web/HTTP/CSP) to add further defense-in-depth
+restrictions to what you allow the resulting content to do. The Firefox integration of
+reader mode uses both of these techniques itself. Sanitizing unsafe content out of the input is explicitly not something we aim to do as part of Readability itself - there are other good sanitizer libraries out there, use them!
+
+## Code porting
+
+Version 2.1.0 - Up to date with Readability.js up to [19 Nov 2018](https://github.com/mozilla/readability/commit/876c81f710711ba2afb36dd83889d4c5b4fc2743).
+
+Master branch - Up to [13 Apr 2020](https://github.com/mozilla/readability/commit/52ab9b5c8916c306a47b2119270dcdabebf9d203).
+
## Testing
-Any version of PHP installed locally should be enough to develop new features and add new test cases. If you want to be 100% sure that your change doesn't create any issues with other versions of PHP, you can use the provided Docker containers to test currently in 7.3, 7.4, and 8.0.
+Any version of PHP from 7.3 and above installed locally should be enough to develop new features and add new test cases. If you want to be 100% sure that your change doesn't create any issues with other versions of PHP, you can use the provided Docker containers to test currently in 7.3, 7.4, and 8.0.
-You'll need Docker and Docker Compose for this. To run all the tests in all the available versions just type the following command:
+You'll need Docker and Docker Compose for this. To run all the tests in the three PHP versions above, just type the following command:
```bash
make test-all
@@ -198,20 +211,25 @@ make test-all
This will start all the containers and run all the tests on every supported version of PHP. If you want to test against a specific version, you can use `make test-7.3`, `make test-7.4`, or `make test-8`.
-If you really want to test against every supported version of PHP and every supported version of libxml, run `test-all-versions`. This will test against PHP versions 7.3 to 8 and libxml versions 2.9.4, 2.9.5, 2.9.10, and 2.9.12. Normally you won't need to do this unless you think you've found a bug on an specific version of libxml.
+### Different versions of libxml
-## Security
+If you want to test against supported versions of PHP *AND* multiple versions of libxml, run `test-all-versions`. This will test against PHP versions 7.3 to 8 and libxml versions 2.9.4, 2.9.5, 2.9.10, and 2.9.12. Normally you won't need to do this unless you think you've found a bug on an specific version of libxml.
-If you're going to use Readability with untrusted input (whether in HTML or DOM form), we **strongly** recommend you use a sanitizer library like [HTML Purifier](https://github.com/ezyang/htmlpurifier) to avoid script injection when you use
-the output of Readability. We would also recommend using [CSP](https://developer.mozilla.org/en-US/docs/Web/HTTP/CSP) to add further defense-in-depth
-restrictions to what you allow the resulting content to do. The Firefox integration of
-reader mode uses both of these techniques itself. Sanitizing unsafe content out of the input is explicitly not something we aim to do as part of Readability itself - there are other good sanitizer libraries out there, use them!
+### Updating the expected tests
-## Code porting
+If you've made an improvement to the code, you'll probably want to examine the Readability.php output for the test cases here. To do that, run the following command first from the root of the project folder:
-Version 2.1.0 - Up to date with Readability.js up to [19 Nov 2018](https://github.com/mozilla/readability/commit/876c81f710711ba2afb36dd83889d4c5b4fc2743).
+ docker-compose up -d php-7.4-libxml-2.9.10
-Master branch - Up to [13 Apr 2020](https://github.com/mozilla/readability/commit/52ab9b5c8916c306a47b2119270dcdabebf9d203).
+You should now have a docker image running with the project root folder mapped to /app/ on your Docker instance (see `docker-compose.yml`). Any changes to these files will be accessible from the Docker instance from now on.
+
+Next, create a folder in tests/ called /changed, then run the following command to run the test suite:
+
+ docker-compose exec -e output-changes=1 -e output-diff=1 php-7.4-libxml-2.9.10 php /app/vendor/phpunit/phpunit/phpunit --configuration /app/phpunit.xml
+
+The two environment variables (`output-changes=1` and `output-diff=1`) will result in new output for any failing test (along with a diff of changes) being written to the changed/ folder.
+
+If you're happy the changes are okay, remove `output-diff=1` and the diff files will no longer be written, making it easier to copy the new expected output over to corresponding locations in test-pages\.
## License
diff --git a/test/ReadabilityTest.php b/test/ReadabilityTest.php
index 842d259..dafd540 100644
--- a/test/ReadabilityTest.php
+++ b/test/ReadabilityTest.php
@@ -40,6 +40,19 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$expected_no_whitespace = preg_replace($from, $to, $testPage->getExpectedHTML());
$readability_no_whitespace = preg_replace($from, $to, $readability->getContent());
+ if (getenv('output-changes') && $expected_no_whitespace !== $readability_no_whitespace) {
+ @mkdir(__DIR__.'/changed/'.$testPage->getSlug());
+ $new_expected = __DIR__.'/changed/'.$testPage->getSlug().'/expected.html';
+ $old_expected = __DIR__.'/test-pages/'.$testPage->getSlug().'/expected.html';
+ //file_put_contents(__DIR__.'/changed/'.$testPage->getSlug().'/readability.html', $readability_no_whitespace);
+ //file_put_contents(__DIR__.'/changed/'.$testPage->getSlug().'/expected-current.html', $expected_no_whitespace);
+ file_put_contents($new_expected, $readability->getContent());
+ if (getenv('output-diff')) {
+ file_put_contents(__DIR__.'/changed/'.$testPage->getSlug().'/diff-expected.txt', shell_exec(sprintf('diff -u -d %s %s', $old_expected, $new_expected)));
+ }
+
+ }
+
$this->assertSame($expected_no_whitespace, $readability_no_whitespace, 'Parsed text does not match the expected one.');
//$this->assertSame($testPage->getExpectedHTML(), $readability->getContent(), 'Parsed text does not match the expected one.');
@@ -68,6 +81,26 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$readability = new Readability($configuration);
$readability->parse($testPage->getSourceHTML());
+ $metadata = [
+ 'Author' => $readability->getAuthor(),
+ 'Direction' => $readability->getDirection(),
+ 'Excerpt' => $readability->getExcerpt(),
+ 'Image' => $readability->getImage(),
+ 'Title' => $readability->getTitle(),
+ 'SiteName' => $readability->getSiteName()
+ ];
+
+ if (getenv('output-changes') && (array)$testPage->getExpectedMetadata() !== $metadata) {
+ @mkdir(__DIR__.'/changed/'.$testPage->getSlug());
+ $new_expected = __DIR__.'/changed/'.$testPage->getSlug().'/expected-metadata.json';
+ $old_expected = __DIR__.'/test-pages/'.$testPage->getSlug().'/expected-metadata.json';
+ //file_put_contents(__DIR__.'/changed/'.$testPage->getSlug().'/expected-metadata-current.json', json_encode($testPage->getExpectedMetadata(), JSON_PRETTY_PRINT));
+ file_put_contents($new_expected, json_encode((object)$metadata, JSON_PRETTY_PRINT));
+ if (getenv('output-diff')) {
+ file_put_contents(__DIR__.'/changed/'.$testPage->getSlug().'/diff-expected-metadata.txt', shell_exec(sprintf('diff -u -d %s %s', $old_expected, $new_expected)));
+ }
+ }
+
$this->assertSame($testPage->getExpectedMetadata()->Author, $readability->getAuthor(), 'Parsed Author does not match expected value.');
$this->assertSame($testPage->getExpectedMetadata()->Direction, $readability->getDirection(), 'Parsed Direction does not match expected value.');
$this->assertSame($testPage->getExpectedMetadata()->Excerpt, $readability->getExcerpt(), 'Parsed Excerpt does not match expected value.');
@@ -95,6 +128,17 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$readability = new Readability($configuration);
$readability->parse($testPage->getSourceHTML());
+ if (getenv('output-changes') && $testPage->getExpectedImages() !== array_values($readability->getImages())) {
+ @mkdir(__DIR__.'/changed/'.$testPage->getSlug());
+ $new_expected = __DIR__.'/changed/'.$testPage->getSlug().'/expected-images.json';
+ $old_expected = __DIR__.'/test-pages/'.$testPage->getSlug().'/expected-images.json';
+ //file_put_contents(__DIR__.'/changed/'.$testPage->getSlug().'/expected-images-current.json', json_encode($testPage->getExpectedImages(), JSON_PRETTY_PRINT));
+ file_put_contents($new_expected, json_encode(array_values($readability->getImages()), JSON_PRETTY_PRINT));
+ if (getenv('output-diff')) {
+ file_put_contents(__DIR__.'/changed/'.$testPage->getSlug().'/diff-expected-images.txt', shell_exec(sprintf('diff -u -d %s %s', $old_expected, $new_expected)));
+ }
+ }
+
$this->assertSame($testPage->getExpectedImages(), array_values($readability->getImages()));
}
@@ -113,9 +157,9 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$slug = $testPage;
$source = file_get_contents($testCasePath . 'source.html');
- $expectedHTML = file_get_contents($testCasePath . 'expected.html');
- $expectedImages = json_decode(file_get_contents($testCasePath . 'expected-images.json'), true);
- $expectedMetadata = json_decode(file_get_contents($testCasePath . 'expected-metadata.json'));
+ $expectedHTML = file_exists($testCasePath . 'expected.html') ? file_get_contents($testCasePath . 'expected.html') : '';
+ $expectedImages = file_exists($testCasePath . 'expected-images.json') ? json_decode(file_get_contents($testCasePath . 'expected-images.json'), true) : [];
+ $expectedMetadata = file_exists($testCasePath . 'expected-metadata.json') ? json_decode(file_get_contents($testCasePath . 'expected-metadata.json')) : (object)[];
$configuration = file_exists($testCasePath . 'config.json') ? json_decode(file_get_contents($testCasePath . 'config.json'), true) : [];
yield $testPage => [new TestPage($slug, $configuration, $source, $expectedHTML, $expectedImages, $expectedMetadata)];