1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
<?php
namespace andreskrey\Readability\Test;
use andreskrey\Readability\Configuration;
use andreskrey\Readability\Readability;
class ReadabilityTest extends \PHPUnit_Framework_TestCase
{
/**
* @dataProvider getSamplePages
*/
public function testReadabilityParsesHTML($html, $expectedResult, $expectedMetadata, $config, $expectedImages)
{
$options = ['originalURL' => 'http://fakehost/test/test.html',
'fixRelativeURLs' => true,
'substituteEntities' => true,
];
if ($config) {
$options = array_merge($options, $config);
}
$configuration = new Configuration();
foreach($options as $key => $value){
$name = 'set' . $key;
$configuration->$name($value);
}
$readability = new Readability($configuration);
$result = $readability->parse($html);
$this->assertEquals($expectedResult, $result['html']);
}
/**
* @dataProvider getSamplePages
*/
public function testHTMLParserParsesImages($html, $expectedResult, $expectedMetadata, $config, $expectedImages)
{
$options = ['originalURL' => 'http://fakehost/test/test.html',
'fixRelativeURLs' => true,
'substituteEntities' => true,
];
if ($config) {
$options = array_merge($options, $config);
}
$configuration = new Configuration();
foreach($options as $key => $value){
$name = 'set' . $key;
$configuration->$name($value);
}
$readability = new Readability($configuration);
$result = $readability->parse($html);
$this->assertEquals($expectedImages, json_encode($result['images']));
}
public function getSamplePages()
{
$path = pathinfo(__FILE__, PATHINFO_DIRNAME) . DIRECTORY_SEPARATOR . 'test-pages';
$testPages = scandir($path);
if (in_array('.DS_Store', $testPages)) {
unset($testPages[array_search('.DS_Store', $testPages)]);
}
$pages = [];
foreach (array_slice($testPages, 2) as $testPage) {
$source = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'source.html');
$expectedHTML = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'expected.html');
$expectedMetadata = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'expected-metadata.json');
$expectedImages = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'expected-images.json');
$config = null;
if (file_exists($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'config.json')) {
$config = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'config.json');
if ($config) {
$config = json_decode($config, true);
}
}
$pages[$testPage] = [$source, $expectedHTML, $expectedMetadata, $config, $expectedImages];
}
return $pages;
}
}
|