summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/HTMLParser.php27
-rw-r--r--test/HTMLParserTest.php10
2 files changed, 31 insertions, 6 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 6705acd..01c581b 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -95,7 +95,8 @@ class HTMLParser
'maxTopCandidates' => 5, // Max amount of top level candidates
'articleByLine' => null,
'stripUnlikelyCandidates' => true,
- 'cleanConditionally' => true
+ 'cleanConditionally' => true,
+ 'removeReadabilityTags' => true
];
$this->environment = Environment::createDefaultEnvironment($defaults);
@@ -140,7 +141,7 @@ class HTMLParser
return [
'title' => isset($this->metadata['title']) ? $this->metadata['title'] : null,
'author' => isset($this->metadata['author']) ? $this->metadata['author'] : null,
- 'image' => isset($this->metadata['image']) ?$this->metadata['image'] : null,
+ 'image' => isset($this->metadata['image']) ? $this->metadata['image'] : null,
'article' => $result,
'html' => $result->C14N()
];
@@ -617,6 +618,8 @@ class HTMLParser
$this->_cleanExtraParagraphs($article);
+ $this->_cleanReadabilityTags($article);
+
// TODO Remove extra BR nodes that have a P sibling.
return $article;
@@ -629,6 +632,24 @@ class HTMLParser
*
* @return void
*/
+ public function _cleanReadabilityTags(DOMDocument $article)
+ {
+ if ($this->getConfig()->getOption('removeReadabilityTags')) {
+ foreach ($article->getElementsByTagName('*') as $tag) {
+ if ($tag->hasAttribute('data-readability')) {
+ $tag->removeAttribute('data-readability');
+ }
+ }
+ }
+ }
+
+ /**
+ * TODO To be moved to Readability
+ *
+ * @param DOMDocument $article
+ *
+ * @return void
+ */
public function _cleanExtraParagraphs(DOMDocument $article)
{
foreach ($article->getElementsByTagName('p') as $paragraph) {
@@ -744,7 +765,7 @@ class HTMLParser
if ($isEmbed) {
$attributeValues = [];
foreach ($item->attributes as $name => $value) {
- $attributeValues[] = $value;
+ $attributeValues[] = $value->nodeValue;
}
$attributeValues = implode('|', $attributeValues);
diff --git a/test/HTMLParserTest.php b/test/HTMLParserTest.php
index 7ad4238..6d46a90 100644
--- a/test/HTMLParserTest.php
+++ b/test/HTMLParserTest.php
@@ -10,7 +10,7 @@ class HTMLParserTest extends \PHPUnit_Framework_TestCase
/**
* @dataProvider getSamplePages
*/
- public function testHTMLParserParsesHTML($html, $expectedResult)
+ public function testHTMLParserParsesHTML($html, $expectedResult, $expectedMetadata)
{
$readability = new HTMLParser();
$result = $readability->parse($html);
@@ -22,14 +22,18 @@ class HTMLParserTest extends \PHPUnit_Framework_TestCase
{
$path = pathinfo(__FILE__, PATHINFO_DIRNAME) . DIRECTORY_SEPARATOR . 'test-pages';
$testPages = scandir($path);
+ if (in_array('.DS_Store', $testPages)) {
+ unset($testPages[array_search('.DS_Store', $testPages)]);
+ }
$pages = [];
- foreach(array_slice($testPages, 2) as $testPage){
+ foreach (array_slice($testPages, 2) as $testPage) {
$source = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'source.html');
$expectedHTML = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'expected.html');
+ $expectedMetadata = file_get_contents($path . DIRECTORY_SEPARATOR . $testPage . DIRECTORY_SEPARATOR . 'expected-metadata.json');
- $pages[] = [$source, $expectedHTML];
+ $pages[] = [$source, $expectedHTML, $expectedMetadata];
}
return $pages;