summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-12-08 15:45:39 +0000
committerAndres Rey <[email protected]>2016-12-08 15:45:39 +0000
commit1e0cc1cda41d1e9bd223dfba616c4b1351b6934e (patch)
treecb20a027e2348a72224b0dcaecf530d9397f00da /src
parent2b5e1f66f0ceb8b00e1c29dc9e3cb18ce3554bbd (diff)
Added option to remove the data-readability tags.
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php27
1 files changed, 24 insertions, 3 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 6705acd..01c581b 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -95,7 +95,8 @@ class HTMLParser
'maxTopCandidates' => 5, // Max amount of top level candidates
'articleByLine' => null,
'stripUnlikelyCandidates' => true,
- 'cleanConditionally' => true
+ 'cleanConditionally' => true,
+ 'removeReadabilityTags' => true
];
$this->environment = Environment::createDefaultEnvironment($defaults);
@@ -140,7 +141,7 @@ class HTMLParser
return [
'title' => isset($this->metadata['title']) ? $this->metadata['title'] : null,
'author' => isset($this->metadata['author']) ? $this->metadata['author'] : null,
- 'image' => isset($this->metadata['image']) ?$this->metadata['image'] : null,
+ 'image' => isset($this->metadata['image']) ? $this->metadata['image'] : null,
'article' => $result,
'html' => $result->C14N()
];
@@ -617,6 +618,8 @@ class HTMLParser
$this->_cleanExtraParagraphs($article);
+ $this->_cleanReadabilityTags($article);
+
// TODO Remove extra BR nodes that have a P sibling.
return $article;
@@ -629,6 +632,24 @@ class HTMLParser
*
* @return void
*/
+ public function _cleanReadabilityTags(DOMDocument $article)
+ {
+ if ($this->getConfig()->getOption('removeReadabilityTags')) {
+ foreach ($article->getElementsByTagName('*') as $tag) {
+ if ($tag->hasAttribute('data-readability')) {
+ $tag->removeAttribute('data-readability');
+ }
+ }
+ }
+ }
+
+ /**
+ * TODO To be moved to Readability
+ *
+ * @param DOMDocument $article
+ *
+ * @return void
+ */
public function _cleanExtraParagraphs(DOMDocument $article)
{
foreach ($article->getElementsByTagName('p') as $paragraph) {
@@ -744,7 +765,7 @@ class HTMLParser
if ($isEmbed) {
$attributeValues = [];
foreach ($item->attributes as $name => $value) {
- $attributeValues[] = $value;
+ $attributeValues[] = $value->nodeValue;
}
$attributeValues = implode('|', $attributeValues);