Added option to remove the data-readability tags.

author: Andres Rey <[email protected]> 2016-12-08 15:45:39 +0000
committer: Andres Rey <[email protected]> 2016-12-08 15:45:39 +0000
commit: 1e0cc1cda41d1e9bd223dfba616c4b1351b6934e (patch)
tree: cb20a027e2348a72224b0dcaecf530d9397f00da /src
parent: 2b5e1f66f0ceb8b00e1c29dc9e3cb18ce3554bbd (diff)
1 files changed, 24 insertions, 3 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 6705acd..01c581b 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -95,7 +95,8 @@ class HTMLParser
             'maxTopCandidates' => 5, // Max amount of top level candidates
             'articleByLine' => null,
             'stripUnlikelyCandidates' => true,
-            'cleanConditionally' => true
+            'cleanConditionally' => true,
+            'removeReadabilityTags' => true
         ];
 
         $this->environment = Environment::createDefaultEnvironment($defaults);
@@ -140,7 +141,7 @@ class HTMLParser
         return [
             'title' => isset($this->metadata['title']) ? $this->metadata['title'] : null,
             'author' => isset($this->metadata['author']) ? $this->metadata['author'] : null,
-            'image' => isset($this->metadata['image']) ?$this->metadata['image'] : null,
+            'image' => isset($this->metadata['image']) ? $this->metadata['image'] : null,
             'article' => $result,
             'html' => $result->C14N()
         ];
@@ -617,6 +618,8 @@ class HTMLParser
 
         $this->_cleanExtraParagraphs($article);
 
+        $this->_cleanReadabilityTags($article);
+
         // TODO Remove extra BR nodes that have a P sibling.
 
         return $article;
@@ -629,6 +632,24 @@ class HTMLParser
      *
      * @return void
      */
+    public function _cleanReadabilityTags(DOMDocument $article)
+    {
+        if ($this->getConfig()->getOption('removeReadabilityTags')) {
+            foreach ($article->getElementsByTagName('*') as $tag) {
+                if ($tag->hasAttribute('data-readability')) {
+                    $tag->removeAttribute('data-readability');
+                }
+            }
+        }
+    }
+
+    /**
+     * TODO To be moved to Readability
+     *
+     * @param DOMDocument $article
+     *
+     * @return void
+     */
     public function _cleanExtraParagraphs(DOMDocument $article)
     {
         foreach ($article->getElementsByTagName('p') as $paragraph) {
@@ -744,7 +765,7 @@ class HTMLParser
             if ($isEmbed) {
                 $attributeValues = [];
                 foreach ($item->attributes as $name => $value) {
-                    $attributeValues[] = $value;
+                    $attributeValues[] = $value->nodeValue;
                 }
                 $attributeValues = implode('|', $attributeValues);
author	Andres Rey <[email protected]>	2016-12-08 15:45:39 +0000
committer	Andres Rey <[email protected]>	2016-12-08 15:45:39 +0000
commit	1e0cc1cda41d1e9bd223dfba616c4b1351b6934e (patch)
tree	cb20a027e2348a72224b0dcaecf530d9397f00da /src
parent	2b5e1f66f0ceb8b00e1c29dc9e3cb18ce3554bbd (diff)