Added cleanExtraParagraphs

author: Andres Rey <[email protected]> 2016-11-27 01:44:32 +0000
committer: Andres Rey <[email protected]> 2016-11-27 01:44:32 +0000
commit: 7481a75b39db90d88158e25f27a35196460eb5bc (patch)
tree: 3e8cac78ebac950cb69777cf770dc9682f4b65f2 /src
parent: 17d67164ea2632fc5121f35602bb52386aaeaf27 (diff)
1 files changed, 31 insertions, 1 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 85f5aaa..863723a 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -615,6 +615,10 @@ class HTMLParser
         $this->_cleanConditionally($article, 'ul');
         $this->_cleanConditionally($article, 'div');
 
+        $this->_cleanExtraParagraphs($article);
+
+        // TODO Remove extra BR nodes that have a P sibling.
+
         return $article;
     }
 
@@ -625,6 +629,31 @@ class HTMLParser
      *
      * @return void
      */
+    public function _cleanExtraParagraphs(DOMDocument $article)
+    {
+        foreach($article->getElementsByTagName('p') as $paragraph){
+            $imgCount = $paragraph->getElementsByTagName('img')->length;
+            $embedCount = $paragraph->getElementsByTagName('embed')->length;
+            $objectCount = $paragraph->getElementsByTagName('object')->length;
+            // At this point, nasty iframes have been removed, only remain embedded video ones.
+            $iframeCount = $paragraph->getElementsByTagName('iframe')->length;
+            $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount;
+
+            if($totalCount === 0 && !trim($paragraph->textContent)){
+                // TODO must be done via readability
+                $paragraph->parentNode->removeChild($paragraph);
+            }
+
+        }
+    }
+
+    /**
+     * TODO To be moved to Readability
+     *
+     * @param DOMDocument $article
+     *
+     * @return void
+     */
     public function _cleanConditionally(DOMDocument $article, $tag)
     {
         if (!$this->getConfig()->getOption('cleanConditionally')) {
@@ -639,6 +668,7 @@ class HTMLParser
          * without effecting the traversal.
          */
 
+        // TODO Check for node shifting and if the removal function is working as expected
         foreach ($article->getElementsByTagName($tag) as $node) {
             $node = new Readability($node);
             $weight = $node->getClassWeight();
@@ -648,7 +678,7 @@ class HTMLParser
                 continue;
             }
 
-            if (substr_count($node->getTextContent(), ',' < 10)) {
+            if (substr_count($node->getTextContent(), ',') < 10) {
                 /*
                  * If there are not very many commas, and the number of
                  * non-paragraph elements is more than paragraphs or other
author	Andres Rey <[email protected]>	2016-11-27 01:44:32 +0000
committer	Andres Rey <[email protected]>	2016-11-27 01:44:32 +0000
commit	7481a75b39db90d88158e25f27a35196460eb5bc (patch)
tree	3e8cac78ebac950cb69777cf770dc9682f4b65f2 /src
parent	17d67164ea2632fc5121f35602bb52386aaeaf27 (diff)