summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-11-27 01:44:32 +0000
committerAndres Rey <[email protected]>2016-11-27 01:44:32 +0000
commit7481a75b39db90d88158e25f27a35196460eb5bc (patch)
tree3e8cac78ebac950cb69777cf770dc9682f4b65f2 /src
parent17d67164ea2632fc5121f35602bb52386aaeaf27 (diff)
Added cleanExtraParagraphs
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php32
1 files changed, 31 insertions, 1 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 85f5aaa..863723a 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -615,6 +615,10 @@ class HTMLParser
$this->_cleanConditionally($article, 'ul');
$this->_cleanConditionally($article, 'div');
+ $this->_cleanExtraParagraphs($article);
+
+ // TODO Remove extra BR nodes that have a P sibling.
+
return $article;
}
@@ -625,6 +629,31 @@ class HTMLParser
*
* @return void
*/
+ public function _cleanExtraParagraphs(DOMDocument $article)
+ {
+ foreach($article->getElementsByTagName('p') as $paragraph){
+ $imgCount = $paragraph->getElementsByTagName('img')->length;
+ $embedCount = $paragraph->getElementsByTagName('embed')->length;
+ $objectCount = $paragraph->getElementsByTagName('object')->length;
+ // At this point, nasty iframes have been removed, only remain embedded video ones.
+ $iframeCount = $paragraph->getElementsByTagName('iframe')->length;
+ $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount;
+
+ if($totalCount === 0 && !trim($paragraph->textContent)){
+ // TODO must be done via readability
+ $paragraph->parentNode->removeChild($paragraph);
+ }
+
+ }
+ }
+
+ /**
+ * TODO To be moved to Readability
+ *
+ * @param DOMDocument $article
+ *
+ * @return void
+ */
public function _cleanConditionally(DOMDocument $article, $tag)
{
if (!$this->getConfig()->getOption('cleanConditionally')) {
@@ -639,6 +668,7 @@ class HTMLParser
* without effecting the traversal.
*/
+ // TODO Check for node shifting and if the removal function is working as expected
foreach ($article->getElementsByTagName($tag) as $node) {
$node = new Readability($node);
$weight = $node->getClassWeight();
@@ -648,7 +678,7 @@ class HTMLParser
continue;
}
- if (substr_count($node->getTextContent(), ',' < 10)) {
+ if (substr_count($node->getTextContent(), ',') < 10) {
/*
* If there are not very many commas, and the number of
* non-paragraph elements is more than paragraphs or other