summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-11-24 09:42:46 +0000
committerAndres Rey <[email protected]>2016-11-24 09:42:46 +0000
commit14cac0b9a7f746c271680c3d6b5b76594267f8dd (patch)
tree83754776961024e2d3f2e4780edf5d6e9cf75584 /src
parent8a24f09d3da047c3b489e7789db7da217ec398fa (diff)
Initial approach to prepArticle
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php57
1 files changed, 57 insertions, 0 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 49c220d..ff8d17c 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -574,6 +574,8 @@ class HTMLParser
}
}
+ $articleContent = $this->prepArticle($articleContent);
+
if ($hasContent) {
return $articleContent;
} else {
@@ -582,6 +584,61 @@ class HTMLParser
}
/**
+ * TODO
+ *
+ * @param DOMDocument $article
+ *
+ * @return DOMDocument
+ */
+ public function prepArticle(DOMDocument $article)
+ {
+ // TODO CleanConditionaly
+ // Clean out junk from the article content
+ $this->_clean($article, 'object');
+ $this->_clean($article, 'embed');
+ $this->_clean($article, 'h1');
+ $this->_clean($article, 'footer');
+
+ return $article;
+ }
+
+ /**
+ * Clean a node of all elements of type "tag".
+ * (Unless it's a youtube/vimeo video. People love movies.)
+ *
+ * @param Element
+ * @param string tag to clean
+ * @return void
+ **/
+ public function _clean(DOMDocument $article, $tag)
+ {
+ $isEmbed = in_array($tag, ['object', 'embed', 'iframe']);
+
+ foreach ($article->getElementsByTagName($tag) as $item) {
+ // Allow youtube and vimeo videos through as people usually want to see those.
+ if ($isEmbed) {
+
+ $attributeValues = [];
+ foreach ($item->attributes as $name => $value) {
+ $attributeValues[] = $value;
+ }
+ $attributeValues = implode('|', $attributeValues);
+
+ // First, check the elements attributes to see if any of them contain youtube or vimeo
+ if (preg_match($this->regexps['videos'], $attributeValues)) {
+ continue;
+ }
+
+ // Then check the elements inside this element for the same.
+ if (preg_match($this->regexps['videos'], $item->C14N())) {
+ continue;
+ }
+ }
+ $this->removeNode($item);
+ }
+ }
+
+ /**
* Checks if the node is a byline.
*
* @param Readability $node