summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-11-24 22:04:24 +0000
committerAndres Rey <[email protected]>2016-11-24 22:04:24 +0000
commit7a3716fcbf696bc92ba8914f32e1dfa85301b9ac (patch)
tree8fdb81520a1c083d91fedfa3ae49a2a796d609b6 /src
parent14cac0b9a7f746c271680c3d6b5b76594267f8dd (diff)
Progress over prepArticle
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php32
1 files changed, 31 insertions, 1 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index ff8d17c..7706b57 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -599,6 +599,15 @@ class HTMLParser
$this->_clean($article, 'h1');
$this->_clean($article, 'footer');
+ // If there is only one h2, they are probably using it as a header
+ // and not a subheader, so remove it since we already have a header.
+ if ($article->getElementsByTagName('h2')->length === 1){
+ $this->_clean($article, 'h2');
+ }
+
+ $this->_clean($article, 'iframe');
+
+
return $article;
}
@@ -617,7 +626,6 @@ class HTMLParser
foreach ($article->getElementsByTagName($tag) as $item) {
// Allow youtube and vimeo videos through as people usually want to see those.
if ($isEmbed) {
-
$attributeValues = [];
foreach ($item->attributes as $name => $value) {
$attributeValues[] = $value;
@@ -639,6 +647,28 @@ class HTMLParser
}
/**
+ * Clean out spurious headers from an Element. Checks things like classnames and link density.
+ *
+ * @param Element
+ * @return void
+ **/
+// public function _cleanHeaders($article) {
+// for`` (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
+// this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) {
+// return this._getClassWeight(header) < 0;
+// });
+// }
+// }
+
+ public function removeNode(\DOMNode $node)
+ {
+ $parent = $node->parentNode;
+ if ($parent) {
+ $parent->removeChild($node);
+ }
+ }
+
+ /**
* Checks if the node is a byline.
*
* @param Readability $node