summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-11 19:35:12 +0000
committerAndres Rey <[email protected]>2017-11-11 19:35:49 +0000
commit539668e447940d0b8ea12b863916ce367137c9a6 (patch)
treeb97b92b1eef67ff2119a08e66c1a957366dde9a0 /src
parent731a102a40efda34b180cfb18bd8daccfd404799 (diff)
_cleanMatchedNodes to remove nodes based on regex during final cleanup
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php27
1 files changed, 27 insertions, 0 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index d98ac68..5d1aa48 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -1021,6 +1021,11 @@ class HTMLParser
$this->_clean($article, 'h1');
$this->_clean($article, 'footer');
+ // Clean out elements have "share" in their id/class combinations from final top candidates,
+ // which means we don't remove the top candidates even they have "share".
+ foreach ($article->childNodes as $child) {
+ $this->_cleanMatchedNodes($child, '/share/i');
+ }
/*
* If there is only one h2 and its text content substantially equals article title,
@@ -1207,6 +1212,28 @@ class HTMLParser
}
/**
+ * Clean out elements whose id/class combinations match specific string.
+ *
+ * TODO To be moved to readability
+ *
+ * @param string $regex Match id/class combination.
+ * @return void
+ **/
+ public function _cleanMatchedNodes($node, $regex)
+ {
+ $node = new Readability($node);
+ $endOfSearchMarkerNode = $node->getNextNode($node, true);
+ $next = $node->getNextNode($node);
+ while ($next && $next !== $endOfSearchMarkerNode) {
+ if (preg_match($regex, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) {
+ $next = $next->removeAndGetNext($next);
+ } else {
+ $next = $next->getNextNode($next);
+ }
+ }
+ }
+
+ /**
* TODO To be moved to Readability.
*
* @param DOMDocument $article