summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-07 20:55:21 +0000
committerAndres Rey <[email protected]>2017-11-07 20:55:21 +0000
commit840cac3acba875d98b245f9d512ef6eebd8f50fd (patch)
tree5b546834dc0fd390d3f7bab2a936770b8944d0f2 /src
parente03d5591b499158a85e21614b1fd5a7f511e5fa8 (diff)
Clean style attributes inside tags
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php34
1 files changed, 32 insertions, 2 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 27dd8e5..273dab9 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -1001,6 +1001,9 @@ class HTMLParser
*/
public function prepArticle(DOMDocument $article)
{
+ $this->_cleanStyles($article);
+ $this->_clean($article, 'style');
+
// Check for data tables before we continue, to avoid removing items in
// those tables, which will often be isolated even though they're
// visually linked to other content-ful elements (text, images, etc.).
@@ -1014,8 +1017,6 @@ class HTMLParser
$this->_clean($article, 'h1');
$this->_clean($article, 'footer');
- // Readability.js cleans styles on prepDocument but we do it here.
- $this->_clean($article, 'style');
/*
* If there is only one h2 and its text content substantially equals article title,
@@ -1176,6 +1177,35 @@ class HTMLParser
}
/**
+ * Remove the style attribute on every e and under.
+ * TODO: To be moved to Readability
+ *
+ * @param $node \DOMDocument|\DOMNode
+ **/
+ public function _cleanStyles($node)
+ {
+ $cur = $node->firstChild;
+
+ // Remove any root styles, if we're able.
+ //TODO Check if we actually need to check for the method to exist
+ if (method_exists($cur, 'removeAttribute')) {
+ $cur->removeAttribute('style');
+ }
+
+ // Go until there are no more child nodes
+ while ($cur !== null) {
+ if ($cur->nodeType === XML_ELEMENT_NODE) {
+ // Remove style attribute(s) :
+ $cur->removeAttribute('style');
+
+ $this->_cleanStyles($cur);
+ }
+
+ $cur = $cur->nextSibling;
+ }
+ }
+
+ /**
* TODO To be moved to Readability.
*
* @param DOMDocument $article