From f50a8f68414e608fa07f24e4e1238b3d1eb2678b Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Sat, 10 Mar 2018 12:05:09 +0000 Subject: Add missing DOMEntity class --- src/Nodes/DOM/DOMDocument.php | 3 ++- src/Nodes/DOM/DOMEntity.php | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 src/Nodes/DOM/DOMEntity.php diff --git a/src/Nodes/DOM/DOMDocument.php b/src/Nodes/DOM/DOMDocument.php index a83f5b9..81e9c7d 100644 --- a/src/Nodes/DOM/DOMDocument.php +++ b/src/Nodes/DOM/DOMDocument.php @@ -20,10 +20,11 @@ class DOMDocument extends \DOMDocument $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class); $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class); $this->registerNodeClass('DOMElement', DOMElement::class); + $this->registerNodeClass('DOMEntity', DOMEntity::class); + $this->registerNodeClass('DOMEntityReference', DOMEntityReference::class); $this->registerNodeClass('DOMNode', DOMNode::class); $this->registerNodeClass('DOMNotation', DOMNotation::class); $this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class); $this->registerNodeClass('DOMText', DOMText::class); - $this->registerNodeClass('DOMEntityReference', DOMEntityReference::class); } } diff --git a/src/Nodes/DOM/DOMEntity.php b/src/Nodes/DOM/DOMEntity.php new file mode 100644 index 0000000..8493e73 --- /dev/null +++ b/src/Nodes/DOM/DOMEntity.php @@ -0,0 +1,10 @@ + Date: Sat, 10 Mar 2018 17:40:39 +0000 Subject: Add _cleanClasses function --- src/Nodes/NodeTrait.php | 4 ++++ src/Readability.php | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php index 5a3cd7f..bb848ab 100644 --- a/src/Nodes/NodeTrait.php +++ b/src/Nodes/NodeTrait.php @@ -7,6 +7,10 @@ use andreskrey\Readability\Nodes\DOM\DOMElement; use andreskrey\Readability\Nodes\DOM\DOMNode; use andreskrey\Readability\Nodes\DOM\DOMText; + +/** + * @method \DOMNode removeAttribute($name) + */ trait NodeTrait { /** diff --git a/src/Readability.php b/src/Readability.php index 91e703c..9a29313 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -1479,6 +1479,28 @@ class Readability } } + /** + * Removes the class="" attribute from every element in the given + * subtree. + * + * Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes + * here so no need to filter those. + * + * @param DOMDocument|DOMNode $node + * + * @return void + **/ + public function _cleanClasses($node) + { + if ($node->getAttribute('class') !== '') { + $node->removeAttribute('class'); + } + + for ($node = $node->firstChild; $node !== null; $node = $node->nextSibling) { + $this->_cleanClasses($node); + } + } + /** * @param DOMDocument $article * @@ -1532,6 +1554,8 @@ class Readability } } + $this->_cleanClasses($article); + return $article; } -- cgit v1.2.3 From 746dd0bcf5f3b0e685d842252c620c01faff19b9 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Sat, 10 Mar 2018 17:49:00 +0000 Subject: Remove all class attributes from the tests --- test/test-pages/001/expected.html | 2 +- test/test-pages/002/expected.html | 70 ++-- test/test-pages/ars-1/expected.html | 56 +-- test/test-pages/bbc-1/expected.html | 22 +- test/test-pages/blogger/expected.html | 14 +- test/test-pages/breitbart/expected.html | 16 +- test/test-pages/bug-1255978/expected.html | 46 +-- test/test-pages/buzzfeed-1/expected.html | 34 +- test/test-pages/challenges/expected.html | 2 +- test/test-pages/cnet/expected.html | 6 +- test/test-pages/cnn/expected.html | 26 +- test/test-pages/daringfireball-1/expected.html | 2 +- test/test-pages/ehow-1/expected.html | 100 ++--- test/test-pages/ehow-2/expected.html | 92 ++--- test/test-pages/gmw/expected.html | 4 +- test/test-pages/heise/expected.html | 12 +- test/test-pages/herald-sun-1/expected.html | 14 +- test/test-pages/iab-1/expected.html | 12 +- test/test-pages/ietf-1/expected.html | 188 +++++----- test/test-pages/infobae/expected.html | 2 +- test/test-pages/keep-images/expected.html | 256 ++++++------- test/test-pages/lemonde-1/expected.html | 28 +- test/test-pages/lemonde-2/expected.html | 20 +- test/test-pages/liberation-1/expected.html | 2 +- .../lifehacker-post-comment-load/expected.html | 26 +- test/test-pages/lifehacker-working/expected.html | 26 +- test/test-pages/links-in-tables/expected.html | 6 +- test/test-pages/lwn-1/expected.html | 176 ++++----- test/test-pages/medium-1/expected.html | 206 +++++------ test/test-pages/medium-2/expected.html | 2 +- test/test-pages/medium-3/expected.html | 410 ++++++++++----------- test/test-pages/mozilla-1/expected.html | 64 ++-- test/test-pages/mozilla-2/expected.html | 72 ++-- test/test-pages/msn/expected.html | 12 +- .../needs-entity-normalization/expected.html | 4 +- test/test-pages/nytimes-1/expected.html | 58 +-- test/test-pages/nytimes-2/expected.html | 58 +-- test/test-pages/pixnet/expected.html | 14 +- test/test-pages/salon-1/expected.html | 2 +- test/test-pages/simplyfound-1/expected.html | 2 +- test/test-pages/social-buttons/expected.html | 2 +- .../table-style-attributes/expected.html | 2 +- test/test-pages/telegraph/expected.html | 42 +-- test/test-pages/tmz-1/expected.html | 10 +- test/test-pages/tumblr/expected.html | 4 +- test/test-pages/wapo-1/expected.html | 20 +- test/test-pages/wapo-2/expected.html | 6 +- test/test-pages/webmd-1/expected.html | 10 +- test/test-pages/webmd-2/expected.html | 2 +- test/test-pages/wikia/expected.html | 10 +- test/test-pages/wikipedia/expected.html | 354 +++++++++--------- test/test-pages/wordpress/expected.html | 8 +- test/test-pages/yahoo-1/expected.html | 70 ++-- test/test-pages/yahoo-2/expected.html | 40 +- test/test-pages/yahoo-3/expected.html | 14 +- test/test-pages/yahoo-4/expected.html | 2 +- test/test-pages/youth/expected.html | 6 +- 57 files changed, 1383 insertions(+), 1383 deletions(-) diff --git a/test/test-pages/001/expected.html b/test/test-pages/001/expected.html index c101aec..e05810f 100644 --- a/test/test-pages/001/expected.html +++ b/test/test-pages/001/expected.html @@ -13,7 +13,7 @@ help. I guess.

Actually I've only found one which provides an adapter for Mocha and actually works…

-