summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Dolgov <[email protected]>2024-04-03 17:38:00 +0000
committerAndrew Dolgov <[email protected]>2024-04-03 17:38:00 +0000
commit37c0c2ce76aa90e8adafbd5cb0b0332a54df1523 (patch)
treee6b44c6309edadf6e933caf81321740a0ace79a1
parent8ac5abdd497b37d2be4833bcf18d6819bba4d9c9 (diff)
parent29c099cb9c1a9e2f6871068513fc321828da31d2 (diff)
Merge branch 'feature/ignore-more-elements' into 'master'HEADmaster
Add config support for ignoring additional elements. See merge request main/libraries/readability-php!1
-rw-r--r--README.md1
-rw-r--r--src/Configuration.php25
-rw-r--r--src/Readability.php8
3 files changed, 30 insertions, 4 deletions
diff --git a/README.md b/README.md
index c910714..547178c 100644
--- a/README.md
+++ b/README.md
@@ -117,6 +117,7 @@ Then you pass this Configuration object to Readability. The following options ar
- **KeepClasses**: default value `false`, which removes all `class="..."` attribute values from HTML elements.
- **Parser**: default value `html5`, which uses HTML5-PHP for parsing. Set to `libxml` to use that instead (not recommended for modern HTML documents).
- **SummonCthulhu**: default value `false`, remove all `<script>` nodes via regex. This is not ideal as it might break things, but if you've set the parser to libxml (see above), it might be the only solution to [libxml problems with unescaped javascript](https://github.com/fivefilters/readability.php#known-libxml-parsing-issues).
+- **ExtraIgnoredElements**: default value `[]`, additional DOM elements that should be ignored. `noscript` and `script` are always ignored.
### Debug log
diff --git a/src/Configuration.php b/src/Configuration.php
index 6d1f03f..0659a82 100644
--- a/src/Configuration.php
+++ b/src/Configuration.php
@@ -84,6 +84,11 @@ class Configuration
protected $disableJSONLD = false;
/**
+ * @var array<string>
+ */
+ protected $extraIgnoredElements = [];
+
+ /**
* Configuration constructor.
*
* @param array $params
@@ -420,4 +425,24 @@ class Configuration
return $this;
}
+
+ /**
+ * @return array<string>
+ */
+ public function getExtraIgnoredElements()
+ {
+ return $this->extraIgnoredElements;
+ }
+
+ /**
+ * @param array<string>
+ *
+ * @return $this
+ */
+ public function setExtraIgnoredElements($extraIgnoredElements)
+ {
+ $this->extraIgnoredElements = $extraIgnoredElements;
+
+ return $this;
+ }
}
diff --git a/src/Readability.php b/src/Readability.php
index 6407a92..c5318a4 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -342,7 +342,7 @@ class Readability
// Extract JSON-LD metadata before removing scripts
$this->jsonld = $this->configuration->getDisableJSONLD() ? [] : $this->getJSONLD($dom);
- $this->removeScripts($dom);
+ $this->removeIgnoredElements($dom);
$this->prepDocument($dom);
@@ -1188,13 +1188,13 @@ class Readability
}
/**
- * Removes all the scripts of the html.
+ * Removes elements that should be ignored.
*
* @param DOMDocument $dom
*/
- private function removeScripts(DOMDocument $dom)
+ private function removeIgnoredElements(DOMDocument $dom)
{
- foreach (['script', 'noscript'] as $tag) {
+ foreach (['noscript', 'script', ...$this->configuration->getExtraIgnoredElements()] as $tag) {
$nodes = $dom->getElementsByTagName($tag);
foreach (iterator_to_array($nodes) as $node) {
NodeUtility::removeNode($node);