diff options
author | Andrew Dolgov <[email protected]> | 2024-04-03 17:38:00 +0000 |
---|---|---|
committer | Andrew Dolgov <[email protected]> | 2024-04-03 17:38:00 +0000 |
commit | 37c0c2ce76aa90e8adafbd5cb0b0332a54df1523 (patch) | |
tree | e6b44c6309edadf6e933caf81321740a0ace79a1 | |
parent | 8ac5abdd497b37d2be4833bcf18d6819bba4d9c9 (diff) | |
parent | 29c099cb9c1a9e2f6871068513fc321828da31d2 (diff) |
Add config support for ignoring additional elements.
See merge request main/libraries/readability-php!1
-rw-r--r-- | README.md | 1 | ||||
-rw-r--r-- | src/Configuration.php | 25 | ||||
-rw-r--r-- | src/Readability.php | 8 |
3 files changed, 30 insertions, 4 deletions
@@ -117,6 +117,7 @@ Then you pass this Configuration object to Readability. The following options ar - **KeepClasses**: default value `false`, which removes all `class="..."` attribute values from HTML elements. - **Parser**: default value `html5`, which uses HTML5-PHP for parsing. Set to `libxml` to use that instead (not recommended for modern HTML documents). - **SummonCthulhu**: default value `false`, remove all `<script>` nodes via regex. This is not ideal as it might break things, but if you've set the parser to libxml (see above), it might be the only solution to [libxml problems with unescaped javascript](https://github.com/fivefilters/readability.php#known-libxml-parsing-issues). +- **ExtraIgnoredElements**: default value `[]`, additional DOM elements that should be ignored. `noscript` and `script` are always ignored. ### Debug log diff --git a/src/Configuration.php b/src/Configuration.php index 6d1f03f..0659a82 100644 --- a/src/Configuration.php +++ b/src/Configuration.php @@ -84,6 +84,11 @@ class Configuration protected $disableJSONLD = false; /** + * @var array<string> + */ + protected $extraIgnoredElements = []; + + /** * Configuration constructor. * * @param array $params @@ -420,4 +425,24 @@ class Configuration return $this; } + + /** + * @return array<string> + */ + public function getExtraIgnoredElements() + { + return $this->extraIgnoredElements; + } + + /** + * @param array<string> + * + * @return $this + */ + public function setExtraIgnoredElements($extraIgnoredElements) + { + $this->extraIgnoredElements = $extraIgnoredElements; + + return $this; + } } diff --git a/src/Readability.php b/src/Readability.php index 6407a92..c5318a4 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -342,7 +342,7 @@ class Readability // Extract JSON-LD metadata before removing scripts $this->jsonld = $this->configuration->getDisableJSONLD() ? [] : $this->getJSONLD($dom); - $this->removeScripts($dom); + $this->removeIgnoredElements($dom); $this->prepDocument($dom); @@ -1188,13 +1188,13 @@ class Readability } /** - * Removes all the scripts of the html. + * Removes elements that should be ignored. * * @param DOMDocument $dom */ - private function removeScripts(DOMDocument $dom) + private function removeIgnoredElements(DOMDocument $dom) { - foreach (['script', 'noscript'] as $tag) { + foreach (['noscript', 'script', ...$this->configuration->getExtraIgnoredElements()] as $tag) { $nodes = $dom->getElementsByTagName($tag); foreach (iterator_to_array($nodes) as $node) { NodeUtility::removeNode($node); |