diff options
author | Andres Rey <[email protected]> | 2017-12-01 21:23:40 +0000 |
---|---|---|
committer | Andres Rey <[email protected]> | 2017-12-01 21:23:40 +0000 |
commit | 8e6dcdcdb48695fae6a6e57a8d4ddd3762c3c47a (patch) | |
tree | fee37562fef901e3009c889f54469ee1fb0f41b7 /src | |
parent | 7c8ee690e3c33c6a25670fba203ae14d1e1bea6e (diff) |
Move load function below parse function
Diffstat (limited to 'src')
-rw-r--r-- | src/Readability.php | 88 |
1 files changed, 44 insertions, 44 deletions
diff --git a/src/Readability.php b/src/Readability.php index f2617a4..c8c7c05 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -105,50 +105,6 @@ class Readability } /** - * Creates a DOM Document object and loads the provided HTML on it. - * - * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text) - * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs - * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both - * objects and ruining the backup. - * - * @param string $html - * - * @return DOMDocument - */ - private function loadHTML($html) - { - // To avoid having a gazillion of errors on malformed HTMLs - libxml_use_internal_errors(true); - - $dom = new DOMDocument('1.0', 'utf-8'); - - if (!$this->configuration->getSubstituteEntities()) { - // Keep the original HTML entities - $dom->substituteEntities = false; - } - - if ($this->configuration->getNormalizeEntities()) { - // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content - $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); - } - - if ($this->configuration->getSummonCthulhu()) { - $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html); - } - - // Prepend the XML tag to avoid having issues with special characters. Should be harmless. - $dom->loadHTML('<?xml encoding="UTF-8">' . $html); - $dom->encoding = 'UTF-8'; - - $this->removeScripts($dom); - - $this->prepDocument($dom); - - return $dom; - } - - /** * Main parse function. * * @param $html @@ -214,6 +170,50 @@ class Readability } /** + * Creates a DOM Document object and loads the provided HTML on it. + * + * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text) + * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs + * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both + * objects and ruining the backup. + * + * @param string $html + * + * @return DOMDocument + */ + private function loadHTML($html) + { + // To avoid throwing a gazillion of errors on malformed HTMLs + libxml_use_internal_errors(true); + + $dom = new DOMDocument('1.0', 'utf-8'); + + if (!$this->configuration->getSubstituteEntities()) { + // Keep the original HTML entities + $dom->substituteEntities = false; + } + + if ($this->configuration->getNormalizeEntities()) { + // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content + $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); + } + + if ($this->configuration->getSummonCthulhu()) { + $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html); + } + + // Prepend the XML tag to avoid having issues with special characters. Should be harmless. + $dom->loadHTML('<?xml encoding="UTF-8">' . $html); + $dom->encoding = 'UTF-8'; + + $this->removeScripts($dom); + + $this->prepDocument($dom); + + return $dom; + } + + /** * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties. */ private function getMetadata() |