From 8e6dcdcdb48695fae6a6e57a8d4ddd3762c3c47a Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Fri, 1 Dec 2017 21:23:40 +0000 Subject: Move load function below parse function --- src/Readability.php | 88 ++++++++++++++++++++++++++--------------------------- 1 file changed, 44 insertions(+), 44 deletions(-) (limited to 'src') diff --git a/src/Readability.php b/src/Readability.php index f2617a4..c8c7c05 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -104,50 +104,6 @@ class Readability $this->configuration = $configuration; } - /** - * Creates a DOM Document object and loads the provided HTML on it. - * - * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text) - * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs - * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both - * objects and ruining the backup. - * - * @param string $html - * - * @return DOMDocument - */ - private function loadHTML($html) - { - // To avoid having a gazillion of errors on malformed HTMLs - libxml_use_internal_errors(true); - - $dom = new DOMDocument('1.0', 'utf-8'); - - if (!$this->configuration->getSubstituteEntities()) { - // Keep the original HTML entities - $dom->substituteEntities = false; - } - - if ($this->configuration->getNormalizeEntities()) { - // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content - $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); - } - - if ($this->configuration->getSummonCthulhu()) { - $html = preg_replace('/]*>([\s\S]*?)<\/script>/', '', $html); - } - - // Prepend the XML tag to avoid having issues with special characters. Should be harmless. - $dom->loadHTML('' . $html); - $dom->encoding = 'UTF-8'; - - $this->removeScripts($dom); - - $this->prepDocument($dom); - - return $dom; - } - /** * Main parse function. * @@ -213,6 +169,50 @@ class Readability return true; } + /** + * Creates a DOM Document object and loads the provided HTML on it. + * + * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text) + * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs + * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both + * objects and ruining the backup. + * + * @param string $html + * + * @return DOMDocument + */ + private function loadHTML($html) + { + // To avoid throwing a gazillion of errors on malformed HTMLs + libxml_use_internal_errors(true); + + $dom = new DOMDocument('1.0', 'utf-8'); + + if (!$this->configuration->getSubstituteEntities()) { + // Keep the original HTML entities + $dom->substituteEntities = false; + } + + if ($this->configuration->getNormalizeEntities()) { + // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content + $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); + } + + if ($this->configuration->getSummonCthulhu()) { + $html = preg_replace('/]*>([\s\S]*?)<\/script>/', '', $html); + } + + // Prepend the XML tag to avoid having issues with special characters. Should be harmless. + $dom->loadHTML('' . $html); + $dom->encoding = 'UTF-8'; + + $this->removeScripts($dom); + + $this->prepDocument($dom); + + return $dom; + } + /** * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties. */ -- cgit v1.2.3