From 8a266f2cae5dce8d1fa39c40caac8400406898bb Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Fri, 1 Dec 2017 00:09:15 +0000 Subject: Add ParseException --- src/ParseException.php | 9 ++++ src/Readability.php | 119 +++++++++++++++++++++---------------------------- 2 files changed, 61 insertions(+), 67 deletions(-) create mode 100644 src/ParseException.php (limited to 'src') diff --git a/src/ParseException.php b/src/ParseException.php new file mode 100644 index 0000000..335851f --- /dev/null +++ b/src/ParseException.php @@ -0,0 +1,9 @@ +configuration = $configuration; } + /** + * Creates a DOM Document object and loads the provided HTML on it. + * + * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text) + * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs + * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both + * objects and ruining the backup. + * + * @param string $html + * + * @return DOMDocument + */ + private function loadHTML($html) + { + // To avoid having a gazillion of errors on malformed HTMLs + libxml_use_internal_errors(true); + + $dom = new DOMDocument('1.0', 'utf-8'); + + if (!$this->configuration->getSubstituteEntities()) { + // Keep the original HTML entities + $dom->substituteEntities = false; + } + + if ($this->configuration->getNormalizeEntities()) { + // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content + $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); + } + + if ($this->configuration->getSummonCthulhu()) { + $html = preg_replace('/]*>([\s\S]*?)<\/script>/', '', $html); + } + + // Prepend the XML tag to avoid having issues with special characters. Should be harmless. + $dom->loadHTML('' . $html); + $dom->encoding = 'UTF-8'; + + $this->removeScripts($dom); + + $this->prepDocument($dom); + + return $dom; + } + /** * Main parse function * * @param $html + * @throws ParseException * * @return array|bool */ @@ -129,10 +166,9 @@ class Readability // Checking for minimum HTML to work with. if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) { - return false; + throw new ParseException('Invalid or incomplete HTML.'); } - $parseSuccessful = true; while (true) { $root = $root->firstChild; @@ -148,8 +184,6 @@ class Readability * finding the -right- content. */ - // TODO Better way to count resulting text. Textcontent usually has alt titles and that stuff - // that doesn't really count to the quality of the result. $length = 0; foreach ($result->getElementsByTagName('p') as $p) { $length += mb_strlen($p->textContent); @@ -165,18 +199,13 @@ class Readability } elseif ($this->configuration->getCleanConditionally()) { $this->configuration->setCleanConditionally(false); } else { - $parseSuccessful = false; - break; + throw new ParseException('Could not parse text.'); } } else { break; } } - if (!$parseSuccessful) { - return false; - } - $result = $this->postProcessContent($result); $this->setContent($result->C14N()); @@ -184,50 +213,6 @@ class Readability return true; } - /** - * Creates a DOM Document object and loads the provided HTML on it. - * - * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text) - * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs - * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both - * objects and ruining the backup. - * - * @param string $html - * - * @return DOMDocument - */ - private function loadHTML($html) - { - // To avoid having a gazillion of errors on malformed HTMLs - libxml_use_internal_errors(true); - - $dom = new DOMDocument('1.0', 'utf-8'); - - if (!$this->configuration->getSubstituteEntities()) { - // Keep the original HTML entities - $dom->substituteEntities = false; - } - - if ($this->configuration->getNormalizeEntities()) { - // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content - $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); - } - - if ($this->configuration->getSummonCthulhu()) { - $html = preg_replace('/]*>([\s\S]*?)<\/script>/', '', $html); - } - - // Prepend the XML tag to avoid having issues with special characters. Should be harmless. - $dom->loadHTML('' . $html); - $dom->encoding = 'UTF-8'; - - $this->removeScripts($dom); - - $this->prepDocument($dom); - - return $dom; - } - /** * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties */ -- cgit v1.2.3