From 8a266f2cae5dce8d1fa39c40caac8400406898bb Mon Sep 17 00:00:00 2001
From: Andres Rey <andreskrey@gmail.com>
Date: Fri, 1 Dec 2017 00:09:15 +0000
Subject: Add ParseException

---
 src/ParseException.php |   9 ++++
 src/Readability.php    | 119 +++++++++++++++++++++----------------------------
 2 files changed, 61 insertions(+), 67 deletions(-)
 create mode 100644 src/ParseException.php

(limited to 'src')

diff --git a/src/ParseException.php b/src/ParseException.php
new file mode 100644
index 0000000..335851f
--- /dev/null
+++ b/src/ParseException.php
@@ -0,0 +1,9 @@
+<?php
+
+namespace andreskrey\Readability;
+
+
+class ParseException extends \Exception
+{
+
+}
diff --git a/src/Readability.php b/src/Readability.php
index b149f96..fd6e66b 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -2,19 +2,11 @@
 
 namespace andreskrey\Readability;
 
-use andreskrey\Readability\NodeClass\DOMDocument;
-use andreskrey\Readability\NodeClass\DOMAttr;
-use andreskrey\Readability\NodeClass\DOMCdataSection;
-use andreskrey\Readability\NodeClass\DOMCharacterData;
-use andreskrey\Readability\NodeClass\DOMComment;
-use andreskrey\Readability\NodeClass\DOMDocumentFragment;
-use andreskrey\Readability\NodeClass\DOMDocumentType;
-use andreskrey\Readability\NodeClass\DOMElement;
-use andreskrey\Readability\NodeClass\DOMNode;
-use andreskrey\Readability\NodeClass\DOMNotation;
-use andreskrey\Readability\NodeClass\DOMProcessingInstruction;
-use andreskrey\Readability\NodeClass\DOMText;
-use andreskrey\Readability\NodeClass\NodeClassTrait;
+use andreskrey\Readability\Nodes\DOMDocument;
+use andreskrey\Readability\Nodes\DOMElement;
+use andreskrey\Readability\Nodes\DOMNode;
+use andreskrey\Readability\Nodes\DOMText;
+use andreskrey\Readability\Nodes\NodeUtility;
 
 /**
  * Class Readability
@@ -112,10 +104,55 @@ class Readability
         $this->configuration = $configuration;
     }
 
+    /**
+     * Creates a DOM Document object and loads the provided HTML on it.
+     *
+     * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)
+     * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
+     * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
+     * objects and ruining the backup.
+     *
+     * @param string $html
+     *
+     * @return DOMDocument
+     */
+    private function loadHTML($html)
+    {
+        // To avoid having a gazillion of errors on malformed HTMLs
+        libxml_use_internal_errors(true);
+
+        $dom = new DOMDocument('1.0', 'utf-8');
+
+        if (!$this->configuration->getSubstituteEntities()) {
+            // Keep the original HTML entities
+            $dom->substituteEntities = false;
+        }
+
+        if ($this->configuration->getNormalizeEntities()) {
+            // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
+            $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
+        }
+
+        if ($this->configuration->getSummonCthulhu()) {
+            $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
+        }
+
+        // Prepend the XML tag to avoid having issues with special characters. Should be harmless.
+        $dom->loadHTML('<?xml encoding="UTF-8">' . $html);
+        $dom->encoding = 'UTF-8';
+
+        $this->removeScripts($dom);
+
+        $this->prepDocument($dom);
+
+        return $dom;
+    }
+
     /**
      * Main parse function
      *
      * @param $html
+     * @throws ParseException
      *
      * @return array|bool
      */
@@ -129,10 +166,9 @@ class Readability
 
         // Checking for minimum HTML to work with.
         if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) {
-            return false;
+            throw new ParseException('Invalid or incomplete HTML.');
         }
 
-        $parseSuccessful = true;
         while (true) {
             $root = $root->firstChild;
 
@@ -148,8 +184,6 @@ class Readability
              * finding the -right- content.
              */
 
-            // TODO Better way to count resulting text. Textcontent usually has alt titles and that stuff
-            // that doesn't really count to the quality of the result.
             $length = 0;
             foreach ($result->getElementsByTagName('p') as $p) {
                 $length += mb_strlen($p->textContent);
@@ -165,18 +199,13 @@ class Readability
                 } elseif ($this->configuration->getCleanConditionally()) {
                     $this->configuration->setCleanConditionally(false);
                 } else {
-                    $parseSuccessful = false;
-                    break;
+                    throw new ParseException('Could not parse text.');
                 }
             } else {
                 break;
             }
         }
 
-        if (!$parseSuccessful) {
-            return false;
-        }
-
         $result = $this->postProcessContent($result);
 
         $this->setContent($result->C14N());
@@ -184,50 +213,6 @@ class Readability
         return true;
     }
 
-    /**
-     * Creates a DOM Document object and loads the provided HTML on it.
-     *
-     * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)
-     * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
-     * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
-     * objects and ruining the backup.
-     *
-     * @param string $html
-     *
-     * @return DOMDocument
-     */
-    private function loadHTML($html)
-    {
-        // To avoid having a gazillion of errors on malformed HTMLs
-        libxml_use_internal_errors(true);
-
-        $dom = new DOMDocument('1.0', 'utf-8');
-
-        if (!$this->configuration->getSubstituteEntities()) {
-            // Keep the original HTML entities
-            $dom->substituteEntities = false;
-        }
-
-        if ($this->configuration->getNormalizeEntities()) {
-            // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
-            $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
-        }
-
-        if ($this->configuration->getSummonCthulhu()) {
-            $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
-        }
-
-        // Prepend the XML tag to avoid having issues with special characters. Should be harmless.
-        $dom->loadHTML('<?xml encoding="UTF-8">' . $html);
-        $dom->encoding = 'UTF-8';
-
-        $this->removeScripts($dom);
-
-        $this->prepDocument($dom);
-
-        return $dom;
-    }
-
     /**
      * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties
      */
-- 
cgit v1.2.3