summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-12-01 00:09:15 +0000
committerAndres Rey <[email protected]>2017-12-01 00:09:15 +0000
commit8a266f2cae5dce8d1fa39c40caac8400406898bb (patch)
treeade21455ed4703f1ae36e221ac8f10d8b7d3487c /src
parent7d34c6ac98e952782ab8665eaab774e1a5d29f5d (diff)
Add ParseException
Diffstat (limited to 'src')
-rw-r--r--src/ParseException.php9
-rw-r--r--src/Readability.php119
2 files changed, 61 insertions, 67 deletions
diff --git a/src/ParseException.php b/src/ParseException.php
new file mode 100644
index 0000000..335851f
--- /dev/null
+++ b/src/ParseException.php
@@ -0,0 +1,9 @@
+<?php
+
+namespace andreskrey\Readability;
+
+
+class ParseException extends \Exception
+{
+
+}
diff --git a/src/Readability.php b/src/Readability.php
index b149f96..fd6e66b 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -2,19 +2,11 @@
namespace andreskrey\Readability;
-use andreskrey\Readability\NodeClass\DOMDocument;
-use andreskrey\Readability\NodeClass\DOMAttr;
-use andreskrey\Readability\NodeClass\DOMCdataSection;
-use andreskrey\Readability\NodeClass\DOMCharacterData;
-use andreskrey\Readability\NodeClass\DOMComment;
-use andreskrey\Readability\NodeClass\DOMDocumentFragment;
-use andreskrey\Readability\NodeClass\DOMDocumentType;
-use andreskrey\Readability\NodeClass\DOMElement;
-use andreskrey\Readability\NodeClass\DOMNode;
-use andreskrey\Readability\NodeClass\DOMNotation;
-use andreskrey\Readability\NodeClass\DOMProcessingInstruction;
-use andreskrey\Readability\NodeClass\DOMText;
-use andreskrey\Readability\NodeClass\NodeClassTrait;
+use andreskrey\Readability\Nodes\DOMDocument;
+use andreskrey\Readability\Nodes\DOMElement;
+use andreskrey\Readability\Nodes\DOMNode;
+use andreskrey\Readability\Nodes\DOMText;
+use andreskrey\Readability\Nodes\NodeUtility;
/**
* Class Readability
@@ -113,9 +105,54 @@ class Readability
}
/**
+ * Creates a DOM Document object and loads the provided HTML on it.
+ *
+ * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)
+ * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
+ * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
+ * objects and ruining the backup.
+ *
+ * @param string $html
+ *
+ * @return DOMDocument
+ */
+ private function loadHTML($html)
+ {
+ // To avoid having a gazillion of errors on malformed HTMLs
+ libxml_use_internal_errors(true);
+
+ $dom = new DOMDocument('1.0', 'utf-8');
+
+ if (!$this->configuration->getSubstituteEntities()) {
+ // Keep the original HTML entities
+ $dom->substituteEntities = false;
+ }
+
+ if ($this->configuration->getNormalizeEntities()) {
+ // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
+ $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
+ }
+
+ if ($this->configuration->getSummonCthulhu()) {
+ $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
+ }
+
+ // Prepend the XML tag to avoid having issues with special characters. Should be harmless.
+ $dom->loadHTML('<?xml encoding="UTF-8">' . $html);
+ $dom->encoding = 'UTF-8';
+
+ $this->removeScripts($dom);
+
+ $this->prepDocument($dom);
+
+ return $dom;
+ }
+
+ /**
* Main parse function
*
* @param $html
+ * @throws ParseException
*
* @return array|bool
*/
@@ -129,10 +166,9 @@ class Readability
// Checking for minimum HTML to work with.
if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) {
- return false;
+ throw new ParseException('Invalid or incomplete HTML.');
}
- $parseSuccessful = true;
while (true) {
$root = $root->firstChild;
@@ -148,8 +184,6 @@ class Readability
* finding the -right- content.
*/
- // TODO Better way to count resulting text. Textcontent usually has alt titles and that stuff
- // that doesn't really count to the quality of the result.
$length = 0;
foreach ($result->getElementsByTagName('p') as $p) {
$length += mb_strlen($p->textContent);
@@ -165,18 +199,13 @@ class Readability
} elseif ($this->configuration->getCleanConditionally()) {
$this->configuration->setCleanConditionally(false);
} else {
- $parseSuccessful = false;
- break;
+ throw new ParseException('Could not parse text.');
}
} else {
break;
}
}
- if (!$parseSuccessful) {
- return false;
- }
-
$result = $this->postProcessContent($result);
$this->setContent($result->C14N());
@@ -185,50 +214,6 @@ class Readability
}
/**
- * Creates a DOM Document object and loads the provided HTML on it.
- *
- * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)
- * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
- * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
- * objects and ruining the backup.
- *
- * @param string $html
- *
- * @return DOMDocument
- */
- private function loadHTML($html)
- {
- // To avoid having a gazillion of errors on malformed HTMLs
- libxml_use_internal_errors(true);
-
- $dom = new DOMDocument('1.0', 'utf-8');
-
- if (!$this->configuration->getSubstituteEntities()) {
- // Keep the original HTML entities
- $dom->substituteEntities = false;
- }
-
- if ($this->configuration->getNormalizeEntities()) {
- // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
- $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
- }
-
- if ($this->configuration->getSummonCthulhu()) {
- $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
- }
-
- // Prepend the XML tag to avoid having issues with special characters. Should be harmless.
- $dom->loadHTML('<?xml encoding="UTF-8">' . $html);
- $dom->encoding = 'UTF-8';
-
- $this->removeScripts($dom);
-
- $this->prepDocument($dom);
-
- return $dom;
- }
-
- /**
* Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties
*/
private function getMetadata()