summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/NodeClass/DOMAttr.php8
-rw-r--r--src/NodeClass/DOMCdataSection.php8
-rw-r--r--src/NodeClass/DOMCharacterData.php8
-rw-r--r--src/NodeClass/DOMComment.php8
-rw-r--r--src/NodeClass/DOMDocumentFragment.php8
-rw-r--r--src/NodeClass/DOMDocumentType.php8
-rw-r--r--src/NodeClass/DOMElement.php8
-rw-r--r--src/NodeClass/DOMNotation.php8
-rw-r--r--src/NodeClass/DOMText.php8
-rw-r--r--src/Nodes/DOMAttr.php8
-rw-r--r--src/Nodes/DOMCdataSection.php8
-rw-r--r--src/Nodes/DOMCharacterData.php8
-rw-r--r--src/Nodes/DOMComment.php8
-rw-r--r--src/Nodes/DOMDocument.php (renamed from src/NodeClass/DOMDocument.php)4
-rw-r--r--src/Nodes/DOMDocumentFragment.php8
-rw-r--r--src/Nodes/DOMDocumentType.php8
-rw-r--r--src/Nodes/DOMElement.php8
-rw-r--r--src/Nodes/DOMNode.php14
-rw-r--r--src/Nodes/DOMNotation.php8
-rw-r--r--src/Nodes/DOMProcessingInstruction.php (renamed from src/NodeClass/DOMProcessingInstruction.php)4
-rw-r--r--src/Nodes/DOMText.php8
-rw-r--r--src/Nodes/NodeTrait.php (renamed from src/NodeClass/NodeClassTrait.php)6
-rw-r--r--src/Nodes/NodeUtility.php (renamed from src/NodeUtility.php)6
-rw-r--r--src/ParseException.php9
-rw-r--r--src/Readability.php104
25 files changed, 151 insertions, 140 deletions
diff --git a/src/NodeClass/DOMAttr.php b/src/NodeClass/DOMAttr.php
deleted file mode 100644
index ea8672d..0000000
--- a/src/NodeClass/DOMAttr.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMAttr extends \DOMAttr
-{
- use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMCdataSection.php b/src/NodeClass/DOMCdataSection.php
deleted file mode 100644
index 438ac99..0000000
--- a/src/NodeClass/DOMCdataSection.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMCdataSection extends \DOMCdataSection
-{
- use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMCharacterData.php b/src/NodeClass/DOMCharacterData.php
deleted file mode 100644
index 480980e..0000000
--- a/src/NodeClass/DOMCharacterData.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMCharacterData extends \DOMCharacterData
-{
- use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMComment.php b/src/NodeClass/DOMComment.php
deleted file mode 100644
index 416460b..0000000
--- a/src/NodeClass/DOMComment.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMComment extends \DOMComment
-{
- use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMDocumentFragment.php b/src/NodeClass/DOMDocumentFragment.php
deleted file mode 100644
index cc8b753..0000000
--- a/src/NodeClass/DOMDocumentFragment.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMDocumentFragment extends \DOMDocumentFragment
-{
- use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMDocumentType.php b/src/NodeClass/DOMDocumentType.php
deleted file mode 100644
index 13f7829..0000000
--- a/src/NodeClass/DOMDocumentType.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMDocumentType extends \DOMDocumentType
-{
- use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMElement.php b/src/NodeClass/DOMElement.php
deleted file mode 100644
index a7dc36a..0000000
--- a/src/NodeClass/DOMElement.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMElement extends \DOMElement
-{
- use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMNotation.php b/src/NodeClass/DOMNotation.php
deleted file mode 100644
index 3e09bbc..0000000
--- a/src/NodeClass/DOMNotation.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMNotation extends \DOMNotation
-{
- use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMText.php b/src/NodeClass/DOMText.php
deleted file mode 100644
index 80ef6c8..0000000
--- a/src/NodeClass/DOMText.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMText extends \DOMText
-{
- use NodeClassTrait;
-}
diff --git a/src/Nodes/DOMAttr.php b/src/Nodes/DOMAttr.php
new file mode 100644
index 0000000..c31517a
--- /dev/null
+++ b/src/Nodes/DOMAttr.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMAttr extends \DOMAttr
+{
+ use NodeTrait;
+}
diff --git a/src/Nodes/DOMCdataSection.php b/src/Nodes/DOMCdataSection.php
new file mode 100644
index 0000000..f3a56f0
--- /dev/null
+++ b/src/Nodes/DOMCdataSection.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMCdataSection extends \DOMCdataSection
+{
+ use NodeTrait;
+}
diff --git a/src/Nodes/DOMCharacterData.php b/src/Nodes/DOMCharacterData.php
new file mode 100644
index 0000000..e5087d9
--- /dev/null
+++ b/src/Nodes/DOMCharacterData.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMCharacterData extends \DOMCharacterData
+{
+ use NodeTrait;
+}
diff --git a/src/Nodes/DOMComment.php b/src/Nodes/DOMComment.php
new file mode 100644
index 0000000..fd2b8b5
--- /dev/null
+++ b/src/Nodes/DOMComment.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMComment extends \DOMComment
+{
+ use NodeTrait;
+}
diff --git a/src/NodeClass/DOMDocument.php b/src/Nodes/DOMDocument.php
index 98b1215..510cdf2 100644
--- a/src/NodeClass/DOMDocument.php
+++ b/src/Nodes/DOMDocument.php
@@ -1,10 +1,10 @@
<?php
-namespace andreskrey\Readability\NodeClass;
+namespace andreskrey\Readability\Nodes;
class DOMDocument extends \DOMDocument
{
- use NodeClassTrait;
+ use NodeTrait;
public function __construct($version, $encoding)
{
diff --git a/src/Nodes/DOMDocumentFragment.php b/src/Nodes/DOMDocumentFragment.php
new file mode 100644
index 0000000..d5f013e
--- /dev/null
+++ b/src/Nodes/DOMDocumentFragment.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMDocumentFragment extends \DOMDocumentFragment
+{
+ use NodeTrait;
+}
diff --git a/src/Nodes/DOMDocumentType.php b/src/Nodes/DOMDocumentType.php
new file mode 100644
index 0000000..81e426b
--- /dev/null
+++ b/src/Nodes/DOMDocumentType.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMDocumentType extends \DOMDocumentType
+{
+ use NodeTrait;
+}
diff --git a/src/Nodes/DOMElement.php b/src/Nodes/DOMElement.php
new file mode 100644
index 0000000..6ca0a29
--- /dev/null
+++ b/src/Nodes/DOMElement.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMElement extends \DOMElement
+{
+ use NodeTrait;
+}
diff --git a/src/Nodes/DOMNode.php b/src/Nodes/DOMNode.php
new file mode 100644
index 0000000..79a352b
--- /dev/null
+++ b/src/Nodes/DOMNode.php
@@ -0,0 +1,14 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+/**
+ * Class DOMNode
+ *
+ * @method getAttribute($attribute)
+ * @package andreskrey\Readability\Nodes
+ */
+class DOMNode extends \DOMNode
+{
+ use NodeTrait;
+}
diff --git a/src/Nodes/DOMNotation.php b/src/Nodes/DOMNotation.php
new file mode 100644
index 0000000..a4802e0
--- /dev/null
+++ b/src/Nodes/DOMNotation.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMNotation extends \DOMNotation
+{
+ use NodeTrait;
+}
diff --git a/src/NodeClass/DOMProcessingInstruction.php b/src/Nodes/DOMProcessingInstruction.php
index 0c615c6..bd80997 100644
--- a/src/NodeClass/DOMProcessingInstruction.php
+++ b/src/Nodes/DOMProcessingInstruction.php
@@ -1,8 +1,8 @@
<?php
-namespace andreskrey\Readability\NodeClass;
+namespace andreskrey\Readability\Nodes;
class DOMProcessingInstruction extends \DOMProcessingInstruction
{
- use NodeClassTrait;
+ use NodeTrait;
}
diff --git a/src/Nodes/DOMText.php b/src/Nodes/DOMText.php
new file mode 100644
index 0000000..43d2ba9
--- /dev/null
+++ b/src/Nodes/DOMText.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMText extends \DOMText
+{
+ use NodeTrait;
+}
diff --git a/src/NodeClass/NodeClassTrait.php b/src/Nodes/NodeTrait.php
index ae3eeb1..5847178 100644
--- a/src/NodeClass/NodeClassTrait.php
+++ b/src/Nodes/NodeTrait.php
@@ -1,10 +1,8 @@
<?php
-namespace andreskrey\Readability\NodeClass;
+namespace andreskrey\Readability\Nodes;
-use andreskrey\Readability\NodeUtility;
-
-trait NodeClassTrait
+trait NodeTrait
{
/**
* Content score of the node. Used to determine the value of the content.
diff --git a/src/NodeUtility.php b/src/Nodes/NodeUtility.php
index 7fbdd45..8938a49 100644
--- a/src/NodeUtility.php
+++ b/src/Nodes/NodeUtility.php
@@ -1,10 +1,6 @@
<?php
-namespace andreskrey\Readability;
-
-use andreskrey\Readability\NodeClass\DOMDocument;
-use andreskrey\Readability\NodeClass\DOMElement;
-use andreskrey\Readability\NodeClass\DOMNode;
+namespace andreskrey\Readability\Nodes;
/**
* Class NodeUtility.
diff --git a/src/ParseException.php b/src/ParseException.php
new file mode 100644
index 0000000..335851f
--- /dev/null
+++ b/src/ParseException.php
@@ -0,0 +1,9 @@
+<?php
+
+namespace andreskrey\Readability;
+
+
+class ParseException extends \Exception
+{
+
+}
diff --git a/src/Readability.php b/src/Readability.php
index db0774a..60a314a 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -2,10 +2,11 @@
namespace andreskrey\Readability;
-use andreskrey\Readability\NodeClass\DOMDocument;
-use andreskrey\Readability\NodeClass\DOMElement;
-use andreskrey\Readability\NodeClass\DOMNode;
-use andreskrey\Readability\NodeClass\DOMText;
+use andreskrey\Readability\Nodes\DOMDocument;
+use andreskrey\Readability\Nodes\DOMElement;
+use andreskrey\Readability\Nodes\DOMNode;
+use andreskrey\Readability\Nodes\DOMText;
+use andreskrey\Readability\Nodes\NodeUtility;
/**
* Class Readability.
@@ -101,15 +102,57 @@ class Readability
public function __construct(Configuration $configuration)
{
$this->configuration = $configuration;
+ }
+ /**
+ * Creates a DOM Document object and loads the provided HTML on it.
+ *
+ * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)
+ * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
+ * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
+ * objects and ruining the backup.
+ *
+ * @param string $html
+ *
+ * @return DOMDocument
+ */
+ private function loadHTML($html)
+ {
// To avoid having a gazillion of errors on malformed HTMLs
libxml_use_internal_errors(true);
+
+ $dom = new DOMDocument('1.0', 'utf-8');
+
+ if (!$this->configuration->getSubstituteEntities()) {
+ // Keep the original HTML entities
+ $dom->substituteEntities = false;
+ }
+
+ if ($this->configuration->getNormalizeEntities()) {
+ // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
+ $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
+ }
+
+ if ($this->configuration->getSummonCthulhu()) {
+ $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
+ }
+
+ // Prepend the XML tag to avoid having issues with special characters. Should be harmless.
+ $dom->loadHTML('<?xml encoding="UTF-8">' . $html);
+ $dom->encoding = 'UTF-8';
+
+ $this->removeScripts($dom);
+
+ $this->prepDocument($dom);
+
+ return $dom;
}
/**
* Main parse function.
*
* @param $html
+ * @throws ParseException
*
* @return array|bool
*/
@@ -123,10 +166,9 @@ class Readability
// Checking for minimum HTML to work with.
if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) {
- return false;
+ throw new ParseException('Invalid or incomplete HTML.');
}
- $parseSuccessful = true;
while (true) {
$root = $root->firstChild;
@@ -142,8 +184,6 @@ class Readability
* finding the -right- content.
*/
- // TODO Better way to count resulting text. Textcontent usually has alt titles and that stuff
- // that doesn't really count to the quality of the result.
$length = 0;
foreach ($result->getElementsByTagName('p') as $p) {
$length += mb_strlen($p->textContent);
@@ -159,18 +199,13 @@ class Readability
} elseif ($this->configuration->getCleanConditionally()) {
$this->configuration->setCleanConditionally(false);
} else {
- $parseSuccessful = false;
- break;
+ throw new ParseException('Could not parse text.');
}
} else {
break;
}
}
- if (!$parseSuccessful) {
- return false;
- }
-
$result = $this->postProcessContent($result);
$this->setContent($result->C14N());
@@ -179,47 +214,6 @@ class Readability
}
/**
- * Creates a DOM Document object and loads the provided HTML on it.
- *
- * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)
- * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
- * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
- * objects and ruining the backup.
- *
- * @param string $html
- *
- * @return DOMDocument
- */
- private function loadHTML($html)
- {
- $dom = new DOMDocument('1.0', 'utf-8');
-
- if (!$this->configuration->getSubstituteEntities()) {
- // Keep the original HTML entities
- $dom->substituteEntities = false;
- }
-
- if ($this->configuration->getNormalizeEntities()) {
- // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
- $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
- }
-
- if ($this->configuration->getSummonCthulhu()) {
- $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
- }
-
- // Prepend the XML tag to avoid having issues with special characters. Should be harmless.
- $dom->loadHTML('<?xml encoding="UTF-8">' . $html);
- $dom->encoding = 'UTF-8';
-
- $this->removeScripts($dom);
-
- $this->prepDocument($dom);
-
- return $dom;
- }
-
- /**
* Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties.
*/
private function getMetadata()