summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-05-20 12:25:44 +0100
committerAndres Rey <[email protected]>2017-05-20 12:25:44 +0100
commitbba656d6733818c450be40292a7c803b76e8973f (patch)
treef159f90586374d7b4b04e5344a375e9d0cbfcfb5 /src
parentf5dcfedf9f9f20c6a98739a4d5d23704de89bf26 (diff)
Add new backup approach. Cloning the original DOM object is not useful to keep a backup of it because there seems to be a connection between original object and clone. Making a change on the original object translates it to the backup one, so html must be reloaded everytime the algorithm cycles.
Diffstat (limited to 'src')
-rw-r--r--src/HTMLParser.php33
1 files changed, 18 insertions, 15 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 288d8a8..9be6ff3 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -17,11 +17,6 @@ class HTMLParser
private $dom = null;
/**
- * @var DOMDocument
- */
- private $backupdom = null;
-
- /**
* @var array
*/
private $metadata = [];
@@ -107,8 +102,6 @@ class HTMLParser
$this->environment->getConfig()->merge($options);
- $this->dom = new DOMDocument('1.0', 'utf-8');
-
// To avoid having a gazillion of errors on malformed HTMLs
libxml_use_internal_errors(true);
}
@@ -122,15 +115,12 @@ class HTMLParser
*/
public function parse($html)
{
- $this->loadHTML($html);
+ $this->dom = $this->loadHTML($html);
$this->removeScripts();
$this->prepDocument();
- // In case we need the original HTML to create a fake top candidate
- $this->backupdom = clone $this->dom;
-
$this->metadata = $this->getMetadata();
$this->metadata['image'] = $this->getMainImage();
@@ -165,7 +155,8 @@ class HTMLParser
$length += mb_strlen($p->textContent);
}
if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < 500) {
- $root = $this->backupdom->getElementsByTagName('body')->item(0);
+ $this->dom = $this->loadHTML($html);
+ $root = $this->dom->getElementsByTagName('body')->item(0);
if ($this->getConfig()->getOption('stripUnlikelyCandidates')) {
$this->getConfig()->setOption('stripUnlikelyCandidates', false);
@@ -203,18 +194,30 @@ class HTMLParser
}
/**
+ * Creates a DOM Document object and loads the provided HTML on it.
+ *
+ * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)
+ * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
+ * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
+ * objects and ruining the backup.
+ *
* @param string $html
+ * @return DOMDocument
*/
private function loadHTML($html)
{
+ $dom = new DOMDocument('1.0', 'utf-8');
+
if (!$this->getConfig()->getOption('substituteEntities')) {
// Keep the original HTML entities
- $this->dom->substituteEntities = false;
+ $dom->substituteEntities = false;
}
// Prepend the XML tag to avoid having issues with special characters. Should be harmless.
- $this->dom->loadHTML('<?xml encoding="UTF-8">' . $html);
- $this->dom->encoding = 'UTF-8';
+ $dom->loadHTML('<?xml encoding="UTF-8">' . $html);
+ $dom->encoding = 'UTF-8';
+
+ return $dom;
}
/**