From bba656d6733818c450be40292a7c803b76e8973f Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Sat, 20 May 2017 12:25:44 +0100 Subject: Add new backup approach. Cloning the original DOM object is not useful to keep a backup of it because there seems to be a connection between original object and clone. Making a change on the original object translates it to the backup one, so html must be reloaded everytime the algorithm cycles. --- src/HTMLParser.php | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 288d8a8..9be6ff3 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -16,11 +16,6 @@ class HTMLParser */ private $dom = null; - /** - * @var DOMDocument - */ - private $backupdom = null; - /** * @var array */ @@ -107,8 +102,6 @@ class HTMLParser $this->environment->getConfig()->merge($options); - $this->dom = new DOMDocument('1.0', 'utf-8'); - // To avoid having a gazillion of errors on malformed HTMLs libxml_use_internal_errors(true); } @@ -122,15 +115,12 @@ class HTMLParser */ public function parse($html) { - $this->loadHTML($html); + $this->dom = $this->loadHTML($html); $this->removeScripts(); $this->prepDocument(); - // In case we need the original HTML to create a fake top candidate - $this->backupdom = clone $this->dom; - $this->metadata = $this->getMetadata(); $this->metadata['image'] = $this->getMainImage(); @@ -165,7 +155,8 @@ class HTMLParser $length += mb_strlen($p->textContent); } if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < 500) { - $root = $this->backupdom->getElementsByTagName('body')->item(0); + $this->dom = $this->loadHTML($html); + $root = $this->dom->getElementsByTagName('body')->item(0); if ($this->getConfig()->getOption('stripUnlikelyCandidates')) { $this->getConfig()->setOption('stripUnlikelyCandidates', false); @@ -203,18 +194,30 @@ class HTMLParser } /** + * Creates a DOM Document object and loads the provided HTML on it. + * + * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text) + * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs + * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both + * objects and ruining the backup. + * * @param string $html + * @return DOMDocument */ private function loadHTML($html) { + $dom = new DOMDocument('1.0', 'utf-8'); + if (!$this->getConfig()->getOption('substituteEntities')) { // Keep the original HTML entities - $this->dom->substituteEntities = false; + $dom->substituteEntities = false; } // Prepend the XML tag to avoid having issues with special characters. Should be harmless. - $this->dom->loadHTML('' . $html); - $this->dom->encoding = 'UTF-8'; + $dom->loadHTML('' . $html); + $dom->encoding = 'UTF-8'; + + return $dom; } /** -- cgit v1.2.3