From 8bb2fd7c67786ae8d945e1baaeb3dd8d0e9195b1 Mon Sep 17 00:00:00 2001 From: David Fricker Date: Sat, 4 Feb 2017 23:00:17 +0000 Subject: prevents an exception being thrown prevents an exception being thrown by postProcessContent when $result is a bool not a DOM object. --- src/HTMLParser.php | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index a8c28ff..0313b2a 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -183,6 +183,10 @@ class HTMLParser if (!$parseSuccessful) { return false; } + + if (!$result) { + return false; + } $result = $this->postProcessContent($result); -- cgit v1.2.3 From 16ad941225aa543a9f05ba2488e222dc9246a026 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Fri, 10 Mar 2017 11:12:15 +0000 Subject: Apply fixes from StyleCI --- src/HTMLParser.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index bc9aa9f..288d8a8 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -185,7 +185,7 @@ class HTMLParser if (!$parseSuccessful) { return false; } - + if (!$result) { return false; } -- cgit v1.2.3 From bba656d6733818c450be40292a7c803b76e8973f Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Sat, 20 May 2017 12:25:44 +0100 Subject: Add new backup approach. Cloning the original DOM object is not useful to keep a backup of it because there seems to be a connection between original object and clone. Making a change on the original object translates it to the backup one, so html must be reloaded everytime the algorithm cycles. --- src/HTMLParser.php | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'src') diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 288d8a8..9be6ff3 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -16,11 +16,6 @@ class HTMLParser */ private $dom = null; - /** - * @var DOMDocument - */ - private $backupdom = null; - /** * @var array */ @@ -107,8 +102,6 @@ class HTMLParser $this->environment->getConfig()->merge($options); - $this->dom = new DOMDocument('1.0', 'utf-8'); - // To avoid having a gazillion of errors on malformed HTMLs libxml_use_internal_errors(true); } @@ -122,15 +115,12 @@ class HTMLParser */ public function parse($html) { - $this->loadHTML($html); + $this->dom = $this->loadHTML($html); $this->removeScripts(); $this->prepDocument(); - // In case we need the original HTML to create a fake top candidate - $this->backupdom = clone $this->dom; - $this->metadata = $this->getMetadata(); $this->metadata['image'] = $this->getMainImage(); @@ -165,7 +155,8 @@ class HTMLParser $length += mb_strlen($p->textContent); } if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < 500) { - $root = $this->backupdom->getElementsByTagName('body')->item(0); + $this->dom = $this->loadHTML($html); + $root = $this->dom->getElementsByTagName('body')->item(0); if ($this->getConfig()->getOption('stripUnlikelyCandidates')) { $this->getConfig()->setOption('stripUnlikelyCandidates', false); @@ -203,18 +194,30 @@ class HTMLParser } /** + * Creates a DOM Document object and loads the provided HTML on it. + * + * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text) + * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs + * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both + * objects and ruining the backup. + * * @param string $html + * @return DOMDocument */ private function loadHTML($html) { + $dom = new DOMDocument('1.0', 'utf-8'); + if (!$this->getConfig()->getOption('substituteEntities')) { // Keep the original HTML entities - $this->dom->substituteEntities = false; + $dom->substituteEntities = false; } // Prepend the XML tag to avoid having issues with special characters. Should be harmless. - $this->dom->loadHTML('' . $html); - $this->dom->encoding = 'UTF-8'; + $dom->loadHTML('' . $html); + $dom->encoding = 'UTF-8'; + + return $dom; } /** -- cgit v1.2.3