diff options
-rw-r--r-- | src/HTMLParser.php | 33 |
1 files changed, 18 insertions, 15 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 288d8a8..9be6ff3 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -17,11 +17,6 @@ class HTMLParser private $dom = null; /** - * @var DOMDocument - */ - private $backupdom = null; - - /** * @var array */ private $metadata = []; @@ -107,8 +102,6 @@ class HTMLParser $this->environment->getConfig()->merge($options); - $this->dom = new DOMDocument('1.0', 'utf-8'); - // To avoid having a gazillion of errors on malformed HTMLs libxml_use_internal_errors(true); } @@ -122,15 +115,12 @@ class HTMLParser */ public function parse($html) { - $this->loadHTML($html); + $this->dom = $this->loadHTML($html); $this->removeScripts(); $this->prepDocument(); - // In case we need the original HTML to create a fake top candidate - $this->backupdom = clone $this->dom; - $this->metadata = $this->getMetadata(); $this->metadata['image'] = $this->getMainImage(); @@ -165,7 +155,8 @@ class HTMLParser $length += mb_strlen($p->textContent); } if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < 500) { - $root = $this->backupdom->getElementsByTagName('body')->item(0); + $this->dom = $this->loadHTML($html); + $root = $this->dom->getElementsByTagName('body')->item(0); if ($this->getConfig()->getOption('stripUnlikelyCandidates')) { $this->getConfig()->setOption('stripUnlikelyCandidates', false); @@ -203,18 +194,30 @@ class HTMLParser } /** + * Creates a DOM Document object and loads the provided HTML on it. + * + * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text) + * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs + * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both + * objects and ruining the backup. + * * @param string $html + * @return DOMDocument */ private function loadHTML($html) { + $dom = new DOMDocument('1.0', 'utf-8'); + if (!$this->getConfig()->getOption('substituteEntities')) { // Keep the original HTML entities - $this->dom->substituteEntities = false; + $dom->substituteEntities = false; } // Prepend the XML tag to avoid having issues with special characters. Should be harmless. - $this->dom->loadHTML('<?xml encoding="UTF-8">' . $html); - $this->dom->encoding = 'UTF-8'; + $dom->loadHTML('<?xml encoding="UTF-8">' . $html); + $dom->encoding = 'UTF-8'; + + return $dom; } /** |