diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/Nodes/NodeTrait.php | 41 | ||||
-rw-r--r-- | src/Readability.php | 30 |
2 files changed, 43 insertions, 28 deletions
diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php index cfd5ebb..f3f127b 100644 --- a/src/Nodes/NodeTrait.php +++ b/src/Nodes/NodeTrait.php @@ -6,6 +6,7 @@ use andreskrey\Readability\Nodes\DOM\DOMDocument; use andreskrey\Readability\Nodes\DOM\DOMElement; use andreskrey\Readability\Nodes\DOM\DOMNode; use andreskrey\Readability\Nodes\DOM\DOMText; +use DOMNodeList; /** * @method \DOMNode removeAttribute($name) @@ -516,4 +517,44 @@ trait NodeTrait return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) || ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br'); } + + /** + * This is a hack that overcomes the issue of node shifting when scanning and removing nodes. + * + * In the JS version of getElementsByTagName, if you remove a node it will not appear during the + * foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an + * orphan node and will give an exception if you try to do anything with it. + * + * Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are + * removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that + * never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries + * to access node 6) + * + * This function solves this by searching for the nodes on every loop and keeping track of the count differences. + * Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be + * used only when the results of the search are going to be used to remove the nodes. + * + * @param string $tag + * + * @return \Generator + */ + public function shiftingAwareGetElementsByTagName($tag) + { + /** @var $nodes DOMNodeList */ + $nodes = $this->getElementsByTagName($tag); + $count = count($nodes); + + for ($i = 0; $i < $count; $i = max(++$i, 0)) { + yield $nodes->item($i); + + // Search for all the nodes again + $nodes = $this->getElementsByTagName($tag); + + // Subtract the amount of nodes removed from the current index + $i -= $count - count($nodes); + + // Subtract the amount of nodes removes from the current count + $count -= ($count - count($nodes)); + } + } } diff --git a/src/Readability.php b/src/Readability.php index f5a7b70..22aa437 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -814,25 +814,7 @@ class Readability { $this->logger->info('[PrepDocument] Preparing document for parsing...'); - /* - * This is a very ugly hack that is probably causing a big performance hit, but after fighting with it for like - * 4 days this is the best solution I've came up with. - * - * Because we have find the first BR and then remove the following ones, nodes shift in a different way than - * they do in the JS version. In the JS version, even if you remove a node, it will still appear during the - * foreach. This does not happen in DOMDocument, because if you remove the BR, the one in the foreach becomes - * orphan and gives an exception if you try to do anything with it. - * - * Shifting also occurs when we convert a P parent node to DIV, which remove the BRs from the "pool" - * of the foreach. - * - * So the solution is to find every BR on each loop and keep track of the ones we removed (by tweaking the value - * of $i) - */ - $DOMNodeList = iterator_to_array($dom->getElementsByTagName('br')); - $length = count($DOMNodeList); - for ($i = 0; $i < $length; $i < 0 ? $i = 0 : $i++) { - $br = $DOMNodeList[$i]; + foreach ($dom->shiftingAwareGetElementsByTagName('br') as $br) { $next = $br->nextSibling; /* @@ -853,10 +835,6 @@ class Readability $brSibling = $next->nextSibling; $next->parentNode->removeChild($next); $next = $brSibling; - - // We just removed a BR and we need to "go back" one step because that node will not be there - // anymore when we search for all the BRs at the end of this loop. - $i--; } /* @@ -899,10 +877,6 @@ class Readability NodeUtility::setNodeTag($p->parentNode, 'div'); } } - - // Search for all the BRs again and tweak the length of the for loop - $DOMNodeList = iterator_to_array($dom->getElementsByTagName('br')); - $length = count($DOMNodeList); } // Replace font tags with span @@ -1290,7 +1264,7 @@ class Readability } // Remove single-cell tables - foreach (iterator_to_array($article->getElementsByTagName('table')) as $table) { + foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) { /** @var DOMNode $table */ $tbody = $table->hasSingleTagInsideElement('tbody') ? $table->childNodes[0] : $table; if ($tbody->hasSingleTagInsideElement('tr')) { |