summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Nodes/NodeTrait.php41
-rw-r--r--src/Readability.php30
2 files changed, 43 insertions, 28 deletions
diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php
index cfd5ebb..f3f127b 100644
--- a/src/Nodes/NodeTrait.php
+++ b/src/Nodes/NodeTrait.php
@@ -6,6 +6,7 @@ use andreskrey\Readability\Nodes\DOM\DOMDocument;
use andreskrey\Readability\Nodes\DOM\DOMElement;
use andreskrey\Readability\Nodes\DOM\DOMNode;
use andreskrey\Readability\Nodes\DOM\DOMText;
+use DOMNodeList;
/**
* @method \DOMNode removeAttribute($name)
@@ -516,4 +517,44 @@ trait NodeTrait
return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) ||
($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br');
}
+
+ /**
+ * This is a hack that overcomes the issue of node shifting when scanning and removing nodes.
+ *
+ * In the JS version of getElementsByTagName, if you remove a node it will not appear during the
+ * foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an
+ * orphan node and will give an exception if you try to do anything with it.
+ *
+ * Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are
+ * removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that
+ * never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries
+ * to access node 6)
+ *
+ * This function solves this by searching for the nodes on every loop and keeping track of the count differences.
+ * Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be
+ * used only when the results of the search are going to be used to remove the nodes.
+ *
+ * @param string $tag
+ *
+ * @return \Generator
+ */
+ public function shiftingAwareGetElementsByTagName($tag)
+ {
+ /** @var $nodes DOMNodeList */
+ $nodes = $this->getElementsByTagName($tag);
+ $count = count($nodes);
+
+ for ($i = 0; $i < $count; $i = max(++$i, 0)) {
+ yield $nodes->item($i);
+
+ // Search for all the nodes again
+ $nodes = $this->getElementsByTagName($tag);
+
+ // Subtract the amount of nodes removed from the current index
+ $i -= $count - count($nodes);
+
+ // Subtract the amount of nodes removes from the current count
+ $count -= ($count - count($nodes));
+ }
+ }
}
diff --git a/src/Readability.php b/src/Readability.php
index f5a7b70..22aa437 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -814,25 +814,7 @@ class Readability
{
$this->logger->info('[PrepDocument] Preparing document for parsing...');
- /*
- * This is a very ugly hack that is probably causing a big performance hit, but after fighting with it for like
- * 4 days this is the best solution I've came up with.
- *
- * Because we have find the first BR and then remove the following ones, nodes shift in a different way than
- * they do in the JS version. In the JS version, even if you remove a node, it will still appear during the
- * foreach. This does not happen in DOMDocument, because if you remove the BR, the one in the foreach becomes
- * orphan and gives an exception if you try to do anything with it.
- *
- * Shifting also occurs when we convert a P parent node to DIV, which remove the BRs from the "pool"
- * of the foreach.
- *
- * So the solution is to find every BR on each loop and keep track of the ones we removed (by tweaking the value
- * of $i)
- */
- $DOMNodeList = iterator_to_array($dom->getElementsByTagName('br'));
- $length = count($DOMNodeList);
- for ($i = 0; $i < $length; $i < 0 ? $i = 0 : $i++) {
- $br = $DOMNodeList[$i];
+ foreach ($dom->shiftingAwareGetElementsByTagName('br') as $br) {
$next = $br->nextSibling;
/*
@@ -853,10 +835,6 @@ class Readability
$brSibling = $next->nextSibling;
$next->parentNode->removeChild($next);
$next = $brSibling;
-
- // We just removed a BR and we need to "go back" one step because that node will not be there
- // anymore when we search for all the BRs at the end of this loop.
- $i--;
}
/*
@@ -899,10 +877,6 @@ class Readability
NodeUtility::setNodeTag($p->parentNode, 'div');
}
}
-
- // Search for all the BRs again and tweak the length of the for loop
- $DOMNodeList = iterator_to_array($dom->getElementsByTagName('br'));
- $length = count($DOMNodeList);
}
// Replace font tags with span
@@ -1290,7 +1264,7 @@ class Readability
}
// Remove single-cell tables
- foreach (iterator_to_array($article->getElementsByTagName('table')) as $table) {
+ foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) {
/** @var DOMNode $table */
$tbody = $table->hasSingleTagInsideElement('tbody') ? $table->childNodes[0] : $table;
if ($tbody->hasSingleTagInsideElement('tr')) {