summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Nodes/NodeUtility.php2
-rw-r--r--src/Readability.php32
2 files changed, 26 insertions, 8 deletions
diff --git a/src/Nodes/NodeUtility.php b/src/Nodes/NodeUtility.php
index b5f66ba..a3d8509 100644
--- a/src/Nodes/NodeUtility.php
+++ b/src/Nodes/NodeUtility.php
@@ -46,7 +46,7 @@ class NodeUtility
$next = $node;
while ($next
&& $next->nodeType !== XML_ELEMENT_NODE
- && preg_match(NodeUtility::$regexps['whitespace'], $next->textContent)) {
+ && $next->isWhitespace()) {
$next = $next->nextSibling;
}
diff --git a/src/Readability.php b/src/Readability.php
index 02938e3..83236c7 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -815,14 +815,24 @@ class Readability
$this->logger->info('[PrepDocument] Preparing document for parsing...');
/*
- * DOMNodeList must be converted to an array before looping over it.
- * This is done to avoid node shifting when removing nodes.
+ * This is a very ugly hack that is probably causing a big performance hit, but after fighting with it for like
+ * 4 days this is the best solution I've came up with.
*
- * Reverse traversing cannot be done here because we need to find brs that are right next to other brs.
- * (If we go the other way around we need to search for previous nodes forcing the creation of new functions
- * that will be used only here)
+ * Because we have find the first BR and then remove the following ones, nodes shift in a different way than
+ * they do in the JS version. In the JS version, even if you remove a node, it will still appear during the
+ * foreach. This does not happen in DOMDocument, because if you remove the BR, the one in the foreach becomes
+ * orphan and gives an exception if you try to do anything with it.
+ *
+ * Shifting also occurs when we convert a P parent node to DIV, which remove the BRs from the "pool"
+ * of the foreach.
+ *
+ * So the solution is to find every BR on each loop and keep track of the ones we removed (by tweaking the value
+ * of $i)
*/
- foreach (iterator_to_array($dom->getElementsByTagName('br')) as $br) {
+ $DOMNodeList = iterator_to_array($dom->getElementsByTagName('br'));
+ $length = count($DOMNodeList);
+ for ($i = 0; $i < $length; $i++) {
+ $br = $DOMNodeList[$i];
$next = $br->nextSibling;
/*
@@ -843,6 +853,10 @@ class Readability
$brSibling = $next->nextSibling;
$next->parentNode->removeChild($next);
$next = $brSibling;
+
+ // We just removed a BR and we need to "go back" one step because that node will not be there
+ // anymore when we search for all the BRs at the end of this loop.
+ $i--;
}
/*
@@ -885,6 +899,10 @@ class Readability
NodeUtility::setNodeTag($p->parentNode, 'div');
}
}
+
+ // Search for all the BRs again and tweak the length of the for loop
+ $DOMNodeList = iterator_to_array($dom->getElementsByTagName('br'));
+ $length = count($DOMNodeList);
}
// Replace font tags with span
@@ -1460,7 +1478,7 @@ class Readability
$node = $DOMNodeList->item($length - 1 - $i);
// First check if we're in a data table, in which case don't remove us.
- if ($node->hasAncestorTag('table', -1, function($node){
+ if ($node->hasAncestorTag('table', -1, function ($node) {
return $node->isReadabilityDataTable();
})) {
continue;