diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/Nodes/DOM/DOMNodeList.php | 108 | ||||
-rw-r--r-- | src/Nodes/NodeTrait.php | 26 | ||||
-rw-r--r-- | src/Nodes/NodeUtility.php | 18 | ||||
-rw-r--r-- | src/Readability.php | 11 |
4 files changed, 135 insertions, 28 deletions
diff --git a/src/Nodes/DOM/DOMNodeList.php b/src/Nodes/DOM/DOMNodeList.php new file mode 100644 index 0000000..28d4d42 --- /dev/null +++ b/src/Nodes/DOM/DOMNodeList.php @@ -0,0 +1,108 @@ +<?php + +namespace andreskrey\Readability\Nodes\DOM; + +/** + * Class DOMNodeList + * + * This is a fake DOMNodeList class that allows adding items to the list. The original class is static and the nodes + * are defined automagically when instantiating it. This fake version behaves exactly the same way but adds the function + * add() that allows to insert new DOMNodes into the DOMNodeList. + * + * It cannot extend the original DOMNodeList class because the functionality behind the property ->length is hidden + * from the user and cannot be extended, changed, or tweaked. + * + * @package andreskrey\Readability\Nodes\DOM + */ +class DOMNodeList implements \ArrayAccess, \Countable, \IteratorAggregate +{ + /** + * @var array + */ + protected $items = []; + + /** + * @var int + */ + protected $length = 0; + + /** + * To allow access to length in the same way that DOMNodeList allows + * + * {@inheritDoc} + */ + public function __get($name) + { + switch ($name) { + case 'length': + return $this->length; + default: + trigger_error(sprintf('Undefined property: %s::%s', static::class, $name)); + } + } + + /** + * @param \DOMNode $node + * + * @return DOMNodeList + */ + public function add(\DOMNode $node) + { + $this->items[] = $node; + $this->length++; + + return $this; + } + + /** + * @return int|void + */ + public function count() + { + return $this->length; + } + + /** + * To make it compatible with iterator_to_array() function + * + * {@inheritDoc} + */ + public function getIterator() + { + return new \ArrayIterator($this->items); + } + + /** + * {@inheritDoc} + */ + public function offsetExists($offset) + { + return isset($this->items[$offset]); + } + + /** + * {@inheritDoc} + */ + public function offsetGet($offset) + { + return $this->items[$offset]; + } + + /** + * {@inheritDoc} + */ + public function offsetSet($offset, $value) + { + $this->items[$offset] = $value; + $this->length = count($this->items); + } + + /** + * {@inheritDoc} + */ + public function offsetUnset($offset) + { + unset($this->items[$offset]); + $this->length--; + } +} diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php index 9ef1fa2..2ce4383 100644 --- a/src/Nodes/NodeTrait.php +++ b/src/Nodes/NodeTrait.php @@ -313,26 +313,6 @@ trait NodeTrait } /** - * Returns the children of the current node. - * - * @param bool $filterEmptyDOMText Filter empty DOMText nodes? - * - * @return array - */ - public function getChildren($filterEmptyDOMText = false) - { - $ret = iterator_to_array($this->childNodes); - if ($filterEmptyDOMText) { - // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number - $ret = array_values(array_filter($ret, function ($node) { - return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue)); - })); - } - - return $ret; - } - - /** * Return an array indicating how many rows and columns this table has. * * @return array @@ -418,12 +398,12 @@ trait NodeTrait public function hasSingleTagInsideElement($tag) { // There should be exactly 1 element child with given tag - if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== $tag) { + if (count($children = NodeUtility::filterTextNodes($this->childNodes)) !== 1 || $children[0]->nodeName !== $tag) { return false; } // And there should be no text nodes with real content - return array_reduce($children, function ($carry, $child) { + return array_reduce(iterator_to_array($children), function ($carry, $child) { if (!$carry === false) { return false; } @@ -443,7 +423,7 @@ trait NodeTrait { $result = false; if ($this->hasChildNodes()) { - foreach ($this->getChildren() as $child) { + foreach ($this->childNodes as $child) { if (in_array($child->nodeName, $this->divToPElements)) { $result = true; } else { diff --git a/src/Nodes/NodeUtility.php b/src/Nodes/NodeUtility.php index 7a1f18e..631a0aa 100644 --- a/src/Nodes/NodeUtility.php +++ b/src/Nodes/NodeUtility.php @@ -5,6 +5,7 @@ namespace andreskrey\Readability\Nodes; use andreskrey\Readability\Nodes\DOM\DOMDocument; use andreskrey\Readability\Nodes\DOM\DOMElement; use andreskrey\Readability\Nodes\DOM\DOMNode; +use andreskrey\Readability\Nodes\DOM\DOMNodeList; /** * Class NodeUtility. @@ -157,4 +158,21 @@ class NodeUtility return ($originalNode) ? $originalNode->nextSibling : $originalNode; } + + /** + * Remove all empty DOMNodes from DOMNodeLists + * + * @param \DOMNodeList $list + * @return DOMNodeList + */ + public static function filterTextNodes(\DOMNodeList $list) + { + $newList = new DOMNodeList(); + foreach($list as $node){ + if($node->nodeType !== XML_TEXT_NODE || mb_strlen(trim($node->nodeValue))){ + $newList->add($node); + } + } + return $newList; + } } diff --git a/src/Readability.php b/src/Readability.php index c8a321b..8890183 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -735,7 +735,7 @@ class Readability */ if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) { $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); - $pNode = $node->getChildren(true)[0]; + $pNode = NodeUtility::filterTextNodes($node->childNodes)[0]; $node->parentNode->replaceChild($pNode, $node); $node = $pNode; $elementsToScore[] = $node; @@ -1095,7 +1095,7 @@ class Readability // If the top candidate is the only child, use parent instead. This will help sibling // joining logic when adjacent content is actually located in parent's sibling node. $parentOfTopCandidate = $topCandidate->parentNode; - while ($parentOfTopCandidate->nodeName !== 'body' && count($parentOfTopCandidate->getChildren(true)) === 1) { + while ($parentOfTopCandidate->nodeName !== 'body' && count(NodeUtility::filterTextNodes($parentOfTopCandidate->childNodes)) === 1) { $topCandidate = $parentOfTopCandidate; $parentOfTopCandidate = $topCandidate->parentNode; } @@ -1115,14 +1115,16 @@ class Readability $siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2); // Keep potential top candidate's parent node to try to get text direction of it later. $parentOfTopCandidate = $topCandidate->parentNode; - $siblings = $parentOfTopCandidate->getChildren(); + $siblings = $parentOfTopCandidate->childNodes; $hasContent = false; $this->logger->info('[Rating] Adding top candidate siblings...'); /** @var DOMElement $sibling */ - foreach ($siblings as $sibling) { + // Can't foreach here because down there we might change the tag name and that causes the foreach to skip items + for ($i = 0; $i < $siblings->length; $i++) { + $sibling = $siblings[$i]; $append = false; if ($sibling === $topCandidate) { @@ -1160,7 +1162,6 @@ class Readability * We have a node that isn't a common block level element, like a form or td tag. * Turn it into a div so it doesn't get filtered out later by accident. */ - $sibling = NodeUtility::setNodeTag($sibling, 'div'); } |