initialized; } /** * @return bool */ public function isReadabilityDataTable() { /* * This is a workaround that I'd like to remove in the future. * Seems that although we are extending the base DOMElement and adding custom properties (like this one, * 'readabilityDataTable'), these properties get lost when you search for elements with getElementsByTagName. * This means that even if we mark the tables in a previous step, when we want to retrieve that information, * all the custom properties are in their default values. Somehow we need to find a way to make these properties * permanent across the whole DOM. * * @see https://stackoverflow.com/questions/35654709/php-registernodeclass-and-reusing-variable-names */ return $this->hasAttribute('readabilityDataTable') && $this->getAttribute('readabilityDataTable') === '1'; // return $this->readabilityDataTable; } /** * @param bool $param */ public function setReadabilityDataTable($param) { // Can't be "true" because DOMDocument casts it to "1" $this->setAttribute('readabilityDataTable', $param ? '1' : '0'); // $this->readabilityDataTable = $param; } /** * Initializer. Calculates the current score of the node and returns a full Readability object. * * @ TODO: I don't like the weightClasses param. How can we get the config here? * * @param $weightClasses bool Weight classes? * * @return static */ public function initializeNode($weightClasses) { if (!$this->isInitialized()) { $contentScore = 0; switch ($this->nodeName) { case 'div': $contentScore += 5; break; case 'pre': case 'td': case 'blockquote': $contentScore += 3; break; case 'address': case 'ol': case 'ul': case 'dl': case 'dd': case 'dt': case 'li': case 'form': $contentScore -= 3; break; case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': case 'th': $contentScore -= 5; break; } $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0); $this->initialized = true; } return $this; } /** * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need * to check first the existence of the attributes property. * * @param $attributeName string Attribute to retrieve * * @return string */ public function getAttribute($attributeName) { if (!is_null($this->attributes)) { return parent::getAttribute($attributeName); } return ''; } /** * Override for native hasAttribute. * * @see getAttribute * * @param $attributeName * * @return bool */ public function hasAttribute($attributeName) { if (!is_null($this->attributes)) { return parent::hasAttribute($attributeName); } return false; } /** * Get the ancestors of the current node. * * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them * * @return array */ public function getNodeAncestors($maxLevel = 3) { $ancestors = []; $level = 0; $node = $this->parentNode; while ($node && !($node instanceof DOMDocument)) { $ancestors[] = $node; $level++; if ($level === $maxLevel) { break; } $node = $node->parentNode; } return $ancestors; } /** * Returns all links from the current element. * * @return array */ public function getAllLinks() { return iterator_to_array($this->getElementsByTagName('a')); } /** * Get the density of links as a percentage of the content * This is the amount of text that is inside a link divided by the total text in the node. * * @return int */ public function getLinkDensity() { $linkLength = 0; $textLength = mb_strlen($this->getTextContent(true)); if (!$textLength) { return 0; } $links = $this->getAllLinks(); if ($links) { /** @var DOMElement $link */ foreach ($links as $link) { $linkLength += mb_strlen($link->getTextContent(true)); } } return $linkLength / $textLength; } /** * Calculates the weight of the class/id of the current element. * * @return int */ public function getClassWeight() { $weight = 0; // Look for a special classname $class = $this->getAttribute('class'); if (trim($class)) { if (preg_match(NodeUtility::$regexps['negative'], $class)) { $weight -= 25; } if (preg_match(NodeUtility::$regexps['positive'], $class)) { $weight += 25; } } // Look for a special ID $id = $this->getAttribute('id'); if (trim($id)) { if (preg_match(NodeUtility::$regexps['negative'], $id)) { $weight -= 25; } if (preg_match(NodeUtility::$regexps['positive'], $id)) { $weight += 25; } } return $weight; } /** * Returns the full text of the node. * * @param bool $normalize Normalize white space? * * @return string */ public function getTextContent($normalize = false) { $nodeValue = $this->nodeValue; if ($normalize) { $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue)); } return $nodeValue; } /** * Returns the children of the current node. * * @param bool $filterEmptyDOMText Filter empty DOMText nodes? * * @return array */ public function getChildren($filterEmptyDOMText = false) { $ret = iterator_to_array($this->childNodes); if ($filterEmptyDOMText) { // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number $ret = array_values(array_filter($ret, function ($node) { return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue)); })); } return $ret; } /** * Return an array indicating how many rows and columns this table has. * * @return array */ public function getRowAndColumnCount() { $rows = $columns = 0; $trs = $this->getElementsByTagName('tr'); foreach ($trs as $tr) { /** @var \DOMElement $tr */ $rowspan = $tr->getAttribute('rowspan'); $rows += ($rowspan || 1); // Now look for column-related info $columnsInThisRow = 0; $cells = $tr->getElementsByTagName('td'); foreach ($cells as $cell) { /** @var \DOMElement $cell */ $colspan = $cell->getAttribute('colspan'); $columnsInThisRow += ($colspan || 1); } $columns = max($columns, $columnsInThisRow); } return ['rows' => $rows, 'columns' => $columns]; } /** * Creates a new node based on the text content of the original node. * * @param $originalNode DOMNode * @param $tagName string * * @return DOMElement */ public function createNode($originalNode, $tagName) { $text = $originalNode->getTextContent(); $newNode = $originalNode->ownerDocument->createElement($tagName, $text); return $newNode; } /** * Check if a given node has one of its ancestor tag name matching the * provided one. * * @param string $tagName * @param int $maxDepth * @param callable $filterFn * * @return bool */ public function hasAncestorTag($tagName, $maxDepth = 3, callable $filterFn = null) { $depth = 0; $node = $this; while ($node->parentNode) { if ($maxDepth > 0 && $depth > $maxDepth) { return false; } if ($node->parentNode->nodeName === $tagName && (!$filterFn || $filterFn($node->parentNode))) { return true; } $node = $node->parentNode; $depth++; } return false; } /** * Check if this node has only whitespace and a single element with given tag * or if it contains no element with given tag or more than 1 element. * * @param $tag string Name of tag * * @return bool */ public function hasSingleTagInsideElement($tag) { // There should be exactly 1 element child with given tag if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== $tag) { return false; } // And there should be no text nodes with real content return array_reduce($children, function ($carry, $child) { if (!$carry === false) { return false; } /* @var DOMNode $child */ return !($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())); }); } /** * Check if the current element has a single child block element. * Block elements are the ones defined in the divToPElements array. * * @return bool */ public function hasSingleChildBlockElement() { $result = false; if ($this->hasChildNodes()) { foreach ($this->getChildren() as $child) { if (in_array($child->nodeName, $this->divToPElements)) { $result = true; } else { // If any of the hasSingleChildBlockElement calls return true, return true then. /** @var $child DOMElement */ $result = ($result || $child->hasSingleChildBlockElement()); } } } return $result; } /** * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace. * * @return bool */ public function isElementWithoutContent() { return $this instanceof DOMElement && mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 && ($this->childNodes->length === 0 || $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length /* * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node. * If there's an empty tag with an space inside and a BR (for example "


) counting only BRs and * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument, * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we * are dealing with (And at this point we know they are empty or are just whitespace, because of the * mb_strlen in this chain of checks). */ + count(array_filter(iterator_to_array($this->childNodes), function ($child) { return $child instanceof DOMText; })) ); } /** * Determine if a node qualifies as phrasing content. * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content. * * @return bool */ public function isPhrasingContent() { return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false || (!is_null($this->childNodes) && ($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') && array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) { return $node->isPhrasingContent() && $carry; }, true) ); } public function isProbablyVisible() { /* * In the original JS project they check if the node has the style display=none, which unfortunately * in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none". * * Might be a good idea to check for classes or other attributes like 'aria-hidden' */ return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden'); } public function isWhitespace() { return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) || ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br'); } /** * This is a hack that overcomes the issue of node shifting when scanning and removing nodes. * * In the JS version of getElementsByTagName, if you remove a node it will not appear during the * foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an * orphan node and will give an exception if you try to do anything with it. * * Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are * removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that * never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries * to access node 6) * * This function solves this by searching for the nodes on every loop and keeping track of the count differences. * Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be * used only when the results of the search are going to be used to remove the nodes. * * @param string $tag * * @return \Generator */ public function shiftingAwareGetElementsByTagName($tag) { /** @var $nodes DOMNodeList */ $nodes = $this->getElementsByTagName($tag); $count = $nodes->length; for ($i = 0; $i < $count; $i = max(++$i, 0)) { yield $nodes->item($i); // Search for all the nodes again $nodes = $this->getElementsByTagName($tag); // Subtract the amount of nodes removed from the current index $i -= $count - $nodes->length; // Subtract the amount of nodes removed from the current count $count -= ($count - $nodes->length); } } }