From 2aaefbfa54447c37a74aaf126f864fac629e9bd5 Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Wed, 20 Jun 2018 14:58:09 +0300 Subject: update autoloader to consider namespaces for third party libraries: placed and loaded from vendor/namespace/classpath.php update readability to a newer implementation based on Readability.js (https://github.com/andreskrey/readability.php) add vendor/Psr/Log interface required for the above --- vendor/andreskrey/Readability/Nodes/NodeTrait.php | 434 ++++++++++++++++++++++ 1 file changed, 434 insertions(+) create mode 100644 vendor/andreskrey/Readability/Nodes/NodeTrait.php (limited to 'vendor/andreskrey/Readability/Nodes/NodeTrait.php') diff --git a/vendor/andreskrey/Readability/Nodes/NodeTrait.php b/vendor/andreskrey/Readability/Nodes/NodeTrait.php new file mode 100644 index 000000000..13611c9e7 --- /dev/null +++ b/vendor/andreskrey/Readability/Nodes/NodeTrait.php @@ -0,0 +1,434 @@ +initialized; + } + + /** + * @return bool + */ + public function isReadabilityDataTable() + { + return $this->readabilityDataTable; + } + + /** + * @param bool $param + */ + public function setReadabilityDataTable($param) + { + $this->readabilityDataTable = $param; + } + + /** + * Initializer. Calculates the current score of the node and returns a full Readability object. + * + * @ TODO: I don't like the weightClasses param. How can we get the config here? + * + * @param $weightClasses bool Weight classes? + * + * @return static + */ + public function initializeNode($weightClasses) + { + if (!$this->isInitialized()) { + $contentScore = 0; + + switch ($this->nodeName) { + case 'div': + $contentScore += 5; + break; + + case 'pre': + case 'td': + case 'blockquote': + $contentScore += 3; + break; + + case 'address': + case 'ol': + case 'ul': + case 'dl': + case 'dd': + case 'dt': + case 'li': + case 'form': + $contentScore -= 3; + break; + + case 'h1': + case 'h2': + case 'h3': + case 'h4': + case 'h5': + case 'h6': + case 'th': + $contentScore -= 5; + break; + } + + $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0); + + $this->initialized = true; + } + + return $this; + } + + /** + * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need + * to check first the existence of the attributes property. + * + * @param $attributeName string Attribute to retrieve + * + * @return string + */ + public function getAttribute($attributeName) + { + if (!is_null($this->attributes)) { + return parent::getAttribute($attributeName); + } + + return ''; + } + + /** + * Get the ancestors of the current node. + * + * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them + * + * @return array + */ + public function getNodeAncestors($maxLevel = 3) + { + $ancestors = []; + $level = 0; + + $node = $this->parentNode; + + while ($node && !($node instanceof DOMDocument)) { + $ancestors[] = $node; + $level++; + if ($level === $maxLevel) { + break; + } + $node = $node->parentNode; + } + + return $ancestors; + } + + /** + * Returns all links from the current element. + * + * @return array + */ + public function getAllLinks() + { + return iterator_to_array($this->getElementsByTagName('a')); + } + + /** + * Get the density of links as a percentage of the content + * This is the amount of text that is inside a link divided by the total text in the node. + * + * @return int + */ + public function getLinkDensity() + { + $linkLength = 0; + $textLength = mb_strlen($this->getTextContent(true)); + + if (!$textLength) { + return 0; + } + + $links = $this->getAllLinks(); + + if ($links) { + /** @var DOMElement $link */ + foreach ($links as $link) { + $linkLength += mb_strlen($link->getTextContent(true)); + } + } + + return $linkLength / $textLength; + } + + /** + * Calculates the weight of the class/id of the current element. + * + * @return int + */ + public function getClassWeight() + { + $weight = 0; + + // Look for a special classname + $class = $this->getAttribute('class'); + if (trim($class)) { + if (preg_match(NodeUtility::$regexps['negative'], $class)) { + $weight -= 25; + } + + if (preg_match(NodeUtility::$regexps['positive'], $class)) { + $weight += 25; + } + } + + // Look for a special ID + $id = $this->getAttribute('id'); + if (trim($id)) { + if (preg_match(NodeUtility::$regexps['negative'], $id)) { + $weight -= 25; + } + + if (preg_match(NodeUtility::$regexps['positive'], $id)) { + $weight += 25; + } + } + + return $weight; + } + + /** + * Returns the full text of the node. + * + * @param bool $normalize Normalize white space? + * + * @return string + */ + public function getTextContent($normalize = false) + { + $nodeValue = $this->nodeValue; + if ($normalize) { + $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue)); + } + + return $nodeValue; + } + + /** + * Returns the children of the current node. + * + * @param bool $filterEmptyDOMText Filter empty DOMText nodes? + * + * @return array + */ + public function getChildren($filterEmptyDOMText = false) + { + $ret = iterator_to_array($this->childNodes); + if ($filterEmptyDOMText) { + // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number + $ret = array_values(array_filter($ret, function ($node) { + return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue)); + })); + } + + return $ret; + } + + /** + * Return an array indicating how many rows and columns this table has. + * + * @return array + */ + public function getRowAndColumnCount() + { + $rows = $columns = 0; + $trs = $this->getElementsByTagName('tr'); + foreach ($trs as $tr) { + /** @var \DOMElement $tr */ + $rowspan = $tr->getAttribute('rowspan'); + $rows += ($rowspan || 1); + + // Now look for column-related info + $columnsInThisRow = 0; + $cells = $tr->getElementsByTagName('td'); + foreach ($cells as $cell) { + /** @var \DOMElement $cell */ + $colspan = $cell->getAttribute('colspan'); + $columnsInThisRow += ($colspan || 1); + } + $columns = max($columns, $columnsInThisRow); + } + + return ['rows' => $rows, 'columns' => $columns]; + } + + /** + * Creates a new node based on the text content of the original node. + * + * @param $originalNode DOMNode + * @param $tagName string + * + * @return DOMElement + */ + public function createNode($originalNode, $tagName) + { + $text = $originalNode->getTextContent(); + $newNode = $originalNode->ownerDocument->createElement($tagName, $text); + + return $newNode; + } + + /** + * Check if a given node has one of its ancestor tag name matching the + * provided one. + * + * @param DOMElement $node + * @param string $tagName + * @param int $maxDepth + * + * @return bool + */ + public function hasAncestorTag($node, $tagName, $maxDepth = 3) + { + $depth = 0; + while ($node->parentNode) { + if ($maxDepth > 0 && $depth > $maxDepth) { + return false; + } + if ($node->parentNode->nodeName === $tagName) { + return true; + } + $node = $node->parentNode; + $depth++; + } + + return false; + } + + /** + * Checks if the current node has a single child and if that child is a P node. + * Useful to convert

nodes to a single

node and avoid confusing the scoring system since div with p + * tags are, in practice, paragraphs. + * + * @param DOMNode $node + * + * @return bool + */ + public function hasSinglePNode() + { + // There should be exactly 1 element child which is a P: + if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== 'p') { + return false; + } + + // And there should be no text nodes with real content (param true on ->getChildren) + foreach ($children as $child) { + /** @var $child DOMNode */ + if ($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())) { + return false; + } + } + + return true; + } + + /** + * Check if the current element has a single child block element. + * Block elements are the ones defined in the divToPElements array. + * + * @return bool + */ + public function hasSingleChildBlockElement() + { + $result = false; + if ($this->hasChildNodes()) { + foreach ($this->getChildren() as $child) { + if (in_array($child->nodeName, $this->divToPElements)) { + $result = true; + } else { + // If any of the hasSingleChildBlockElement calls return true, return true then. + /** @var $child DOMElement */ + $result = ($result || $child->hasSingleChildBlockElement()); + } + } + } + + return $result; + } + + /** + * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace. + * + * @return bool + */ + public function isElementWithoutContent() + { + return $this instanceof DOMElement && + mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 && + ($this->childNodes->length === 0 || + $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length + /* + * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node. + * If there's an empty tag with an space inside and a BR (for example "


) counting only BRs and + * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument, + * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we + * are dealing with (And at this point we know they are empty or are just whitespace, because of the + * mb_strlen in this chain of checks). + */ + + count(array_filter(iterator_to_array($this->childNodes), function ($child) { + return $child instanceof DOMText; + })) + + ); + } +} -- cgit v1.2.3