From 0226e0ca0dc70f9a0310b3eef045ee1c1e0ca3ac Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Tue, 13 Dec 2022 20:00:46 +0300 Subject: split into a separate repo --- .../readability.php/src/Nodes/DOM/DOMAttr.php | 10 + .../src/Nodes/DOM/DOMCdataSection.php | 10 + .../src/Nodes/DOM/DOMCharacterData.php | 10 + .../readability.php/src/Nodes/DOM/DOMComment.php | 10 + .../readability.php/src/Nodes/DOM/DOMDocument.php | 30 ++ .../src/Nodes/DOM/DOMDocumentFragment.php | 10 + .../src/Nodes/DOM/DOMDocumentType.php | 10 + .../readability.php/src/Nodes/DOM/DOMElement.php | 46 ++ .../readability.php/src/Nodes/DOM/DOMEntity.php | 10 + .../src/Nodes/DOM/DOMEntityReference.php | 10 + .../readability.php/src/Nodes/DOM/DOMNode.php | 14 + .../readability.php/src/Nodes/DOM/DOMNodeList.php | 82 +++ .../readability.php/src/Nodes/DOM/DOMNotation.php | 10 + .../src/Nodes/DOM/DOMProcessingInstruction.php | 10 + .../readability.php/src/Nodes/DOM/DOMText.php | 10 + .../readability.php/src/Nodes/NodeTrait.php | 566 +++++++++++++++++++++ .../readability.php/src/Nodes/NodeUtility.php | 192 +++++++ 17 files changed, 1040 insertions(+) create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMAttr.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMCdataSection.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMCharacterData.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMComment.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMDocument.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMDocumentFragment.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMDocumentType.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMElement.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMEntity.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMEntityReference.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMNode.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMNodeList.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMNotation.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMProcessingInstruction.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMText.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/NodeTrait.php create mode 100644 vendor/fivefilters/readability.php/src/Nodes/NodeUtility.php (limited to 'vendor/fivefilters/readability.php/src/Nodes') diff --git a/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMAttr.php b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMAttr.php new file mode 100644 index 0000000..1bdf395 --- /dev/null +++ b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMAttr.php @@ -0,0 +1,10 @@ +registerNodeClass('DOMAttr', DOMAttr::class); + $this->registerNodeClass('DOMCdataSection', DOMCdataSection::class); + $this->registerNodeClass('DOMCharacterData', DOMCharacterData::class); + $this->registerNodeClass('DOMComment', DOMComment::class); + $this->registerNodeClass('DOMDocument', self::class); + $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class); + $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class); + $this->registerNodeClass('DOMElement', DOMElement::class); + $this->registerNodeClass('DOMEntity', DOMEntity::class); + $this->registerNodeClass('DOMEntityReference', DOMEntityReference::class); + $this->registerNodeClass('DOMNode', DOMNode::class); + $this->registerNodeClass('DOMNotation', DOMNotation::class); + $this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class); + $this->registerNodeClass('DOMText', DOMText::class); + } +} diff --git a/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMDocumentFragment.php b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMDocumentFragment.php new file mode 100644 index 0000000..33a3f95 --- /dev/null +++ b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMDocumentFragment.php @@ -0,0 +1,10 @@ +childNodes as $node) { + if ($node->nodeType === XML_ELEMENT_NODE) { + $newList->add($node); + } + } + return $newList; + } + + /** + * Returns the Element immediately prior to the specified one in its parent's children list, or null if the specified element is the first one in the list. + * + * @see https://wiki.php.net/rfc/dom_living_standard_api + * @return DOMElement|null + */ + public function previousElementSibling() + { + $previous = $this->previousSibling; + while ($previous) { + if ($previous->nodeType === XML_ELEMENT_NODE) { + return $previous; + } + $previous = $previous->previousSibling; + } + return null; + } +} diff --git a/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMEntity.php b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMEntity.php new file mode 100644 index 0000000..751b59c --- /dev/null +++ b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMEntity.php @@ -0,0 +1,10 @@ +length is hidden + * from the user and cannot be extended, changed, or tweaked. + */ +class DOMNodeList implements \Countable, \IteratorAggregate +{ + /** + * @var array + */ + protected $items = []; + + /** + * @var int + */ + protected $length = 0; + + /** + * To allow access to length in the same way that DOMNodeList allows. + * + * {@inheritdoc} + */ + public function __get($name) + { + switch ($name) { + case 'length': + return $this->length; + default: + trigger_error(sprintf('Undefined property: %s::%s', static::class, $name)); + } + } + + /** + * @param DOMNode|DOMElement|DOMComment $node + * + * @return DOMNodeList + */ + public function add($node) + { + $this->items[] = $node; + $this->length++; + + return $this; + } + + /** + * @param int $offset + * + * @return DOMNode|DOMElement|DOMComment + */ + public function item(int $offset) + { + return $this->items[$offset]; + } + + /** + * @return int|void + */ + public function count(): int + { + return $this->length; + } + + /** + * To make it compatible with iterator_to_array() function. + * + * {@inheritdoc} + */ + public function getIterator(): \ArrayIterator + { + return new \ArrayIterator($this->items); + } +} diff --git a/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMNotation.php b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMNotation.php new file mode 100644 index 0000000..d276e42 --- /dev/null +++ b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMNotation.php @@ -0,0 +1,10 @@ +initialized; + } + + /** + * @return bool + */ + public function isReadabilityDataTable() + { + /* + * This is a workaround that I'd like to remove in the future. + * Seems that although we are extending the base DOMElement and adding custom properties (like this one, + * 'readabilityDataTable'), these properties get lost when you search for elements with getElementsByTagName. + * This means that even if we mark the tables in a previous step, when we want to retrieve that information, + * all the custom properties are in their default values. Somehow we need to find a way to make these properties + * permanent across the whole DOM. + * + * @see https://stackoverflow.com/questions/35654709/php-registernodeclass-and-reusing-variable-names + */ + return $this->hasAttribute('readabilityDataTable') + && $this->getAttribute('readabilityDataTable') === '1'; +// return $this->readabilityDataTable; + } + + /** + * @param bool $param + */ + public function setReadabilityDataTable($param) + { + // Can't be "true" because DOMDocument casts it to "1" + $this->setAttribute('readabilityDataTable', $param ? '1' : '0'); +// $this->readabilityDataTable = $param; + } + + /** + * Initializer. Calculates the current score of the node and returns a full Readability object. + * + * @ TODO: I don't like the weightClasses param. How can we get the config here? + * + * @param $weightClasses bool Weight classes? + * + * @return static + */ + public function initializeNode($weightClasses) + { + if (!$this->isInitialized()) { + $contentScore = 0; + + switch ($this->nodeName) { + case 'div': + $contentScore += 5; + break; + + case 'pre': + case 'td': + case 'blockquote': + $contentScore += 3; + break; + + case 'address': + case 'ol': + case 'ul': + case 'dl': + case 'dd': + case 'dt': + case 'li': + case 'form': + $contentScore -= 3; + break; + + case 'h1': + case 'h2': + case 'h3': + case 'h4': + case 'h5': + case 'h6': + case 'th': + $contentScore -= 5; + break; + } + + $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0); + + $this->initialized = true; + } + + return $this; + } + + /** + * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need + * to check first the existence of the attributes property. + * + * @param $attributeName string Attribute to retrieve + * + * @return string + */ + #[\ReturnTypeWillChange] + public function getAttribute($attributeName) + { + if (!is_null($this->attributes)) { + return parent::getAttribute($attributeName); + } + + return ''; + } + + /** + * Override for native hasAttribute. + * + * @param $attributeName + * + * @return bool + * + * @see getAttribute + */ + #[\ReturnTypeWillChange] + public function hasAttribute($attributeName) + { + if (!is_null($this->attributes)) { + return parent::hasAttribute($attributeName); + } + + return false; + } + + /** + * Get the ancestors of the current node. + * + * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them + * + * @return array + */ + public function getNodeAncestors($maxLevel = 3) + { + $ancestors = []; + $level = 0; + + $node = $this->parentNode; + + while ($node && !($node instanceof DOMDocument)) { + $ancestors[] = $node; + $level++; + if ($level === $maxLevel) { + break; + } + $node = $node->parentNode; + } + + return $ancestors; + } + + /** + * Returns all links from the current element. + * + * @return array + */ + public function getAllLinks() + { + return iterator_to_array($this->getElementsByTagName('a')); + } + + /** + * Get the density of links as a percentage of the content + * This is the amount of text that is inside a link divided by the total text in the node. + * + * @return int + */ + public function getLinkDensity() + { + $textLength = mb_strlen($this->getTextContent(true)); + if ($textLength === 0) { + return 0; + } + + $linkLength = 0; + + $links = $this->getAllLinks(); + + if ($links) { + /** @var DOMElement $link */ + foreach ($links as $link) { + $href = $link->getAttribute('href'); + $coefficient = ($href && preg_match(NodeUtility::$regexps['hashUrl'], $href)) ? 0.3 : 1; + $linkLength += mb_strlen($link->getTextContent(true)) * $coefficient; + } + } + + return $linkLength / $textLength; + } + + /** + * Calculates the weight of the class/id of the current element. + * + * @return int + */ + public function getClassWeight() + { + $weight = 0; + + // Look for a special classname + $class = $this->getAttribute('class'); + if (trim($class)) { + if (preg_match(NodeUtility::$regexps['negative'], $class)) { + $weight -= 25; + } + + if (preg_match(NodeUtility::$regexps['positive'], $class)) { + $weight += 25; + } + } + + // Look for a special ID + $id = $this->getAttribute('id'); + if (trim($id) !== '') { + if (preg_match(NodeUtility::$regexps['negative'], $id)) { + $weight -= 25; + } + + if (preg_match(NodeUtility::$regexps['positive'], $id)) { + $weight += 25; + } + } + + return $weight; + } + + /** + * Returns the full text of the node. + * + * @param bool $normalize Normalize white space? + * + * @return string + */ + public function getTextContent($normalize = true) + { + $nodeValue = trim($this->textContent); + if ($normalize) { + $nodeValue = preg_replace(NodeUtility::$regexps['normalize'], ' ', $nodeValue); + } + + return $nodeValue; + } + + /** + * Return an array indicating how many rows and columns this table has. + * + * @return array + */ + public function getRowAndColumnCount() + { + $rows = $columns = 0; + $trs = $this->getElementsByTagName('tr'); + foreach ($trs as $tr) { + /** @var \DOMElement $tr */ + $rowspan = $tr->getAttribute('rowspan'); + $rows += ($rowspan || 1); + + // Now look for column-related info + $columnsInThisRow = 0; + $cells = $tr->getElementsByTagName('td'); + foreach ($cells as $cell) { + /** @var \DOMElement $cell */ + $colspan = $cell->getAttribute('colspan'); + $columnsInThisRow += ($colspan || 1); + } + $columns = max($columns, $columnsInThisRow); + } + + return ['rows' => $rows, 'columns' => $columns]; + } + + /** + * Creates a new node based on the text content of the original node. + * + * @param $originalNode DOMNode + * @param $tagName string + * + * @return DOMElement + */ + public function createNode($originalNode, $tagName) + { + $text = $originalNode->getTextContent(false); + $newNode = $originalNode->ownerDocument->createElement($tagName, $text); + + return $newNode; + } + + /** + * Check if a given node has one of its ancestor tag name matching the + * provided one. + * + * @param string $tagName + * @param int $maxDepth + * @param callable $filterFn + * + * @return bool + */ + public function hasAncestorTag($tagName, $maxDepth = 3, callable $filterFn = null) + { + $depth = 0; + $node = $this; + + while ($node->parentNode) { + if ($maxDepth > 0 && $depth > $maxDepth) { + return false; + } + + if ($node->parentNode->nodeName === $tagName && (!$filterFn || $filterFn($node->parentNode))) { + return true; + } + + $node = $node->parentNode; + $depth++; + } + + return false; + } + + /** + * Check if this node has only whitespace and a single element with given tag + * or if it contains no element with given tag or more than 1 element. + * + * @param $tag string Name of tag + * + * @return bool + */ + public function hasSingleTagInsideElement($tag) + { + // There should be exactly 1 element child with given tag + if (count($children = NodeUtility::filterTextNodes($this->childNodes)) !== 1 || $children->item(0)->nodeName !== $tag) { + return false; + } + + // And there should be no text nodes with real content + return array_reduce(iterator_to_array($children), function ($carry, $child) { + if (!$carry === false) { + return false; + } + + /* @var DOMNode $child */ + return !($child->nodeType === XML_TEXT_NODE && preg_match(NodeUtility::$regexps['hasContent'], $child->textContent)); + }); + } + + /** + * Check if the current element has a single child block element. + * Block elements are the ones defined in the divToPElements array. + * + * @return bool + */ + public function hasSingleChildBlockElement() + { + $result = false; + if ($this->hasChildNodes()) { + foreach ($this->childNodes as $child) { + if (in_array($child->nodeName, $this->divToPElements)) { + $result = true; + } else { + // If any of the hasSingleChildBlockElement calls return true, return true then. + /** @var $child DOMElement */ + $result = ($result || $child->hasSingleChildBlockElement()); + } + } + } + + return $result; + } + + /** + * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace. + * + * @return bool + */ + public function isElementWithoutContent() + { + return $this instanceof DOMElement && + mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 && + ($this->childNodes->length === 0 || + $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length + /* + * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node. + * If there's an empty tag with an space inside and a BR (for example "


) counting only BRs and + * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument, + * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we + * are dealing with (And at this point we know they are empty or are just whitespace, because of the + * mb_strlen in this chain of checks). + */ + + count(array_filter(iterator_to_array($this->childNodes), function ($child) { + return $child instanceof DOMText; + })) + + ); + } + + /** + * Determine if a node qualifies as phrasing content. + * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content. + * + * @return bool + */ + public function isPhrasingContent() + { + return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false || + (!is_null($this->childNodes) && + ($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') && + array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) { + return $node->isPhrasingContent() && $carry; + }, true) + ); + } + + /** + * In the original JS project they check if the node has the style display=none, which unfortunately + * in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none". + * + * @return bool + */ + public function isProbablyVisible() + { + return !preg_match('/display:( )?none/i', $this->getAttribute('style')) && + !$this->hasAttribute('hidden') && + //check for "fallback-image" so that wikimedia math images are displayed + (!$this->hasAttribute('aria-hidden') || $this->getAttribute('aria-hidden') !== 'true' || ($this->hasAttribute('class') && strpos($this->getAttribute('class'), 'fallback-image') !== false)); + } + + /** + * @return bool + */ + public function isWhitespace() + { + return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) || + ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br'); + } + + /** + * This is a hack that overcomes the issue of node shifting when scanning and removing nodes. + * + * In the JS version of getElementsByTagName, if you remove a node it will not appear during the + * foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an + * orphan node and will give an exception if you try to do anything with it. + * + * Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are + * removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that + * never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries + * to access node 6) + * + * This function solves this by searching for the nodes on every loop and keeping track of the count differences. + * Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be + * used only when the results of the search are going to be used to remove the nodes. + * + * @param string $tag + * + * @return \Generator + */ + public function shiftingAwareGetElementsByTagName($tag) + { + /** @var $nodes DOMNodeList */ + $nodes = $this->getElementsByTagName($tag); + $count = $nodes->length; + + for ($i = 0; $i < $count; $i = max(++$i, 0)) { + yield $nodes->item($i); + + // Search for all the nodes again + $nodes = $this->getElementsByTagName($tag); + + // Subtract the amount of nodes removed from the current index + $i -= $count - $nodes->length; + + // Subtract the amount of nodes removed from the current count + $count -= ($count - $nodes->length); + } + } + + /** + * Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this + * function to get the first one that is an DOMElement node. + * + * @return \DOMElement|null + */ + public function getFirstElementChild() + { + if ($this->childNodes instanceof \Traversable) { + foreach ($this->childNodes as $node) { + if ($node instanceof \DOMElement) { + return $node; + } + } + } + + return null; + } +} diff --git a/vendor/fivefilters/readability.php/src/Nodes/NodeUtility.php b/vendor/fivefilters/readability.php/src/Nodes/NodeUtility.php new file mode 100644 index 0000000..56de705 --- /dev/null +++ b/vendor/fivefilters/readability.php/src/Nodes/NodeUtility.php @@ -0,0 +1,192 @@ + '/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', + 'okMaybeItsACandidate' => '/and|article|body|column|content|main|shadow/i', + 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', + 'byline' => '/byline|author|dateline|writtenby|p-author/i', + 'replaceFonts' => '/<(\/?)font[^>]*>/i', + 'normalize' => '/\s{2,}/', + 'videos' => '/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i', + 'shareElements' => '/(\b|_)(share|sharedaddy)(\b|_)/i', + 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', + 'prevLink' => '/(prev|earl|old|new|<|«)/i', + 'tokenize' => '/\W+/', + 'whitespace' => '/^\s*$/', + 'hasContent' => '/\S$/', + 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i', + 'negative' => '/-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i', + // \x{00A0} is the unicode version of   + 'onlyWhitespace' => '/\x{00A0}|\s+/u', + 'hashUrl' => '/^#.+/', + 'srcsetUrl' => '/(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/', + 'b64DataUrl' => '/^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i', + // See: https://schema.org/Article + 'jsonLdArticleTypes' => '/^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/' + + ]; + + /** + * Finds the next node, starting from the given node, and ignoring + * whitespace in between. If the given node is an element, the same node is + * returned. + * + * Imported from the Element class on league\html-to-markdown. + * + * @param $node + * + * @return DOMNode + */ + public static function nextNode($node) + { + $next = $node; + while ($next + && $next->nodeType !== XML_ELEMENT_NODE + && $next->isWhitespace()) { + $next = $next->nextSibling; + } + + return $next; + } + + /** + * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new + * element with the new tag name and importing it to the main DOMDocument. + * + * @param DOMNode $node + * @param string $value + * @param bool $importAttributes + * + * @return DOMNode + */ + public static function setNodeTag($node, $value, $importAttributes = true) + { + $new = new DOMDocument('1.0', 'utf-8'); + $new->appendChild($new->createElement($value)); + + $children = $node->childNodes; + /** @var $children \DOMNodeList $i */ + for ($i = 0; $i < $children->length; $i++) { + $import = $new->importNode($children->item($i), true); + $new->firstChild->appendChild($import); + } + + if ($importAttributes) { + // Import attributes from the original node. + foreach ($node->attributes as $attribute) { + $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue); + } + } + + // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement. + $import = $node->ownerDocument->importNode($new->firstChild, true); + $node->parentNode->replaceChild($import, $node); + + return $import; + } + + /** + * Removes the current node and returns the next node to be parsed (child, sibling or parent). + * + * @param DOMNode $node + * + * @return DOMNode + */ + public static function removeAndGetNext($node) + { + $nextNode = self::getNextNode($node, true); + $node->parentNode->removeChild($node); + + return $nextNode; + } + + /** + * Remove the selected node. + * + * @param $node DOMElement + * + * @return void + **/ + public static function removeNode($node) + { + $parent = $node->parentNode; + if ($parent) { + $parent->removeChild($node); + } + } + + /** + * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally + * for parents. + * + * @param DOMNode $originalNode + * @param bool $ignoreSelfAndKids + * + * @return DOMNode + */ + public static function getNextNode($originalNode, $ignoreSelfAndKids = false) + { + /* + * Traverse the DOM from node to node, starting at the node passed in. + * Pass true for the second parameter to indicate this node itself + * (and its kids) are going away, and we want the next node over. + * + * Calling this in a loop will traverse the DOM depth-first. + */ + + // First check for kids if those aren't being ignored + if (!$ignoreSelfAndKids && $originalNode->firstChild) { + return $originalNode->firstChild; + } + + // Then for siblings... + if ($originalNode->nextSibling) { + return $originalNode->nextSibling; + } + + // And finally, move up the parent chain *and* find a sibling + // (because this is depth-first traversal, we will have already + // seen the parent nodes themselves). + do { + $originalNode = $originalNode->parentNode; + } while ($originalNode && !$originalNode->nextSibling); + + return ($originalNode) ? $originalNode->nextSibling : $originalNode; + } + + /** + * Remove all empty DOMNodes from DOMNodeLists. + * + * @param \DOMNodeList $list + * + * @return DOMNodeList + */ + public static function filterTextNodes(\DOMNodeList $list) + { + $newList = new DOMNodeList(); + foreach ($list as $node) { + if ($node->nodeType !== XML_TEXT_NODE || mb_strlen(trim($node->nodeValue))) { + $newList->add($node); + } + } + + return $newList; + } +} -- cgit v1.2.3