summaryrefslogtreecommitdiff
path: root/vendor/andreskrey/Readability/Nodes/NodeUtility.php
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/andreskrey/Readability/Nodes/NodeUtility.php')
-rw-r--r--vendor/andreskrey/Readability/Nodes/NodeUtility.php160
1 files changed, 160 insertions, 0 deletions
diff --git a/vendor/andreskrey/Readability/Nodes/NodeUtility.php b/vendor/andreskrey/Readability/Nodes/NodeUtility.php
new file mode 100644
index 0000000..7a1f18e
--- /dev/null
+++ b/vendor/andreskrey/Readability/Nodes/NodeUtility.php
@@ -0,0 +1,160 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+use andreskrey\Readability\Nodes\DOM\DOMDocument;
+use andreskrey\Readability\Nodes\DOM\DOMElement;
+use andreskrey\Readability\Nodes\DOM\DOMNode;
+
+/**
+ * Class NodeUtility.
+ */
+class NodeUtility
+{
+ /**
+ * Collection of regexps to check the node usability.
+ *
+ * @var array
+ */
+ public static $regexps = [
+ 'unlikelyCandidates' => '/-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
+ 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
+ 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
+ 'byline' => '/byline|author|dateline|writtenby|p-author/i',
+ 'replaceFonts' => '/<(\/?)font[^>]*>/gi',
+ 'normalize' => '/\s{2,}/',
+ 'videos' => '/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i',
+ 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i',
+ 'prevLink' => '/(prev|earl|old|new|<|«)/i',
+ 'whitespace' => '/^\s*$/',
+ 'hasContent' => '/\S$/',
+ 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i',
+ 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i',
+ // \x{00A0} is the unicode version of &nbsp;
+ 'onlyWhitespace' => '/\x{00A0}|\s+/u'
+ ];
+
+ /**
+ * Imported from the Element class on league\html-to-markdown.
+ *
+ * @param $node
+ *
+ * @return DOMElement
+ */
+ public static function nextElement($node)
+ {
+ $next = $node;
+ while ($next
+ && $next->nodeType !== XML_ELEMENT_NODE
+ && $next->isWhitespace()) {
+ $next = $next->nextSibling;
+ }
+
+ return $next;
+ }
+
+ /**
+ * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new
+ * element with the new tag name and importing it to the main DOMDocument.
+ *
+ * @param DOMNode $node
+ * @param string $value
+ * @param bool $importAttributes
+ *
+ * @return DOMNode
+ */
+ public static function setNodeTag($node, $value, $importAttributes = true)
+ {
+ $new = new DOMDocument('1.0', 'utf-8');
+ $new->appendChild($new->createElement($value));
+
+ $children = $node->childNodes;
+ /** @var $children \DOMNodeList $i */
+ for ($i = 0; $i < $children->length; $i++) {
+ $import = $new->importNode($children->item($i), true);
+ $new->firstChild->appendChild($import);
+ }
+
+ if ($importAttributes) {
+ // Import attributes from the original node.
+ foreach ($node->attributes as $attribute) {
+ $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue);
+ }
+ }
+
+ // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement.
+ $import = $node->ownerDocument->importNode($new->firstChild, true);
+ $node->parentNode->replaceChild($import, $node);
+
+ return $import;
+ }
+
+ /**
+ * Removes the current node and returns the next node to be parsed (child, sibling or parent).
+ *
+ * @param DOMNode $node
+ *
+ * @return DOMNode
+ */
+ public static function removeAndGetNext($node)
+ {
+ $nextNode = self::getNextNode($node, true);
+ $node->parentNode->removeChild($node);
+
+ return $nextNode;
+ }
+
+ /**
+ * Remove the selected node.
+ *
+ * @param $node DOMElement
+ *
+ * @return void
+ **/
+ public static function removeNode($node)
+ {
+ $parent = $node->parentNode;
+ if ($parent) {
+ $parent->removeChild($node);
+ }
+ }
+
+ /**
+ * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally
+ * for parents.
+ *
+ * @param DOMNode $originalNode
+ * @param bool $ignoreSelfAndKids
+ *
+ * @return DOMNode
+ */
+ public static function getNextNode($originalNode, $ignoreSelfAndKids = false)
+ {
+ /*
+ * Traverse the DOM from node to node, starting at the node passed in.
+ * Pass true for the second parameter to indicate this node itself
+ * (and its kids) are going away, and we want the next node over.
+ *
+ * Calling this in a loop will traverse the DOM depth-first.
+ */
+
+ // First check for kids if those aren't being ignored
+ if (!$ignoreSelfAndKids && $originalNode->firstChild) {
+ return $originalNode->firstChild;
+ }
+
+ // Then for siblings...
+ if ($originalNode->nextSibling) {
+ return $originalNode->nextSibling;
+ }
+
+ // And finally, move up the parent chain *and* find a sibling
+ // (because this is depth-first traversal, we will have already
+ // seen the parent nodes themselves).
+ do {
+ $originalNode = $originalNode->parentNode;
+ } while ($originalNode && !$originalNode->nextSibling);
+
+ return ($originalNode) ? $originalNode->nextSibling : $originalNode;
+ }
+}