NodeUtility.php 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. <?php
  2. namespace andreskrey\Readability\Nodes;
  3. use andreskrey\Readability\Nodes\DOM\DOMDocument;
  4. use andreskrey\Readability\Nodes\DOM\DOMElement;
  5. use andreskrey\Readability\Nodes\DOM\DOMNode;
  6. /**
  7. * Class NodeUtility.
  8. */
  9. class NodeUtility
  10. {
  11. /**
  12. * Collection of regexps to check the node usability.
  13. *
  14. * @var array
  15. */
  16. public static $regexps = [
  17. 'unlikelyCandidates' => '/-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
  18. 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
  19. 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
  20. 'byline' => '/byline|author|dateline|writtenby|p-author/i',
  21. 'replaceFonts' => '/<(\/?)font[^>]*>/gi',
  22. 'normalize' => '/\s{2,}/',
  23. 'videos' => '/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i',
  24. 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i',
  25. 'prevLink' => '/(prev|earl|old|new|<|«)/i',
  26. 'whitespace' => '/^\s*$/',
  27. 'hasContent' => '/\S$/',
  28. 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i',
  29. 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i',
  30. // \x{00A0} is the unicode version of &nbsp;
  31. 'onlyWhitespace' => '/\x{00A0}|\s+/u'
  32. ];
  33. /**
  34. * Imported from the Element class on league\html-to-markdown.
  35. *
  36. * @param $node
  37. *
  38. * @return DOMElement
  39. */
  40. public static function nextElement($node)
  41. {
  42. $next = $node;
  43. while ($next
  44. && $next->nodeType !== XML_ELEMENT_NODE
  45. && $next->isWhitespace()) {
  46. $next = $next->nextSibling;
  47. }
  48. return $next;
  49. }
  50. /**
  51. * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new
  52. * element with the new tag name and importing it to the main DOMDocument.
  53. *
  54. * @param DOMNode $node
  55. * @param string $value
  56. * @param bool $importAttributes
  57. *
  58. * @return DOMNode
  59. */
  60. public static function setNodeTag($node, $value, $importAttributes = true)
  61. {
  62. $new = new DOMDocument('1.0', 'utf-8');
  63. $new->appendChild($new->createElement($value));
  64. $children = $node->childNodes;
  65. /** @var $children \DOMNodeList $i */
  66. for ($i = 0; $i < $children->length; $i++) {
  67. $import = $new->importNode($children->item($i), true);
  68. $new->firstChild->appendChild($import);
  69. }
  70. if ($importAttributes) {
  71. // Import attributes from the original node.
  72. foreach ($node->attributes as $attribute) {
  73. $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue);
  74. }
  75. }
  76. // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement.
  77. $import = $node->ownerDocument->importNode($new->firstChild, true);
  78. $node->parentNode->replaceChild($import, $node);
  79. return $import;
  80. }
  81. /**
  82. * Removes the current node and returns the next node to be parsed (child, sibling or parent).
  83. *
  84. * @param DOMNode $node
  85. *
  86. * @return DOMNode
  87. */
  88. public static function removeAndGetNext($node)
  89. {
  90. $nextNode = self::getNextNode($node, true);
  91. $node->parentNode->removeChild($node);
  92. return $nextNode;
  93. }
  94. /**
  95. * Remove the selected node.
  96. *
  97. * @param $node DOMElement
  98. *
  99. * @return void
  100. **/
  101. public static function removeNode($node)
  102. {
  103. $parent = $node->parentNode;
  104. if ($parent) {
  105. $parent->removeChild($node);
  106. }
  107. }
  108. /**
  109. * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally
  110. * for parents.
  111. *
  112. * @param DOMNode $originalNode
  113. * @param bool $ignoreSelfAndKids
  114. *
  115. * @return DOMNode
  116. */
  117. public static function getNextNode($originalNode, $ignoreSelfAndKids = false)
  118. {
  119. /*
  120. * Traverse the DOM from node to node, starting at the node passed in.
  121. * Pass true for the second parameter to indicate this node itself
  122. * (and its kids) are going away, and we want the next node over.
  123. *
  124. * Calling this in a loop will traverse the DOM depth-first.
  125. */
  126. // First check for kids if those aren't being ignored
  127. if (!$ignoreSelfAndKids && $originalNode->firstChild) {
  128. return $originalNode->firstChild;
  129. }
  130. // Then for siblings...
  131. if ($originalNode->nextSibling) {
  132. return $originalNode->nextSibling;
  133. }
  134. // And finally, move up the parent chain *and* find a sibling
  135. // (because this is depth-first traversal, we will have already
  136. // seen the parent nodes themselves).
  137. do {
  138. $originalNode = $originalNode->parentNode;
  139. } while ($originalNode && !$originalNode->nextSibling);
  140. return ($originalNode) ? $originalNode->nextSibling : $originalNode;
  141. }
  142. }