NodeTrait.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. <?php
  2. namespace andreskrey\Readability\Nodes;
  3. use andreskrey\Readability\Nodes\DOM\DOMDocument;
  4. use andreskrey\Readability\Nodes\DOM\DOMElement;
  5. use andreskrey\Readability\Nodes\DOM\DOMNode;
  6. use andreskrey\Readability\Nodes\DOM\DOMText;
  7. use DOMNodeList;
  8. /**
  9. * @method \DOMNode removeAttribute($name)
  10. */
  11. trait NodeTrait
  12. {
  13. /**
  14. * Content score of the node. Used to determine the value of the content.
  15. *
  16. * @var int
  17. */
  18. public $contentScore = 0;
  19. /**
  20. * Flag for initialized status.
  21. *
  22. * @var bool
  23. */
  24. private $initialized = false;
  25. /**
  26. * Flag data tables.
  27. *
  28. * @var bool
  29. */
  30. private $readabilityDataTable = false;
  31. /**
  32. * @var array
  33. */
  34. private $divToPElements = [
  35. 'a',
  36. 'blockquote',
  37. 'dl',
  38. 'div',
  39. 'img',
  40. 'ol',
  41. 'p',
  42. 'pre',
  43. 'table',
  44. 'ul',
  45. 'select',
  46. ];
  47. /**
  48. * The commented out elements qualify as phrasing content but tend to be
  49. * removed by readability when put into paragraphs, so we ignore them here.
  50. *
  51. * @var array
  52. */
  53. private $phrasing_elems = [
  54. // 'CANVAS', 'IFRAME', 'SVG', 'VIDEO',
  55. 'abbr', 'audio', 'b', 'bdo', 'br', 'button', 'cite', 'code', 'data',
  56. 'datalist', 'dfn', 'em', 'embed', 'i', 'img', 'input', 'kbd', 'label',
  57. 'mark', 'math', 'meter', 'noscript', 'object', 'output', 'progress', 'q',
  58. 'ruby', 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub',
  59. 'sup', 'textarea', 'time', 'var', 'wbr'
  60. ];
  61. /**
  62. * initialized getter.
  63. *
  64. * @return bool
  65. */
  66. public function isInitialized()
  67. {
  68. return $this->initialized;
  69. }
  70. /**
  71. * @return bool
  72. */
  73. public function isReadabilityDataTable()
  74. {
  75. /*
  76. * This is a workaround that I'd like to remove in the future.
  77. * Seems that although we are extending the base DOMElement and adding custom properties (like this one,
  78. * 'readabilityDataTable'), these properties get lost when you search for elements with getElementsByTagName.
  79. * This means that even if we mark the tables in a previous step, when we want to retrieve that information,
  80. * all the custom properties are in their default values. Somehow we need to find a way to make these properties
  81. * permanent across the whole DOM.
  82. *
  83. * @see https://stackoverflow.com/questions/35654709/php-registernodeclass-and-reusing-variable-names
  84. */
  85. return $this->hasAttribute('readabilityDataTable')
  86. && $this->getAttribute('readabilityDataTable') === '1';
  87. // return $this->readabilityDataTable;
  88. }
  89. /**
  90. * @param bool $param
  91. */
  92. public function setReadabilityDataTable($param)
  93. {
  94. // Can't be "true" because DOMDocument casts it to "1"
  95. $this->setAttribute('readabilityDataTable', $param ? '1' : '0');
  96. // $this->readabilityDataTable = $param;
  97. }
  98. /**
  99. * Initializer. Calculates the current score of the node and returns a full Readability object.
  100. *
  101. * @ TODO: I don't like the weightClasses param. How can we get the config here?
  102. *
  103. * @param $weightClasses bool Weight classes?
  104. *
  105. * @return static
  106. */
  107. public function initializeNode($weightClasses)
  108. {
  109. if (!$this->isInitialized()) {
  110. $contentScore = 0;
  111. switch ($this->nodeName) {
  112. case 'div':
  113. $contentScore += 5;
  114. break;
  115. case 'pre':
  116. case 'td':
  117. case 'blockquote':
  118. $contentScore += 3;
  119. break;
  120. case 'address':
  121. case 'ol':
  122. case 'ul':
  123. case 'dl':
  124. case 'dd':
  125. case 'dt':
  126. case 'li':
  127. case 'form':
  128. $contentScore -= 3;
  129. break;
  130. case 'h1':
  131. case 'h2':
  132. case 'h3':
  133. case 'h4':
  134. case 'h5':
  135. case 'h6':
  136. case 'th':
  137. $contentScore -= 5;
  138. break;
  139. }
  140. $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0);
  141. $this->initialized = true;
  142. }
  143. return $this;
  144. }
  145. /**
  146. * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need
  147. * to check first the existence of the attributes property.
  148. *
  149. * @param $attributeName string Attribute to retrieve
  150. *
  151. * @return string
  152. */
  153. public function getAttribute($attributeName)
  154. {
  155. if (!is_null($this->attributes)) {
  156. return parent::getAttribute($attributeName);
  157. }
  158. return '';
  159. }
  160. /**
  161. * Override for native hasAttribute.
  162. *
  163. * @see getAttribute
  164. *
  165. * @param $attributeName
  166. *
  167. * @return bool
  168. */
  169. public function hasAttribute($attributeName)
  170. {
  171. if (!is_null($this->attributes)) {
  172. return parent::hasAttribute($attributeName);
  173. }
  174. return false;
  175. }
  176. /**
  177. * Get the ancestors of the current node.
  178. *
  179. * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them
  180. *
  181. * @return array
  182. */
  183. public function getNodeAncestors($maxLevel = 3)
  184. {
  185. $ancestors = [];
  186. $level = 0;
  187. $node = $this->parentNode;
  188. while ($node && !($node instanceof DOMDocument)) {
  189. $ancestors[] = $node;
  190. $level++;
  191. if ($level === $maxLevel) {
  192. break;
  193. }
  194. $node = $node->parentNode;
  195. }
  196. return $ancestors;
  197. }
  198. /**
  199. * Returns all links from the current element.
  200. *
  201. * @return array
  202. */
  203. public function getAllLinks()
  204. {
  205. return iterator_to_array($this->getElementsByTagName('a'));
  206. }
  207. /**
  208. * Get the density of links as a percentage of the content
  209. * This is the amount of text that is inside a link divided by the total text in the node.
  210. *
  211. * @return int
  212. */
  213. public function getLinkDensity()
  214. {
  215. $linkLength = 0;
  216. $textLength = mb_strlen($this->getTextContent(true));
  217. if (!$textLength) {
  218. return 0;
  219. }
  220. $links = $this->getAllLinks();
  221. if ($links) {
  222. /** @var DOMElement $link */
  223. foreach ($links as $link) {
  224. $linkLength += mb_strlen($link->getTextContent(true));
  225. }
  226. }
  227. return $linkLength / $textLength;
  228. }
  229. /**
  230. * Calculates the weight of the class/id of the current element.
  231. *
  232. * @return int
  233. */
  234. public function getClassWeight()
  235. {
  236. $weight = 0;
  237. // Look for a special classname
  238. $class = $this->getAttribute('class');
  239. if (trim($class)) {
  240. if (preg_match(NodeUtility::$regexps['negative'], $class)) {
  241. $weight -= 25;
  242. }
  243. if (preg_match(NodeUtility::$regexps['positive'], $class)) {
  244. $weight += 25;
  245. }
  246. }
  247. // Look for a special ID
  248. $id = $this->getAttribute('id');
  249. if (trim($id)) {
  250. if (preg_match(NodeUtility::$regexps['negative'], $id)) {
  251. $weight -= 25;
  252. }
  253. if (preg_match(NodeUtility::$regexps['positive'], $id)) {
  254. $weight += 25;
  255. }
  256. }
  257. return $weight;
  258. }
  259. /**
  260. * Returns the full text of the node.
  261. *
  262. * @param bool $normalize Normalize white space?
  263. *
  264. * @return string
  265. */
  266. public function getTextContent($normalize = false)
  267. {
  268. $nodeValue = $this->nodeValue;
  269. if ($normalize) {
  270. $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue));
  271. }
  272. return $nodeValue;
  273. }
  274. /**
  275. * Returns the children of the current node.
  276. *
  277. * @param bool $filterEmptyDOMText Filter empty DOMText nodes?
  278. *
  279. * @return array
  280. */
  281. public function getChildren($filterEmptyDOMText = false)
  282. {
  283. $ret = iterator_to_array($this->childNodes);
  284. if ($filterEmptyDOMText) {
  285. // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number
  286. $ret = array_values(array_filter($ret, function ($node) {
  287. return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue));
  288. }));
  289. }
  290. return $ret;
  291. }
  292. /**
  293. * Return an array indicating how many rows and columns this table has.
  294. *
  295. * @return array
  296. */
  297. public function getRowAndColumnCount()
  298. {
  299. $rows = $columns = 0;
  300. $trs = $this->getElementsByTagName('tr');
  301. foreach ($trs as $tr) {
  302. /** @var \DOMElement $tr */
  303. $rowspan = $tr->getAttribute('rowspan');
  304. $rows += ($rowspan || 1);
  305. // Now look for column-related info
  306. $columnsInThisRow = 0;
  307. $cells = $tr->getElementsByTagName('td');
  308. foreach ($cells as $cell) {
  309. /** @var \DOMElement $cell */
  310. $colspan = $cell->getAttribute('colspan');
  311. $columnsInThisRow += ($colspan || 1);
  312. }
  313. $columns = max($columns, $columnsInThisRow);
  314. }
  315. return ['rows' => $rows, 'columns' => $columns];
  316. }
  317. /**
  318. * Creates a new node based on the text content of the original node.
  319. *
  320. * @param $originalNode DOMNode
  321. * @param $tagName string
  322. *
  323. * @return DOMElement
  324. */
  325. public function createNode($originalNode, $tagName)
  326. {
  327. $text = $originalNode->getTextContent();
  328. $newNode = $originalNode->ownerDocument->createElement($tagName, $text);
  329. return $newNode;
  330. }
  331. /**
  332. * Check if a given node has one of its ancestor tag name matching the
  333. * provided one.
  334. *
  335. * @param string $tagName
  336. * @param int $maxDepth
  337. * @param callable $filterFn
  338. *
  339. * @return bool
  340. */
  341. public function hasAncestorTag($tagName, $maxDepth = 3, callable $filterFn = null)
  342. {
  343. $depth = 0;
  344. $node = $this;
  345. while ($node->parentNode) {
  346. if ($maxDepth > 0 && $depth > $maxDepth) {
  347. return false;
  348. }
  349. if ($node->parentNode->nodeName === $tagName && (!$filterFn || $filterFn($node->parentNode))) {
  350. return true;
  351. }
  352. $node = $node->parentNode;
  353. $depth++;
  354. }
  355. return false;
  356. }
  357. /**
  358. * Check if this node has only whitespace and a single element with given tag
  359. * or if it contains no element with given tag or more than 1 element.
  360. *
  361. * @param $tag string Name of tag
  362. *
  363. * @return bool
  364. */
  365. public function hasSingleTagInsideElement($tag)
  366. {
  367. // There should be exactly 1 element child with given tag
  368. if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== $tag) {
  369. return false;
  370. }
  371. // And there should be no text nodes with real content
  372. return array_reduce($children, function ($carry, $child) {
  373. if (!$carry === false) {
  374. return false;
  375. }
  376. /* @var DOMNode $child */
  377. return !($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent()));
  378. });
  379. }
  380. /**
  381. * Check if the current element has a single child block element.
  382. * Block elements are the ones defined in the divToPElements array.
  383. *
  384. * @return bool
  385. */
  386. public function hasSingleChildBlockElement()
  387. {
  388. $result = false;
  389. if ($this->hasChildNodes()) {
  390. foreach ($this->getChildren() as $child) {
  391. if (in_array($child->nodeName, $this->divToPElements)) {
  392. $result = true;
  393. } else {
  394. // If any of the hasSingleChildBlockElement calls return true, return true then.
  395. /** @var $child DOMElement */
  396. $result = ($result || $child->hasSingleChildBlockElement());
  397. }
  398. }
  399. }
  400. return $result;
  401. }
  402. /**
  403. * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace.
  404. *
  405. * @return bool
  406. */
  407. public function isElementWithoutContent()
  408. {
  409. return $this instanceof DOMElement &&
  410. mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 &&
  411. ($this->childNodes->length === 0 ||
  412. $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length
  413. /*
  414. * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node.
  415. * If there's an empty tag with an space inside and a BR (for example "<p> <br/></p>) counting only BRs and
  416. * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument,
  417. * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we
  418. * are dealing with (And at this point we know they are empty or are just whitespace, because of the
  419. * mb_strlen in this chain of checks).
  420. */
  421. + count(array_filter(iterator_to_array($this->childNodes), function ($child) {
  422. return $child instanceof DOMText;
  423. }))
  424. );
  425. }
  426. /**
  427. * Determine if a node qualifies as phrasing content.
  428. * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content.
  429. *
  430. * @return bool
  431. */
  432. public function isPhrasingContent()
  433. {
  434. return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false ||
  435. (!is_null($this->childNodes) &&
  436. ($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') &&
  437. array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) {
  438. return $node->isPhrasingContent() && $carry;
  439. }, true)
  440. );
  441. }
  442. public function isProbablyVisible()
  443. {
  444. /*
  445. * In the original JS project they check if the node has the style display=none, which unfortunately
  446. * in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none".
  447. *
  448. * Might be a good idea to check for classes or other attributes like 'aria-hidden'
  449. */
  450. return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden');
  451. }
  452. public function isWhitespace()
  453. {
  454. return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) ||
  455. ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br');
  456. }
  457. /**
  458. * This is a hack that overcomes the issue of node shifting when scanning and removing nodes.
  459. *
  460. * In the JS version of getElementsByTagName, if you remove a node it will not appear during the
  461. * foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an
  462. * orphan node and will give an exception if you try to do anything with it.
  463. *
  464. * Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are
  465. * removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that
  466. * never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries
  467. * to access node 6)
  468. *
  469. * This function solves this by searching for the nodes on every loop and keeping track of the count differences.
  470. * Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be
  471. * used only when the results of the search are going to be used to remove the nodes.
  472. *
  473. * @param string $tag
  474. *
  475. * @return \Generator
  476. */
  477. public function shiftingAwareGetElementsByTagName($tag)
  478. {
  479. /** @var $nodes DOMNodeList */
  480. $nodes = $this->getElementsByTagName($tag);
  481. $count = $nodes->length;
  482. for ($i = 0; $i < $count; $i = max(++$i, 0)) {
  483. yield $nodes->item($i);
  484. // Search for all the nodes again
  485. $nodes = $this->getElementsByTagName($tag);
  486. // Subtract the amount of nodes removed from the current index
  487. $i -= $count - $nodes->length;
  488. // Subtract the amount of nodes removed from the current count
  489. $count -= ($count - $nodes->length);
  490. }
  491. }
  492. }