From 7d34c6ac98e952782ab8665eaab774e1a5d29f5d Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Fri, 1 Dec 2017 00:09:08 +0000 Subject: Rename NodeClass namespace to Nodes --- src/NodeClass/DOMAttr.php | 8 - src/NodeClass/DOMCdataSection.php | 8 - src/NodeClass/DOMCharacterData.php | 8 - src/NodeClass/DOMComment.php | 8 - src/NodeClass/DOMDocument.php | 26 -- src/NodeClass/DOMDocumentFragment.php | 8 - src/NodeClass/DOMDocumentType.php | 8 - src/NodeClass/DOMElement.php | 8 - src/NodeClass/DOMNode.php | 14 - src/NodeClass/DOMNotation.php | 8 - src/NodeClass/DOMProcessingInstruction.php | 8 - src/NodeClass/DOMText.php | 8 - src/NodeClass/NodeClassTrait.php | 406 ----------------------------- src/NodeUtility.php | 164 ------------ src/Nodes/DOMAttr.php | 8 + src/Nodes/DOMCdataSection.php | 8 + src/Nodes/DOMCharacterData.php | 8 + src/Nodes/DOMComment.php | 8 + src/Nodes/DOMDocument.php | 26 ++ src/Nodes/DOMDocumentFragment.php | 8 + src/Nodes/DOMDocumentType.php | 8 + src/Nodes/DOMElement.php | 8 + src/Nodes/DOMNode.php | 14 + src/Nodes/DOMNotation.php | 8 + src/Nodes/DOMProcessingInstruction.php | 8 + src/Nodes/DOMText.php | 8 + src/Nodes/NodeTrait.php | 404 ++++++++++++++++++++++++++++ src/Nodes/NodeUtility.php | 160 ++++++++++++ 28 files changed, 684 insertions(+), 690 deletions(-) delete mode 100644 src/NodeClass/DOMAttr.php delete mode 100644 src/NodeClass/DOMCdataSection.php delete mode 100644 src/NodeClass/DOMCharacterData.php delete mode 100644 src/NodeClass/DOMComment.php delete mode 100644 src/NodeClass/DOMDocument.php delete mode 100644 src/NodeClass/DOMDocumentFragment.php delete mode 100644 src/NodeClass/DOMDocumentType.php delete mode 100644 src/NodeClass/DOMElement.php delete mode 100644 src/NodeClass/DOMNode.php delete mode 100644 src/NodeClass/DOMNotation.php delete mode 100644 src/NodeClass/DOMProcessingInstruction.php delete mode 100644 src/NodeClass/DOMText.php delete mode 100644 src/NodeClass/NodeClassTrait.php delete mode 100644 src/NodeUtility.php create mode 100644 src/Nodes/DOMAttr.php create mode 100644 src/Nodes/DOMCdataSection.php create mode 100644 src/Nodes/DOMCharacterData.php create mode 100644 src/Nodes/DOMComment.php create mode 100644 src/Nodes/DOMDocument.php create mode 100644 src/Nodes/DOMDocumentFragment.php create mode 100644 src/Nodes/DOMDocumentType.php create mode 100644 src/Nodes/DOMElement.php create mode 100644 src/Nodes/DOMNode.php create mode 100644 src/Nodes/DOMNotation.php create mode 100644 src/Nodes/DOMProcessingInstruction.php create mode 100644 src/Nodes/DOMText.php create mode 100644 src/Nodes/NodeTrait.php create mode 100644 src/Nodes/NodeUtility.php (limited to 'src') diff --git a/src/NodeClass/DOMAttr.php b/src/NodeClass/DOMAttr.php deleted file mode 100644 index ea8672d..0000000 --- a/src/NodeClass/DOMAttr.php +++ /dev/null @@ -1,8 +0,0 @@ -registerNodeClass('DOMAttr', DOMAttr::class); - $this->registerNodeClass('DOMCdataSection', DOMCdataSection::class); - $this->registerNodeClass('DOMCharacterData', DOMCharacterData::class); - $this->registerNodeClass('DOMComment', DOMComment::class); - $this->registerNodeClass('DOMDocument', DOMDocument::class); - $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class); - $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class); - $this->registerNodeClass('DOMElement', DOMElement::class); - $this->registerNodeClass('DOMNode', DOMNode::class); - $this->registerNodeClass('DOMNotation', DOMNotation::class); - $this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class); - $this->registerNodeClass('DOMText', DOMText::class); - } -} diff --git a/src/NodeClass/DOMDocumentFragment.php b/src/NodeClass/DOMDocumentFragment.php deleted file mode 100644 index cc8b753..0000000 --- a/src/NodeClass/DOMDocumentFragment.php +++ /dev/null @@ -1,8 +0,0 @@ -initialized; - } - - /** - * Initializer. Calculates the current score of the node and returns a full Readability object. - * - * @ TODO: I don't like the weightClasses param. How can we get the config here? - * - * @param $weightClasses bool Weight classes? - * @return static - */ - public function initializeNode($weightClasses) - { - if (!$this->isInitialized()) { - $contentScore = 0; - - switch ($this->nodeName) { - case 'div': - $contentScore += 5; - break; - - case 'pre': - case 'td': - case 'blockquote': - $contentScore += 3; - break; - - case 'address': - case 'ol': - case 'ul': - case 'dl': - case 'dd': - case 'dt': - case 'li': - case 'form': - $contentScore -= 3; - break; - - case 'h1': - case 'h2': - case 'h3': - case 'h4': - case 'h5': - case 'h6': - case 'th': - $contentScore -= 5; - break; - } - - $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0); - - $this->initialized = true; - } - - return $this; - } - - /** - * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need - * to check first the existence of the attributes property. - * - * @param $attributeName string Attribute to retrieve - * - * @return string - */ - public function getAttribute($attributeName) - { - if (!is_null($this->attributes)) { - return parent::getAttribute($attributeName); - } - - return ''; - } - - /** - * Get the ancestors of the current node. - * - * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them - * - * @return array - */ - public function getNodeAncestors($maxLevel = 3) - { - $ancestors = []; - $level = 0; - - $node = $this->parentNode; - - while ($node) { - $ancestors[] = $node; - $level++; - if ($level === $maxLevel) { - break; - } - $node = $node->parentNode; - } - - return $ancestors; - } - - /** - * Returns all links from the current element. - * - * @return array - */ - public function getAllLinks() - { - return iterator_to_array($this->getElementsByTagName('a')); - } - - /** - * Get the density of links as a percentage of the content - * This is the amount of text that is inside a link divided by the total text in the node. - * - * @return int - */ - public function getLinkDensity() - { - $linkLength = 0; - $textLength = mb_strlen($this->getTextContent(true)); - - if (!$textLength) { - return 0; - } - - $links = $this->getAllLinks(); - - if ($links) { - /** @var DOMElement $link */ - foreach ($links as $link) { - $linkLength += mb_strlen($link->getTextContent(true)); - } - } - - return $linkLength / $textLength; - } - - - /** - * Calculates the weight of the class/id of the current element. - * - * @return int - */ - public function getClassWeight() - { - $weight = 0; - - // Look for a special classname - $class = $this->getAttribute('class'); - if (trim($class)) { - if (preg_match(NodeUtility::$regexps['negative'], $class)) { - $weight -= 25; - } - - if (preg_match(NodeUtility::$regexps['positive'], $class)) { - $weight += 25; - } - } - - // Look for a special ID - $id = $this->getAttribute('id'); - if (trim($id)) { - if (preg_match(NodeUtility::$regexps['negative'], $id)) { - $weight -= 25; - } - - if (preg_match(NodeUtility::$regexps['positive'], $id)) { - $weight += 25; - } - } - - return $weight; - } - - /** - * Returns the full text of the node. - * - * @param bool $normalize Normalize white space? - * - * @return string - */ - public function getTextContent($normalize = false) - { - $nodeValue = $this->nodeValue; - if ($normalize) { - $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue)); - } - - return $nodeValue; - } - - /** - * Returns the children of the current node. - * - * @param bool $filterEmptyDOMText Filter empty DOMText nodes? - * - * @return array - */ - public function getChildren($filterEmptyDOMText = false) - { - $ret = iterator_to_array($this->childNodes); - if ($filterEmptyDOMText) { - // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number - $ret = array_values(array_filter($ret, function ($node) { - return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue)); - })); - } - - return $ret; - } - - /** - * Return an array indicating how many rows and columns this table has. - * - * @return array - */ - public function getRowAndColumnCount() - { - $rows = $columns = 0; - $trs = $this->getElementsByTagName('tr'); - foreach ($trs as $tr) { - /** @var \DOMElement $tr */ - $rowspan = $tr->getAttribute('rowspan'); - $rows += ($rowspan || 1); - - // Now look for column-related info - $columnsInThisRow = 0; - $cells = $tr->getElementsByTagName('td'); - foreach ($cells as $cell) { - /** @var \DOMElement $cell */ - $colspan = $cell->getAttribute('colspan'); - $columnsInThisRow += ($colspan || 1); - } - $columns = max($columns, $columnsInThisRow); - } - - return ['rows' => $rows, 'columns' => $columns]; - } - - - /** - * Creates a new node based on the text content of the original node. - * - * @param $originalNode DOMElement - * @param $tagName string - * - * @return DOMElement - */ - public function createNode($originalNode, $tagName) - { - $text = $originalNode->getTextContent(); - $newNode = $originalNode->ownerDocument->createElement($tagName, $text); - - return $newNode; - } - - /** - * Check if a given node has one of its ancestor tag name matching the - * provided one. - * - * @param DOMElement $node - * @param string $tagName - * @param int $maxDepth - * - * @return bool - */ - public function hasAncestorTag($node, $tagName, $maxDepth = 3) - { - $depth = 0; - while ($node->parentNode) { - if ($maxDepth > 0 && $depth > $maxDepth) { - return false; - } - if ($node->parentNode->nodeName === $tagName) { - return true; - } - $node = $node->parentNode; - $depth++; - } - - return false; - } - - /** - * Checks if the current node has a single child and if that child is a P node. - * Useful to convert

nodes to a single

node and avoid confusing the scoring system since div with p - * tags are, in practice, paragraphs. - * - * @param DOMNode $node - * - * @return bool - */ - public function hasSinglePNode() - { - // There should be exactly 1 element child which is a P: - if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== 'p') { - return false; - } - - // And there should be no text nodes with real content (param true on ->getChildren) - foreach ($children as $child) { - /** @var $child DOMNode */ - if ($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())) { - return false; - } - } - - return true; - } - - /** - * Check if the current element has a single child block element. - * Block elements are the ones defined in the divToPElements array. - * - * @return bool - */ - public function hasSingleChildBlockElement() - { - $result = false; - if ($this->hasChildNodes()) { - foreach ($this->getChildren() as $child) { - if (in_array($child->nodeName, $this->divToPElements)) { - $result = true; - } else { - // If any of the hasSingleChildBlockElement calls return true, return true then. - /** @var $child DOMElement */ - $result = ($result || $child->hasSingleChildBlockElement()); - } - } - } - - return $result; - } - - /** - * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace. - * - * @return bool - */ - public function isElementWithoutContent() - { - return $this instanceof DOMElement && - mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 && - ($this->childNodes->length === 0 || - $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length - /* - * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node. - * If there's an empty tag with an space inside and a BR (for example "


) counting only BRs and - * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument, - * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we - * are dealing with (And at this point we know they are empty or are just whitespace, because of the - * mb_strlen in this chain of checks). - */ - + count(array_filter(iterator_to_array($this->childNodes), function ($child) { - return $child instanceof DOMText; - })) - - ); - } -} diff --git a/src/NodeUtility.php b/src/NodeUtility.php deleted file mode 100644 index d0796dd..0000000 --- a/src/NodeUtility.php +++ /dev/null @@ -1,164 +0,0 @@ - '/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', - 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', - 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', - 'byline' => '/byline|author|dateline|writtenby|p-author/i', - 'replaceFonts' => '/<(\/?)font[^>]*>/gi', - 'normalize' => '/\s{2,}/', - 'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i', - 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', - 'prevLink' => '/(prev|earl|old|new|<|«)/i', - 'whitespace' => '/^\s*$/', - 'hasContent' => '/\S$/', - 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i', - 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i', - // \x{00A0} is the unicode version of   - 'onlyWhitespace' => '/\x{00A0}|\s+/u' - ]; - - - /** - * - * Imported from the Element class on league\html-to-markdown - * - * @param $node - * @return DOMElement - */ - public static function nextElement($node) - { - $next = $node; - while ($next - && $next->nodeName !== '#text' - && trim($next->textContent)) { - $next = $next->nextSibling; - } - - return $next; - } - - - /** - * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new - * element with the new tag name and importing it to the main DOMDocument. - * - * @param string $value - * @param bool $importAttributes - * @return DOMNode - */ - public static function setNodeTag($node, $value, $importAttributes = false) - { - $new = new DOMDocument('1.0', 'utf-8'); - $new->appendChild($new->createElement($value)); - - $children = $node->childNodes; - /** @var $children \DOMNodeList $i */ - - for ($i = 0; $i < $children->length; $i++) { - $import = $new->importNode($children->item($i), true); - $new->firstChild->appendChild($import); - } - - if ($importAttributes) { - // Import attributes from the original node. - foreach ($node->attributes as $attribute) { - $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue); - } - } - - // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement. - $import = $node->ownerDocument->importNode($new->firstChild, true); - $node->parentNode->replaceChild($import, $node); - - return $import; - } - - /** - * Removes the current node and returns the next node to be parsed (child, sibling or parent). - * - * @param DOMNode $node - * - * @return DOMNode - */ - public static function removeAndGetNext($node) - { - $nextNode = self::getNextNode($node, true); - $node->parentNode->removeChild($node); - - return $nextNode; - } - - /** - * Remove the selected node. - * - * @param $node DOMElement - * - * @return void - **/ - public static function removeNode($node) - { - $parent = $node->parentNode; - if ($parent) { - $parent->removeChild($node); - } - } - - - /** - * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally - * for parents. - * - * @param DOMNode $originalNode - * @param bool $ignoreSelfAndKids - * - * @return DOMNode - */ - public static function getNextNode($originalNode, $ignoreSelfAndKids = false) - { - /* - * Traverse the DOM from node to node, starting at the node passed in. - * Pass true for the second parameter to indicate this node itself - * (and its kids) are going away, and we want the next node over. - * - * Calling this in a loop will traverse the DOM depth-first. - */ - - // First check for kids if those aren't being ignored - if (!$ignoreSelfAndKids && $originalNode->firstChild) { - return $originalNode->firstChild; - } - - // Then for siblings... - if ($originalNode->nextSibling) { - return $originalNode->nextSibling; - } - - // And finally, move up the parent chain *and* find a sibling - // (because this is depth-first traversal, we will have already - // seen the parent nodes themselves). - do { - $originalNode = $originalNode->parentNode; - } while ($originalNode && !$originalNode->nextSibling); - - return ($originalNode) ? $originalNode->nextSibling : $originalNode; - } -} diff --git a/src/Nodes/DOMAttr.php b/src/Nodes/DOMAttr.php new file mode 100644 index 0000000..c31517a --- /dev/null +++ b/src/Nodes/DOMAttr.php @@ -0,0 +1,8 @@ +registerNodeClass('DOMAttr', DOMAttr::class); + $this->registerNodeClass('DOMCdataSection', DOMCdataSection::class); + $this->registerNodeClass('DOMCharacterData', DOMCharacterData::class); + $this->registerNodeClass('DOMComment', DOMComment::class); + $this->registerNodeClass('DOMDocument', DOMDocument::class); + $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class); + $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class); + $this->registerNodeClass('DOMElement', DOMElement::class); + $this->registerNodeClass('DOMNode', DOMNode::class); + $this->registerNodeClass('DOMNotation', DOMNotation::class); + $this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class); + $this->registerNodeClass('DOMText', DOMText::class); + } +} diff --git a/src/Nodes/DOMDocumentFragment.php b/src/Nodes/DOMDocumentFragment.php new file mode 100644 index 0000000..d5f013e --- /dev/null +++ b/src/Nodes/DOMDocumentFragment.php @@ -0,0 +1,8 @@ +initialized; + } + + /** + * Initializer. Calculates the current score of the node and returns a full Readability object. + * + * @ TODO: I don't like the weightClasses param. How can we get the config here? + * + * @param $weightClasses bool Weight classes? + * @return static + */ + public function initializeNode($weightClasses) + { + if (!$this->isInitialized()) { + $contentScore = 0; + + switch ($this->nodeName) { + case 'div': + $contentScore += 5; + break; + + case 'pre': + case 'td': + case 'blockquote': + $contentScore += 3; + break; + + case 'address': + case 'ol': + case 'ul': + case 'dl': + case 'dd': + case 'dt': + case 'li': + case 'form': + $contentScore -= 3; + break; + + case 'h1': + case 'h2': + case 'h3': + case 'h4': + case 'h5': + case 'h6': + case 'th': + $contentScore -= 5; + break; + } + + $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0); + + $this->initialized = true; + } + + return $this; + } + + /** + * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need + * to check first the existence of the attributes property. + * + * @param $attributeName string Attribute to retrieve + * + * @return string + */ + public function getAttribute($attributeName) + { + if (!is_null($this->attributes)) { + return parent::getAttribute($attributeName); + } + + return ''; + } + + /** + * Get the ancestors of the current node. + * + * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them + * + * @return array + */ + public function getNodeAncestors($maxLevel = 3) + { + $ancestors = []; + $level = 0; + + $node = $this->parentNode; + + while ($node) { + $ancestors[] = $node; + $level++; + if ($level === $maxLevel) { + break; + } + $node = $node->parentNode; + } + + return $ancestors; + } + + /** + * Returns all links from the current element. + * + * @return array + */ + public function getAllLinks() + { + return iterator_to_array($this->getElementsByTagName('a')); + } + + /** + * Get the density of links as a percentage of the content + * This is the amount of text that is inside a link divided by the total text in the node. + * + * @return int + */ + public function getLinkDensity() + { + $linkLength = 0; + $textLength = mb_strlen($this->getTextContent(true)); + + if (!$textLength) { + return 0; + } + + $links = $this->getAllLinks(); + + if ($links) { + /** @var DOMElement $link */ + foreach ($links as $link) { + $linkLength += mb_strlen($link->getTextContent(true)); + } + } + + return $linkLength / $textLength; + } + + + /** + * Calculates the weight of the class/id of the current element. + * + * @return int + */ + public function getClassWeight() + { + $weight = 0; + + // Look for a special classname + $class = $this->getAttribute('class'); + if (trim($class)) { + if (preg_match(NodeUtility::$regexps['negative'], $class)) { + $weight -= 25; + } + + if (preg_match(NodeUtility::$regexps['positive'], $class)) { + $weight += 25; + } + } + + // Look for a special ID + $id = $this->getAttribute('id'); + if (trim($id)) { + if (preg_match(NodeUtility::$regexps['negative'], $id)) { + $weight -= 25; + } + + if (preg_match(NodeUtility::$regexps['positive'], $id)) { + $weight += 25; + } + } + + return $weight; + } + + /** + * Returns the full text of the node. + * + * @param bool $normalize Normalize white space? + * + * @return string + */ + public function getTextContent($normalize = false) + { + $nodeValue = $this->nodeValue; + if ($normalize) { + $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue)); + } + + return $nodeValue; + } + + /** + * Returns the children of the current node. + * + * @param bool $filterEmptyDOMText Filter empty DOMText nodes? + * + * @return array + */ + public function getChildren($filterEmptyDOMText = false) + { + $ret = iterator_to_array($this->childNodes); + if ($filterEmptyDOMText) { + // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number + $ret = array_values(array_filter($ret, function ($node) { + return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue)); + })); + } + + return $ret; + } + + /** + * Return an array indicating how many rows and columns this table has. + * + * @return array + */ + public function getRowAndColumnCount() + { + $rows = $columns = 0; + $trs = $this->getElementsByTagName('tr'); + foreach ($trs as $tr) { + /** @var \DOMElement $tr */ + $rowspan = $tr->getAttribute('rowspan'); + $rows += ($rowspan || 1); + + // Now look for column-related info + $columnsInThisRow = 0; + $cells = $tr->getElementsByTagName('td'); + foreach ($cells as $cell) { + /** @var \DOMElement $cell */ + $colspan = $cell->getAttribute('colspan'); + $columnsInThisRow += ($colspan || 1); + } + $columns = max($columns, $columnsInThisRow); + } + + return ['rows' => $rows, 'columns' => $columns]; + } + + + /** + * Creates a new node based on the text content of the original node. + * + * @param $originalNode DOMElement + * @param $tagName string + * + * @return DOMElement + */ + public function createNode($originalNode, $tagName) + { + $text = $originalNode->getTextContent(); + $newNode = $originalNode->ownerDocument->createElement($tagName, $text); + + return $newNode; + } + + /** + * Check if a given node has one of its ancestor tag name matching the + * provided one. + * + * @param DOMElement $node + * @param string $tagName + * @param int $maxDepth + * + * @return bool + */ + public function hasAncestorTag($node, $tagName, $maxDepth = 3) + { + $depth = 0; + while ($node->parentNode) { + if ($maxDepth > 0 && $depth > $maxDepth) { + return false; + } + if ($node->parentNode->nodeName === $tagName) { + return true; + } + $node = $node->parentNode; + $depth++; + } + + return false; + } + + /** + * Checks if the current node has a single child and if that child is a P node. + * Useful to convert

nodes to a single

node and avoid confusing the scoring system since div with p + * tags are, in practice, paragraphs. + * + * @param DOMNode $node + * + * @return bool + */ + public function hasSinglePNode() + { + // There should be exactly 1 element child which is a P: + if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== 'p') { + return false; + } + + // And there should be no text nodes with real content (param true on ->getChildren) + foreach ($children as $child) { + /** @var $child DOMNode */ + if ($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())) { + return false; + } + } + + return true; + } + + /** + * Check if the current element has a single child block element. + * Block elements are the ones defined in the divToPElements array. + * + * @return bool + */ + public function hasSingleChildBlockElement() + { + $result = false; + if ($this->hasChildNodes()) { + foreach ($this->getChildren() as $child) { + if (in_array($child->nodeName, $this->divToPElements)) { + $result = true; + } else { + // If any of the hasSingleChildBlockElement calls return true, return true then. + /** @var $child DOMElement */ + $result = ($result || $child->hasSingleChildBlockElement()); + } + } + } + + return $result; + } + + /** + * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace. + * + * @return bool + */ + public function isElementWithoutContent() + { + return $this instanceof DOMElement && + mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 && + ($this->childNodes->length === 0 || + $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length + /* + * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node. + * If there's an empty tag with an space inside and a BR (for example "


) counting only BRs and + * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument, + * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we + * are dealing with (And at this point we know they are empty or are just whitespace, because of the + * mb_strlen in this chain of checks). + */ + + count(array_filter(iterator_to_array($this->childNodes), function ($child) { + return $child instanceof DOMText; + })) + + ); + } +} diff --git a/src/Nodes/NodeUtility.php b/src/Nodes/NodeUtility.php new file mode 100644 index 0000000..f35e9c5 --- /dev/null +++ b/src/Nodes/NodeUtility.php @@ -0,0 +1,160 @@ + '/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', + 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', + 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', + 'byline' => '/byline|author|dateline|writtenby|p-author/i', + 'replaceFonts' => '/<(\/?)font[^>]*>/gi', + 'normalize' => '/\s{2,}/', + 'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i', + 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', + 'prevLink' => '/(prev|earl|old|new|<|«)/i', + 'whitespace' => '/^\s*$/', + 'hasContent' => '/\S$/', + 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i', + 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i', + // \x{00A0} is the unicode version of   + 'onlyWhitespace' => '/\x{00A0}|\s+/u' + ]; + + + /** + * + * Imported from the Element class on league\html-to-markdown + * + * @param $node + * @return DOMElement + */ + public static function nextElement($node) + { + $next = $node; + while ($next + && $next->nodeName !== '#text' + && trim($next->textContent)) { + $next = $next->nextSibling; + } + + return $next; + } + + + /** + * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new + * element with the new tag name and importing it to the main DOMDocument. + * + * @param string $value + * @param bool $importAttributes + * @return DOMNode + */ + public static function setNodeTag($node, $value, $importAttributes = false) + { + $new = new DOMDocument('1.0', 'utf-8'); + $new->appendChild($new->createElement($value)); + + $children = $node->childNodes; + /** @var $children \DOMNodeList $i */ + + for ($i = 0; $i < $children->length; $i++) { + $import = $new->importNode($children->item($i), true); + $new->firstChild->appendChild($import); + } + + if ($importAttributes) { + // Import attributes from the original node. + foreach ($node->attributes as $attribute) { + $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue); + } + } + + // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement. + $import = $node->ownerDocument->importNode($new->firstChild, true); + $node->parentNode->replaceChild($import, $node); + + return $import; + } + + /** + * Removes the current node and returns the next node to be parsed (child, sibling or parent). + * + * @param DOMNode $node + * + * @return DOMNode + */ + public static function removeAndGetNext($node) + { + $nextNode = self::getNextNode($node, true); + $node->parentNode->removeChild($node); + + return $nextNode; + } + + /** + * Remove the selected node. + * + * @param $node DOMElement + * + * @return void + **/ + public static function removeNode($node) + { + $parent = $node->parentNode; + if ($parent) { + $parent->removeChild($node); + } + } + + + /** + * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally + * for parents. + * + * @param DOMNode $originalNode + * @param bool $ignoreSelfAndKids + * + * @return DOMNode + */ + public static function getNextNode($originalNode, $ignoreSelfAndKids = false) + { + /* + * Traverse the DOM from node to node, starting at the node passed in. + * Pass true for the second parameter to indicate this node itself + * (and its kids) are going away, and we want the next node over. + * + * Calling this in a loop will traverse the DOM depth-first. + */ + + // First check for kids if those aren't being ignored + if (!$ignoreSelfAndKids && $originalNode->firstChild) { + return $originalNode->firstChild; + } + + // Then for siblings... + if ($originalNode->nextSibling) { + return $originalNode->nextSibling; + } + + // And finally, move up the parent chain *and* find a sibling + // (because this is depth-first traversal, we will have already + // seen the parent nodes themselves). + do { + $originalNode = $originalNode->parentNode; + } while ($originalNode && !$originalNode->nextSibling); + + return ($originalNode) ? $originalNode->nextSibling : $originalNode; + } +} -- cgit v1.2.3