From 7d34c6ac98e952782ab8665eaab774e1a5d29f5d Mon Sep 17 00:00:00 2001
From: Andres Rey <andreskrey@gmail.com>
Date: Fri, 1 Dec 2017 00:09:08 +0000
Subject: Rename NodeClass namespace to Nodes

---
 src/NodeClass/DOMAttr.php                  |   8 -
 src/NodeClass/DOMCdataSection.php          |   8 -
 src/NodeClass/DOMCharacterData.php         |   8 -
 src/NodeClass/DOMComment.php               |   8 -
 src/NodeClass/DOMDocument.php              |  26 --
 src/NodeClass/DOMDocumentFragment.php      |   8 -
 src/NodeClass/DOMDocumentType.php          |   8 -
 src/NodeClass/DOMElement.php               |   8 -
 src/NodeClass/DOMNode.php                  |  14 -
 src/NodeClass/DOMNotation.php              |   8 -
 src/NodeClass/DOMProcessingInstruction.php |   8 -
 src/NodeClass/DOMText.php                  |   8 -
 src/NodeClass/NodeClassTrait.php           | 406 -----------------------------
 src/NodeUtility.php                        | 164 ------------
 src/Nodes/DOMAttr.php                      |   8 +
 src/Nodes/DOMCdataSection.php              |   8 +
 src/Nodes/DOMCharacterData.php             |   8 +
 src/Nodes/DOMComment.php                   |   8 +
 src/Nodes/DOMDocument.php                  |  26 ++
 src/Nodes/DOMDocumentFragment.php          |   8 +
 src/Nodes/DOMDocumentType.php              |   8 +
 src/Nodes/DOMElement.php                   |   8 +
 src/Nodes/DOMNode.php                      |  14 +
 src/Nodes/DOMNotation.php                  |   8 +
 src/Nodes/DOMProcessingInstruction.php     |   8 +
 src/Nodes/DOMText.php                      |   8 +
 src/Nodes/NodeTrait.php                    | 404 ++++++++++++++++++++++++++++
 src/Nodes/NodeUtility.php                  | 160 ++++++++++++
 28 files changed, 684 insertions(+), 690 deletions(-)
 delete mode 100644 src/NodeClass/DOMAttr.php
 delete mode 100644 src/NodeClass/DOMCdataSection.php
 delete mode 100644 src/NodeClass/DOMCharacterData.php
 delete mode 100644 src/NodeClass/DOMComment.php
 delete mode 100644 src/NodeClass/DOMDocument.php
 delete mode 100644 src/NodeClass/DOMDocumentFragment.php
 delete mode 100644 src/NodeClass/DOMDocumentType.php
 delete mode 100644 src/NodeClass/DOMElement.php
 delete mode 100644 src/NodeClass/DOMNode.php
 delete mode 100644 src/NodeClass/DOMNotation.php
 delete mode 100644 src/NodeClass/DOMProcessingInstruction.php
 delete mode 100644 src/NodeClass/DOMText.php
 delete mode 100644 src/NodeClass/NodeClassTrait.php
 delete mode 100644 src/NodeUtility.php
 create mode 100644 src/Nodes/DOMAttr.php
 create mode 100644 src/Nodes/DOMCdataSection.php
 create mode 100644 src/Nodes/DOMCharacterData.php
 create mode 100644 src/Nodes/DOMComment.php
 create mode 100644 src/Nodes/DOMDocument.php
 create mode 100644 src/Nodes/DOMDocumentFragment.php
 create mode 100644 src/Nodes/DOMDocumentType.php
 create mode 100644 src/Nodes/DOMElement.php
 create mode 100644 src/Nodes/DOMNode.php
 create mode 100644 src/Nodes/DOMNotation.php
 create mode 100644 src/Nodes/DOMProcessingInstruction.php
 create mode 100644 src/Nodes/DOMText.php
 create mode 100644 src/Nodes/NodeTrait.php
 create mode 100644 src/Nodes/NodeUtility.php

(limited to 'src')

diff --git a/src/NodeClass/DOMAttr.php b/src/NodeClass/DOMAttr.php
deleted file mode 100644
index ea8672d..0000000
--- a/src/NodeClass/DOMAttr.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMAttr extends \DOMAttr
-{
-    use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMCdataSection.php b/src/NodeClass/DOMCdataSection.php
deleted file mode 100644
index 438ac99..0000000
--- a/src/NodeClass/DOMCdataSection.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMCdataSection extends \DOMCdataSection
-{
-    use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMCharacterData.php b/src/NodeClass/DOMCharacterData.php
deleted file mode 100644
index 480980e..0000000
--- a/src/NodeClass/DOMCharacterData.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMCharacterData extends \DOMCharacterData
-{
-    use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMComment.php b/src/NodeClass/DOMComment.php
deleted file mode 100644
index 416460b..0000000
--- a/src/NodeClass/DOMComment.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMComment extends \DOMComment
-{
-    use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMDocument.php b/src/NodeClass/DOMDocument.php
deleted file mode 100644
index f379268..0000000
--- a/src/NodeClass/DOMDocument.php
+++ /dev/null
@@ -1,26 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMDocument extends \DOMDocument
-{
-    use NodeClassTrait;
-
-    public function __construct($version, $encoding)
-    {
-        parent::__construct($version, $encoding);
-
-        $this->registerNodeClass('DOMAttr', DOMAttr::class);
-        $this->registerNodeClass('DOMCdataSection', DOMCdataSection::class);
-        $this->registerNodeClass('DOMCharacterData', DOMCharacterData::class);
-        $this->registerNodeClass('DOMComment', DOMComment::class);
-        $this->registerNodeClass('DOMDocument', DOMDocument::class);
-        $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class);
-        $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class);
-        $this->registerNodeClass('DOMElement', DOMElement::class);
-        $this->registerNodeClass('DOMNode', DOMNode::class);
-        $this->registerNodeClass('DOMNotation', DOMNotation::class);
-        $this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class);
-        $this->registerNodeClass('DOMText', DOMText::class);
-    }
-}
diff --git a/src/NodeClass/DOMDocumentFragment.php b/src/NodeClass/DOMDocumentFragment.php
deleted file mode 100644
index cc8b753..0000000
--- a/src/NodeClass/DOMDocumentFragment.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMDocumentFragment extends \DOMDocumentFragment
-{
-    use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMDocumentType.php b/src/NodeClass/DOMDocumentType.php
deleted file mode 100644
index 13f7829..0000000
--- a/src/NodeClass/DOMDocumentType.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMDocumentType extends \DOMDocumentType
-{
-    use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMElement.php b/src/NodeClass/DOMElement.php
deleted file mode 100644
index a7dc36a..0000000
--- a/src/NodeClass/DOMElement.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMElement extends \DOMElement
-{
-    use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMNode.php b/src/NodeClass/DOMNode.php
deleted file mode 100644
index c9ed1c3..0000000
--- a/src/NodeClass/DOMNode.php
+++ /dev/null
@@ -1,14 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-/**
- * Class DOMNode
- *
- * @method getAttribute($attribute)
- * @package andreskrey\Readability\NodeClass
- */
-class DOMNode extends \DOMNode
-{
-    use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMNotation.php b/src/NodeClass/DOMNotation.php
deleted file mode 100644
index 3e09bbc..0000000
--- a/src/NodeClass/DOMNotation.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMNotation extends \DOMNotation
-{
-    use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMProcessingInstruction.php b/src/NodeClass/DOMProcessingInstruction.php
deleted file mode 100644
index 0c615c6..0000000
--- a/src/NodeClass/DOMProcessingInstruction.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMProcessingInstruction extends \DOMProcessingInstruction
-{
-    use NodeClassTrait;
-}
diff --git a/src/NodeClass/DOMText.php b/src/NodeClass/DOMText.php
deleted file mode 100644
index 80ef6c8..0000000
--- a/src/NodeClass/DOMText.php
+++ /dev/null
@@ -1,8 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-class DOMText extends \DOMText
-{
-    use NodeClassTrait;
-}
diff --git a/src/NodeClass/NodeClassTrait.php b/src/NodeClass/NodeClassTrait.php
deleted file mode 100644
index a1382d1..0000000
--- a/src/NodeClass/NodeClassTrait.php
+++ /dev/null
@@ -1,406 +0,0 @@
-<?php
-
-namespace andreskrey\Readability\NodeClass;
-
-use andreskrey\Readability\NodeUtility;
-
-trait NodeClassTrait
-{
-    /**
-     * Content score of the node. Used to determine the value of the content
-     *
-     * @var int
-     */
-    public $contentScore = 0;
-
-    /**
-     * Flag for initialized status
-     *
-     * @var bool
-     */
-    private $initialized = false;
-
-    /**
-     * @var array
-     */
-    private $divToPElements = [
-        'a',
-        'blockquote',
-        'dl',
-        'div',
-        'img',
-        'ol',
-        'p',
-        'pre',
-        'table',
-        'ul',
-        'select',
-    ];
-
-    /**
-     * initialized getter
-     *
-     * @return bool
-     */
-    public function isInitialized()
-    {
-        return $this->initialized;
-    }
-
-    /**
-     * Initializer. Calculates the current score of the node and returns a full Readability object.
-     *
-     * @ TODO: I don't like the weightClasses param. How can we get the config here?
-     *
-     * @param $weightClasses bool Weight classes?
-     * @return static
-     */
-    public function initializeNode($weightClasses)
-    {
-        if (!$this->isInitialized()) {
-            $contentScore = 0;
-
-            switch ($this->nodeName) {
-                case 'div':
-                    $contentScore += 5;
-                    break;
-
-                case 'pre':
-                case 'td':
-                case 'blockquote':
-                    $contentScore += 3;
-                    break;
-
-                case 'address':
-                case 'ol':
-                case 'ul':
-                case 'dl':
-                case 'dd':
-                case 'dt':
-                case 'li':
-                case 'form':
-                    $contentScore -= 3;
-                    break;
-
-                case 'h1':
-                case 'h2':
-                case 'h3':
-                case 'h4':
-                case 'h5':
-                case 'h6':
-                case 'th':
-                    $contentScore -= 5;
-                    break;
-            }
-
-            $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0);
-
-            $this->initialized = true;
-        }
-
-        return $this;
-    }
-
-    /**
-     * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need
-     * to check first the existence of the attributes property.
-     *
-     * @param $attributeName string Attribute to retrieve
-     *
-     * @return string
-     */
-    public function getAttribute($attributeName)
-    {
-        if (!is_null($this->attributes)) {
-            return parent::getAttribute($attributeName);
-        }
-
-        return '';
-    }
-
-    /**
-     * Get the ancestors of the current node.
-     *
-     * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them
-     *
-     * @return array
-     */
-    public function getNodeAncestors($maxLevel = 3)
-    {
-        $ancestors = [];
-        $level = 0;
-
-        $node = $this->parentNode;
-
-        while ($node) {
-            $ancestors[] = $node;
-            $level++;
-            if ($level === $maxLevel) {
-                break;
-            }
-            $node = $node->parentNode;
-        }
-
-        return $ancestors;
-    }
-
-    /**
-     * Returns all links from the current element.
-     *
-     * @return array
-     */
-    public function getAllLinks()
-    {
-        return iterator_to_array($this->getElementsByTagName('a'));
-    }
-
-    /**
-     * Get the density of links as a percentage of the content
-     * This is the amount of text that is inside a link divided by the total text in the node.
-     *
-     * @return int
-     */
-    public function getLinkDensity()
-    {
-        $linkLength = 0;
-        $textLength = mb_strlen($this->getTextContent(true));
-
-        if (!$textLength) {
-            return 0;
-        }
-
-        $links = $this->getAllLinks();
-
-        if ($links) {
-            /** @var DOMElement $link */
-            foreach ($links as $link) {
-                $linkLength += mb_strlen($link->getTextContent(true));
-            }
-        }
-
-        return $linkLength / $textLength;
-    }
-
-
-    /**
-     * Calculates the weight of the class/id of the current element.
-     *
-     * @return int
-     */
-    public function getClassWeight()
-    {
-        $weight = 0;
-
-        // Look for a special classname
-        $class = $this->getAttribute('class');
-        if (trim($class)) {
-            if (preg_match(NodeUtility::$regexps['negative'], $class)) {
-                $weight -= 25;
-            }
-
-            if (preg_match(NodeUtility::$regexps['positive'], $class)) {
-                $weight += 25;
-            }
-        }
-
-        // Look for a special ID
-        $id = $this->getAttribute('id');
-        if (trim($id)) {
-            if (preg_match(NodeUtility::$regexps['negative'], $id)) {
-                $weight -= 25;
-            }
-
-            if (preg_match(NodeUtility::$regexps['positive'], $id)) {
-                $weight += 25;
-            }
-        }
-
-        return $weight;
-    }
-
-    /**
-     * Returns the full text of the node.
-     *
-     * @param bool $normalize Normalize white space?
-     *
-     * @return string
-     */
-    public function getTextContent($normalize = false)
-    {
-        $nodeValue = $this->nodeValue;
-        if ($normalize) {
-            $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue));
-        }
-
-        return $nodeValue;
-    }
-
-    /**
-     * Returns the children of the current node.
-     *
-     * @param bool $filterEmptyDOMText Filter empty DOMText nodes?
-     *
-     * @return array
-     */
-    public function getChildren($filterEmptyDOMText = false)
-    {
-        $ret = iterator_to_array($this->childNodes);
-        if ($filterEmptyDOMText) {
-            // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number
-            $ret = array_values(array_filter($ret, function ($node) {
-                return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue));
-            }));
-        }
-
-        return $ret;
-    }
-
-    /**
-     * Return an array indicating how many rows and columns this table has.
-     *
-     * @return array
-     */
-    public function getRowAndColumnCount()
-    {
-        $rows = $columns = 0;
-        $trs = $this->getElementsByTagName('tr');
-        foreach ($trs as $tr) {
-            /** @var \DOMElement $tr */
-            $rowspan = $tr->getAttribute('rowspan');
-            $rows += ($rowspan || 1);
-
-            // Now look for column-related info
-            $columnsInThisRow = 0;
-            $cells = $tr->getElementsByTagName('td');
-            foreach ($cells as $cell) {
-                /** @var \DOMElement $cell */
-                $colspan = $cell->getAttribute('colspan');
-                $columnsInThisRow += ($colspan || 1);
-            }
-            $columns = max($columns, $columnsInThisRow);
-        }
-
-        return ['rows' => $rows, 'columns' => $columns];
-    }
-
-
-    /**
-     * Creates a new node based on the text content of the original node.
-     *
-     * @param $originalNode DOMElement
-     * @param $tagName string
-     *
-     * @return DOMElement
-     */
-    public function createNode($originalNode, $tagName)
-    {
-        $text = $originalNode->getTextContent();
-        $newNode = $originalNode->ownerDocument->createElement($tagName, $text);
-
-        return $newNode;
-    }
-
-    /**
-     * Check if a given node has one of its ancestor tag name matching the
-     * provided one.
-     *
-     * @param DOMElement $node
-     * @param string $tagName
-     * @param int $maxDepth
-     *
-     * @return bool
-     */
-    public function hasAncestorTag($node, $tagName, $maxDepth = 3)
-    {
-        $depth = 0;
-        while ($node->parentNode) {
-            if ($maxDepth > 0 && $depth > $maxDepth) {
-                return false;
-            }
-            if ($node->parentNode->nodeName === $tagName) {
-                return true;
-            }
-            $node = $node->parentNode;
-            $depth++;
-        }
-
-        return false;
-    }
-
-    /**
-     * Checks if the current node has a single child and if that child is a P node.
-     * Useful to convert <div><p> nodes to a single <p> node and avoid confusing the scoring system since div with p
-     * tags are, in practice, paragraphs.
-     *
-     * @param DOMNode $node
-     *
-     * @return bool
-     */
-    public function hasSinglePNode()
-    {
-        // There should be exactly 1 element child which is a P:
-        if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== 'p') {
-            return false;
-        }
-
-        // And there should be no text nodes with real content (param true on ->getChildren)
-        foreach ($children as $child) {
-            /** @var $child DOMNode */
-            if ($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())) {
-                return false;
-            }
-        }
-
-        return true;
-    }
-
-    /**
-     * Check if the current element has a single child block element.
-     * Block elements are the ones defined in the divToPElements array.
-     *
-     * @return bool
-     */
-    public function hasSingleChildBlockElement()
-    {
-        $result = false;
-        if ($this->hasChildNodes()) {
-            foreach ($this->getChildren() as $child) {
-                if (in_array($child->nodeName, $this->divToPElements)) {
-                    $result = true;
-                } else {
-                    // If any of the hasSingleChildBlockElement calls return true, return true then.
-                    /** @var $child DOMElement */
-                    $result = ($result || $child->hasSingleChildBlockElement());
-                }
-            }
-        }
-
-        return $result;
-    }
-
-    /**
-     * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace.
-     *
-     * @return bool
-     */
-    public function isElementWithoutContent()
-    {
-        return $this instanceof DOMElement &&
-            mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 &&
-            ($this->childNodes->length === 0 ||
-                $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length
-                /*
-                 * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node.
-                 * If there's an empty tag with an space inside and a BR (for example "<p> <br/></p>) counting only BRs and
-                 * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument,
-                 * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we
-                 * are dealing with (And at this point we know they are empty or are just whitespace, because of the
-                 * mb_strlen in this chain of checks).
-                 */
-                + count(array_filter(iterator_to_array($this->childNodes), function ($child) {
-                    return $child instanceof DOMText;
-                }))
-
-            );
-    }
-}
diff --git a/src/NodeUtility.php b/src/NodeUtility.php
deleted file mode 100644
index d0796dd..0000000
--- a/src/NodeUtility.php
+++ /dev/null
@@ -1,164 +0,0 @@
-<?php
-
-namespace andreskrey\Readability;
-
-use andreskrey\Readability\NodeClass\DOMDocument;
-use andreskrey\Readability\NodeClass\DOMElement;
-use andreskrey\Readability\NodeClass\DOMNode;
-
-/**
- * Class NodeUtility
- * @package andreskrey\Readability
- */
-class NodeUtility
-{
-
-    /**
-     * Collection of regexps to check the node usability
-     *
-     * @var array
-     */
-    public static $regexps = [
-        'unlikelyCandidates' => '/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
-        'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
-        'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
-        'byline' => '/byline|author|dateline|writtenby|p-author/i',
-        'replaceFonts' => '/<(\/?)font[^>]*>/gi',
-        'normalize' => '/\s{2,}/',
-        'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i',
-        'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i',
-        'prevLink' => '/(prev|earl|old|new|<|«)/i',
-        'whitespace' => '/^\s*$/',
-        'hasContent' => '/\S$/',
-        'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i',
-        'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i',
-        // \x{00A0} is the unicode version of &nbsp;
-        'onlyWhitespace' => '/\x{00A0}|\s+/u'
-    ];
-
-
-    /**
-     *
-     * Imported from the Element class on league\html-to-markdown
-     *
-     * @param $node
-     * @return DOMElement
-     */
-    public static function nextElement($node)
-    {
-        $next = $node;
-        while ($next
-            && $next->nodeName !== '#text'
-            && trim($next->textContent)) {
-            $next = $next->nextSibling;
-        }
-
-        return $next;
-    }
-
-
-    /**
-     * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new
-     * element with the new tag name and importing it to the main DOMDocument.
-     *
-     * @param string $value
-     * @param bool $importAttributes
-     * @return DOMNode
-     */
-    public static function setNodeTag($node, $value, $importAttributes = false)
-    {
-        $new = new DOMDocument('1.0', 'utf-8');
-        $new->appendChild($new->createElement($value));
-
-        $children = $node->childNodes;
-        /** @var $children \DOMNodeList $i */
-
-        for ($i = 0; $i < $children->length; $i++) {
-            $import = $new->importNode($children->item($i), true);
-            $new->firstChild->appendChild($import);
-        }
-
-        if ($importAttributes) {
-            // Import attributes from the original node.
-            foreach ($node->attributes as $attribute) {
-                $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue);
-            }
-        }
-
-        // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement.
-        $import = $node->ownerDocument->importNode($new->firstChild, true);
-        $node->parentNode->replaceChild($import, $node);
-
-        return $import;
-    }
-
-    /**
-     * Removes the current node and returns the next node to be parsed (child, sibling or parent).
-     *
-     * @param DOMNode $node
-     *
-     * @return DOMNode
-     */
-    public static function removeAndGetNext($node)
-    {
-        $nextNode = self::getNextNode($node, true);
-        $node->parentNode->removeChild($node);
-
-        return $nextNode;
-    }
-
-    /**
-     * Remove the selected node.
-     *
-     * @param $node DOMElement
-     *
-     * @return void
-     **/
-    public static function removeNode($node)
-    {
-        $parent = $node->parentNode;
-        if ($parent) {
-            $parent->removeChild($node);
-        }
-    }
-
-
-    /**
-     * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally
-     * for parents.
-     *
-     * @param DOMNode $originalNode
-     * @param bool $ignoreSelfAndKids
-     *
-     * @return DOMNode
-     */
-    public static function getNextNode($originalNode, $ignoreSelfAndKids = false)
-    {
-        /*
-         * Traverse the DOM from node to node, starting at the node passed in.
-         * Pass true for the second parameter to indicate this node itself
-         * (and its kids) are going away, and we want the next node over.
-         *
-         * Calling this in a loop will traverse the DOM depth-first.
-         */
-
-        // First check for kids if those aren't being ignored
-        if (!$ignoreSelfAndKids && $originalNode->firstChild) {
-            return $originalNode->firstChild;
-        }
-
-        // Then for siblings...
-        if ($originalNode->nextSibling) {
-            return $originalNode->nextSibling;
-        }
-
-        // And finally, move up the parent chain *and* find a sibling
-        // (because this is depth-first traversal, we will have already
-        // seen the parent nodes themselves).
-        do {
-            $originalNode = $originalNode->parentNode;
-        } while ($originalNode && !$originalNode->nextSibling);
-
-        return ($originalNode) ? $originalNode->nextSibling : $originalNode;
-    }
-}
diff --git a/src/Nodes/DOMAttr.php b/src/Nodes/DOMAttr.php
new file mode 100644
index 0000000..c31517a
--- /dev/null
+++ b/src/Nodes/DOMAttr.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMAttr extends \DOMAttr
+{
+    use NodeTrait;
+}
diff --git a/src/Nodes/DOMCdataSection.php b/src/Nodes/DOMCdataSection.php
new file mode 100644
index 0000000..f3a56f0
--- /dev/null
+++ b/src/Nodes/DOMCdataSection.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMCdataSection extends \DOMCdataSection
+{
+    use NodeTrait;
+}
diff --git a/src/Nodes/DOMCharacterData.php b/src/Nodes/DOMCharacterData.php
new file mode 100644
index 0000000..e5087d9
--- /dev/null
+++ b/src/Nodes/DOMCharacterData.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMCharacterData extends \DOMCharacterData
+{
+    use NodeTrait;
+}
diff --git a/src/Nodes/DOMComment.php b/src/Nodes/DOMComment.php
new file mode 100644
index 0000000..fd2b8b5
--- /dev/null
+++ b/src/Nodes/DOMComment.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMComment extends \DOMComment
+{
+    use NodeTrait;
+}
diff --git a/src/Nodes/DOMDocument.php b/src/Nodes/DOMDocument.php
new file mode 100644
index 0000000..f954f7d
--- /dev/null
+++ b/src/Nodes/DOMDocument.php
@@ -0,0 +1,26 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMDocument extends \DOMDocument
+{
+    use NodeTrait;
+
+    public function __construct($version, $encoding)
+    {
+        parent::__construct($version, $encoding);
+
+        $this->registerNodeClass('DOMAttr', DOMAttr::class);
+        $this->registerNodeClass('DOMCdataSection', DOMCdataSection::class);
+        $this->registerNodeClass('DOMCharacterData', DOMCharacterData::class);
+        $this->registerNodeClass('DOMComment', DOMComment::class);
+        $this->registerNodeClass('DOMDocument', DOMDocument::class);
+        $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class);
+        $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class);
+        $this->registerNodeClass('DOMElement', DOMElement::class);
+        $this->registerNodeClass('DOMNode', DOMNode::class);
+        $this->registerNodeClass('DOMNotation', DOMNotation::class);
+        $this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class);
+        $this->registerNodeClass('DOMText', DOMText::class);
+    }
+}
diff --git a/src/Nodes/DOMDocumentFragment.php b/src/Nodes/DOMDocumentFragment.php
new file mode 100644
index 0000000..d5f013e
--- /dev/null
+++ b/src/Nodes/DOMDocumentFragment.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMDocumentFragment extends \DOMDocumentFragment
+{
+    use NodeTrait;
+}
diff --git a/src/Nodes/DOMDocumentType.php b/src/Nodes/DOMDocumentType.php
new file mode 100644
index 0000000..81e426b
--- /dev/null
+++ b/src/Nodes/DOMDocumentType.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMDocumentType extends \DOMDocumentType
+{
+    use NodeTrait;
+}
diff --git a/src/Nodes/DOMElement.php b/src/Nodes/DOMElement.php
new file mode 100644
index 0000000..6ca0a29
--- /dev/null
+++ b/src/Nodes/DOMElement.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMElement extends \DOMElement
+{
+    use NodeTrait;
+}
diff --git a/src/Nodes/DOMNode.php b/src/Nodes/DOMNode.php
new file mode 100644
index 0000000..79a352b
--- /dev/null
+++ b/src/Nodes/DOMNode.php
@@ -0,0 +1,14 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+/**
+ * Class DOMNode
+ *
+ * @method getAttribute($attribute)
+ * @package andreskrey\Readability\Nodes
+ */
+class DOMNode extends \DOMNode
+{
+    use NodeTrait;
+}
diff --git a/src/Nodes/DOMNotation.php b/src/Nodes/DOMNotation.php
new file mode 100644
index 0000000..a4802e0
--- /dev/null
+++ b/src/Nodes/DOMNotation.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMNotation extends \DOMNotation
+{
+    use NodeTrait;
+}
diff --git a/src/Nodes/DOMProcessingInstruction.php b/src/Nodes/DOMProcessingInstruction.php
new file mode 100644
index 0000000..bd80997
--- /dev/null
+++ b/src/Nodes/DOMProcessingInstruction.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMProcessingInstruction extends \DOMProcessingInstruction
+{
+    use NodeTrait;
+}
diff --git a/src/Nodes/DOMText.php b/src/Nodes/DOMText.php
new file mode 100644
index 0000000..43d2ba9
--- /dev/null
+++ b/src/Nodes/DOMText.php
@@ -0,0 +1,8 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+class DOMText extends \DOMText
+{
+    use NodeTrait;
+}
diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php
new file mode 100644
index 0000000..3294612
--- /dev/null
+++ b/src/Nodes/NodeTrait.php
@@ -0,0 +1,404 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+trait NodeTrait
+{
+    /**
+     * Content score of the node. Used to determine the value of the content
+     *
+     * @var int
+     */
+    public $contentScore = 0;
+
+    /**
+     * Flag for initialized status
+     *
+     * @var bool
+     */
+    private $initialized = false;
+
+    /**
+     * @var array
+     */
+    private $divToPElements = [
+        'a',
+        'blockquote',
+        'dl',
+        'div',
+        'img',
+        'ol',
+        'p',
+        'pre',
+        'table',
+        'ul',
+        'select',
+    ];
+
+    /**
+     * initialized getter
+     *
+     * @return bool
+     */
+    public function isInitialized()
+    {
+        return $this->initialized;
+    }
+
+    /**
+     * Initializer. Calculates the current score of the node and returns a full Readability object.
+     *
+     * @ TODO: I don't like the weightClasses param. How can we get the config here?
+     *
+     * @param $weightClasses bool Weight classes?
+     * @return static
+     */
+    public function initializeNode($weightClasses)
+    {
+        if (!$this->isInitialized()) {
+            $contentScore = 0;
+
+            switch ($this->nodeName) {
+                case 'div':
+                    $contentScore += 5;
+                    break;
+
+                case 'pre':
+                case 'td':
+                case 'blockquote':
+                    $contentScore += 3;
+                    break;
+
+                case 'address':
+                case 'ol':
+                case 'ul':
+                case 'dl':
+                case 'dd':
+                case 'dt':
+                case 'li':
+                case 'form':
+                    $contentScore -= 3;
+                    break;
+
+                case 'h1':
+                case 'h2':
+                case 'h3':
+                case 'h4':
+                case 'h5':
+                case 'h6':
+                case 'th':
+                    $contentScore -= 5;
+                    break;
+            }
+
+            $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0);
+
+            $this->initialized = true;
+        }
+
+        return $this;
+    }
+
+    /**
+     * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need
+     * to check first the existence of the attributes property.
+     *
+     * @param $attributeName string Attribute to retrieve
+     *
+     * @return string
+     */
+    public function getAttribute($attributeName)
+    {
+        if (!is_null($this->attributes)) {
+            return parent::getAttribute($attributeName);
+        }
+
+        return '';
+    }
+
+    /**
+     * Get the ancestors of the current node.
+     *
+     * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them
+     *
+     * @return array
+     */
+    public function getNodeAncestors($maxLevel = 3)
+    {
+        $ancestors = [];
+        $level = 0;
+
+        $node = $this->parentNode;
+
+        while ($node) {
+            $ancestors[] = $node;
+            $level++;
+            if ($level === $maxLevel) {
+                break;
+            }
+            $node = $node->parentNode;
+        }
+
+        return $ancestors;
+    }
+
+    /**
+     * Returns all links from the current element.
+     *
+     * @return array
+     */
+    public function getAllLinks()
+    {
+        return iterator_to_array($this->getElementsByTagName('a'));
+    }
+
+    /**
+     * Get the density of links as a percentage of the content
+     * This is the amount of text that is inside a link divided by the total text in the node.
+     *
+     * @return int
+     */
+    public function getLinkDensity()
+    {
+        $linkLength = 0;
+        $textLength = mb_strlen($this->getTextContent(true));
+
+        if (!$textLength) {
+            return 0;
+        }
+
+        $links = $this->getAllLinks();
+
+        if ($links) {
+            /** @var DOMElement $link */
+            foreach ($links as $link) {
+                $linkLength += mb_strlen($link->getTextContent(true));
+            }
+        }
+
+        return $linkLength / $textLength;
+    }
+
+
+    /**
+     * Calculates the weight of the class/id of the current element.
+     *
+     * @return int
+     */
+    public function getClassWeight()
+    {
+        $weight = 0;
+
+        // Look for a special classname
+        $class = $this->getAttribute('class');
+        if (trim($class)) {
+            if (preg_match(NodeUtility::$regexps['negative'], $class)) {
+                $weight -= 25;
+            }
+
+            if (preg_match(NodeUtility::$regexps['positive'], $class)) {
+                $weight += 25;
+            }
+        }
+
+        // Look for a special ID
+        $id = $this->getAttribute('id');
+        if (trim($id)) {
+            if (preg_match(NodeUtility::$regexps['negative'], $id)) {
+                $weight -= 25;
+            }
+
+            if (preg_match(NodeUtility::$regexps['positive'], $id)) {
+                $weight += 25;
+            }
+        }
+
+        return $weight;
+    }
+
+    /**
+     * Returns the full text of the node.
+     *
+     * @param bool $normalize Normalize white space?
+     *
+     * @return string
+     */
+    public function getTextContent($normalize = false)
+    {
+        $nodeValue = $this->nodeValue;
+        if ($normalize) {
+            $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue));
+        }
+
+        return $nodeValue;
+    }
+
+    /**
+     * Returns the children of the current node.
+     *
+     * @param bool $filterEmptyDOMText Filter empty DOMText nodes?
+     *
+     * @return array
+     */
+    public function getChildren($filterEmptyDOMText = false)
+    {
+        $ret = iterator_to_array($this->childNodes);
+        if ($filterEmptyDOMText) {
+            // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number
+            $ret = array_values(array_filter($ret, function ($node) {
+                return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue));
+            }));
+        }
+
+        return $ret;
+    }
+
+    /**
+     * Return an array indicating how many rows and columns this table has.
+     *
+     * @return array
+     */
+    public function getRowAndColumnCount()
+    {
+        $rows = $columns = 0;
+        $trs = $this->getElementsByTagName('tr');
+        foreach ($trs as $tr) {
+            /** @var \DOMElement $tr */
+            $rowspan = $tr->getAttribute('rowspan');
+            $rows += ($rowspan || 1);
+
+            // Now look for column-related info
+            $columnsInThisRow = 0;
+            $cells = $tr->getElementsByTagName('td');
+            foreach ($cells as $cell) {
+                /** @var \DOMElement $cell */
+                $colspan = $cell->getAttribute('colspan');
+                $columnsInThisRow += ($colspan || 1);
+            }
+            $columns = max($columns, $columnsInThisRow);
+        }
+
+        return ['rows' => $rows, 'columns' => $columns];
+    }
+
+
+    /**
+     * Creates a new node based on the text content of the original node.
+     *
+     * @param $originalNode DOMElement
+     * @param $tagName string
+     *
+     * @return DOMElement
+     */
+    public function createNode($originalNode, $tagName)
+    {
+        $text = $originalNode->getTextContent();
+        $newNode = $originalNode->ownerDocument->createElement($tagName, $text);
+
+        return $newNode;
+    }
+
+    /**
+     * Check if a given node has one of its ancestor tag name matching the
+     * provided one.
+     *
+     * @param DOMElement $node
+     * @param string $tagName
+     * @param int $maxDepth
+     *
+     * @return bool
+     */
+    public function hasAncestorTag($node, $tagName, $maxDepth = 3)
+    {
+        $depth = 0;
+        while ($node->parentNode) {
+            if ($maxDepth > 0 && $depth > $maxDepth) {
+                return false;
+            }
+            if ($node->parentNode->nodeName === $tagName) {
+                return true;
+            }
+            $node = $node->parentNode;
+            $depth++;
+        }
+
+        return false;
+    }
+
+    /**
+     * Checks if the current node has a single child and if that child is a P node.
+     * Useful to convert <div><p> nodes to a single <p> node and avoid confusing the scoring system since div with p
+     * tags are, in practice, paragraphs.
+     *
+     * @param DOMNode $node
+     *
+     * @return bool
+     */
+    public function hasSinglePNode()
+    {
+        // There should be exactly 1 element child which is a P:
+        if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== 'p') {
+            return false;
+        }
+
+        // And there should be no text nodes with real content (param true on ->getChildren)
+        foreach ($children as $child) {
+            /** @var $child DOMNode */
+            if ($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    /**
+     * Check if the current element has a single child block element.
+     * Block elements are the ones defined in the divToPElements array.
+     *
+     * @return bool
+     */
+    public function hasSingleChildBlockElement()
+    {
+        $result = false;
+        if ($this->hasChildNodes()) {
+            foreach ($this->getChildren() as $child) {
+                if (in_array($child->nodeName, $this->divToPElements)) {
+                    $result = true;
+                } else {
+                    // If any of the hasSingleChildBlockElement calls return true, return true then.
+                    /** @var $child DOMElement */
+                    $result = ($result || $child->hasSingleChildBlockElement());
+                }
+            }
+        }
+
+        return $result;
+    }
+
+    /**
+     * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace.
+     *
+     * @return bool
+     */
+    public function isElementWithoutContent()
+    {
+        return $this instanceof DOMElement &&
+            mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 &&
+            ($this->childNodes->length === 0 ||
+                $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length
+                /*
+                 * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node.
+                 * If there's an empty tag with an space inside and a BR (for example "<p> <br/></p>) counting only BRs and
+                 * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument,
+                 * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we
+                 * are dealing with (And at this point we know they are empty or are just whitespace, because of the
+                 * mb_strlen in this chain of checks).
+                 */
+                + count(array_filter(iterator_to_array($this->childNodes), function ($child) {
+                    return $child instanceof DOMText;
+                }))
+
+            );
+    }
+}
diff --git a/src/Nodes/NodeUtility.php b/src/Nodes/NodeUtility.php
new file mode 100644
index 0000000..f35e9c5
--- /dev/null
+++ b/src/Nodes/NodeUtility.php
@@ -0,0 +1,160 @@
+<?php
+
+namespace andreskrey\Readability\Nodes;
+
+/**
+ * Class NodeUtility
+ * @package andreskrey\Readability
+ */
+class NodeUtility
+{
+
+    /**
+     * Collection of regexps to check the node usability
+     *
+     * @var array
+     */
+    public static $regexps = [
+        'unlikelyCandidates' => '/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
+        'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
+        'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
+        'byline' => '/byline|author|dateline|writtenby|p-author/i',
+        'replaceFonts' => '/<(\/?)font[^>]*>/gi',
+        'normalize' => '/\s{2,}/',
+        'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i',
+        'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i',
+        'prevLink' => '/(prev|earl|old|new|<|«)/i',
+        'whitespace' => '/^\s*$/',
+        'hasContent' => '/\S$/',
+        'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i',
+        'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i',
+        // \x{00A0} is the unicode version of &nbsp;
+        'onlyWhitespace' => '/\x{00A0}|\s+/u'
+    ];
+
+
+    /**
+     *
+     * Imported from the Element class on league\html-to-markdown
+     *
+     * @param $node
+     * @return DOMElement
+     */
+    public static function nextElement($node)
+    {
+        $next = $node;
+        while ($next
+            && $next->nodeName !== '#text'
+            && trim($next->textContent)) {
+            $next = $next->nextSibling;
+        }
+
+        return $next;
+    }
+
+
+    /**
+     * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new
+     * element with the new tag name and importing it to the main DOMDocument.
+     *
+     * @param string $value
+     * @param bool $importAttributes
+     * @return DOMNode
+     */
+    public static function setNodeTag($node, $value, $importAttributes = false)
+    {
+        $new = new DOMDocument('1.0', 'utf-8');
+        $new->appendChild($new->createElement($value));
+
+        $children = $node->childNodes;
+        /** @var $children \DOMNodeList $i */
+
+        for ($i = 0; $i < $children->length; $i++) {
+            $import = $new->importNode($children->item($i), true);
+            $new->firstChild->appendChild($import);
+        }
+
+        if ($importAttributes) {
+            // Import attributes from the original node.
+            foreach ($node->attributes as $attribute) {
+                $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue);
+            }
+        }
+
+        // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement.
+        $import = $node->ownerDocument->importNode($new->firstChild, true);
+        $node->parentNode->replaceChild($import, $node);
+
+        return $import;
+    }
+
+    /**
+     * Removes the current node and returns the next node to be parsed (child, sibling or parent).
+     *
+     * @param DOMNode $node
+     *
+     * @return DOMNode
+     */
+    public static function removeAndGetNext($node)
+    {
+        $nextNode = self::getNextNode($node, true);
+        $node->parentNode->removeChild($node);
+
+        return $nextNode;
+    }
+
+    /**
+     * Remove the selected node.
+     *
+     * @param $node DOMElement
+     *
+     * @return void
+     **/
+    public static function removeNode($node)
+    {
+        $parent = $node->parentNode;
+        if ($parent) {
+            $parent->removeChild($node);
+        }
+    }
+
+
+    /**
+     * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally
+     * for parents.
+     *
+     * @param DOMNode $originalNode
+     * @param bool $ignoreSelfAndKids
+     *
+     * @return DOMNode
+     */
+    public static function getNextNode($originalNode, $ignoreSelfAndKids = false)
+    {
+        /*
+         * Traverse the DOM from node to node, starting at the node passed in.
+         * Pass true for the second parameter to indicate this node itself
+         * (and its kids) are going away, and we want the next node over.
+         *
+         * Calling this in a loop will traverse the DOM depth-first.
+         */
+
+        // First check for kids if those aren't being ignored
+        if (!$ignoreSelfAndKids && $originalNode->firstChild) {
+            return $originalNode->firstChild;
+        }
+
+        // Then for siblings...
+        if ($originalNode->nextSibling) {
+            return $originalNode->nextSibling;
+        }
+
+        // And finally, move up the parent chain *and* find a sibling
+        // (because this is depth-first traversal, we will have already
+        // seen the parent nodes themselves).
+        do {
+            $originalNode = $originalNode->parentNode;
+        } while ($originalNode && !$originalNode->nextSibling);
+
+        return ($originalNode) ? $originalNode->nextSibling : $originalNode;
+    }
+}
-- 
cgit v1.2.3