summaryrefslogtreecommitdiff
path: root/src/Readability.php
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-10-21 20:20:48 +0100
committerAndres Rey <[email protected]>2016-10-21 20:20:48 +0100
commit0c11d557755ddbbafeeccd71e2d70b5c1d1458ab (patch)
treefcef7e334ff98025c1b60e2a1b3600296097e45b /src/Readability.php
parentfe73c4d7cf71023c4cc5654afb0aea47b179c084 (diff)
Major refactor, Readability now is an extension of Element.
Diffstat (limited to 'src/Readability.php')
-rw-r--r--src/Readability.php128
1 files changed, 107 insertions, 21 deletions
diff --git a/src/Readability.php b/src/Readability.php
index 7f3b1c1..a15b664 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -2,17 +2,25 @@
namespace andreskrey\Readability;
-class Readability implements ReadabilityInterface
+use League\HTMLToMarkdown\Element;
+
+/**
+ * Class DOMElement.
+ *
+ * This is a extension of the original Element class from League\HTMLToMarkdown\Element.
+ * This class adds functions specific to Readability.php and overloads some of them to fit the purpose of this project.
+ */
+class Readability extends Element implements ReadabilityInterface
{
/**
- * @var int
+ * @var \DOMNode
*/
- protected $contentScore = 0;
+ protected $node;
/**
- * @var null
+ * @var int
*/
- protected $node;
+ protected $contentScore = 0;
/**
* @var array
@@ -25,11 +33,99 @@ class Readability implements ReadabilityInterface
/**
* Constructor.
*
- * @param DOMElement $node
+ * @param \DOMNode $node Selected element from DOMDocument
+ */
+ public function __construct(\DOMNode $node)
+ {
+ parent::__construct($node);
+ }
+
+ /**
+ * Checks for the tag name. Case insensitive.
+ *
+ * @param string $value Name to compare to the current tag
+ *
+ * @return bool
*/
- public function __construct($node)
+ public function tagNameEqualsTo($value)
{
- $this->node = $node;
+ $tagName = $this->getTagName();
+ if (strtolower($value) === strtolower($tagName)) {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Checks if the current node has a single child and if that child is a P node.
+ * Useful to convert <div><p> nodes to a single <p> node and avoid confusing the scoring system since div with p
+ * tags are, in practice, paragraphs.
+ *
+ * @return bool
+ */
+ public function hasSinglePNode()
+ {
+ if ($this->hasChildren()) {
+ $children = $this->getChildren();
+
+ if (count($children) === 1) {
+ if (strtolower($children[0]->getTagName()) === 'p') {
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Get the ancestors of the current node.
+ *
+ * @param int $maxLevel Max amount of ancestors to get.
+ *
+ * @return array
+ */
+ public function getNodeAncestors($maxLevel = 3)
+ {
+ $ancestors = [];
+ $level = 0;
+
+ $node = $this;
+
+ while ($node && $node->getParent()) {
+ $ancestors[] = new static($node->node);
+ $level++;
+ if ($level >= $maxLevel) {
+ break;
+ }
+ $node = $node->getParent();
+ }
+
+ return $ancestors;
+ }
+
+ /**
+ * Overloading the getParent function from League\HTMLToMarkdown\Element due to a bug when there are no more parents
+ * on the selected element.
+ *
+ * @return Readability|null
+ */
+ public function getParent()
+ {
+ $node = $this->node->parentNode;
+
+ return ($node) ? new static($node) : null;
+ }
+
+ /**
+ * Returns all links from the current element.
+ *
+ * @return Readability|null
+ */
+ public function getAllLinks()
+ {
+ return ($this->isText()) ? null : $this->node->getElementsByTagName('a');
}
/**
@@ -39,7 +135,7 @@ class Readability implements ReadabilityInterface
*/
public function initializeNode()
{
- switch ($this->node->getTagName()) {
+ switch ($this->getTagName()) {
case 'div':
$this->contentScore += 5;
break;
@@ -91,7 +187,7 @@ class Readability implements ReadabilityInterface
$weight = 0;
// Look for a special classname
- $class = $this->node->getAttribute('class');
+ $class = $this->getAttribute('class');
if (trim($class)) {
if (preg_match($this->regexps['negative'], $class)) {
$weight -= 25;
@@ -103,7 +199,7 @@ class Readability implements ReadabilityInterface
}
// Look for a special ID
- $id = $this->node->getAttribute('class');
+ $id = $this->getAttribute('class');
if (trim($id)) {
if (preg_match($this->regexps['negative'], $id)) {
$weight -= 25;
@@ -150,14 +246,4 @@ class Readability implements ReadabilityInterface
{
return $this->node->getChildrenAsString();
}
-
- /**
- * Returns all links from the current element.
- *
- * @return DOMElement|null
- */
- public function getAllLinks()
- {
- return $this->node->getAllLinks();
- }
}