summaryrefslogtreecommitdiff
path: root/src/Readability.php
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-26 17:56:12 +0000
committerAndres Rey <[email protected]>2017-11-26 17:56:12 +0000
commit8c36112c0bbac0ab2e8e897388219917ecfa103c (patch)
tree545ec6cf88a8280ec88d4e1d50b213f4a542e2dd /src/Readability.php
parent3687f2f5cd4f0390613bb5879cbfce1a94307a33 (diff)
Making room for v1
Diffstat (limited to 'src/Readability.php')
-rw-r--r--src/Readability.php548
1 files changed, 0 insertions, 548 deletions
diff --git a/src/Readability.php b/src/Readability.php
index bf8cdf8..e69de29 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -1,548 +0,0 @@
-<?php
-
-namespace andreskrey\Readability;
-
-use League\HTMLToMarkdown\Element;
-
-/**
- * Class DOMElement.
- *
- * This is a extension of the original Element class from League\HTMLToMarkdown\Element.
- * This class adds functions specific to Readability.php and overloads some of them to fit the purpose of this project.
- */
-class Readability extends Element implements ReadabilityInterface
-{
- /**
- * @var \DOMNode|\DOMElement
- */
- protected $node;
-
- /**
- * @var int
- */
- protected $contentScore = 0;
-
- /**
- * @var int
- */
- protected $initialized = false;
-
- /**
- * @var array
- */
- private $regexps = [
- 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i',
- 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i',
- ];
-
- /**
- * Constructor.
- *
- * @param \DOMNode $node Selected element from DOMDocument
- */
- public function __construct(\DOMNode $node)
- {
- parent::__construct($node);
-
- /*
- * Restore the score if the object has been already scored.
- *
- * An if must be added before calling the getAttribute function, because if we reach the DOMDocument
- * by getting the node parents we'll get a undefined function fatal error
- */
- if (method_exists($node, 'getAttribute')) {
- if ($node->hasAttribute('data-readability')) {
- // Node was initialized previously. Restoring score and setting flag.
- $this->initialized = true;
- $score = $node->getAttribute('data-readability');
- $this->setContentScore($score);
- }
- }
- }
-
- /**
- * Checks for the tag name. Case insensitive.
- *
- * @param string $value Name to compare to the current tag
- *
- * @return bool
- */
- public function tagNameEqualsTo($value)
- {
- $tagName = $this->getTagName();
- if (strtolower($value) === strtolower($tagName)) {
- return true;
- }
-
- return false;
- }
-
- /**
- * Checks for the node type.
- *
- * @param string $value Type of node to compare to
- *
- * @return bool
- */
- public function nodeTypeEqualsTo($value)
- {
- return $this->node->nodeType === $value;
- }
-
- /**
- * Get the ancestors of the current node.
- *
- * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them
- *
- * @return array
- */
- public function getNodeAncestors($maxLevel = 3)
- {
- $ancestors = [];
- $level = 0;
-
- $node = $this->getParent();
-
- while ($node) {
- $ancestors[] = $node;
- $level++;
- if ($level === $maxLevel) {
- break;
- }
- $node = $node->getParent();
- }
-
- return $ancestors;
- }
-
- /**
- * Overloading the getParent function from League\HTMLToMarkdown\Element due to a bug when there are no more parents
- * on the selected element.
- *
- * @return Readability|null
- */
- public function getParent()
- {
- $node = $this->node->parentNode;
-
- return ($node) ? new self($node) : null;
- }
-
- /**
- * Returns all links from the current element.
- *
- * @return array|null
- */
- public function getAllLinks()
- {
- if (($this->isText())) {
- return null;
- } else {
- $links = [];
- foreach ($this->node->getElementsByTagName('a') as $link) {
- $links[] = new self($link);
- }
-
- return $links;
- }
- }
-
- /**
- * Initializer. Calculates the current score of the node and returns a full Readability object.
- *
- * @return Readability
- */
- public function initializeNode()
- {
- if (!$this->initialized) {
- $contentScore = 0;
-
- switch ($this->getTagName()) {
- case 'div':
- $contentScore += 5;
- break;
-
- case 'pre':
- case 'td':
- case 'blockquote':
- $contentScore += 3;
- break;
-
- case 'address':
- case 'ol':
- case 'ul':
- case 'dl':
- case 'dd':
- case 'dt':
- case 'li':
- case 'form':
- $contentScore -= 3;
- break;
-
- case 'h1':
- case 'h2':
- case 'h3':
- case 'h4':
- case 'h5':
- case 'h6':
- case 'th':
- $contentScore -= 5;
- break;
- }
-
- $this->setContentScore($contentScore + $this->getClassWeight());
-
- $this->initialized = true;
- }
-
- return $this;
- }
-
- /**
- * Calculates the weight of the class/id of the current element.
- *
- * @todo check for flag that lets this function run or not
- *
- * @return int
- */
- public function getClassWeight()
- {
- // TODO To implement. How to get config from html parser from readability
-// if ($this->getConfig()->getOption('weightClasses')) {
-// return 0;
-// }
-//
- $weight = 0;
-
- // Look for a special classname
- $class = $this->getAttribute('class');
- if (trim($class)) {
- if (preg_match($this->regexps['negative'], $class)) {
- $weight -= 25;
- }
-
- if (preg_match($this->regexps['positive'], $class)) {
- $weight += 25;
- }
- }
-
- // Look for a special ID
- $id = $this->getAttribute('id');
- if (trim($id)) {
- if (preg_match($this->regexps['negative'], $id)) {
- $weight -= 25;
- }
-
- if (preg_match($this->regexps['positive'], $id)) {
- $weight += 25;
- }
- }
-
- return $weight;
- }
-
- /**
- * Returns the current score of the Readability object.
- *
- * @return int
- */
- public function getContentScore()
- {
- return $this->contentScore;
- }
-
- /**
- * Returns the current score of the Readability object.
- *
- * @param int $score
- *
- * @return int
- */
- public function setContentScore($score)
- {
- // Check if the setAttribute method exists, as some elements lack of it (and calling it anyway throws an exception)
- if (method_exists($this->node, 'setAttribute')) {
- $this->contentScore = (float)$score;
-
- // Set score in an attribute of the tag to prevent losing it while creating new Readability objects.
- $this->node->setAttribute('data-readability', $this->contentScore);
-
- return $this->contentScore;
- }
-
- return 0;
- }
-
- /**
- * Returns the full text of the node.
- *
- * @param bool $normalize Normalize white space?
- *
- * @return string
- */
- public function getTextContent($normalize = false)
- {
- $nodeValue = $this->node->nodeValue;
- if ($normalize) {
- $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue));
- }
-
- return $nodeValue;
- }
-
- /**
- * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new
- * element with the new tag name and importing it to the main DOMDocument.
- *
- * @param string $value
- * @param bool $importAttributes
- */
- public function setNodeTag($value, $importAttributes = false)
- {
- $new = new \DOMDocument();
- $new->appendChild($new->createElement($value));
-
- $childs = $this->node->childNodes;
- for ($i = 0; $i < $childs->length; $i++) {
- $import = $new->importNode($childs->item($i), true);
- $new->firstChild->appendChild($import);
- }
-
- if ($importAttributes) {
- // Import attributes from the original node.
- foreach ($this->node->attributes as $attribute) {
- $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue);
- }
- }
-
- // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement.
- $import = $this->node->ownerDocument->importNode($new->firstChild, true);
- $this->node->parentNode->replaceChild($import, $this->node);
-
- $this->node = $import;
- }
-
- /**
- * Returns the current DOMNode.
- *
- * @return \DOMNode
- */
- public function getDOMNode()
- {
- return $this->node;
- }
-
- /**
- * Removes the current node and returns the next node to be parsed (child, sibling or parent).
- *
- * @param Readability $node
- *
- * @return Readability
- */
- public function removeAndGetNext($node)
- {
- $nextNode = $this->getNextNode($node, true);
- $node->node->parentNode->removeChild($node->node);
-
- return $nextNode;
- }
-
- /**
- * Returns the next node. First checks for childs (if the flag allows it), then for siblings, and finally
- * for parents.
- *
- * @param Readability $originalNode
- * @param bool $ignoreSelfAndKids
- *
- * @return Readability
- */
- public function getNextNode($originalNode, $ignoreSelfAndKids = false)
- {
- /*
- * Traverse the DOM from node to node, starting at the node passed in.
- * Pass true for the second parameter to indicate this node itself
- * (and its kids) are going away, and we want the next node over.
- *
- * Calling this in a loop will traverse the DOM depth-first.
- */
-
- // First check for kids if those aren't being ignored
- if (!$ignoreSelfAndKids && $originalNode->node->firstChild) {
- return new self($originalNode->node->firstChild);
- }
-
- // Then for siblings...
- if ($originalNode->node->nextSibling) {
- return new self($originalNode->node->nextSibling);
- }
-
- // And finally, move up the parent chain *and* find a sibling
- // (because this is depth-first traversal, we will have already
- // seen the parent nodes themselves).
- do {
- $originalNode = $originalNode->getParent();
- } while ($originalNode && !$originalNode->node->nextSibling);
-
- return ($originalNode) ? new self($originalNode->node->nextSibling) : $originalNode;
- }
-
- /**
- * Compares nodes. Checks for tag name and text content.
- *
- * It's a replacement of the original JS code, which looked like this:
- *
- * $node1 == $node2
- *
- * I'm not sure this works the same in PHP, so I created a mock function to check the actual content of the node.
- * Should serve the same porpuse as the original comparison.
- *
- * @param Readability $node1
- * @param Readability $node2
- *
- * @return bool
- */
- public function compareNodes($node1, $node2)
- {
- if ($node1->getTagName() !== $node2->getTagName()) {
- return false;
- }
-
- if ($node1->getTextContent(true) !== $node2->getTextContent(true)) {
- return false;
- }
-
- return true;
- }
-
- /**
- * Replaces child node with a new one.
- *
- * @param Readability $newNode
- */
- public function replaceChild(self $newNode)
- {
- $this->node->parentNode->replaceChild($newNode->node, $this->node);
- }
-
- /**
- * Creates a new node based on the text content of the original node.
- *
- * @param Readability $originalNode
- * @param string $tagName
- *
- * @return Readability
- */
- public function createNode(self $originalNode, $tagName)
- {
- $text = $originalNode->getTextContent();
- $newNode = $originalNode->node->ownerDocument->createElement($tagName, $text);
-
- return new static($newNode);
- }
-
- /**
- * Checks if the object is initialized.
- *
- * @return bool
- */
- public function isInitialized()
- {
- return $this->initialized;
- }
-
- /**
- * Reloads the score stores in the data-readability tag.
- *
- * @return int|bool
- */
- public function reloadScore()
- {
- if (method_exists($this->node, 'getAttribute')) {
- if ($this->node->hasAttribute('data-readability')) {
- $this->initialized = true;
- $score = $this->node->getAttribute('data-readability');
- $this->setContentScore($score);
-
- return $score;
- }
- }
-
- return false;
- }
-
- /**
- * Check if a given node has one of its ancestor tag name matching the
- * provided one.
- *
- * @param Readability $node
- * @param string $tagName
- * @param int $maxDepth
- *
- * @return bool
- */
- public function hasAncestorTag(self $node, $tagName, $maxDepth = 3)
- {
- $depth = 0;
- while ($node->getParent()) {
- if ($maxDepth > 0 && $depth > $maxDepth) {
- return false;
- }
- if ($node->getParent()->tagNameEqualsTo($tagName)) {
- return true;
- }
- $node = $node->getParent();
- $depth++;
- }
-
- return false;
- }
-
- /**
- * Returns the children of the current node.
- *
- * @param bool $filterEmptyDOMText Filter empty DOMText nodes?
- *
- * @return array
- */
- public function getChildren($filterEmptyDOMText = false)
- {
- $ret = [];
- /** @var \DOMNode $node */
- foreach ($this->node->childNodes as $node) {
- if ($filterEmptyDOMText && $node->nodeName === '#text' && !trim($node->nodeValue)) {
- continue;
- }
-
- $ret[] = new static($node);
- }
-
- return $ret;
- }
-
- /**
- * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace.
- *
- * @return bool
- */
- public function isElementWithoutContent()
- {
- return $this->node instanceof \DOMElement &&
- // /\x{00A0}|\s+/u TODO to be replaced with regexps array
- mb_strlen(preg_replace('/\x{00A0}|\s+/u', '', $this->node->textContent)) === 0 &&
- ($this->node->childNodes->length === 0 ||
- $this->node->childNodes->length === $this->node->getElementsByTagName('br')->length + $this->node->getElementsByTagName('hr')->length
- /*
- * Special DOMDocument case: We also need to count how many DOMText we have inside the node.
- * If there's an empty tag with an space inside and a BR (for example "<p> <br/></p>) counting only BRs and
- * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument,
- * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we
- * are dealing with (And at this point we know they are empty or are just whitespace, because of the
- * mb_strlen in this chain of checks).
- */
- + count(array_filter(iterator_to_array($this->node->childNodes), function ($child) {
- return $child instanceof \DOMText;
- }))
-
- );
- }
-}