From 0226e0ca0dc70f9a0310b3eef045ee1c1e0ca3ac Mon Sep 17 00:00:00 2001
From: Andrew Dolgov
Date: Tue, 13 Dec 2022 20:00:46 +0300
Subject: split into a separate repo
---
.../readability.php/src/Configuration.php | 423 ++++
.../readability.php/src/Nodes/DOM/DOMAttr.php | 10 +
.../src/Nodes/DOM/DOMCdataSection.php | 10 +
.../src/Nodes/DOM/DOMCharacterData.php | 10 +
.../readability.php/src/Nodes/DOM/DOMComment.php | 10 +
.../readability.php/src/Nodes/DOM/DOMDocument.php | 30 +
.../src/Nodes/DOM/DOMDocumentFragment.php | 10 +
.../src/Nodes/DOM/DOMDocumentType.php | 10 +
.../readability.php/src/Nodes/DOM/DOMElement.php | 46 +
.../readability.php/src/Nodes/DOM/DOMEntity.php | 10 +
.../src/Nodes/DOM/DOMEntityReference.php | 10 +
.../readability.php/src/Nodes/DOM/DOMNode.php | 14 +
.../readability.php/src/Nodes/DOM/DOMNodeList.php | 82 +
.../readability.php/src/Nodes/DOM/DOMNotation.php | 10 +
.../src/Nodes/DOM/DOMProcessingInstruction.php | 10 +
.../readability.php/src/Nodes/DOM/DOMText.php | 10 +
.../readability.php/src/Nodes/NodeTrait.php | 566 +++++
.../readability.php/src/Nodes/NodeUtility.php | 192 ++
.../readability.php/src/ParseException.php | 7 +
.../readability.php/src/Readability.php | 2410 ++++++++++++++++++++
20 files changed, 3880 insertions(+)
create mode 100644 vendor/fivefilters/readability.php/src/Configuration.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMAttr.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMCdataSection.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMCharacterData.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMComment.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMDocument.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMDocumentFragment.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMDocumentType.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMElement.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMEntity.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMEntityReference.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMNode.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMNodeList.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMNotation.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMProcessingInstruction.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/DOM/DOMText.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/NodeTrait.php
create mode 100644 vendor/fivefilters/readability.php/src/Nodes/NodeUtility.php
create mode 100644 vendor/fivefilters/readability.php/src/ParseException.php
create mode 100644 vendor/fivefilters/readability.php/src/Readability.php
(limited to 'vendor/fivefilters/readability.php/src')
diff --git a/vendor/fivefilters/readability.php/src/Configuration.php b/vendor/fivefilters/readability.php/src/Configuration.php
new file mode 100644
index 0000000..6d1f03f
--- /dev/null
+++ b/vendor/fivefilters/readability.php/src/Configuration.php
@@ -0,0 +1,423 @@
+ $value) {
+ $setter = sprintf('set%s', $key);
+ if (method_exists($this, $setter)) {
+ call_user_func([$this, $setter], $value);
+ }
+ }
+ }
+
+ /**
+ * Returns an array-representation of configuration.
+ *
+ * @return array
+ */
+ public function toArray()
+ {
+ $out = [];
+ foreach ($this as $key => $value) {
+ $getter = sprintf('get%s', $key);
+ if (!is_object($value) && method_exists($this, $getter)) {
+ $out[$key] = call_user_func([$this, $getter]);
+ }
+ }
+
+ return $out;
+ }
+
+ /**
+ * @return LoggerInterface
+ */
+ public function getLogger()
+ {
+ // If no logger has been set, just return a null logger
+ if ($this->logger === null) {
+ return new NullLogger();
+ }
+
+ return $this->logger;
+ }
+
+ /**
+ * @param LoggerInterface $logger
+ *
+ * @return Configuration
+ */
+ public function setLogger(LoggerInterface $logger)
+ {
+ $this->logger = $logger;
+
+ return $this;
+ }
+
+ /**
+ * @return int
+ */
+ public function getMaxTopCandidates()
+ {
+ return $this->maxTopCandidates;
+ }
+
+ /**
+ * @param int $maxTopCandidates
+ *
+ * @return $this
+ */
+ public function setMaxTopCandidates($maxTopCandidates)
+ {
+ $this->maxTopCandidates = $maxTopCandidates;
+
+ return $this;
+ }
+
+ /**
+ * @return int
+ */
+ public function getCharThreshold()
+ {
+ return $this->charThreshold;
+ }
+
+ /**
+ * @param int $charThreshold
+ *
+ * @return $this
+ */
+ public function setCharThreshold($charThreshold)
+ {
+ $this->charThreshold = $charThreshold;
+
+ return $this;
+ }
+
+ /**
+ * @return bool
+ */
+ public function getArticleByLine()
+ {
+ return $this->articleByLine;
+ }
+
+ /**
+ * @param bool $articleByLine
+ *
+ * @return $this
+ */
+ public function setArticleByLine($articleByLine)
+ {
+ $this->articleByLine = $articleByLine;
+
+ return $this;
+ }
+
+ /**
+ * @return bool
+ */
+ public function getStripUnlikelyCandidates()
+ {
+ return $this->stripUnlikelyCandidates;
+ }
+
+ /**
+ * @param bool $stripUnlikelyCandidates
+ *
+ * @return $this
+ */
+ public function setStripUnlikelyCandidates($stripUnlikelyCandidates)
+ {
+ $this->stripUnlikelyCandidates = $stripUnlikelyCandidates;
+
+ return $this;
+ }
+
+ /**
+ * @return bool
+ */
+ public function getCleanConditionally()
+ {
+ return $this->cleanConditionally;
+ }
+
+ /**
+ * @param bool $cleanConditionally
+ *
+ * @return $this
+ */
+ public function setCleanConditionally($cleanConditionally)
+ {
+ $this->cleanConditionally = $cleanConditionally;
+
+ return $this;
+ }
+
+ /**
+ * @return bool
+ */
+ public function getWeightClasses()
+ {
+ return $this->weightClasses;
+ }
+
+ /**
+ * @param bool $weightClasses
+ *
+ * @return $this
+ */
+ public function setWeightClasses($weightClasses)
+ {
+ $this->weightClasses = $weightClasses;
+
+ return $this;
+ }
+
+ /**
+ * @return bool
+ */
+ public function getFixRelativeURLs()
+ {
+ return $this->fixRelativeURLs;
+ }
+
+ /**
+ * @param bool $fixRelativeURLs
+ *
+ * @return $this
+ */
+ public function setFixRelativeURLs($fixRelativeURLs)
+ {
+ $this->fixRelativeURLs = $fixRelativeURLs;
+
+ return $this;
+ }
+
+ /**
+ * @return bool
+ */
+ public function getSubstituteEntities()
+ {
+ return $this->substituteEntities;
+ }
+
+ /**
+ * @param bool $substituteEntities
+ *
+ * @return $this
+ */
+ public function setSubstituteEntities($substituteEntities)
+ {
+ $this->substituteEntities = $substituteEntities;
+
+ return $this;
+ }
+
+ /**
+ * @return bool
+ */
+ public function getNormalizeEntities()
+ {
+ return $this->normalizeEntities;
+ }
+
+ /**
+ * @param bool $normalizeEntities
+ *
+ * @return $this
+ */
+ public function setNormalizeEntities($normalizeEntities)
+ {
+ $this->normalizeEntities = $normalizeEntities;
+
+ return $this;
+ }
+
+ /**
+ * @return string
+ */
+ public function getOriginalURL()
+ {
+ return $this->originalURL;
+ }
+
+ /**
+ * @param string $originalURL
+ *
+ * @return $this
+ */
+ public function setOriginalURL($originalURL)
+ {
+ $this->originalURL = $originalURL;
+
+ return $this;
+ }
+
+ /**
+ * @return string
+ */
+ public function getParser()
+ {
+ return $this->parser;
+ }
+
+ /**
+ * @param string $parser
+ *
+ * @return $this
+ */
+ public function setParser($parser)
+ {
+ $this->parser = $parser;
+
+ return $this;
+ }
+
+ /**
+ * @return bool
+ */
+ public function getKeepClasses()
+ {
+ return $this->keepClasses;
+ }
+
+ /**
+ * @param bool $keepClasses
+ *
+ * @return $this
+ */
+ public function setKeepClasses($keepClasses)
+ {
+ $this->keepClasses = $keepClasses;
+
+ return $this;
+ }
+
+ /**
+ * @return bool
+ */
+ public function getDisableJSONLD()
+ {
+ return $this->disableJSONLD;
+ }
+
+ /**
+ * @param bool $disableJSONLD
+ *
+ * @return $this
+ */
+ public function setDisableJSONLD($disableJSONLD)
+ {
+ $this->disableJSONLD = $disableJSONLD;
+
+ return $this;
+ }
+
+ /**
+ * @return bool
+ */
+ public function getSummonCthulhu()
+ {
+ return $this->summonCthulhu;
+ }
+
+ /**
+ * @param bool $summonCthulhu
+ *
+ * @return $this
+ */
+ public function setSummonCthulhu($summonCthulhu)
+ {
+ $this->summonCthulhu = $summonCthulhu;
+
+ return $this;
+ }
+}
diff --git a/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMAttr.php b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMAttr.php
new file mode 100644
index 0000000..1bdf395
--- /dev/null
+++ b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMAttr.php
@@ -0,0 +1,10 @@
+registerNodeClass('DOMAttr', DOMAttr::class);
+ $this->registerNodeClass('DOMCdataSection', DOMCdataSection::class);
+ $this->registerNodeClass('DOMCharacterData', DOMCharacterData::class);
+ $this->registerNodeClass('DOMComment', DOMComment::class);
+ $this->registerNodeClass('DOMDocument', self::class);
+ $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class);
+ $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class);
+ $this->registerNodeClass('DOMElement', DOMElement::class);
+ $this->registerNodeClass('DOMEntity', DOMEntity::class);
+ $this->registerNodeClass('DOMEntityReference', DOMEntityReference::class);
+ $this->registerNodeClass('DOMNode', DOMNode::class);
+ $this->registerNodeClass('DOMNotation', DOMNotation::class);
+ $this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class);
+ $this->registerNodeClass('DOMText', DOMText::class);
+ }
+}
diff --git a/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMDocumentFragment.php b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMDocumentFragment.php
new file mode 100644
index 0000000..33a3f95
--- /dev/null
+++ b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMDocumentFragment.php
@@ -0,0 +1,10 @@
+childNodes as $node) {
+ if ($node->nodeType === XML_ELEMENT_NODE) {
+ $newList->add($node);
+ }
+ }
+ return $newList;
+ }
+
+ /**
+ * Returns the Element immediately prior to the specified one in its parent's children list, or null if the specified element is the first one in the list.
+ *
+ * @see https://wiki.php.net/rfc/dom_living_standard_api
+ * @return DOMElement|null
+ */
+ public function previousElementSibling()
+ {
+ $previous = $this->previousSibling;
+ while ($previous) {
+ if ($previous->nodeType === XML_ELEMENT_NODE) {
+ return $previous;
+ }
+ $previous = $previous->previousSibling;
+ }
+ return null;
+ }
+}
diff --git a/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMEntity.php b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMEntity.php
new file mode 100644
index 0000000..751b59c
--- /dev/null
+++ b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMEntity.php
@@ -0,0 +1,10 @@
+length is hidden
+ * from the user and cannot be extended, changed, or tweaked.
+ */
+class DOMNodeList implements \Countable, \IteratorAggregate
+{
+ /**
+ * @var array
+ */
+ protected $items = [];
+
+ /**
+ * @var int
+ */
+ protected $length = 0;
+
+ /**
+ * To allow access to length in the same way that DOMNodeList allows.
+ *
+ * {@inheritdoc}
+ */
+ public function __get($name)
+ {
+ switch ($name) {
+ case 'length':
+ return $this->length;
+ default:
+ trigger_error(sprintf('Undefined property: %s::%s', static::class, $name));
+ }
+ }
+
+ /**
+ * @param DOMNode|DOMElement|DOMComment $node
+ *
+ * @return DOMNodeList
+ */
+ public function add($node)
+ {
+ $this->items[] = $node;
+ $this->length++;
+
+ return $this;
+ }
+
+ /**
+ * @param int $offset
+ *
+ * @return DOMNode|DOMElement|DOMComment
+ */
+ public function item(int $offset)
+ {
+ return $this->items[$offset];
+ }
+
+ /**
+ * @return int|void
+ */
+ public function count(): int
+ {
+ return $this->length;
+ }
+
+ /**
+ * To make it compatible with iterator_to_array() function.
+ *
+ * {@inheritdoc}
+ */
+ public function getIterator(): \ArrayIterator
+ {
+ return new \ArrayIterator($this->items);
+ }
+}
diff --git a/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMNotation.php b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMNotation.php
new file mode 100644
index 0000000..d276e42
--- /dev/null
+++ b/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMNotation.php
@@ -0,0 +1,10 @@
+initialized;
+ }
+
+ /**
+ * @return bool
+ */
+ public function isReadabilityDataTable()
+ {
+ /*
+ * This is a workaround that I'd like to remove in the future.
+ * Seems that although we are extending the base DOMElement and adding custom properties (like this one,
+ * 'readabilityDataTable'), these properties get lost when you search for elements with getElementsByTagName.
+ * This means that even if we mark the tables in a previous step, when we want to retrieve that information,
+ * all the custom properties are in their default values. Somehow we need to find a way to make these properties
+ * permanent across the whole DOM.
+ *
+ * @see https://stackoverflow.com/questions/35654709/php-registernodeclass-and-reusing-variable-names
+ */
+ return $this->hasAttribute('readabilityDataTable')
+ && $this->getAttribute('readabilityDataTable') === '1';
+// return $this->readabilityDataTable;
+ }
+
+ /**
+ * @param bool $param
+ */
+ public function setReadabilityDataTable($param)
+ {
+ // Can't be "true" because DOMDocument casts it to "1"
+ $this->setAttribute('readabilityDataTable', $param ? '1' : '0');
+// $this->readabilityDataTable = $param;
+ }
+
+ /**
+ * Initializer. Calculates the current score of the node and returns a full Readability object.
+ *
+ * @ TODO: I don't like the weightClasses param. How can we get the config here?
+ *
+ * @param $weightClasses bool Weight classes?
+ *
+ * @return static
+ */
+ public function initializeNode($weightClasses)
+ {
+ if (!$this->isInitialized()) {
+ $contentScore = 0;
+
+ switch ($this->nodeName) {
+ case 'div':
+ $contentScore += 5;
+ break;
+
+ case 'pre':
+ case 'td':
+ case 'blockquote':
+ $contentScore += 3;
+ break;
+
+ case 'address':
+ case 'ol':
+ case 'ul':
+ case 'dl':
+ case 'dd':
+ case 'dt':
+ case 'li':
+ case 'form':
+ $contentScore -= 3;
+ break;
+
+ case 'h1':
+ case 'h2':
+ case 'h3':
+ case 'h4':
+ case 'h5':
+ case 'h6':
+ case 'th':
+ $contentScore -= 5;
+ break;
+ }
+
+ $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0);
+
+ $this->initialized = true;
+ }
+
+ return $this;
+ }
+
+ /**
+ * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need
+ * to check first the existence of the attributes property.
+ *
+ * @param $attributeName string Attribute to retrieve
+ *
+ * @return string
+ */
+ #[\ReturnTypeWillChange]
+ public function getAttribute($attributeName)
+ {
+ if (!is_null($this->attributes)) {
+ return parent::getAttribute($attributeName);
+ }
+
+ return '';
+ }
+
+ /**
+ * Override for native hasAttribute.
+ *
+ * @param $attributeName
+ *
+ * @return bool
+ *
+ * @see getAttribute
+ */
+ #[\ReturnTypeWillChange]
+ public function hasAttribute($attributeName)
+ {
+ if (!is_null($this->attributes)) {
+ return parent::hasAttribute($attributeName);
+ }
+
+ return false;
+ }
+
+ /**
+ * Get the ancestors of the current node.
+ *
+ * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them
+ *
+ * @return array
+ */
+ public function getNodeAncestors($maxLevel = 3)
+ {
+ $ancestors = [];
+ $level = 0;
+
+ $node = $this->parentNode;
+
+ while ($node && !($node instanceof DOMDocument)) {
+ $ancestors[] = $node;
+ $level++;
+ if ($level === $maxLevel) {
+ break;
+ }
+ $node = $node->parentNode;
+ }
+
+ return $ancestors;
+ }
+
+ /**
+ * Returns all links from the current element.
+ *
+ * @return array
+ */
+ public function getAllLinks()
+ {
+ return iterator_to_array($this->getElementsByTagName('a'));
+ }
+
+ /**
+ * Get the density of links as a percentage of the content
+ * This is the amount of text that is inside a link divided by the total text in the node.
+ *
+ * @return int
+ */
+ public function getLinkDensity()
+ {
+ $textLength = mb_strlen($this->getTextContent(true));
+ if ($textLength === 0) {
+ return 0;
+ }
+
+ $linkLength = 0;
+
+ $links = $this->getAllLinks();
+
+ if ($links) {
+ /** @var DOMElement $link */
+ foreach ($links as $link) {
+ $href = $link->getAttribute('href');
+ $coefficient = ($href && preg_match(NodeUtility::$regexps['hashUrl'], $href)) ? 0.3 : 1;
+ $linkLength += mb_strlen($link->getTextContent(true)) * $coefficient;
+ }
+ }
+
+ return $linkLength / $textLength;
+ }
+
+ /**
+ * Calculates the weight of the class/id of the current element.
+ *
+ * @return int
+ */
+ public function getClassWeight()
+ {
+ $weight = 0;
+
+ // Look for a special classname
+ $class = $this->getAttribute('class');
+ if (trim($class)) {
+ if (preg_match(NodeUtility::$regexps['negative'], $class)) {
+ $weight -= 25;
+ }
+
+ if (preg_match(NodeUtility::$regexps['positive'], $class)) {
+ $weight += 25;
+ }
+ }
+
+ // Look for a special ID
+ $id = $this->getAttribute('id');
+ if (trim($id) !== '') {
+ if (preg_match(NodeUtility::$regexps['negative'], $id)) {
+ $weight -= 25;
+ }
+
+ if (preg_match(NodeUtility::$regexps['positive'], $id)) {
+ $weight += 25;
+ }
+ }
+
+ return $weight;
+ }
+
+ /**
+ * Returns the full text of the node.
+ *
+ * @param bool $normalize Normalize white space?
+ *
+ * @return string
+ */
+ public function getTextContent($normalize = true)
+ {
+ $nodeValue = trim($this->textContent);
+ if ($normalize) {
+ $nodeValue = preg_replace(NodeUtility::$regexps['normalize'], ' ', $nodeValue);
+ }
+
+ return $nodeValue;
+ }
+
+ /**
+ * Return an array indicating how many rows and columns this table has.
+ *
+ * @return array
+ */
+ public function getRowAndColumnCount()
+ {
+ $rows = $columns = 0;
+ $trs = $this->getElementsByTagName('tr');
+ foreach ($trs as $tr) {
+ /** @var \DOMElement $tr */
+ $rowspan = $tr->getAttribute('rowspan');
+ $rows += ($rowspan || 1);
+
+ // Now look for column-related info
+ $columnsInThisRow = 0;
+ $cells = $tr->getElementsByTagName('td');
+ foreach ($cells as $cell) {
+ /** @var \DOMElement $cell */
+ $colspan = $cell->getAttribute('colspan');
+ $columnsInThisRow += ($colspan || 1);
+ }
+ $columns = max($columns, $columnsInThisRow);
+ }
+
+ return ['rows' => $rows, 'columns' => $columns];
+ }
+
+ /**
+ * Creates a new node based on the text content of the original node.
+ *
+ * @param $originalNode DOMNode
+ * @param $tagName string
+ *
+ * @return DOMElement
+ */
+ public function createNode($originalNode, $tagName)
+ {
+ $text = $originalNode->getTextContent(false);
+ $newNode = $originalNode->ownerDocument->createElement($tagName, $text);
+
+ return $newNode;
+ }
+
+ /**
+ * Check if a given node has one of its ancestor tag name matching the
+ * provided one.
+ *
+ * @param string $tagName
+ * @param int $maxDepth
+ * @param callable $filterFn
+ *
+ * @return bool
+ */
+ public function hasAncestorTag($tagName, $maxDepth = 3, callable $filterFn = null)
+ {
+ $depth = 0;
+ $node = $this;
+
+ while ($node->parentNode) {
+ if ($maxDepth > 0 && $depth > $maxDepth) {
+ return false;
+ }
+
+ if ($node->parentNode->nodeName === $tagName && (!$filterFn || $filterFn($node->parentNode))) {
+ return true;
+ }
+
+ $node = $node->parentNode;
+ $depth++;
+ }
+
+ return false;
+ }
+
+ /**
+ * Check if this node has only whitespace and a single element with given tag
+ * or if it contains no element with given tag or more than 1 element.
+ *
+ * @param $tag string Name of tag
+ *
+ * @return bool
+ */
+ public function hasSingleTagInsideElement($tag)
+ {
+ // There should be exactly 1 element child with given tag
+ if (count($children = NodeUtility::filterTextNodes($this->childNodes)) !== 1 || $children->item(0)->nodeName !== $tag) {
+ return false;
+ }
+
+ // And there should be no text nodes with real content
+ return array_reduce(iterator_to_array($children), function ($carry, $child) {
+ if (!$carry === false) {
+ return false;
+ }
+
+ /* @var DOMNode $child */
+ return !($child->nodeType === XML_TEXT_NODE && preg_match(NodeUtility::$regexps['hasContent'], $child->textContent));
+ });
+ }
+
+ /**
+ * Check if the current element has a single child block element.
+ * Block elements are the ones defined in the divToPElements array.
+ *
+ * @return bool
+ */
+ public function hasSingleChildBlockElement()
+ {
+ $result = false;
+ if ($this->hasChildNodes()) {
+ foreach ($this->childNodes as $child) {
+ if (in_array($child->nodeName, $this->divToPElements)) {
+ $result = true;
+ } else {
+ // If any of the hasSingleChildBlockElement calls return true, return true then.
+ /** @var $child DOMElement */
+ $result = ($result || $child->hasSingleChildBlockElement());
+ }
+ }
+ }
+
+ return $result;
+ }
+
+ /**
+ * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace.
+ *
+ * @return bool
+ */
+ public function isElementWithoutContent()
+ {
+ return $this instanceof DOMElement &&
+ mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 &&
+ ($this->childNodes->length === 0 ||
+ $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length
+ /*
+ * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node.
+ * If there's an empty tag with an space inside and a BR (for example "
) counting only BRs and
+ * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument,
+ * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we
+ * are dealing with (And at this point we know they are empty or are just whitespace, because of the
+ * mb_strlen in this chain of checks).
+ */
+ + count(array_filter(iterator_to_array($this->childNodes), function ($child) {
+ return $child instanceof DOMText;
+ }))
+
+ );
+ }
+
+ /**
+ * Determine if a node qualifies as phrasing content.
+ * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content.
+ *
+ * @return bool
+ */
+ public function isPhrasingContent()
+ {
+ return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false ||
+ (!is_null($this->childNodes) &&
+ ($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') &&
+ array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) {
+ return $node->isPhrasingContent() && $carry;
+ }, true)
+ );
+ }
+
+ /**
+ * In the original JS project they check if the node has the style display=none, which unfortunately
+ * in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none".
+ *
+ * @return bool
+ */
+ public function isProbablyVisible()
+ {
+ return !preg_match('/display:( )?none/i', $this->getAttribute('style')) &&
+ !$this->hasAttribute('hidden') &&
+ //check for "fallback-image" so that wikimedia math images are displayed
+ (!$this->hasAttribute('aria-hidden') || $this->getAttribute('aria-hidden') !== 'true' || ($this->hasAttribute('class') && strpos($this->getAttribute('class'), 'fallback-image') !== false));
+ }
+
+ /**
+ * @return bool
+ */
+ public function isWhitespace()
+ {
+ return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) ||
+ ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br');
+ }
+
+ /**
+ * This is a hack that overcomes the issue of node shifting when scanning and removing nodes.
+ *
+ * In the JS version of getElementsByTagName, if you remove a node it will not appear during the
+ * foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an
+ * orphan node and will give an exception if you try to do anything with it.
+ *
+ * Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are
+ * removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that
+ * never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries
+ * to access node 6)
+ *
+ * This function solves this by searching for the nodes on every loop and keeping track of the count differences.
+ * Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be
+ * used only when the results of the search are going to be used to remove the nodes.
+ *
+ * @param string $tag
+ *
+ * @return \Generator
+ */
+ public function shiftingAwareGetElementsByTagName($tag)
+ {
+ /** @var $nodes DOMNodeList */
+ $nodes = $this->getElementsByTagName($tag);
+ $count = $nodes->length;
+
+ for ($i = 0; $i < $count; $i = max(++$i, 0)) {
+ yield $nodes->item($i);
+
+ // Search for all the nodes again
+ $nodes = $this->getElementsByTagName($tag);
+
+ // Subtract the amount of nodes removed from the current index
+ $i -= $count - $nodes->length;
+
+ // Subtract the amount of nodes removed from the current count
+ $count -= ($count - $nodes->length);
+ }
+ }
+
+ /**
+ * Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this
+ * function to get the first one that is an DOMElement node.
+ *
+ * @return \DOMElement|null
+ */
+ public function getFirstElementChild()
+ {
+ if ($this->childNodes instanceof \Traversable) {
+ foreach ($this->childNodes as $node) {
+ if ($node instanceof \DOMElement) {
+ return $node;
+ }
+ }
+ }
+
+ return null;
+ }
+}
diff --git a/vendor/fivefilters/readability.php/src/Nodes/NodeUtility.php b/vendor/fivefilters/readability.php/src/Nodes/NodeUtility.php
new file mode 100644
index 0000000..56de705
--- /dev/null
+++ b/vendor/fivefilters/readability.php/src/Nodes/NodeUtility.php
@@ -0,0 +1,192 @@
+ '/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
+ 'okMaybeItsACandidate' => '/and|article|body|column|content|main|shadow/i',
+ 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
+ 'byline' => '/byline|author|dateline|writtenby|p-author/i',
+ 'replaceFonts' => '/<(\/?)font[^>]*>/i',
+ 'normalize' => '/\s{2,}/',
+ 'videos' => '/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i',
+ 'shareElements' => '/(\b|_)(share|sharedaddy)(\b|_)/i',
+ 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i',
+ 'prevLink' => '/(prev|earl|old|new|<|«)/i',
+ 'tokenize' => '/\W+/',
+ 'whitespace' => '/^\s*$/',
+ 'hasContent' => '/\S$/',
+ 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i',
+ 'negative' => '/-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i',
+ // \x{00A0} is the unicode version of
+ 'onlyWhitespace' => '/\x{00A0}|\s+/u',
+ 'hashUrl' => '/^#.+/',
+ 'srcsetUrl' => '/(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/',
+ 'b64DataUrl' => '/^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i',
+ // See: https://schema.org/Article
+ 'jsonLdArticleTypes' => '/^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/'
+
+ ];
+
+ /**
+ * Finds the next node, starting from the given node, and ignoring
+ * whitespace in between. If the given node is an element, the same node is
+ * returned.
+ *
+ * Imported from the Element class on league\html-to-markdown.
+ *
+ * @param $node
+ *
+ * @return DOMNode
+ */
+ public static function nextNode($node)
+ {
+ $next = $node;
+ while ($next
+ && $next->nodeType !== XML_ELEMENT_NODE
+ && $next->isWhitespace()) {
+ $next = $next->nextSibling;
+ }
+
+ return $next;
+ }
+
+ /**
+ * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new
+ * element with the new tag name and importing it to the main DOMDocument.
+ *
+ * @param DOMNode $node
+ * @param string $value
+ * @param bool $importAttributes
+ *
+ * @return DOMNode
+ */
+ public static function setNodeTag($node, $value, $importAttributes = true)
+ {
+ $new = new DOMDocument('1.0', 'utf-8');
+ $new->appendChild($new->createElement($value));
+
+ $children = $node->childNodes;
+ /** @var $children \DOMNodeList $i */
+ for ($i = 0; $i < $children->length; $i++) {
+ $import = $new->importNode($children->item($i), true);
+ $new->firstChild->appendChild($import);
+ }
+
+ if ($importAttributes) {
+ // Import attributes from the original node.
+ foreach ($node->attributes as $attribute) {
+ $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue);
+ }
+ }
+
+ // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement.
+ $import = $node->ownerDocument->importNode($new->firstChild, true);
+ $node->parentNode->replaceChild($import, $node);
+
+ return $import;
+ }
+
+ /**
+ * Removes the current node and returns the next node to be parsed (child, sibling or parent).
+ *
+ * @param DOMNode $node
+ *
+ * @return DOMNode
+ */
+ public static function removeAndGetNext($node)
+ {
+ $nextNode = self::getNextNode($node, true);
+ $node->parentNode->removeChild($node);
+
+ return $nextNode;
+ }
+
+ /**
+ * Remove the selected node.
+ *
+ * @param $node DOMElement
+ *
+ * @return void
+ **/
+ public static function removeNode($node)
+ {
+ $parent = $node->parentNode;
+ if ($parent) {
+ $parent->removeChild($node);
+ }
+ }
+
+ /**
+ * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally
+ * for parents.
+ *
+ * @param DOMNode $originalNode
+ * @param bool $ignoreSelfAndKids
+ *
+ * @return DOMNode
+ */
+ public static function getNextNode($originalNode, $ignoreSelfAndKids = false)
+ {
+ /*
+ * Traverse the DOM from node to node, starting at the node passed in.
+ * Pass true for the second parameter to indicate this node itself
+ * (and its kids) are going away, and we want the next node over.
+ *
+ * Calling this in a loop will traverse the DOM depth-first.
+ */
+
+ // First check for kids if those aren't being ignored
+ if (!$ignoreSelfAndKids && $originalNode->firstChild) {
+ return $originalNode->firstChild;
+ }
+
+ // Then for siblings...
+ if ($originalNode->nextSibling) {
+ return $originalNode->nextSibling;
+ }
+
+ // And finally, move up the parent chain *and* find a sibling
+ // (because this is depth-first traversal, we will have already
+ // seen the parent nodes themselves).
+ do {
+ $originalNode = $originalNode->parentNode;
+ } while ($originalNode && !$originalNode->nextSibling);
+
+ return ($originalNode) ? $originalNode->nextSibling : $originalNode;
+ }
+
+ /**
+ * Remove all empty DOMNodes from DOMNodeLists.
+ *
+ * @param \DOMNodeList $list
+ *
+ * @return DOMNodeList
+ */
+ public static function filterTextNodes(\DOMNodeList $list)
+ {
+ $newList = new DOMNodeList();
+ foreach ($list as $node) {
+ if ($node->nodeType !== XML_TEXT_NODE || mb_strlen(trim($node->nodeValue))) {
+ $newList->add($node);
+ }
+ }
+
+ return $newList;
+ }
+}
diff --git a/vendor/fivefilters/readability.php/src/ParseException.php b/vendor/fivefilters/readability.php/src/ParseException.php
new file mode 100644
index 0000000..587da33
--- /dev/null
+++ b/vendor/fivefilters/readability.php/src/ParseException.php
@@ -0,0 +1,7 @@
+ '<',
+ 'gt' => '>',
+ 'amp' => '&',
+ 'quot' => '"',
+ 'apos' => '\'',
+ ];
+
+ /**
+ * Readability constructor.
+ *
+ * @param Configuration $configuration
+ */
+ public function __construct(Configuration $configuration)
+ {
+ $this->configuration = $configuration;
+ $this->logger = $this->configuration->getLogger();
+ }
+
+ /**
+ * Main parse function.
+ *
+ * @param $html
+ *
+ * @throws ParseException
+ *
+ * @return bool
+ */
+ public function parse($html)
+ {
+ $this->logger->info('*** Starting parse process...');
+
+ $this->dom = $this->loadHTML($html);
+
+ // Checking for minimum HTML to work with.
+ if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) {
+ $this->logger->emergency('No body tag present or body tag empty');
+
+ throw new ParseException('Invalid or incomplete HTML.');
+ }
+
+ $this->getMetadata();
+
+ $this->getMainImage();
+
+ while (true) {
+ $this->logger->debug('Starting parse loop');
+ $root = $root->firstChild;
+
+ $elementsToScore = $this->getNodes($root);
+ $this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore)));
+
+ $result = $this->rateNodes($elementsToScore);
+
+ /*
+ * Now that we've gone through the full algorithm, check to see if
+ * we got any meaningful content. If we didn't, we may need to re-run
+ * grabArticle with different flags set. This gives us a higher likelihood of
+ * finding the content, and the sieve approach gives us a higher likelihood of
+ * finding the -right- content.
+ */
+
+ $length = mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent));
+
+ $this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold()));
+
+ if ($result && $length < $this->configuration->getCharThreshold()) {
+ $this->dom = $this->loadHTML($html);
+ $root = $this->dom->getElementsByTagName('body')->item(0);
+
+ if ($this->configuration->getStripUnlikelyCandidates()) {
+ $this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false');
+ $this->configuration->setStripUnlikelyCandidates(false);
+ $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
+ } elseif ($this->configuration->getWeightClasses()) {
+ $this->logger->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false');
+ $this->configuration->setWeightClasses(false);
+ $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
+ } elseif ($this->configuration->getCleanConditionally()) {
+ $this->logger->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false');
+ $this->configuration->setCleanConditionally(false);
+ $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
+ } else {
+ $this->logger->debug('[Parsing] Threshold not met, searching across attempts for some content.');
+ $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
+
+ // No luck after removing flags, just return the longest text we found during the different loops
+ usort($this->attempts, function ($a, $b) {
+ return $b['textLength'] - $a['textLength'];
+ });
+
+ // But first check if we actually have something
+ if (!$this->attempts[0]['textLength']) {
+ $this->logger->emergency('[Parsing] Could not parse text, giving up :(');
+
+ throw new ParseException('Could not parse text.');
+ }
+
+ $this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.');
+
+ $result = $this->attempts[0]['articleContent'];
+ break;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (!$result) {
+ $this->logger->info('*** Parse failed :(');
+ return false;
+ }
+
+ $result = $this->postProcessContent($result);
+
+ // If we haven't found an excerpt in the article's metadata, use the article's
+ // first paragraph as the excerpt. This can be used for displaying a preview of
+ // the article's content.
+ if (!$this->getExcerpt()) {
+ $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');
+ $paragraphs = $result->getElementsByTagName('p');
+ if ($paragraphs->length > 0) {
+ $this->setExcerpt(trim($paragraphs->item(0)->textContent));
+ }
+ }
+
+ $this->setContent($result);
+
+ $this->logger->info('*** Parse successful :)');
+
+ return true;
+ }
+
+ /**
+ * Creates a DOM Document object and loads the provided HTML on it.
+ *
+ * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)
+ * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
+ * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
+ * objects and ruining the backup.
+ *
+ * @param string $html
+ *
+ * @return DOMDocument
+ */
+ private function loadHTML($html)
+ {
+ $this->logger->debug('[Loading] Loading HTML...');
+
+ // To avoid throwing a gazillion of errors on malformed HTMLs
+ libxml_use_internal_errors(true);
+
+ //$html = preg_replace('/(
]*>[ \n\r\t]*){2,}/i', '
', $html);
+
+ if ($this->configuration->getParser() === 'html5') {
+ $this->logger->debug('[Loading] Using HTML5 parser...');
+ $html5 = new HTML5(['disable_html_ns' => true, 'target_document' => new DOMDocument('1.0', 'utf-8')]);
+ $dom = $html5->loadHTML($html);
+ //TODO: Improve this so it looks inside
, not just any
+ $base = $dom->getElementsByTagName('base');
+ if ($base->length > 0) {
+ $base = $base->item(0);
+ $base = $base->getAttribute('href');
+ if ($base != '') {
+ $this->baseURI = $base;
+ }
+ }
+ } else {
+ $this->logger->debug('[Loading] Using libxml parser...');
+ $dom = new DOMDocument('1.0', 'utf-8');
+ if ($this->configuration->getNormalizeEntities()) {
+ $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.');
+ // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
+ $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
+ }
+ }
+
+ if (!$this->configuration->getSubstituteEntities()) {
+ // Keep the original HTML entities
+ $dom->substituteEntities = false;
+ }
+
+ if ($this->configuration->getSummonCthulhu()) {
+ $this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘');
+ $html = preg_replace('/