summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-28 19:12:29 +0000
committerAndres Rey <[email protected]>2017-11-28 19:12:29 +0000
commitd1bc52aad329f6bf1d4f88758ea8dc1ba3f37ab8 (patch)
tree74880cd3c65e2eab6928c27b1623d1d4e41b3e7f /src
parent79f1356c18955c56d289a5f17288e9d24136821f (diff)
Import and fix more code
Diffstat (limited to 'src')
-rw-r--r--src/NodeClass/NodeClassTrait.php33
-rw-r--r--src/NodeUtility.php21
-rw-r--r--src/Readability.php385
3 files changed, 419 insertions, 20 deletions
diff --git a/src/NodeClass/NodeClassTrait.php b/src/NodeClass/NodeClassTrait.php
index 0d36039..b2c0fcc 100644
--- a/src/NodeClass/NodeClassTrait.php
+++ b/src/NodeClass/NodeClassTrait.php
@@ -282,12 +282,12 @@ trait NodeClassTrait
/**
* Creates a new node based on the text content of the original node.
*
- * @param $originalNode self
+ * @param $originalNode DOMElement
* @param $tagName string
*
- * @return self
+ * @return DOMElement
*/
- public function createNode(self $originalNode, $tagName)
+ public function createNode($originalNode, $tagName)
{
$text = NodeUtility::getTextContent($originalNode);
$newNode = $originalNode->ownerDocument->createElement($tagName, $text);
@@ -370,4 +370,31 @@ trait NodeClassTrait
);
}
+ /**
+ * Return an array indicating how many rows and columns this table has.
+ *
+ * @return array
+ */
+ public function _getRowAndColumnCount()
+ {
+ $rows = $columns = 0;
+ $trs = $this->getElementsByTagName('tr');
+ foreach ($trs as $tr) {
+ /** @var \DOMElement $tr */
+ $rowspan = $tr->getAttribute('rowspan');
+ $rows += ($rowspan || 1);
+
+ // Now look for column-related info
+ $columnsInThisRow = 0;
+ $cells = $tr->getElementsByTagName('td');
+ foreach ($cells as $cell) {
+ /** @var \DOMElement $cell */
+ $colspan = $cell->getAttribute('colspan');
+ $columnsInThisRow += ($colspan || 1);
+ }
+ $columns = max($columns, $columnsInThisRow);
+ }
+
+ return ['rows' => $rows, 'columns' => $columns];
+ }
}
diff --git a/src/NodeUtility.php b/src/NodeUtility.php
index 52f2d09..4621f21 100644
--- a/src/NodeUtility.php
+++ b/src/NodeUtility.php
@@ -2,6 +2,7 @@
namespace andreskrey\Readability;
use andreskrey\Readability\NodeClass\DOMDocument;
+use andreskrey\Readability\NodeClass\DOMElement;
use andreskrey\Readability\NodeClass\DOMNode;
/**
@@ -100,6 +101,22 @@ class NodeUtility
}
/**
+ * Remove the passed node.
+ *
+ * @param DOMElement $node
+ *
+ * @return void
+ **/
+ public static function removeNode(DOMElement $node)
+ {
+ $parent = $node->parentNode;
+ if ($parent) {
+ $parent->removeChild($node);
+ }
+ }
+
+
+ /**
* Returns the next node. First checks for childs (if the flag allows it), then for siblings, and finally
* for parents.
*
@@ -132,7 +149,7 @@ class NodeUtility
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
do {
- $originalNode = $originalNode->getParent();
+ $originalNode = $originalNode->parentNode;
} while ($originalNode && !$originalNode->nextSibling);
return ($originalNode) ? $originalNode->nextSibling : $originalNode;
@@ -189,7 +206,7 @@ class NodeUtility
/**
* Returns the full text of the node.
*
- * @param $node DOMNode
+ * @param $node DOMNode|DOMElement
* @param bool $normalize Normalize white space?
* @return string
*/
diff --git a/src/Readability.php b/src/Readability.php
index add5366..452bf95 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -14,6 +14,7 @@ use andreskrey\Readability\NodeClass\DOMNode;
use andreskrey\Readability\NodeClass\DOMNotation;
use andreskrey\Readability\NodeClass\DOMProcessingInstruction;
use andreskrey\Readability\NodeClass\DOMText;
+use andreskrey\Readability\NodeClass\NodeClassTrait;
/**
* Class Readability
@@ -134,16 +135,16 @@ class Readability
foreach ($result->getElementsByTagName('p') as $p) {
$length += mb_strlen($p->textContent);
}
- if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < $this->getConfig()->getOption('wordThreshold')) {
+ if ($result && mb_strlen(preg_replace('/\s/', '', $result->textContent)) < $this->configuration->getWordThreshold()) {
$this->dom = $this->loadHTML($html);
$root = $this->dom->getElementsByTagName('body')->item(0);
- if ($this->getConfig()->getOption('stripUnlikelyCandidates')) {
- $this->getConfig()->setOption('stripUnlikelyCandidates', false);
- } elseif ($this->getConfig()->getOption('weightClasses')) {
- $this->getConfig()->setOption('weightClasses', false);
- } elseif ($this->getConfig()->getOption('cleanConditionally')) {
- $this->getConfig()->setOption('cleanConditionally', false);
+ if ($this->configuration->getStripUnlikelyCandidates()) {
+ $this->configuration->setStripUnlikelyCandidates(false);
+ } elseif ($this->configuration->getWeightClasses()) {
+ $this->configuration->setWeightClasses(false);
+ } elseif ($this->configuration->getCleanConditionally()) {
+ $this->configuration->setCleanConditionally(false);
} else {
$parseSuccessful = false;
break;
@@ -286,6 +287,37 @@ class Readability
}
/**
+ * @return array
+ */
+ public function getImages()
+ {
+ $result = [];
+ if (!empty($this->metadata['image'])) {
+ $result[] = $this->metadata['image'];
+ }
+ if (null == $this->dom) {
+ return $result;
+ }
+
+ foreach ($this->dom->getElementsByTagName('img') as $img) {
+ if ($src = $img->getAttribute('src')) {
+ $result[] = $src;
+ }
+ }
+
+ if ($this->configuration->getFixRelativeURLs()) {
+ foreach ($result as &$imgSrc) {
+ $imgSrc = $this->toAbsoluteURI($imgSrc);
+ }
+ }
+
+ $result = array_unique(array_filter($result));
+
+ return $result;
+ }
+
+
+ /**
* Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't
* find a correct image.
*
@@ -758,13 +790,13 @@ class Readability
* tree.
*/
- $parentOfTopCandidate = $topCandidate->getParent();
+ $parentOfTopCandidate = $topCandidate->parentNode;
$lastScore = $topCandidate->getContentScore();
// The scores shouldn't get too low.
$scoreThreshold = $lastScore / 3;
- /* @var Readability $parentOfTopCandidate */
+ /* @var DOMElement $parentOfTopCandidate */
while (!$parentOfTopCandidate->tagNameEqualsTo('body')) {
$parentScore = $parentOfTopCandidate->getContentScore();
if ($parentScore < $scoreThreshold) {
@@ -777,15 +809,15 @@ class Readability
break;
}
$lastScore = $parentOfTopCandidate->getContentScore();
- $parentOfTopCandidate = $parentOfTopCandidate->getParent();
+ $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
}
// If the top candidate is the only child, use parent instead. This will help sibling
// joining logic when adjacent content is actually located in parent's sibling node.
- $parentOfTopCandidate = $topCandidate->getParent();
+ $parentOfTopCandidate = $topCandidate->parentNode;
while (!$parentOfTopCandidate->tagNameEqualsTo('body') && count($parentOfTopCandidate->getChildren(true)) === 1) {
$topCandidate = $parentOfTopCandidate;
- $parentOfTopCandidate = $topCandidate->getParent();
+ $parentOfTopCandidate = $topCandidate->parentNode;
}
}
@@ -841,7 +873,7 @@ class Readability
* Turn it into a div so it doesn't get filtered out later by accident.
*/
- $sibling->setNodeTag('div');
+ NodeUtility::setNodeTag($siblings, 'div');
}
$import = $articleContent->importNode($sibling, true);
@@ -942,8 +974,6 @@ class Readability
$this->_cleanExtraParagraphs($article);
- $this->_cleanReadabilityTags($article);
-
foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) {
$next = $br->nextSibling;
if ($next && $next->nodeName === 'p') {
@@ -954,6 +984,331 @@ class Readability
return $article;
}
+ /**
+ * Look for 'data' (as opposed to 'layout') tables, for which we use
+ * similar checks as
+ * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920.
+ *
+ * TODO To be moved to Readability. WARNING: check if we actually keep the "readabilityDataTable" param and
+ * maybe switch to a readability data-tag?
+ *
+ * @param DOMDocument $article
+ *
+ * @return void
+ */
+ public function _markDataTables(DOMDocument $article)
+ {
+ $tables = $article->getElementsByTagName('table');
+ foreach ($tables as $table) {
+ /** @var DOMElement $table */
+ $role = $table->getAttribute('role');
+ if ($role === 'presentation') {
+ $table->readabilityDataTable = false;
+ continue;
+ }
+ $datatable = $table->getAttribute('datatable');
+ if ($datatable == '0') {
+ $table->readabilityDataTable = false;
+ continue;
+ }
+ $summary = $table->getAttribute('summary');
+ if ($summary) {
+ $table->readabilityDataTable = true;
+ continue;
+ }
+
+ $caption = $table->getElementsByTagName('caption');
+ if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) {
+ $table->readabilityDataTable = true;
+ continue;
+ }
+
+ // If the table has a descendant with any of these tags, consider a data table:
+ foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) {
+ if ($table->getElementsByTagName($dataTableDescendants)->length > 0) {
+ $table->readabilityDataTable = true;
+ continue 2;
+ }
+ }
+
+ // Nested tables indicate a layout table:
+ if ($table->getElementsByTagName('table')->length > 0) {
+ $table->readabilityDataTable = false;
+ continue;
+ }
+
+ $sizeInfo = $table->_getRowAndColumnCount();
+ if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) {
+ $table->readabilityDataTable = true;
+ continue;
+ }
+ // Now just go by size entirely:
+ $table->readabilityDataTable = $sizeInfo['rows'] * $sizeInfo['columns'] > 10;
+ }
+ }
+
+
+ /**
+ * Remove the style attribute on every e and under.
+ * TODO: To be moved to Readability.
+ *
+ * @param $node \DOMDocument|\DOMNode
+ **/
+ public function _cleanStyles($node)
+ {
+ if (property_exists($node, 'tagName') && $node->tagName === 'svg') {
+ return;
+ }
+
+ // Do not bother if there's no method to remove an attribute
+ if (method_exists($node, 'removeAttribute')) {
+ $presentational_attributes = ['align', 'background', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'frame', 'hspace', 'rules', 'style', 'valign', 'vspace'];
+ // Remove `style` and deprecated presentational attributes
+ foreach ($presentational_attributes as $presentational_attribute) {
+ $node->removeAttribute($presentational_attribute);
+ }
+
+ $deprecated_size_attribute_elems = ['table', 'th', 'td', 'hr', 'pre'];
+ if (property_exists($node, 'tagName') && in_array($node->tagName, $deprecated_size_attribute_elems)) {
+ $node->removeAttribute('width');
+ $node->removeAttribute('height');
+ }
+ }
+
+ $cur = $node->firstChild;
+ while ($cur !== null) {
+ $this->_cleanStyles($cur);
+ $cur = $cur->nextSibling;
+ }
+ }
+
+ /**
+ * Clean out elements whose id/class combinations match specific string.
+ *
+ * TODO To be moved to readability
+ *
+ * @param $node DOMElement Node to clean
+ * @param $regex string Match id/class combination.
+ *
+ * @return void
+ **/
+ public function _cleanMatchedNodes($node, $regex)
+ {
+ $endOfSearchMarkerNode = $node->getNextNode($node, true);
+ $next = $node->getNextNode($node);
+ while ($next && $next !== $endOfSearchMarkerNode) {
+ if (preg_match($regex, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) {
+ $next = NodeUtility::removeAndGetNext($node);
+ } else {
+ $next = $next->getNextNode($next);
+ }
+ }
+ }
+
+ /**
+ * TODO To be moved to Readability.
+ *
+ * @param DOMDocument $article
+ *
+ * @return void
+ */
+ public function _cleanExtraParagraphs(DOMDocument $article)
+ {
+ $paragraphs = $article->getElementsByTagName('p');
+ $length = $paragraphs->length;
+
+ for ($i = 0; $i < $length; $i++) {
+ $paragraph = $paragraphs->item($length - 1 - $i);
+
+ $imgCount = $paragraph->getElementsByTagName('img')->length;
+ $embedCount = $paragraph->getElementsByTagName('embed')->length;
+ $objectCount = $paragraph->getElementsByTagName('object')->length;
+ // At this point, nasty iframes have been removed, only remain embedded video ones.
+ $iframeCount = $paragraph->getElementsByTagName('iframe')->length;
+ $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount;
+
+ if ($totalCount === 0 && !preg_replace($this->regexps['onlyWhitespace'], '', $paragraph->textContent)) {
+ // TODO must be done via readability
+ $paragraph->parentNode->removeChild($paragraph);
+ }
+ }
+ }
+
+ /**
+ * TODO To be moved to Readability.
+ *
+ * @param DOMDocument $article
+ *
+ * @return void
+ */
+ public function _cleanConditionally(DOMDocument $article, $tag)
+ {
+ if (!$this->configuration->getCleanConditionally()) {
+ return;
+ }
+
+ $isList = in_array($tag, ['ul', 'ol']);
+
+ /*
+ * Gather counts for other typical elements embedded within.
+ * Traverse backwards so we can remove nodes at the same time
+ * without effecting the traversal.
+ */
+
+ $DOMNodeList = $article->getElementsByTagName($tag);
+ $length = $DOMNodeList->length;
+ for ($i = 0; $i < $length; $i++) {
+ /** @var $node DOMElement */
+ $node = $DOMNodeList->item($length - 1 - $i);
+
+ // First check if we're in a data table, in which case don't remove us.
+ if ($node->hasAncestorTag($node, 'table', -1) && isset($node->readabilityDataTable)) {
+ continue;
+ }
+
+ $weight = $node->getClassWeight();
+
+ if ($weight < 0) {
+ NodeUtility::removeNode($node);
+ continue;
+ }
+
+ if (substr_count($node->getTextContent(), ',') < 10) {
+ /*
+ * If there are not very many commas, and the number of
+ * non-paragraph elements is more than paragraphs or other
+ * ominous signs, remove the element.
+ */
+
+ // TODO Horrible hack, must be removed once this function is inside Readability
+ $p = $node->getElementsByTagName('p')->length;
+ $img = $node->getElementsByTagName('img')->length;
+ $li = $node->getElementsByTagName('li')->length - 100;
+ $input = $node->getElementsByTagName('input')->length;
+
+ $embedCount = 0;
+ $embeds = $node->getElementsByTagName('embed');
+
+ foreach ($embeds as $embedNode) {
+ if (preg_match($this->regexps['videos'], $embedNode->C14N())) {
+ $embedCount++;
+ }
+ }
+
+ $linkDensity = $node->getLinkDensity();
+ $contentLength = mb_strlen($node->getTextContent(true));
+
+ $haveToRemove =
+ ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag($node, 'figure')) ||
+ (!$isList && $li > $p) ||
+ ($input > floor($p / 3)) ||
+ (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag($node, 'figure')) ||
+ (!$isList && $weight < 25 && $linkDensity > 0.2) ||
+ ($weight >= 25 && $linkDensity > 0.5) ||
+ (($embedCount === 1 && $contentLength < 75) || $embedCount > 1);
+
+ if ($haveToRemove) {
+ NodeUtility::removeNode($node);
+ }
+ }
+ }
+ }
+
+ /**
+ * Clean a node of all elements of type "tag".
+ * (Unless it's a youtube/vimeo video. People love movies.).
+ *
+ * TODO To be moved to Readability
+ *
+ * @param $article DOMDocument
+ * @param $tag string tag to clean
+ *
+ * @return void
+ **/
+ public function _clean(DOMDocument $article, $tag)
+ {
+ $isEmbed = in_array($tag, ['object', 'embed', 'iframe']);
+
+ $DOMNodeList = $article->getElementsByTagName($tag);
+ $length = $DOMNodeList->length;
+ for ($i = 0; $i < $length; $i++) {
+ $item = $DOMNodeList->item($length - 1 - $i);
+
+ // Allow youtube and vimeo videos through as people usually want to see those.
+ if ($isEmbed) {
+ $attributeValues = [];
+ foreach ($item->attributes as $name => $value) {
+ $attributeValues[] = $value->nodeValue;
+ }
+ $attributeValues = implode('|', $attributeValues);
+
+ // First, check the elements attributes to see if any of them contain youtube or vimeo
+ if (preg_match($this->regexps['videos'], $attributeValues)) {
+ continue;
+ }
+
+ // Then check the elements inside this element for the same.
+ if (preg_match($this->regexps['videos'], $item->C14N())) {
+ continue;
+ }
+ }
+ NodeUtility::removeNode($item);
+ }
+ }
+
+ /**
+ * Clean out spurious headers from an Element. Checks things like classnames and link density.
+ *
+ * TODO To be moved to Readability
+ *
+ * @param DOMDocument $article
+ *
+ * @return void
+ **/
+ public function _cleanHeaders(DOMDocument $article)
+ {
+ for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
+ $headers = $article->getElementsByTagName('h' . $headerIndex);
+ /** @var $header DOMElement */
+ foreach ($headers as $header) {
+ if ($header->getClassWeight() < 0) {
+ NodeUtility::removeNode($header);
+ }
+ }
+ }
+ }
+
+ public function postProcessContent(DOMDocument $article)
+ {
+ // Readability cannot open relative uris so we convert them to absolute uris.
+ if ($this->configuration->getFixRelativeURLs()) {
+ foreach (iterator_to_array($article->getElementsByTagName('a')) as $link) {
+ /** @var DOMElement $link */
+ $href = $link->getAttribute('href');
+ if ($href) {
+ // Replace links with javascript: URIs with text content, since
+ // they won't work after scripts have been removed from the page.
+ if (strpos($href, 'javascript:') === 0) {
+ $text = $article->createTextNode($link->textContent);
+ $link->parentNode->replaceChild($text, $link);
+ } else {
+ $link->setAttribute('href', $this->toAbsoluteURI($href));
+ }
+ }
+ }
+
+ foreach ($article->getElementsByTagName('img') as $img) {
+ /** @var DOMElement $img */
+ $src = $img->getAttribute('src');
+ if ($src) {
+ $img->setAttribute('src', $this->toAbsoluteURI($src));
+ }
+ }
+ }
+
+ return $article;
+ }
+
/**
* @return null|string