summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-12-01 20:44:35 +0000
committerAndres Rey <[email protected]>2017-12-01 20:44:35 +0000
commitecebda06241ff34eac70b247c74175b162902582 (patch)
treec919efcf1b013cbd7f1221596328f93e0b54e367
parent8a266f2cae5dce8d1fa39c40caac8400406898bb (diff)
parenta2eded4e4df7e48a6f20d9e0449c52bf8003c471 (diff)
Merge remote-tracking branch 'origin/v1.0' into v1.0
# Conflicts: # src/NodeClass/DOMNode.php # src/Readability.php
-rw-r--r--src/Configuration.php26
-rw-r--r--src/NodeClass/DOMNode.php13
-rw-r--r--src/Nodes/DOMDocument.php2
-rw-r--r--src/Nodes/NodeTrait.php34
-rw-r--r--src/Nodes/NodeUtility.php15
-rw-r--r--src/Readability.php64
-rw-r--r--test/ReadabilityTest.php5
7 files changed, 101 insertions, 58 deletions
diff --git a/src/Configuration.php b/src/Configuration.php
index 6ce8b5b..1a405de 100644
--- a/src/Configuration.php
+++ b/src/Configuration.php
@@ -3,7 +3,7 @@
namespace andreskrey\Readability;
/**
- * Class Configuration
+ * Class Configuration.
*/
class Configuration
{
@@ -62,11 +62,13 @@ class Configuration
/**
* @param int $maxTopCandidates
+ *
* @return $this
*/
public function setMaxTopCandidates($maxTopCandidates)
{
$this->maxTopCandidates = $maxTopCandidates;
+
return $this;
}
@@ -80,11 +82,13 @@ class Configuration
/**
* @param int $wordThreshold
+ *
* @return $this
*/
public function setWordThreshold($wordThreshold)
{
$this->wordThreshold = $wordThreshold;
+
return $this;
}
@@ -98,11 +102,13 @@ class Configuration
/**
* @param bool $articleByLine
+ *
* @return $this
*/
public function setArticleByLine($articleByLine)
{
$this->articleByLine = $articleByLine;
+
return $this;
}
@@ -116,11 +122,13 @@ class Configuration
/**
* @param bool $stripUnlikelyCandidates
+ *
* @return $this
*/
public function setStripUnlikelyCandidates($stripUnlikelyCandidates)
{
$this->stripUnlikelyCandidates = $stripUnlikelyCandidates;
+
return $this;
}
@@ -134,11 +142,13 @@ class Configuration
/**
* @param bool $cleanConditionally
+ *
* @return $this
*/
public function setCleanConditionally($cleanConditionally)
{
$this->cleanConditionally = $cleanConditionally;
+
return $this;
}
@@ -152,11 +162,13 @@ class Configuration
/**
* @param bool $weightClasses
+ *
* @return $this
*/
public function setWeightClasses($weightClasses)
{
$this->weightClasses = $weightClasses;
+
return $this;
}
@@ -170,11 +182,13 @@ class Configuration
/**
* @param bool $removeReadabilityTags
+ *
* @return $this
*/
public function setRemoveReadabilityTags($removeReadabilityTags)
{
$this->removeReadabilityTags = $removeReadabilityTags;
+
return $this;
}
@@ -188,11 +202,13 @@ class Configuration
/**
* @param bool $fixRelativeURLs
+ *
* @return $this
*/
public function setFixRelativeURLs($fixRelativeURLs)
{
$this->fixRelativeURLs = $fixRelativeURLs;
+
return $this;
}
@@ -206,11 +222,13 @@ class Configuration
/**
* @param bool $substituteEntities
+ *
* @return $this
*/
public function setSubstituteEntities($substituteEntities)
{
$this->substituteEntities = $substituteEntities;
+
return $this;
}
@@ -224,11 +242,13 @@ class Configuration
/**
* @param bool $normalizeEntities
+ *
* @return $this
*/
public function setNormalizeEntities($normalizeEntities)
{
$this->normalizeEntities = $normalizeEntities;
+
return $this;
}
@@ -242,11 +262,13 @@ class Configuration
/**
* @param string $originalURL
+ *
* @return $this
*/
public function setOriginalURL($originalURL)
{
$this->originalURL = $originalURL;
+
return $this;
}
@@ -260,11 +282,13 @@ class Configuration
/**
* @param bool $summonCthulhu
+ *
* @return $this
*/
public function setSummonCthulhu($summonCthulhu)
{
$this->summonCthulhu = $summonCthulhu;
+
return $this;
}
diff --git a/src/NodeClass/DOMNode.php b/src/NodeClass/DOMNode.php
new file mode 100644
index 0000000..eb5e93e
--- /dev/null
+++ b/src/NodeClass/DOMNode.php
@@ -0,0 +1,13 @@
+<?php
+
+namespace andreskrey\Readability\NodeClass;
+
+/**
+ * Class DOMNode.
+ *
+ * @method getAttribute($attribute)
+ */
+class DOMNode extends \DOMNode
+{
+ use NodeClassTrait;
+}
diff --git a/src/Nodes/DOMDocument.php b/src/Nodes/DOMDocument.php
index f954f7d..510cdf2 100644
--- a/src/Nodes/DOMDocument.php
+++ b/src/Nodes/DOMDocument.php
@@ -14,7 +14,7 @@ class DOMDocument extends \DOMDocument
$this->registerNodeClass('DOMCdataSection', DOMCdataSection::class);
$this->registerNodeClass('DOMCharacterData', DOMCharacterData::class);
$this->registerNodeClass('DOMComment', DOMComment::class);
- $this->registerNodeClass('DOMDocument', DOMDocument::class);
+ $this->registerNodeClass('DOMDocument', self::class);
$this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class);
$this->registerNodeClass('DOMDocumentType', DOMDocumentType::class);
$this->registerNodeClass('DOMElement', DOMElement::class);
diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php
index 3294612..5847178 100644
--- a/src/Nodes/NodeTrait.php
+++ b/src/Nodes/NodeTrait.php
@@ -5,20 +5,27 @@ namespace andreskrey\Readability\Nodes;
trait NodeTrait
{
/**
- * Content score of the node. Used to determine the value of the content
+ * Content score of the node. Used to determine the value of the content.
*
* @var int
*/
public $contentScore = 0;
/**
- * Flag for initialized status
+ * Flag for initialized status.
*
* @var bool
*/
private $initialized = false;
/**
+ * Flag data tables.
+ *
+ * @var bool
+ */
+ private $readabilityDataTable = false;
+
+ /**
* @var array
*/
private $divToPElements = [
@@ -36,7 +43,7 @@ trait NodeTrait
];
/**
- * initialized getter
+ * initialized getter.
*
* @return bool
*/
@@ -46,11 +53,28 @@ trait NodeTrait
}
/**
+ * @return bool
+ */
+ public function isReadabilityDataTable()
+ {
+ return $this->readabilityDataTable;
+ }
+
+ /**
+ * @param bool $param
+ */
+ public function setReadabilityDataTable($param)
+ {
+ $this->readabilityDataTable = $param;
+ }
+
+ /**
* Initializer. Calculates the current score of the node and returns a full Readability object.
*
* @ TODO: I don't like the weightClasses param. How can we get the config here?
*
* @param $weightClasses bool Weight classes?
+ *
* @return static
*/
public function initializeNode($weightClasses)
@@ -179,7 +203,6 @@ trait NodeTrait
return $linkLength / $textLength;
}
-
/**
* Calculates the weight of the class/id of the current element.
*
@@ -281,11 +304,10 @@ trait NodeTrait
return ['rows' => $rows, 'columns' => $columns];
}
-
/**
* Creates a new node based on the text content of the original node.
*
- * @param $originalNode DOMElement
+ * @param $originalNode DOMNode
* @param $tagName string
*
* @return DOMElement
diff --git a/src/Nodes/NodeUtility.php b/src/Nodes/NodeUtility.php
index f35e9c5..8938a49 100644
--- a/src/Nodes/NodeUtility.php
+++ b/src/Nodes/NodeUtility.php
@@ -3,14 +3,12 @@
namespace andreskrey\Readability\Nodes;
/**
- * Class NodeUtility
- * @package andreskrey\Readability
+ * Class NodeUtility.
*/
class NodeUtility
{
-
/**
- * Collection of regexps to check the node usability
+ * Collection of regexps to check the node usability.
*
* @var array
*/
@@ -32,12 +30,11 @@ class NodeUtility
'onlyWhitespace' => '/\x{00A0}|\s+/u'
];
-
/**
- *
- * Imported from the Element class on league\html-to-markdown
+ * Imported from the Element class on league\html-to-markdown.
*
* @param $node
+ *
* @return DOMElement
*/
public static function nextElement($node)
@@ -52,13 +49,13 @@ class NodeUtility
return $next;
}
-
/**
* Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new
* element with the new tag name and importing it to the main DOMDocument.
*
* @param string $value
* @param bool $importAttributes
+ *
* @return DOMNode
*/
public static function setNodeTag($node, $value, $importAttributes = false)
@@ -68,7 +65,6 @@ class NodeUtility
$children = $node->childNodes;
/** @var $children \DOMNodeList $i */
-
for ($i = 0; $i < $children->length; $i++) {
$import = $new->importNode($children->item($i), true);
$new->firstChild->appendChild($import);
@@ -118,7 +114,6 @@ class NodeUtility
}
}
-
/**
* Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally
* for parents.
diff --git a/src/Readability.php b/src/Readability.php
index fd6e66b..60a314a 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -9,7 +9,7 @@ use andreskrey\Readability\Nodes\DOMText;
use andreskrey\Readability\Nodes\NodeUtility;
/**
- * Class Readability
+ * Class Readability.
*/
class Readability
{
@@ -21,49 +21,49 @@ class Readability
protected $dom;
/**
- * Title of the article
+ * Title of the article.
*
* @var string|null
*/
protected $title = null;
/**
- * HTML content article
+ * HTML content article.
*
* @var string|null
*/
protected $content = null;
/**
- * Excerpt of the article
+ * Excerpt of the article.
*
* @var string|null
*/
protected $excerpt = null;
/**
- * Main image of the article
+ * Main image of the article.
*
* @var string|null
*/
protected $image = null;
/**
- * Author of the article. Extracted from the byline tags and other social media properties
+ * Author of the article. Extracted from the byline tags and other social media properties.
*
* @var string|null
*/
protected $author = null;
/**
- * Direction of the text
+ * Direction of the text.
*
* @var string|null
*/
protected $direction = null;
/**
- * Configuration object
+ * Configuration object.
*
* @var Configuration
*/
@@ -149,7 +149,7 @@ class Readability
}
/**
- * Main parse function
+ * Main parse function.
*
* @param $html
* @throws ParseException
@@ -214,7 +214,7 @@ class Readability
}
/**
- * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties
+ * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties.
*/
private function getMetadata()
{
@@ -311,7 +311,6 @@ class Readability
return $result;
}
-
/**
* Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't
* find a correct image.
@@ -432,7 +431,6 @@ class Readability
return $curTitle;
}
-
private function toAbsoluteURI($uri)
{
list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL());
@@ -480,7 +478,6 @@ class Readability
return [$pathBase, $scheme, $prePath];
}
-
/**
* Gets nodes from the root element.
*
@@ -593,14 +590,14 @@ class Readability
/*
* Check if the byline is already set
*/
- if (isset($this->metadata['byline'])) {
+ if ($this->getAuthor()) {
return false;
}
$rel = $node->getAttribute('rel');
if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) {
- $this->metadata['byline'] = trim($node->getTextContent());
+ $this->setAuthor(trim($node->getTextContent()));
return true;
}
@@ -626,7 +623,6 @@ class Readability
return false;
}
-
/**
* Removes all the scripts of the html.
*
@@ -720,7 +716,6 @@ class Readability
}
}
-
/**
* Assign scores to each node. This function will rate each node and return a DOMElement object for each one.
*
@@ -816,7 +811,6 @@ class Readability
}
$topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null;
- $neededToCreateTopCandidate = false;
$parentOfTopCandidate = null;
/*
@@ -922,7 +916,7 @@ class Readability
$hasContent = false;
- /** @var Readability $sibling */
+ /** @var DOMElement $sibling */
foreach ($siblings as $sibling) {
$append = false;
@@ -1088,56 +1082,55 @@ class Readability
/** @var DOMElement $table */
$role = $table->getAttribute('role');
if ($role === 'presentation') {
- $table->readabilityDataTable = false;
+ $table->setReadabilityDataTable(false);
continue;
}
$datatable = $table->getAttribute('datatable');
if ($datatable == '0') {
- $table->readabilityDataTable = false;
+ $table->setReadabilityDataTable(false);
continue;
}
$summary = $table->getAttribute('summary');
if ($summary) {
- $table->readabilityDataTable = true;
+ $table->setReadabilityDataTable(true);
continue;
}
$caption = $table->getElementsByTagName('caption');
if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) {
- $table->readabilityDataTable = true;
+ $table->setReadabilityDataTable(true);
continue;
}
// If the table has a descendant with any of these tags, consider a data table:
foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) {
if ($table->getElementsByTagName($dataTableDescendants)->length > 0) {
- $table->readabilityDataTable = true;
+ $table->setReadabilityDataTable(true);
continue 2;
}
}
// Nested tables indicate a layout table:
if ($table->getElementsByTagName('table')->length > 0) {
- $table->readabilityDataTable = false;
+ $table->setReadabilityDataTable(false);
continue;
}
$sizeInfo = $table->getRowAndColumnCount();
if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) {
- $table->readabilityDataTable = true;
+ $table->setReadabilityDataTable(true);
continue;
}
// Now just go by size entirely:
- $table->readabilityDataTable = $sizeInfo['rows'] * $sizeInfo['columns'] > 10;
+ $table->setReadabilityDataTable($sizeInfo['rows'] * $sizeInfo['columns'] > 10);
}
}
-
/**
* Remove the style attribute on every e and under.
* TODO: To be moved to Readability.
*
- * @param $node \DOMDocument|\DOMNode
+ * @param $node DOMDocument|DOMNode
**/
public function _cleanStyles($node)
{
@@ -1213,7 +1206,6 @@ class Readability
$totalCount = $imgCount + $embedCount + $objectCount + $iframeCount;
if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) {
- // TODO must be done via readability
$paragraph->parentNode->removeChild($paragraph);
}
}
@@ -1247,7 +1239,7 @@ class Readability
$node = $DOMNodeList->item($length - 1 - $i);
// First check if we're in a data table, in which case don't remove us.
- if ($node->hasAncestorTag($node, 'table', -1) && isset($node->readabilityDataTable)) {
+ if ($node->hasAncestorTag($node, 'table', -1) && $node->isReadabilityDataTable()) {
continue;
}
@@ -1268,7 +1260,6 @@ class Readability
* ominous signs, remove the element.
*/
- // TODO Horrible hack, must be removed once this function is inside Readability
$p = $node->getElementsByTagName('p')->length;
$img = $node->getElementsByTagName('img')->length;
$li = $node->getElementsByTagName('li')->length - 100;
@@ -1402,7 +1393,6 @@ class Readability
return $article;
}
-
/**
* @return null|string
*/
@@ -1420,7 +1410,7 @@ class Readability
}
/**
- * @param null $title
+ * @param string $title
*/
protected function setTitle($title)
{
@@ -1436,7 +1426,7 @@ class Readability
}
/**
- * @param null $content
+ * @param string $content
*/
protected function setContent($content)
{
@@ -1468,7 +1458,7 @@ class Readability
}
/**
- * @param null $image
+ * @param string $image
*/
protected function setImage($image)
{
@@ -1484,7 +1474,7 @@ class Readability
}
/**
- * @param null $author
+ * @param string $author
*/
protected function setAuthor($author)
{
diff --git a/test/ReadabilityTest.php b/test/ReadabilityTest.php
index a577b3d..9d29ba5 100644
--- a/test/ReadabilityTest.php
+++ b/test/ReadabilityTest.php
@@ -2,7 +2,6 @@
namespace andreskrey\Readability\Test;
-
use andreskrey\Readability\Configuration;
use andreskrey\Readability\Readability;
@@ -24,7 +23,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
$configuration = new Configuration();
- foreach($options as $key => $value){
+ foreach ($options as $key => $value) {
$name = 'set' . $key;
$configuration->$name($value);
}
@@ -50,7 +49,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase
}
$configuration = new Configuration();
- foreach($options as $key => $value){
+ foreach ($options as $key => $value) {
$name = 'set' . $key;
$configuration->$name($value);
}