diff options
author | Andres Rey <[email protected]> | 2017-12-01 20:44:35 +0000 |
---|---|---|
committer | Andres Rey <[email protected]> | 2017-12-01 20:44:35 +0000 |
commit | ecebda06241ff34eac70b247c74175b162902582 (patch) | |
tree | c919efcf1b013cbd7f1221596328f93e0b54e367 | |
parent | 8a266f2cae5dce8d1fa39c40caac8400406898bb (diff) | |
parent | a2eded4e4df7e48a6f20d9e0449c52bf8003c471 (diff) |
Merge remote-tracking branch 'origin/v1.0' into v1.0
# Conflicts:
# src/NodeClass/DOMNode.php
# src/Readability.php
-rw-r--r-- | src/Configuration.php | 26 | ||||
-rw-r--r-- | src/NodeClass/DOMNode.php | 13 | ||||
-rw-r--r-- | src/Nodes/DOMDocument.php | 2 | ||||
-rw-r--r-- | src/Nodes/NodeTrait.php | 34 | ||||
-rw-r--r-- | src/Nodes/NodeUtility.php | 15 | ||||
-rw-r--r-- | src/Readability.php | 64 | ||||
-rw-r--r-- | test/ReadabilityTest.php | 5 |
7 files changed, 101 insertions, 58 deletions
diff --git a/src/Configuration.php b/src/Configuration.php index 6ce8b5b..1a405de 100644 --- a/src/Configuration.php +++ b/src/Configuration.php @@ -3,7 +3,7 @@ namespace andreskrey\Readability; /** - * Class Configuration + * Class Configuration. */ class Configuration { @@ -62,11 +62,13 @@ class Configuration /** * @param int $maxTopCandidates + * * @return $this */ public function setMaxTopCandidates($maxTopCandidates) { $this->maxTopCandidates = $maxTopCandidates; + return $this; } @@ -80,11 +82,13 @@ class Configuration /** * @param int $wordThreshold + * * @return $this */ public function setWordThreshold($wordThreshold) { $this->wordThreshold = $wordThreshold; + return $this; } @@ -98,11 +102,13 @@ class Configuration /** * @param bool $articleByLine + * * @return $this */ public function setArticleByLine($articleByLine) { $this->articleByLine = $articleByLine; + return $this; } @@ -116,11 +122,13 @@ class Configuration /** * @param bool $stripUnlikelyCandidates + * * @return $this */ public function setStripUnlikelyCandidates($stripUnlikelyCandidates) { $this->stripUnlikelyCandidates = $stripUnlikelyCandidates; + return $this; } @@ -134,11 +142,13 @@ class Configuration /** * @param bool $cleanConditionally + * * @return $this */ public function setCleanConditionally($cleanConditionally) { $this->cleanConditionally = $cleanConditionally; + return $this; } @@ -152,11 +162,13 @@ class Configuration /** * @param bool $weightClasses + * * @return $this */ public function setWeightClasses($weightClasses) { $this->weightClasses = $weightClasses; + return $this; } @@ -170,11 +182,13 @@ class Configuration /** * @param bool $removeReadabilityTags + * * @return $this */ public function setRemoveReadabilityTags($removeReadabilityTags) { $this->removeReadabilityTags = $removeReadabilityTags; + return $this; } @@ -188,11 +202,13 @@ class Configuration /** * @param bool $fixRelativeURLs + * * @return $this */ public function setFixRelativeURLs($fixRelativeURLs) { $this->fixRelativeURLs = $fixRelativeURLs; + return $this; } @@ -206,11 +222,13 @@ class Configuration /** * @param bool $substituteEntities + * * @return $this */ public function setSubstituteEntities($substituteEntities) { $this->substituteEntities = $substituteEntities; + return $this; } @@ -224,11 +242,13 @@ class Configuration /** * @param bool $normalizeEntities + * * @return $this */ public function setNormalizeEntities($normalizeEntities) { $this->normalizeEntities = $normalizeEntities; + return $this; } @@ -242,11 +262,13 @@ class Configuration /** * @param string $originalURL + * * @return $this */ public function setOriginalURL($originalURL) { $this->originalURL = $originalURL; + return $this; } @@ -260,11 +282,13 @@ class Configuration /** * @param bool $summonCthulhu + * * @return $this */ public function setSummonCthulhu($summonCthulhu) { $this->summonCthulhu = $summonCthulhu; + return $this; } diff --git a/src/NodeClass/DOMNode.php b/src/NodeClass/DOMNode.php new file mode 100644 index 0000000..eb5e93e --- /dev/null +++ b/src/NodeClass/DOMNode.php @@ -0,0 +1,13 @@ +<?php + +namespace andreskrey\Readability\NodeClass; + +/** + * Class DOMNode. + * + * @method getAttribute($attribute) + */ +class DOMNode extends \DOMNode +{ + use NodeClassTrait; +} diff --git a/src/Nodes/DOMDocument.php b/src/Nodes/DOMDocument.php index f954f7d..510cdf2 100644 --- a/src/Nodes/DOMDocument.php +++ b/src/Nodes/DOMDocument.php @@ -14,7 +14,7 @@ class DOMDocument extends \DOMDocument $this->registerNodeClass('DOMCdataSection', DOMCdataSection::class); $this->registerNodeClass('DOMCharacterData', DOMCharacterData::class); $this->registerNodeClass('DOMComment', DOMComment::class); - $this->registerNodeClass('DOMDocument', DOMDocument::class); + $this->registerNodeClass('DOMDocument', self::class); $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class); $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class); $this->registerNodeClass('DOMElement', DOMElement::class); diff --git a/src/Nodes/NodeTrait.php b/src/Nodes/NodeTrait.php index 3294612..5847178 100644 --- a/src/Nodes/NodeTrait.php +++ b/src/Nodes/NodeTrait.php @@ -5,20 +5,27 @@ namespace andreskrey\Readability\Nodes; trait NodeTrait { /** - * Content score of the node. Used to determine the value of the content + * Content score of the node. Used to determine the value of the content. * * @var int */ public $contentScore = 0; /** - * Flag for initialized status + * Flag for initialized status. * * @var bool */ private $initialized = false; /** + * Flag data tables. + * + * @var bool + */ + private $readabilityDataTable = false; + + /** * @var array */ private $divToPElements = [ @@ -36,7 +43,7 @@ trait NodeTrait ]; /** - * initialized getter + * initialized getter. * * @return bool */ @@ -46,11 +53,28 @@ trait NodeTrait } /** + * @return bool + */ + public function isReadabilityDataTable() + { + return $this->readabilityDataTable; + } + + /** + * @param bool $param + */ + public function setReadabilityDataTable($param) + { + $this->readabilityDataTable = $param; + } + + /** * Initializer. Calculates the current score of the node and returns a full Readability object. * * @ TODO: I don't like the weightClasses param. How can we get the config here? * * @param $weightClasses bool Weight classes? + * * @return static */ public function initializeNode($weightClasses) @@ -179,7 +203,6 @@ trait NodeTrait return $linkLength / $textLength; } - /** * Calculates the weight of the class/id of the current element. * @@ -281,11 +304,10 @@ trait NodeTrait return ['rows' => $rows, 'columns' => $columns]; } - /** * Creates a new node based on the text content of the original node. * - * @param $originalNode DOMElement + * @param $originalNode DOMNode * @param $tagName string * * @return DOMElement diff --git a/src/Nodes/NodeUtility.php b/src/Nodes/NodeUtility.php index f35e9c5..8938a49 100644 --- a/src/Nodes/NodeUtility.php +++ b/src/Nodes/NodeUtility.php @@ -3,14 +3,12 @@ namespace andreskrey\Readability\Nodes; /** - * Class NodeUtility - * @package andreskrey\Readability + * Class NodeUtility. */ class NodeUtility { - /** - * Collection of regexps to check the node usability + * Collection of regexps to check the node usability. * * @var array */ @@ -32,12 +30,11 @@ class NodeUtility 'onlyWhitespace' => '/\x{00A0}|\s+/u' ]; - /** - * - * Imported from the Element class on league\html-to-markdown + * Imported from the Element class on league\html-to-markdown. * * @param $node + * * @return DOMElement */ public static function nextElement($node) @@ -52,13 +49,13 @@ class NodeUtility return $next; } - /** * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new * element with the new tag name and importing it to the main DOMDocument. * * @param string $value * @param bool $importAttributes + * * @return DOMNode */ public static function setNodeTag($node, $value, $importAttributes = false) @@ -68,7 +65,6 @@ class NodeUtility $children = $node->childNodes; /** @var $children \DOMNodeList $i */ - for ($i = 0; $i < $children->length; $i++) { $import = $new->importNode($children->item($i), true); $new->firstChild->appendChild($import); @@ -118,7 +114,6 @@ class NodeUtility } } - /** * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally * for parents. diff --git a/src/Readability.php b/src/Readability.php index fd6e66b..60a314a 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -9,7 +9,7 @@ use andreskrey\Readability\Nodes\DOMText; use andreskrey\Readability\Nodes\NodeUtility; /** - * Class Readability + * Class Readability. */ class Readability { @@ -21,49 +21,49 @@ class Readability protected $dom; /** - * Title of the article + * Title of the article. * * @var string|null */ protected $title = null; /** - * HTML content article + * HTML content article. * * @var string|null */ protected $content = null; /** - * Excerpt of the article + * Excerpt of the article. * * @var string|null */ protected $excerpt = null; /** - * Main image of the article + * Main image of the article. * * @var string|null */ protected $image = null; /** - * Author of the article. Extracted from the byline tags and other social media properties + * Author of the article. Extracted from the byline tags and other social media properties. * * @var string|null */ protected $author = null; /** - * Direction of the text + * Direction of the text. * * @var string|null */ protected $direction = null; /** - * Configuration object + * Configuration object. * * @var Configuration */ @@ -149,7 +149,7 @@ class Readability } /** - * Main parse function + * Main parse function. * * @param $html * @throws ParseException @@ -214,7 +214,7 @@ class Readability } /** - * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties + * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties. */ private function getMetadata() { @@ -311,7 +311,6 @@ class Readability return $result; } - /** * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't * find a correct image. @@ -432,7 +431,6 @@ class Readability return $curTitle; } - private function toAbsoluteURI($uri) { list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL()); @@ -480,7 +478,6 @@ class Readability return [$pathBase, $scheme, $prePath]; } - /** * Gets nodes from the root element. * @@ -593,14 +590,14 @@ class Readability /* * Check if the byline is already set */ - if (isset($this->metadata['byline'])) { + if ($this->getAuthor()) { return false; } $rel = $node->getAttribute('rel'); if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) { - $this->metadata['byline'] = trim($node->getTextContent()); + $this->setAuthor(trim($node->getTextContent())); return true; } @@ -626,7 +623,6 @@ class Readability return false; } - /** * Removes all the scripts of the html. * @@ -720,7 +716,6 @@ class Readability } } - /** * Assign scores to each node. This function will rate each node and return a DOMElement object for each one. * @@ -816,7 +811,6 @@ class Readability } $topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null; - $neededToCreateTopCandidate = false; $parentOfTopCandidate = null; /* @@ -922,7 +916,7 @@ class Readability $hasContent = false; - /** @var Readability $sibling */ + /** @var DOMElement $sibling */ foreach ($siblings as $sibling) { $append = false; @@ -1088,56 +1082,55 @@ class Readability /** @var DOMElement $table */ $role = $table->getAttribute('role'); if ($role === 'presentation') { - $table->readabilityDataTable = false; + $table->setReadabilityDataTable(false); continue; } $datatable = $table->getAttribute('datatable'); if ($datatable == '0') { - $table->readabilityDataTable = false; + $table->setReadabilityDataTable(false); continue; } $summary = $table->getAttribute('summary'); if ($summary) { - $table->readabilityDataTable = true; + $table->setReadabilityDataTable(true); continue; } $caption = $table->getElementsByTagName('caption'); if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) { - $table->readabilityDataTable = true; + $table->setReadabilityDataTable(true); continue; } // If the table has a descendant with any of these tags, consider a data table: foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) { if ($table->getElementsByTagName($dataTableDescendants)->length > 0) { - $table->readabilityDataTable = true; + $table->setReadabilityDataTable(true); continue 2; } } // Nested tables indicate a layout table: if ($table->getElementsByTagName('table')->length > 0) { - $table->readabilityDataTable = false; + $table->setReadabilityDataTable(false); continue; } $sizeInfo = $table->getRowAndColumnCount(); if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) { - $table->readabilityDataTable = true; + $table->setReadabilityDataTable(true); continue; } // Now just go by size entirely: - $table->readabilityDataTable = $sizeInfo['rows'] * $sizeInfo['columns'] > 10; + $table->setReadabilityDataTable($sizeInfo['rows'] * $sizeInfo['columns'] > 10); } } - /** * Remove the style attribute on every e and under. * TODO: To be moved to Readability. * - * @param $node \DOMDocument|\DOMNode + * @param $node DOMDocument|DOMNode **/ public function _cleanStyles($node) { @@ -1213,7 +1206,6 @@ class Readability $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount; if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) { - // TODO must be done via readability $paragraph->parentNode->removeChild($paragraph); } } @@ -1247,7 +1239,7 @@ class Readability $node = $DOMNodeList->item($length - 1 - $i); // First check if we're in a data table, in which case don't remove us. - if ($node->hasAncestorTag($node, 'table', -1) && isset($node->readabilityDataTable)) { + if ($node->hasAncestorTag($node, 'table', -1) && $node->isReadabilityDataTable()) { continue; } @@ -1268,7 +1260,6 @@ class Readability * ominous signs, remove the element. */ - // TODO Horrible hack, must be removed once this function is inside Readability $p = $node->getElementsByTagName('p')->length; $img = $node->getElementsByTagName('img')->length; $li = $node->getElementsByTagName('li')->length - 100; @@ -1402,7 +1393,6 @@ class Readability return $article; } - /** * @return null|string */ @@ -1420,7 +1410,7 @@ class Readability } /** - * @param null $title + * @param string $title */ protected function setTitle($title) { @@ -1436,7 +1426,7 @@ class Readability } /** - * @param null $content + * @param string $content */ protected function setContent($content) { @@ -1468,7 +1458,7 @@ class Readability } /** - * @param null $image + * @param string $image */ protected function setImage($image) { @@ -1484,7 +1474,7 @@ class Readability } /** - * @param null $author + * @param string $author */ protected function setAuthor($author) { diff --git a/test/ReadabilityTest.php b/test/ReadabilityTest.php index a577b3d..9d29ba5 100644 --- a/test/ReadabilityTest.php +++ b/test/ReadabilityTest.php @@ -2,7 +2,6 @@ namespace andreskrey\Readability\Test; - use andreskrey\Readability\Configuration; use andreskrey\Readability\Readability; @@ -24,7 +23,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase $configuration = new Configuration(); - foreach($options as $key => $value){ + foreach ($options as $key => $value) { $name = 'set' . $key; $configuration->$name($value); } @@ -50,7 +49,7 @@ class ReadabilityTest extends \PHPUnit_Framework_TestCase } $configuration = new Configuration(); - foreach($options as $key => $value){ + foreach ($options as $key => $value) { $name = 'set' . $key; $configuration->$name($value); } |