summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-10-18 16:49:47 +0100
committerAndres Rey <[email protected]>2016-10-18 16:49:47 +0100
commit878243522a6ac71df1ca175ffdc5d7247b7f5e09 (patch)
tree42f855bb61e1db80bd7501c9210aedc69493ec9b
parent8e5abf7e184fdab04d588013a1b8ffc49f8c1ec4 (diff)
Lots of comments
-rw-r--r--src/DOMElement.php27
-rw-r--r--src/DOMElementInterface.php2
-rw-r--r--src/HTMLParser.php79
-rw-r--r--src/Readability.php27
-rw-r--r--src/ReadabilityInterface.php12
5 files changed, 139 insertions, 8 deletions
diff --git a/src/DOMElement.php b/src/DOMElement.php
index f5e38c9..728aa65 100644
--- a/src/DOMElement.php
+++ b/src/DOMElement.php
@@ -4,6 +4,15 @@ namespace andreskrey\Readability;
use League\HTMLToMarkdown\Element;
+/**
+ * Class DOMElement
+ *
+ * This is a extension of the original Element class from League\HTMLToMarkdown\Element.
+ * This class adds functions specific to Readability.php and overloads some of them to fit the purpose of this project.
+ *
+ * @package andreskrey\Readability
+ */
+
class DOMElement extends Element implements DOMElementInterface
{
/**
@@ -11,13 +20,20 @@ class DOMElement extends Element implements DOMElementInterface
*/
protected $node;
+ /**
+ * Constructor
+ *
+ * @param \DOMNode $node Selected element from DOMDocument
+ */
public function __construct(\DOMNode $node)
{
parent::__construct($node);
}
/**
- * @param string $value
+ * Checks for the tag name. Case insensitive.
+ *
+ * @param string $value Name to compare to the current tag
*
* @return bool
*/
@@ -32,6 +48,9 @@ class DOMElement extends Element implements DOMElementInterface
}
/**
+ * Checks if the current node has a single child and if that child is a P node.
+ * Useful to convert <div><p> nodes to a single <p> node and avoid confusing the scoring system since div with p
+ * tags are, in practice, paragraphs.
*
* @return bool
*/
@@ -51,7 +70,9 @@ class DOMElement extends Element implements DOMElementInterface
}
/**
- * @param integer $maxLevel
+ * Get the ancestors of the current node.
+ *
+ * @param int $maxLevel Max amount of ancestors to get.
* @return array
*/
public function getNodeAncestors($maxLevel = 3)
@@ -74,7 +95,7 @@ class DOMElement extends Element implements DOMElementInterface
}
/**
- * Overloading the getParent function from League\html-to-markdown due to a bug when there are no more parents
+ * Overloading the getParent function from League\HTMLToMarkdown\Element due to a bug when there are no more parents
* on the selected element.
*
* @return DOMElementInterface|null
diff --git a/src/DOMElementInterface.php b/src/DOMElementInterface.php
index af9e6e1..0241b6a 100644
--- a/src/DOMElementInterface.php
+++ b/src/DOMElementInterface.php
@@ -14,13 +14,11 @@ interface DOMElementInterface extends ElementInterface
public function tagNameEqualsTo($value);
/**
- *
* @return bool
*/
public function hasSinglePNode();
/**
- *
* @return integer
*/
public function getNodeAncestors();
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index ca994b3..c4d102c 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -4,14 +4,37 @@ namespace andreskrey\Readability;
use DOMDocument;
+/**
+ * Class HTMLParser
+ *
+ * A helper class to parse HTML and get a Readability object.
+ *
+ */
class HTMLParser
{
-
+ /**
+ * @var DOMDocument
+ */
private $dom = null;
+ /**
+ * @var array
+ */
private $metadata = [];
+
+ /**
+ * @var array
+ */
private $title = [];
+
+ /**
+ * @var array
+ */
private $elementsToScore = [];
+
+ /**
+ * @var array
+ */
private $regexps = [
'unlikelyCandidates' => '/banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i',
'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
@@ -26,12 +49,25 @@ class HTMLParser
'hasContent' => '/\S$/'
];
+ /**
+ * Constructor
+ *
+ */
public function __construct()
{
$this->dom = new DOMDocument('1.0', 'utf-8');
+
+ // To avoid having a gazillion of errors on malformed HTMLs
libxml_use_internal_errors(true);
}
+ /**
+ * Parse the html. This is the main entry point of the HTMLParser
+ *
+ * @param string $html Full html of the website, page, etc.
+ *
+ * #return ? TBD
+ */
public function parse($html)
{
$this->loadHTML($html);
@@ -53,12 +89,21 @@ class HTMLParser
$this->rateNodes($this->elementsToScore);
}
+ /**
+ * @param string $html
+ */
private function loadHTML($html)
{
$this->dom->loadHTML($html);
- $this->dom->encoding = 'utf-8';
+ $this->dom->encoding = 'UTF-8';
}
+ /**
+ * Removes all the scripts of the html.
+ *
+ * @TODO is this really necessary? Readability.js uses it to chop any script that might interfere with their
+ * system. Is it necessary here?
+ */
private function removeScripts()
{
while ($script = $this->dom->getElementsByTagName('script')) {
@@ -70,13 +115,20 @@ class HTMLParser
}
}
+ /**
+ * Tries to guess relevant info from metadata of the html
+ *
+ * @return array Metadata info. May have title, excerpt and or byline.
+ */
private function getMetadata()
{
$metadata = [];
foreach ($this->dom->getElementsByTagName('meta') as $meta) {
+ /** @var DOMElement $meta */
$name = $meta->getAttribute('name');
$property = $meta->getAttribute('property');
+ // Select either name or property
$item = ($name ? $name : $property);
if ($item == 'og:title' || $item == 'twitter:title') {
@@ -95,6 +147,11 @@ class HTMLParser
return $metadata;
}
+ /**
+ * Returns the title of the html. Prioritizes the title from the metadata against the title tag.
+ *
+ * @return string|null
+ */
private function getTitle()
{
if (isset($this->metadata['title'])) {
@@ -109,10 +166,16 @@ class HTMLParser
return null;
}
+ /**
+ * Gets nodes from the root element.
+ *
+ * @param $node DOMElementInterface
+ */
private function getNodes(DOMElementInterface $node)
{
$matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
+ // Avoid elements that are unlikely to have any useful information.
if (
preg_match($this->regexps['unlikelyCandidates'], $matchString) &&
!preg_match($this->regexps['okMaybeItsACandidate'], $matchString) &&
@@ -122,23 +185,28 @@ class HTMLParser
return;
}
+ // Loop over the element if it has children
if ($node->hasChildren()) {
foreach ($node->getChildren() as $child) {
$this->getNodes($child);
}
}
+ // Check for nodes that have only on P node as a child and convert them to a single P node
if ($node->hasSinglePNode()) {
$pNode = $node->getChildren();
$node = $pNode[0];
}
+ // If there's any info on the node, add it to the elements to score in the next step.
if (trim($node->getValue())) {
$this->elementsToScore[] = $node;
}
}
/**
+ * Assign scores to each node. This function will rate each node and return a Readability object for each one.
+ *
* @param array $nodes
*/
private function rateNodes($nodes)
@@ -146,12 +214,16 @@ class HTMLParser
$candidates = [];
foreach ($nodes as $node) {
+
+ // Discard nodes with less than 25 characters
if (strlen($node->getValue()) < 25) {
continue;
}
$ancestors = $node->getNodeAncestors();
- if ($ancestors < 3) {
+
+ // Exclude nodes with no ancestor
+ if ($ancestors === 0) {
continue;
}
@@ -164,6 +236,7 @@ class HTMLParser
// For every 100 characters in this paragraph, add another point. Up to 3 points.
$contentScore += min(floor(strlen($node->getValue()) / 100), 3);
+ // Initialize and score ancestors.
foreach ($ancestors as $ancestor) {
$readability = new Readability($ancestor);
$candidates[] = $readability->initializeNode();
diff --git a/src/Readability.php b/src/Readability.php
index a3c3cc2..a53d032 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -4,16 +4,27 @@ namespace andreskrey\Readability;
class Readability implements ReadabilityInterface
{
+ /**
+ * @var int
+ */
protected $score = 0;
+ /**
+ * @var null
+ */
protected $node;
+ /**
+ * @var array
+ */
private $regexps = [
'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i',
'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i',
];
/**
+ * Constructor
+ *
* @param DOMElement $node
*/
public function __construct($node)
@@ -21,6 +32,11 @@ class Readability implements ReadabilityInterface
$this->node = $node;
}
+ /**
+ * Initializer. Calculates the current score of the node and returns a full Readability object.
+ *
+ * @return Readability
+ */
public function initializeNode()
{
switch ($this->node->getTagName()) {
@@ -61,6 +77,12 @@ class Readability implements ReadabilityInterface
return $this;
}
+ /**
+ * Calculates the weight of the class/id of the current element
+ *
+ * @todo check for flag that lets this function run or not
+ * @return int
+ */
public function getClassWeight()
{
// if(!Config::FLAG_WEIGHT_CLASSES) return 0;
@@ -94,6 +116,11 @@ class Readability implements ReadabilityInterface
return $weight;
}
+ /**
+ * Returns the current score of the Readability object.
+ *
+ * @return int
+ */
public function getScore()
{
return $this->score;
diff --git a/src/ReadabilityInterface.php b/src/ReadabilityInterface.php
index f5df055..cbb4cb2 100644
--- a/src/ReadabilityInterface.php
+++ b/src/ReadabilityInterface.php
@@ -4,12 +4,24 @@ namespace andreskrey\Readability;
interface ReadabilityInterface
{
+ /**
+ * @param DOMElement $node
+ */
public function __construct($node);
+ /**
+ * @return int
+ */
public function getScore();
+ /**
+ * @return Readability
+ */
public function initializeNode();
+ /**
+ * @return int
+ */
public function getClassWeight();
}