summaryrefslogtreecommitdiff
path: root/src/HTMLParser.php
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-10-18 16:49:47 +0100
committerAndres Rey <[email protected]>2016-10-18 16:49:47 +0100
commit878243522a6ac71df1ca175ffdc5d7247b7f5e09 (patch)
tree42f855bb61e1db80bd7501c9210aedc69493ec9b /src/HTMLParser.php
parent8e5abf7e184fdab04d588013a1b8ffc49f8c1ec4 (diff)
Lots of comments
Diffstat (limited to 'src/HTMLParser.php')
-rw-r--r--src/HTMLParser.php79
1 files changed, 76 insertions, 3 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index ca994b3..c4d102c 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -4,14 +4,37 @@ namespace andreskrey\Readability;
use DOMDocument;
+/**
+ * Class HTMLParser
+ *
+ * A helper class to parse HTML and get a Readability object.
+ *
+ */
class HTMLParser
{
-
+ /**
+ * @var DOMDocument
+ */
private $dom = null;
+ /**
+ * @var array
+ */
private $metadata = [];
+
+ /**
+ * @var array
+ */
private $title = [];
+
+ /**
+ * @var array
+ */
private $elementsToScore = [];
+
+ /**
+ * @var array
+ */
private $regexps = [
'unlikelyCandidates' => '/banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i',
'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
@@ -26,12 +49,25 @@ class HTMLParser
'hasContent' => '/\S$/'
];
+ /**
+ * Constructor
+ *
+ */
public function __construct()
{
$this->dom = new DOMDocument('1.0', 'utf-8');
+
+ // To avoid having a gazillion of errors on malformed HTMLs
libxml_use_internal_errors(true);
}
+ /**
+ * Parse the html. This is the main entry point of the HTMLParser
+ *
+ * @param string $html Full html of the website, page, etc.
+ *
+ * #return ? TBD
+ */
public function parse($html)
{
$this->loadHTML($html);
@@ -53,12 +89,21 @@ class HTMLParser
$this->rateNodes($this->elementsToScore);
}
+ /**
+ * @param string $html
+ */
private function loadHTML($html)
{
$this->dom->loadHTML($html);
- $this->dom->encoding = 'utf-8';
+ $this->dom->encoding = 'UTF-8';
}
+ /**
+ * Removes all the scripts of the html.
+ *
+ * @TODO is this really necessary? Readability.js uses it to chop any script that might interfere with their
+ * system. Is it necessary here?
+ */
private function removeScripts()
{
while ($script = $this->dom->getElementsByTagName('script')) {
@@ -70,13 +115,20 @@ class HTMLParser
}
}
+ /**
+ * Tries to guess relevant info from metadata of the html
+ *
+ * @return array Metadata info. May have title, excerpt and or byline.
+ */
private function getMetadata()
{
$metadata = [];
foreach ($this->dom->getElementsByTagName('meta') as $meta) {
+ /** @var DOMElement $meta */
$name = $meta->getAttribute('name');
$property = $meta->getAttribute('property');
+ // Select either name or property
$item = ($name ? $name : $property);
if ($item == 'og:title' || $item == 'twitter:title') {
@@ -95,6 +147,11 @@ class HTMLParser
return $metadata;
}
+ /**
+ * Returns the title of the html. Prioritizes the title from the metadata against the title tag.
+ *
+ * @return string|null
+ */
private function getTitle()
{
if (isset($this->metadata['title'])) {
@@ -109,10 +166,16 @@ class HTMLParser
return null;
}
+ /**
+ * Gets nodes from the root element.
+ *
+ * @param $node DOMElementInterface
+ */
private function getNodes(DOMElementInterface $node)
{
$matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');
+ // Avoid elements that are unlikely to have any useful information.
if (
preg_match($this->regexps['unlikelyCandidates'], $matchString) &&
!preg_match($this->regexps['okMaybeItsACandidate'], $matchString) &&
@@ -122,23 +185,28 @@ class HTMLParser
return;
}
+ // Loop over the element if it has children
if ($node->hasChildren()) {
foreach ($node->getChildren() as $child) {
$this->getNodes($child);
}
}
+ // Check for nodes that have only on P node as a child and convert them to a single P node
if ($node->hasSinglePNode()) {
$pNode = $node->getChildren();
$node = $pNode[0];
}
+ // If there's any info on the node, add it to the elements to score in the next step.
if (trim($node->getValue())) {
$this->elementsToScore[] = $node;
}
}
/**
+ * Assign scores to each node. This function will rate each node and return a Readability object for each one.
+ *
* @param array $nodes
*/
private function rateNodes($nodes)
@@ -146,12 +214,16 @@ class HTMLParser
$candidates = [];
foreach ($nodes as $node) {
+
+ // Discard nodes with less than 25 characters
if (strlen($node->getValue()) < 25) {
continue;
}
$ancestors = $node->getNodeAncestors();
- if ($ancestors < 3) {
+
+ // Exclude nodes with no ancestor
+ if ($ancestors === 0) {
continue;
}
@@ -164,6 +236,7 @@ class HTMLParser
// For every 100 characters in this paragraph, add another point. Up to 3 points.
$contentScore += min(floor(strlen($node->getValue()) / 100), 3);
+ // Initialize and score ancestors.
foreach ($ancestors as $ancestor) {
$readability = new Readability($ancestor);
$candidates[] = $readability->initializeNode();