diff options
author | Andres Rey <[email protected]> | 2016-10-18 16:49:47 +0100 |
---|---|---|
committer | Andres Rey <[email protected]> | 2016-10-18 16:49:47 +0100 |
commit | 878243522a6ac71df1ca175ffdc5d7247b7f5e09 (patch) | |
tree | 42f855bb61e1db80bd7501c9210aedc69493ec9b /src/HTMLParser.php | |
parent | 8e5abf7e184fdab04d588013a1b8ffc49f8c1ec4 (diff) |
Lots of comments
Diffstat (limited to 'src/HTMLParser.php')
-rw-r--r-- | src/HTMLParser.php | 79 |
1 files changed, 76 insertions, 3 deletions
diff --git a/src/HTMLParser.php b/src/HTMLParser.php index ca994b3..c4d102c 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -4,14 +4,37 @@ namespace andreskrey\Readability; use DOMDocument; +/** + * Class HTMLParser + * + * A helper class to parse HTML and get a Readability object. + * + */ class HTMLParser { - + /** + * @var DOMDocument + */ private $dom = null; + /** + * @var array + */ private $metadata = []; + + /** + * @var array + */ private $title = []; + + /** + * @var array + */ private $elementsToScore = []; + + /** + * @var array + */ private $regexps = [ 'unlikelyCandidates' => '/banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i', 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', @@ -26,12 +49,25 @@ class HTMLParser 'hasContent' => '/\S$/' ]; + /** + * Constructor + * + */ public function __construct() { $this->dom = new DOMDocument('1.0', 'utf-8'); + + // To avoid having a gazillion of errors on malformed HTMLs libxml_use_internal_errors(true); } + /** + * Parse the html. This is the main entry point of the HTMLParser + * + * @param string $html Full html of the website, page, etc. + * + * #return ? TBD + */ public function parse($html) { $this->loadHTML($html); @@ -53,12 +89,21 @@ class HTMLParser $this->rateNodes($this->elementsToScore); } + /** + * @param string $html + */ private function loadHTML($html) { $this->dom->loadHTML($html); - $this->dom->encoding = 'utf-8'; + $this->dom->encoding = 'UTF-8'; } + /** + * Removes all the scripts of the html. + * + * @TODO is this really necessary? Readability.js uses it to chop any script that might interfere with their + * system. Is it necessary here? + */ private function removeScripts() { while ($script = $this->dom->getElementsByTagName('script')) { @@ -70,13 +115,20 @@ class HTMLParser } } + /** + * Tries to guess relevant info from metadata of the html + * + * @return array Metadata info. May have title, excerpt and or byline. + */ private function getMetadata() { $metadata = []; foreach ($this->dom->getElementsByTagName('meta') as $meta) { + /** @var DOMElement $meta */ $name = $meta->getAttribute('name'); $property = $meta->getAttribute('property'); + // Select either name or property $item = ($name ? $name : $property); if ($item == 'og:title' || $item == 'twitter:title') { @@ -95,6 +147,11 @@ class HTMLParser return $metadata; } + /** + * Returns the title of the html. Prioritizes the title from the metadata against the title tag. + * + * @return string|null + */ private function getTitle() { if (isset($this->metadata['title'])) { @@ -109,10 +166,16 @@ class HTMLParser return null; } + /** + * Gets nodes from the root element. + * + * @param $node DOMElementInterface + */ private function getNodes(DOMElementInterface $node) { $matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id'); + // Avoid elements that are unlikely to have any useful information. if ( preg_match($this->regexps['unlikelyCandidates'], $matchString) && !preg_match($this->regexps['okMaybeItsACandidate'], $matchString) && @@ -122,23 +185,28 @@ class HTMLParser return; } + // Loop over the element if it has children if ($node->hasChildren()) { foreach ($node->getChildren() as $child) { $this->getNodes($child); } } + // Check for nodes that have only on P node as a child and convert them to a single P node if ($node->hasSinglePNode()) { $pNode = $node->getChildren(); $node = $pNode[0]; } + // If there's any info on the node, add it to the elements to score in the next step. if (trim($node->getValue())) { $this->elementsToScore[] = $node; } } /** + * Assign scores to each node. This function will rate each node and return a Readability object for each one. + * * @param array $nodes */ private function rateNodes($nodes) @@ -146,12 +214,16 @@ class HTMLParser $candidates = []; foreach ($nodes as $node) { + + // Discard nodes with less than 25 characters if (strlen($node->getValue()) < 25) { continue; } $ancestors = $node->getNodeAncestors(); - if ($ancestors < 3) { + + // Exclude nodes with no ancestor + if ($ancestors === 0) { continue; } @@ -164,6 +236,7 @@ class HTMLParser // For every 100 characters in this paragraph, add another point. Up to 3 points. $contentScore += min(floor(strlen($node->getValue()) / 100), 3); + // Initialize and score ancestors. foreach ($ancestors as $ancestor) { $readability = new Readability($ancestor); $candidates[] = $readability->initializeNode(); |