From 03ef55b961173152fc8ffc1c83bc96c74ffb08b7 Mon Sep 17 00:00:00 2001 From: Andres Rey Date: Tue, 18 Oct 2016 15:52:57 +0100 Subject: Fixed a nasty bug when searching for a missing parent on an element. Expanded the Readability object with class and element scoring. --- src/DOMElement.php | 23 +++++++---- src/HTMLParser.php | 7 ++-- src/Readability.php | 90 +++++++++++++++++++++++++++++++++++++++++++- src/ReadabilityInterface.php | 7 ++++ 4 files changed, 115 insertions(+), 12 deletions(-) diff --git a/src/DOMElement.php b/src/DOMElement.php index 4e93dcf..f5e38c9 100644 --- a/src/DOMElement.php +++ b/src/DOMElement.php @@ -11,11 +11,6 @@ class DOMElement extends Element implements DOMElementInterface */ protected $node; - /** - * @var DOMElementInterface|null - */ - private $nextCached; - public function __construct(\DOMNode $node) { parent::__construct($node); @@ -65,8 +60,9 @@ class DOMElement extends Element implements DOMElementInterface $level = 0; $node = $this; - while ($node->getParent()) { - $ancestors[] = new static($this->node); + + while ($node && $node->getParent()) { + $ancestors[] = new static($node->node); $level++; if ($level >= $maxLevel) { break; @@ -76,4 +72,17 @@ class DOMElement extends Element implements DOMElementInterface return $ancestors; } + + /** + * Overloading the getParent function from League\html-to-markdown due to a bug when there are no more parents + * on the selected element. + * + * @return DOMElementInterface|null + */ + public function getParent() + { + $node = $this->node->parentNode; + return ($node) ? new static($node) : null; + } + } diff --git a/src/HTMLParser.php b/src/HTMLParser.php index 6859579..ca994b3 100644 --- a/src/HTMLParser.php +++ b/src/HTMLParser.php @@ -15,8 +15,6 @@ class HTMLParser private $regexps = [ 'unlikelyCandidates' => '/banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i', 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', - 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i', - 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i', 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', 'byline' => '/byline|author|dateline|writtenby|p-author/i', 'replaceFonts' => '/<(\/?)font[^>]*>/gi', @@ -141,7 +139,7 @@ class HTMLParser } /** - * @param DOMElement $nodes + * @param array $nodes */ private function rateNodes($nodes) { @@ -167,7 +165,8 @@ class HTMLParser $contentScore += min(floor(strlen($node->getValue()) / 100), 3); foreach ($ancestors as $ancestor) { - $tes = $ancestor->node->getTagName(); + $readability = new Readability($ancestor); + $candidates[] = $readability->initializeNode(); } } diff --git a/src/Readability.php b/src/Readability.php index d4c2181..a3c3cc2 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -4,7 +4,95 @@ namespace andreskrey\Readability; class Readability implements ReadabilityInterface { - private $score = 0; + protected $score = 0; + + protected $node; + + private $regexps = [ + 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i', + 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i', + ]; + + /** + * @param DOMElement $node + */ + public function __construct($node) + { + $this->node = $node; + } + + public function initializeNode() + { + switch ($this->node->getTagName()) { + case 'div': + $this->score += 5; + break; + + case 'pre': + case 'td': + case 'blockquote': + $this->score += 3; + break; + + case 'address': + case 'ol': + case 'ul': + case 'dl': + case 'dd': + case 'dt': + case 'li': + case 'form': + $this->score -= 3; + break; + + case 'h1': + case 'h2': + case 'h3': + case 'h4': + case 'h5': + case 'h6': + case 'th': + $this->score -= 5; + break; + } + + $this->score += $this->getClassWeight(); + + return $this; + } + + public function getClassWeight() + { + // if(!Config::FLAG_WEIGHT_CLASSES) return 0; + + $weight = 0; + + // Look for a special classname + $class = $this->node->getAttribute('class'); + if (trim($class)) { + if (preg_match($this->regexps['negative'], $class)) { + $weight -= 25; + } + + if (preg_match($this->regexps['positive'], $class)) { + $weight += 25; + } + } + + // Look for a special ID + $id = $this->node->getAttribute('class'); + if (trim($id)) { + if (preg_match($this->regexps['negative'], $id)) { + $weight -= 25; + } + + if (preg_match($this->regexps['positive'], $id)) { + $weight += 25; + } + } + + return $weight; + } public function getScore() { diff --git a/src/ReadabilityInterface.php b/src/ReadabilityInterface.php index 8e659a3..f5df055 100644 --- a/src/ReadabilityInterface.php +++ b/src/ReadabilityInterface.php @@ -4,5 +4,12 @@ namespace andreskrey\Readability; interface ReadabilityInterface { + public function __construct($node); + public function getScore(); + + public function initializeNode(); + + public function getClassWeight(); + } -- cgit v1.2.3