summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2016-10-18 15:52:57 +0100
committerAndres Rey <[email protected]>2016-10-18 15:52:57 +0100
commit03ef55b961173152fc8ffc1c83bc96c74ffb08b7 (patch)
tree766c7d13a8ab1dcc59bdfd126951726ed237cc04
parenta92863189af61a1f3c86d42815ee04b5e2353cc9 (diff)
Fixed a nasty bug when searching for a missing parent on an element. Expanded the Readability object with class and element scoring.
-rw-r--r--src/DOMElement.php23
-rw-r--r--src/HTMLParser.php7
-rw-r--r--src/Readability.php90
-rw-r--r--src/ReadabilityInterface.php7
4 files changed, 115 insertions, 12 deletions
diff --git a/src/DOMElement.php b/src/DOMElement.php
index 4e93dcf..f5e38c9 100644
--- a/src/DOMElement.php
+++ b/src/DOMElement.php
@@ -11,11 +11,6 @@ class DOMElement extends Element implements DOMElementInterface
*/
protected $node;
- /**
- * @var DOMElementInterface|null
- */
- private $nextCached;
-
public function __construct(\DOMNode $node)
{
parent::__construct($node);
@@ -65,8 +60,9 @@ class DOMElement extends Element implements DOMElementInterface
$level = 0;
$node = $this;
- while ($node->getParent()) {
- $ancestors[] = new static($this->node);
+
+ while ($node && $node->getParent()) {
+ $ancestors[] = new static($node->node);
$level++;
if ($level >= $maxLevel) {
break;
@@ -76,4 +72,17 @@ class DOMElement extends Element implements DOMElementInterface
return $ancestors;
}
+
+ /**
+ * Overloading the getParent function from League\html-to-markdown due to a bug when there are no more parents
+ * on the selected element.
+ *
+ * @return DOMElementInterface|null
+ */
+ public function getParent()
+ {
+ $node = $this->node->parentNode;
+ return ($node) ? new static($node) : null;
+ }
+
}
diff --git a/src/HTMLParser.php b/src/HTMLParser.php
index 6859579..ca994b3 100644
--- a/src/HTMLParser.php
+++ b/src/HTMLParser.php
@@ -15,8 +15,6 @@ class HTMLParser
private $regexps = [
'unlikelyCandidates' => '/banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i',
'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
- 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i',
- 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i',
'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
'byline' => '/byline|author|dateline|writtenby|p-author/i',
'replaceFonts' => '/<(\/?)font[^>]*>/gi',
@@ -141,7 +139,7 @@ class HTMLParser
}
/**
- * @param DOMElement $nodes
+ * @param array $nodes
*/
private function rateNodes($nodes)
{
@@ -167,7 +165,8 @@ class HTMLParser
$contentScore += min(floor(strlen($node->getValue()) / 100), 3);
foreach ($ancestors as $ancestor) {
- $tes = $ancestor->node->getTagName();
+ $readability = new Readability($ancestor);
+ $candidates[] = $readability->initializeNode();
}
}
diff --git a/src/Readability.php b/src/Readability.php
index d4c2181..a3c3cc2 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -4,7 +4,95 @@ namespace andreskrey\Readability;
class Readability implements ReadabilityInterface
{
- private $score = 0;
+ protected $score = 0;
+
+ protected $node;
+
+ private $regexps = [
+ 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i',
+ 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i',
+ ];
+
+ /**
+ * @param DOMElement $node
+ */
+ public function __construct($node)
+ {
+ $this->node = $node;
+ }
+
+ public function initializeNode()
+ {
+ switch ($this->node->getTagName()) {
+ case 'div':
+ $this->score += 5;
+ break;
+
+ case 'pre':
+ case 'td':
+ case 'blockquote':
+ $this->score += 3;
+ break;
+
+ case 'address':
+ case 'ol':
+ case 'ul':
+ case 'dl':
+ case 'dd':
+ case 'dt':
+ case 'li':
+ case 'form':
+ $this->score -= 3;
+ break;
+
+ case 'h1':
+ case 'h2':
+ case 'h3':
+ case 'h4':
+ case 'h5':
+ case 'h6':
+ case 'th':
+ $this->score -= 5;
+ break;
+ }
+
+ $this->score += $this->getClassWeight();
+
+ return $this;
+ }
+
+ public function getClassWeight()
+ {
+ // if(!Config::FLAG_WEIGHT_CLASSES) return 0;
+
+ $weight = 0;
+
+ // Look for a special classname
+ $class = $this->node->getAttribute('class');
+ if (trim($class)) {
+ if (preg_match($this->regexps['negative'], $class)) {
+ $weight -= 25;
+ }
+
+ if (preg_match($this->regexps['positive'], $class)) {
+ $weight += 25;
+ }
+ }
+
+ // Look for a special ID
+ $id = $this->node->getAttribute('class');
+ if (trim($id)) {
+ if (preg_match($this->regexps['negative'], $id)) {
+ $weight -= 25;
+ }
+
+ if (preg_match($this->regexps['positive'], $id)) {
+ $weight += 25;
+ }
+ }
+
+ return $weight;
+ }
public function getScore()
{
diff --git a/src/ReadabilityInterface.php b/src/ReadabilityInterface.php
index 8e659a3..f5df055 100644
--- a/src/ReadabilityInterface.php
+++ b/src/ReadabilityInterface.php
@@ -4,5 +4,12 @@ namespace andreskrey\Readability;
interface ReadabilityInterface
{
+ public function __construct($node);
+
public function getScore();
+
+ public function initializeNode();
+
+ public function getClassWeight();
+
}