summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Rey <[email protected]>2017-11-30 19:25:18 +0000
committerAndres Rey <[email protected]>2017-11-30 19:25:18 +0000
commit820f3d27e924c5eed6eb1fc3327ee8d1a11d8373 (patch)
tree8328d266983ca07e43d14f9f128253d044902fe4 /src
parent13d3910781c599a5aea74ef4fa9d7ca4815cde69 (diff)
Move regexps array to NodeUtility
Diffstat (limited to 'src')
-rw-r--r--src/NodeClass/NodeClassTrait.php18
-rw-r--r--src/NodeUtility.php23
-rw-r--r--src/Readability.php32
3 files changed, 34 insertions, 39 deletions
diff --git a/src/NodeClass/NodeClassTrait.php b/src/NodeClass/NodeClassTrait.php
index c117691..4ab68d8 100644
--- a/src/NodeClass/NodeClassTrait.php
+++ b/src/NodeClass/NodeClassTrait.php
@@ -22,16 +22,6 @@ trait NodeClassTrait
private $initialized = false;
/**
- * Collection of regexps to check the node usability
- *
- * @var array
- */
- private $regexps = [
- 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i',
- 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i',
- ];
-
- /**
* initialized getter
*
* @return bool
@@ -188,11 +178,11 @@ trait NodeClassTrait
// Look for a special classname
$class = $this->getAttribute('class');
if (trim($class)) {
- if (preg_match($this->regexps['negative'], $class)) {
+ if (preg_match(NodeUtility::$regexps['negative'], $class)) {
$weight -= 25;
}
- if (preg_match($this->regexps['positive'], $class)) {
+ if (preg_match(NodeUtility::$regexps['positive'], $class)) {
$weight += 25;
}
}
@@ -200,11 +190,11 @@ trait NodeClassTrait
// Look for a special ID
$id = $this->getAttribute('id');
if (trim($id)) {
- if (preg_match($this->regexps['negative'], $id)) {
+ if (preg_match(NodeUtility::$regexps['negative'], $id)) {
$weight -= 25;
}
- if (preg_match($this->regexps['positive'], $id)) {
+ if (preg_match(NodeUtility::$regexps['positive'], $id)) {
$weight += 25;
}
}
diff --git a/src/NodeUtility.php b/src/NodeUtility.php
index 4621f21..1743aba 100644
--- a/src/NodeUtility.php
+++ b/src/NodeUtility.php
@@ -29,6 +29,29 @@ class NodeUtility
'select',
];
+ /**
+ * Collection of regexps to check the node usability
+ *
+ * @var array
+ */
+ public static $regexps = [
+ 'unlikelyCandidates' => '/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
+ 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
+ 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
+ 'byline' => '/byline|author|dateline|writtenby|p-author/i',
+ 'replaceFonts' => '/<(\/?)font[^>]*>/gi',
+ 'normalize' => '/\s{2,}/',
+ 'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i',
+ 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i',
+ 'prevLink' => '/(prev|earl|old|new|<|«)/i',
+ 'whitespace' => '/^\s*$/',
+ 'hasContent' => '/\S$/',
+ 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i',
+ 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i',
+ // \x{00A0} is the unicode version of &nbsp;
+ 'onlyWhitespace' => '/\x{00A0}|\s+/u'
+ ];
+
/**
*
diff --git a/src/Readability.php b/src/Readability.php
index 4b1309c..0684b7b 100644
--- a/src/Readability.php
+++ b/src/Readability.php
@@ -47,24 +47,6 @@ class Readability
*/
private $configuration;
- /**
- * @var array
- */
- private $regexps = [
- 'unlikelyCandidates' => '/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i',
- 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
- 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i',
- 'byline' => '/byline|author|dateline|writtenby|p-author/i',
- 'replaceFonts' => '/<(\/?)font[^>]*>/gi',
- 'normalize' => '/\s{2,}/',
- 'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i',
- 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i',
- 'prevLink' => '/(prev|earl|old|new|<|«)/i',
- 'whitespace' => '/^\s*$/',
- 'hasContent' => '/\S$/',
- // \x{00A0} is the unicode version of &nbsp;
- 'onlyWhitespace' => '/\x{00A0}|\s+/u'
- ];
private $defaultTagsToScore = [
'section',
'h2',
@@ -529,8 +511,8 @@ class Readability
// Remove unlikely candidates
if ($stripUnlikelyCandidates) {
if (
- preg_match($this->regexps['unlikelyCandidates'], $matchString) &&
- !preg_match($this->regexps['okMaybeItsACandidate'], $matchString) &&
+ preg_match(NodeUtility::$regexps['unlikelyCandidates'], $matchString) &&
+ !preg_match(NodeUtility::$regexps['okMaybeItsACandidate'], $matchString) &&
$node->nodeName !== 'body' &&
$node->nodeName !== 'a'
) {
@@ -610,7 +592,7 @@ class Readability
$rel = $node->getAttribute('rel');
- if ($rel === 'author' || preg_match($this->regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) {
+ if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) {
$this->metadata['byline'] = trim($node->getTextContent());
return true;
@@ -1223,7 +1205,7 @@ class Readability
$iframeCount = $paragraph->getElementsByTagName('iframe')->length;
$totalCount = $imgCount + $embedCount + $objectCount + $iframeCount;
- if ($totalCount === 0 && !preg_replace($this->regexps['onlyWhitespace'], '', $paragraph->textContent)) {
+ if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) {
// TODO must be done via readability
$paragraph->parentNode->removeChild($paragraph);
}
@@ -1289,7 +1271,7 @@ class Readability
$embeds = $node->getElementsByTagName('embed');
foreach ($embeds as $embedNode) {
- if (preg_match($this->regexps['videos'], $embedNode->C14N())) {
+ if (preg_match(NodeUtility::$regexps['videos'], $embedNode->C14N())) {
$embedCount++;
}
}
@@ -1342,12 +1324,12 @@ class Readability
$attributeValues = implode('|', $attributeValues);
// First, check the elements attributes to see if any of them contain youtube or vimeo
- if (preg_match($this->regexps['videos'], $attributeValues)) {
+ if (preg_match(NodeUtility::$regexps['videos'], $attributeValues)) {
continue;
}
// Then check the elements inside this element for the same.
- if (preg_match($this->regexps['videos'], $item->C14N())) {
+ if (preg_match(NodeUtility::$regexps['videos'], $item->C14N())) {
continue;
}
}