diff options
-rw-r--r-- | src/NodeClass/NodeClassTrait.php | 18 | ||||
-rw-r--r-- | src/NodeUtility.php | 23 | ||||
-rw-r--r-- | src/Readability.php | 32 |
3 files changed, 34 insertions, 39 deletions
diff --git a/src/NodeClass/NodeClassTrait.php b/src/NodeClass/NodeClassTrait.php index c117691..4ab68d8 100644 --- a/src/NodeClass/NodeClassTrait.php +++ b/src/NodeClass/NodeClassTrait.php @@ -22,16 +22,6 @@ trait NodeClassTrait private $initialized = false; /** - * Collection of regexps to check the node usability - * - * @var array - */ - private $regexps = [ - 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i', - 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i', - ]; - - /** * initialized getter * * @return bool @@ -188,11 +178,11 @@ trait NodeClassTrait // Look for a special classname $class = $this->getAttribute('class'); if (trim($class)) { - if (preg_match($this->regexps['negative'], $class)) { + if (preg_match(NodeUtility::$regexps['negative'], $class)) { $weight -= 25; } - if (preg_match($this->regexps['positive'], $class)) { + if (preg_match(NodeUtility::$regexps['positive'], $class)) { $weight += 25; } } @@ -200,11 +190,11 @@ trait NodeClassTrait // Look for a special ID $id = $this->getAttribute('id'); if (trim($id)) { - if (preg_match($this->regexps['negative'], $id)) { + if (preg_match(NodeUtility::$regexps['negative'], $id)) { $weight -= 25; } - if (preg_match($this->regexps['positive'], $id)) { + if (preg_match(NodeUtility::$regexps['positive'], $id)) { $weight += 25; } } diff --git a/src/NodeUtility.php b/src/NodeUtility.php index 4621f21..1743aba 100644 --- a/src/NodeUtility.php +++ b/src/NodeUtility.php @@ -29,6 +29,29 @@ class NodeUtility 'select', ]; + /** + * Collection of regexps to check the node usability + * + * @var array + */ + public static $regexps = [ + 'unlikelyCandidates' => '/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', + 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', + 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', + 'byline' => '/byline|author|dateline|writtenby|p-author/i', + 'replaceFonts' => '/<(\/?)font[^>]*>/gi', + 'normalize' => '/\s{2,}/', + 'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i', + 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', + 'prevLink' => '/(prev|earl|old|new|<|«)/i', + 'whitespace' => '/^\s*$/', + 'hasContent' => '/\S$/', + 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i', + 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i', + // \x{00A0} is the unicode version of + 'onlyWhitespace' => '/\x{00A0}|\s+/u' + ]; + /** * diff --git a/src/Readability.php b/src/Readability.php index 4b1309c..0684b7b 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -47,24 +47,6 @@ class Readability */ private $configuration; - /** - * @var array - */ - private $regexps = [ - 'unlikelyCandidates' => '/banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', - 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', - 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', - 'byline' => '/byline|author|dateline|writtenby|p-author/i', - 'replaceFonts' => '/<(\/?)font[^>]*>/gi', - 'normalize' => '/\s{2,}/', - 'videos' => '/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i', - 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', - 'prevLink' => '/(prev|earl|old|new|<|«)/i', - 'whitespace' => '/^\s*$/', - 'hasContent' => '/\S$/', - // \x{00A0} is the unicode version of - 'onlyWhitespace' => '/\x{00A0}|\s+/u' - ]; private $defaultTagsToScore = [ 'section', 'h2', @@ -529,8 +511,8 @@ class Readability // Remove unlikely candidates if ($stripUnlikelyCandidates) { if ( - preg_match($this->regexps['unlikelyCandidates'], $matchString) && - !preg_match($this->regexps['okMaybeItsACandidate'], $matchString) && + preg_match(NodeUtility::$regexps['unlikelyCandidates'], $matchString) && + !preg_match(NodeUtility::$regexps['okMaybeItsACandidate'], $matchString) && $node->nodeName !== 'body' && $node->nodeName !== 'a' ) { @@ -610,7 +592,7 @@ class Readability $rel = $node->getAttribute('rel'); - if ($rel === 'author' || preg_match($this->regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) { + if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) { $this->metadata['byline'] = trim($node->getTextContent()); return true; @@ -1223,7 +1205,7 @@ class Readability $iframeCount = $paragraph->getElementsByTagName('iframe')->length; $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount; - if ($totalCount === 0 && !preg_replace($this->regexps['onlyWhitespace'], '', $paragraph->textContent)) { + if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) { // TODO must be done via readability $paragraph->parentNode->removeChild($paragraph); } @@ -1289,7 +1271,7 @@ class Readability $embeds = $node->getElementsByTagName('embed'); foreach ($embeds as $embedNode) { - if (preg_match($this->regexps['videos'], $embedNode->C14N())) { + if (preg_match(NodeUtility::$regexps['videos'], $embedNode->C14N())) { $embedCount++; } } @@ -1342,12 +1324,12 @@ class Readability $attributeValues = implode('|', $attributeValues); // First, check the elements attributes to see if any of them contain youtube or vimeo - if (preg_match($this->regexps['videos'], $attributeValues)) { + if (preg_match(NodeUtility::$regexps['videos'], $attributeValues)) { continue; } // Then check the elements inside this element for the same. - if (preg_match($this->regexps['videos'], $item->C14N())) { + if (preg_match(NodeUtility::$regexps['videos'], $item->C14N())) { continue; } } |