From b8c1d622a77226b14fb307cfe3e0f4cea9e4268a Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Sat, 16 Jul 2022 16:30:46 +0300 Subject: add missing files for forked idiorm --- .../readability.php/src/Nodes/DOM/DOMAttr.php | 4 +- .../src/Nodes/DOM/DOMCdataSection.php | 4 +- .../src/Nodes/DOM/DOMCharacterData.php | 4 +- .../readability.php/src/Nodes/DOM/DOMComment.php | 4 +- .../readability.php/src/Nodes/DOM/DOMDocument.php | 4 +- .../src/Nodes/DOM/DOMDocumentFragment.php | 4 +- .../src/Nodes/DOM/DOMDocumentType.php | 4 +- .../readability.php/src/Nodes/DOM/DOMElement.php | 40 ++++++++++++- .../readability.php/src/Nodes/DOM/DOMEntity.php | 4 +- .../src/Nodes/DOM/DOMEntityReference.php | 4 +- .../readability.php/src/Nodes/DOM/DOMNode.php | 4 +- .../readability.php/src/Nodes/DOM/DOMNodeList.php | 2 +- .../readability.php/src/Nodes/DOM/DOMNotation.php | 4 +- .../src/Nodes/DOM/DOMProcessingInstruction.php | 4 +- .../readability.php/src/Nodes/DOM/DOMText.php | 4 +- .../readability.php/src/Nodes/NodeTrait.php | 69 ++++++++-------------- .../readability.php/src/Nodes/NodeUtility.php | 36 +++++++---- 17 files changed, 113 insertions(+), 86 deletions(-) (limited to 'plugins/af_readability/vendor/fivefilters/readability.php/src/Nodes') diff --git a/plugins/af_readability/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMAttr.php b/plugins/af_readability/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMAttr.php index 91729f3b1..1bdf395e7 100644 --- a/plugins/af_readability/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMAttr.php +++ b/plugins/af_readability/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMAttr.php @@ -1,8 +1,8 @@ childNodes as $node) { + if ($node->nodeType === XML_ELEMENT_NODE) { + $newList->add($node); + } + } + return $newList; + } + + /** + * Returns the Element immediately prior to the specified one in its parent's children list, or null if the specified element is the first one in the list. + * + * @see https://wiki.php.net/rfc/dom_living_standard_api + * @return DOMElement|null + */ + public function previousElementSibling() + { + $previous = $this->previousSibling; + while ($previous) { + if ($previous->nodeType === XML_ELEMENT_NODE) { + return $previous; + } + $previous = $previous->previousSibling; + } + return null; + } } diff --git a/plugins/af_readability/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMEntity.php b/plugins/af_readability/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMEntity.php index 8493e7319..751b59c48 100644 --- a/plugins/af_readability/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMEntity.php +++ b/plugins/af_readability/vendor/fivefilters/readability.php/src/Nodes/DOM/DOMEntity.php @@ -1,8 +1,8 @@ attributes)) { @@ -187,6 +186,7 @@ trait NodeTrait * * @see getAttribute */ + #[\ReturnTypeWillChange] public function hasAttribute($attributeName) { if (!is_null($this->attributes)) { @@ -240,19 +240,21 @@ trait NodeTrait */ public function getLinkDensity() { - $linkLength = 0; $textLength = mb_strlen($this->getTextContent(true)); - - if (!$textLength) { + if ($textLength === 0) { return 0; } + $linkLength = 0; + $links = $this->getAllLinks(); if ($links) { /** @var DOMElement $link */ foreach ($links as $link) { - $linkLength += mb_strlen($link->getTextContent(true)); + $href = $link->getAttribute('href'); + $coefficient = ($href && preg_match(NodeUtility::$regexps['hashUrl'], $href)) ? 0.3 : 1; + $linkLength += mb_strlen($link->getTextContent(true)) * $coefficient; } } @@ -282,7 +284,7 @@ trait NodeTrait // Look for a special ID $id = $this->getAttribute('id'); - if (trim($id)) { + if (trim($id) !== '') { if (preg_match(NodeUtility::$regexps['negative'], $id)) { $weight -= 25; } @@ -302,40 +304,16 @@ trait NodeTrait * * @return string */ - public function getTextContent($normalize = false) + public function getTextContent($normalize = true) { - $nodeValue = $this->nodeValue; + $nodeValue = trim($this->textContent); if ($normalize) { - $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue)); + $nodeValue = preg_replace(NodeUtility::$regexps['normalize'], ' ', $nodeValue); } return $nodeValue; } - /** - * Returns the children of the current node. - * - * @param bool $filterEmptyDOMText Filter empty DOMText nodes? - * - * @deprecated Use NodeUtility::filterTextNodes, function will be removed in version 3.0 - * - * @return array - */ - public function getChildren($filterEmptyDOMText = false) - { - @trigger_error('getChildren was replaced with NodeUtility::filterTextNodes and will be removed in version 3.0', E_USER_DEPRECATED); - - $ret = iterator_to_array($this->childNodes); - if ($filterEmptyDOMText) { - // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number - $ret = array_values(array_filter($ret, function ($node) { - return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue)); - })); - } - - return $ret; - } - /** * Return an array indicating how many rows and columns this table has. * @@ -374,7 +352,7 @@ trait NodeTrait */ public function createNode($originalNode, $tagName) { - $text = $originalNode->getTextContent(); + $text = $originalNode->getTextContent(false); $newNode = $originalNode->ownerDocument->createElement($tagName, $text); return $newNode; @@ -433,7 +411,7 @@ trait NodeTrait } /* @var DOMNode $child */ - return !($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())); + return !($child->nodeType === XML_TEXT_NODE && preg_match(NodeUtility::$regexps['hasContent'], $child->textContent)); }); } @@ -508,13 +486,14 @@ trait NodeTrait * In the original JS project they check if the node has the style display=none, which unfortunately * in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none". * - * Might be a good idea to check for classes or other attributes like 'aria-hidden' - * * @return bool */ public function isProbablyVisible() { - return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden'); + return !preg_match('/display:( )?none/i', $this->getAttribute('style')) && + !$this->hasAttribute('hidden') && + //check for "fallback-image" so that wikimedia math images are displayed + (!$this->hasAttribute('aria-hidden') || $this->getAttribute('aria-hidden') !== 'true' || ($this->hasAttribute('class') && strpos($this->getAttribute('class'), 'fallback-image') !== false)); } /** diff --git a/plugins/af_readability/vendor/fivefilters/readability.php/src/Nodes/NodeUtility.php b/plugins/af_readability/vendor/fivefilters/readability.php/src/Nodes/NodeUtility.php index cbf78bae0..56de70517 100644 --- a/plugins/af_readability/vendor/fivefilters/readability.php/src/Nodes/NodeUtility.php +++ b/plugins/af_readability/vendor/fivefilters/readability.php/src/Nodes/NodeUtility.php @@ -1,11 +1,11 @@ '/-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', - 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', + 'unlikelyCandidates' => '/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', + 'okMaybeItsACandidate' => '/and|article|body|column|content|main|shadow/i', 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', 'byline' => '/byline|author|dateline|writtenby|p-author/i', - 'replaceFonts' => '/<(\/?)font[^>]*>/gi', + 'replaceFonts' => '/<(\/?)font[^>]*>/i', 'normalize' => '/\s{2,}/', 'videos' => '/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i', + 'shareElements' => '/(\b|_)(share|sharedaddy)(\b|_)/i', 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', 'prevLink' => '/(prev|earl|old|new|<|«)/i', + 'tokenize' => '/\W+/', 'whitespace' => '/^\s*$/', 'hasContent' => '/\S$/', 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i', - 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i', + 'negative' => '/-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i', // \x{00A0} is the unicode version of   - 'onlyWhitespace' => '/\x{00A0}|\s+/u' + 'onlyWhitespace' => '/\x{00A0}|\s+/u', + 'hashUrl' => '/^#.+/', + 'srcsetUrl' => '/(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/', + 'b64DataUrl' => '/^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i', + // See: https://schema.org/Article + 'jsonLdArticleTypes' => '/^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/' + ]; /** + * Finds the next node, starting from the given node, and ignoring + * whitespace in between. If the given node is an element, the same node is + * returned. + * * Imported from the Element class on league\html-to-markdown. * * @param $node * - * @return DOMElement + * @return DOMNode */ - public static function nextElement($node) + public static function nextNode($node) { $next = $node; while ($next -- cgit v1.2.3