From fe31e18fd0928ee2efa71973568f0a2e800d259a Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Sat, 10 Jun 2023 14:04:38 +0300 Subject: simplify things a bit --- init.php | 175 ++++++++++++++++++--------------------------------------------- 1 file changed, 50 insertions(+), 125 deletions(-) (limited to 'init.php') diff --git a/init.php b/init.php index f6ba6e3..3191bcb 100644 --- a/init.php +++ b/init.php @@ -5,7 +5,7 @@ class Af_Lemmy extends Plugin { private $host; /** @var array */ - private $domain_blacklist = [ "github.com" ]; + private $lemmy_domains = ["lemmy.ml", "beehaw.org", "feddit.de"]; /** @var array> */ private $generated_enclosures = []; @@ -27,9 +27,15 @@ class Af_Lemmy extends Plugin { $host->add_hook($host::HOOK_PRE_SUBSCRIBE, $this); } + // TODO: maybe support other federated instances like beehaw? whitelist? function hook_pre_subscribe(&$url, $auth_login, $auth_pass) { - // TODO + $origin_domain = parse_url($url, PHP_URL_HOST); + + if (in_array($origin_domain, $this->lemmy_domains)) { + + + } return false; } @@ -52,15 +58,12 @@ class Af_Lemmy extends Plugin { $matches = []; - /* skip links going back to origin (and any other blacklisted stuff, including /u/user profiles, except for /pictrs) */ - if (!$found && - strpos($entry_href, "/pictrs/") === FALSE && - (strpos($entry_href, "/u/") !== FALSE || $this->is_blacklisted($entry_href, [$origin_domain]))) { - Debug::log("BODY: URL $entry_href is blacklisted, skipping", Debug::LOG_EXTENDED); + if (!$found && !$this->can_embed($entry_href)) { + Debug::log("BODY: not allowed to embed URL: $entry_href, skipping", Debug::LOG_EXTENDED); continue; } - Debug::log("BODY: processing URL: " . $entry_href, Debug::LOG_VERBOSE); + Debug::log("BODY: processing URL: $entry_href", Debug::LOG_VERBOSE); if (!$found && preg_match("/^https?:\/\/twitter.com\/(.*?)\/status\/(.*)/", $entry_href, $matches)) { Debug::log("handling as twitter: " . $matches[1] . " " . $matches[2], Debug::LOG_VERBOSE); @@ -128,7 +131,7 @@ class Af_Lemmy extends Plugin { $this->handle_as_video($doc, $entry, $source_stream, $poster_url); - $found = true; + $found = 1; } $matches = array(); @@ -142,10 +145,10 @@ class Af_Lemmy extends Plugin { /* push generated video URL to enclosures so that youtube embed plugins would deal with it later (if enabled) */ $this->generated_enclosures[] = [$video_url, "text/html", null, null, '', '']; - $found = true; + $found = 1; } - if (!$found && (preg_match("/\.(jpe?g|gif|png)(\?[0-9][0-9]*)?[$\?]?/i", $entry_href) || + if (!$found && (preg_match("/\.(jpe?g|webp|gif|png)(\?[0-9][0-9]*)?[$\?]?/i", $entry_href) || /* mb_strpos($entry_href, "i.reddituploads.com") !== false || */ mb_strpos($this->get_content_type($entry_href), "image/") !== false)) { @@ -155,69 +158,24 @@ class Af_Lemmy extends Plugin { $found = 1; } - // imgur via link rel="image_src" href="..." - if (!$found && preg_match("/imgur/", $entry_href)) { + // let's try meta properties + if (!$found) { - Debug::log("handling as imgur page/whatever", Debug::LOG_VERBOSE); + Debug::log("probing content-type...", Debug::LOG_EXTENDED); $content_type = $this->get_content_type($entry_href); - if ($content_type && strpos($content_type, "text/html") !== false) { - - $content = UrlHelper::fetch(["url" => $entry_href, - "http_accept" => "text/*"]); - - if ($content) { - $cdoc = new DOMDocument(); - - if (@$cdoc->loadHTML($content)) { - $cxpath = new DOMXPath($cdoc); - - /** @var ?DOMElement $rel_image */ - $rel_image = $cxpath->query("//link[@rel='image_src']")->item(0); + Debug::log("got content-type: $content_type", Debug::LOG_VERBOSE); - if ($rel_image) { - - $img = $doc->createElement('img'); - $img->setAttribute("src", $rel_image->getAttribute("href")); - - $br = $doc->createElement('br'); - $entry->parentNode->insertBefore($img, $entry); - $entry->parentNode->insertBefore($br, $entry); - - $found = true; - } - } - } - - } else { - Debug::log("skipping imgur $entry_href because of content type: $content_type", Debug::LOG_VERBOSE); - } - } - - // wtf is this even - if (!$found && preg_match("/^https?:\/\/gyazo\.com\/([^\.\/]+$)/", $entry_href, $matches)) { - $img_id = $matches[1]; - - Debug::log("handling as gyazo: $img_id", Debug::LOG_VERBOSE); - - $img = $doc->createElement('img'); - $img->setAttribute("src", "https://i.gyazo.com/$img_id.jpg"); - - $br = $doc->createElement('br'); - $entry->parentNode->insertBefore($img, $entry); - $entry->parentNode->insertBefore($br, $entry); - - $found = true; - } + if ($content_type && strpos($content_type, "image/") !== FALSE) { + Debug::log("Handling as a picture based on content-type", Debug::LOG_VERBOSE); - // let's try meta properties - if (!$found) { - Debug::log("looking for meta og:image", Debug::LOG_VERBOSE); + $this->handle_as_image($doc, $entry, $entry_href, $entry_href); + $found = 1; - $content_type = $this->get_content_type($entry_href); + } else if ($content_type && strpos($content_type, "text/html") !== FALSE) { - if ($content_type && strpos($content_type, "text/html") !== false) { + Debug::log("looking for meta og:image/video...", Debug::LOG_VERBOSE); $content = UrlHelper::fetch(["url" => $entry_href, "http_accept" => "text/*"]); @@ -247,7 +205,7 @@ class Af_Lemmy extends Plugin { } $this->handle_as_video($doc, $entry, $source_stream, $poster_url); - $found = true; + $found = 1; } } else if ($og_image) { @@ -262,13 +220,26 @@ class Af_Lemmy extends Plugin { $entry->parentNode->insertBefore($img, $entry); $entry->parentNode->insertBefore($br, $entry); - $found = true; + $found = 1; } } } } + + Debug::log("trying to extract full content using readability...", Debug::LOG_VERBOSE); + + $this->host->run_hooks_callback(PluginHost::HOOK_GET_FULL_TEXT, + function ($result) use (&$article, &$found) { + if ($result && mb_strlen($result) >= 128) { + $article["content"] .= "
" . $result; + $found = true; + return true; + } + }, + $entry_href); + } else { - Debug::log("BODY: skipping $entry_href because of content type: $content_type", Debug::LOG_VERBOSE); + Debug::log("BODY: skipping because of content type: $content_type", Debug::LOG_VERBOSE); } } } @@ -278,25 +249,20 @@ class Af_Lemmy extends Plugin { function hook_article_filter($article) { - if (preg_match("/lemmy.ml\/post\/[0-9]{1,}$/", $article['link']) !== false && !empty($article["content"])) { + $origin_domain = parse_url($article["feed"]["site_url"] ?? '', PHP_URL_HOST); + + if (in_array($origin_domain, $this->lemmy_domains) && preg_match("/\/post\/[0-9]{1,}$/", $article['link']) && !empty($article["content"])) { $doc = new DOMDocument(); if (@$doc->loadHTML($article["content"])) { $xpath = new DOMXPath($doc); - if ($this->is_blacklisted($article['link'])) - return $article; - - $origin_domain = parse_url($article["feed"]["site_url"] ?? '', PHP_URL_HOST); - $found = $this->inline_stuff($article, $doc, $xpath, $origin_domain); $node = $doc->getElementsByTagName('body')->item(0); if ($node && $found) { $article["content"] = $doc->saveHTML($node); $article["enclosures"] = $this->generated_enclosures; - } else { - $article = $this->readability($article, $article['link'], $doc, $xpath); } } } @@ -369,57 +335,16 @@ class Af_Lemmy extends Plugin { return $this->get_header($url, CURLINFO_CONTENT_TYPE, $useragent); } - /** - * @param array $article - * @param string $url - * @param DOMDocument $doc - * @param DOMXPath $xpath - * @param bool $debug - * @return array - * @throws PDOException - */ - private function readability(array $article, string $url, DOMDocument $doc, DOMXpath $xpath, bool $debug = false) : array { + private function can_embed(string $url) : bool { + if (strpos($url, "/pictrs/") !== FALSE) + return true; - if (function_exists("curl_init") && !preg_match("/post\/[0-9]{1,}$/", $url) && mb_strlen(strip_tags($article["content"])) <= 200) { + if (preg_match("#/u/|/c/|/post/#", $url)) + return false; - /* link may lead to a huge video file or whatever, we need to check content type before trying to - parse it which p much requires curl */ + $origin_domain = parse_url($url, PHP_URL_HOST); - $useragent_compat = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"; - $content_type = $this->get_content_type($url, $useragent_compat); - - if ($content_type && strpos($content_type, "text/html") !== false) { - - $this->host->run_hooks_callback(PluginHost::HOOK_GET_FULL_TEXT, - function ($result) use (&$article) { - if ($result) { - $article["content"] .= "
" . $result; - return true; - } - }, - $url); - } - } - - return $article; - } - - /** - * @param string $src - * @param array $also_blacklist - * @return bool - */ - private function is_blacklisted(string $src, array $also_blacklist = []) : bool { - $src_domain = parse_url($src, PHP_URL_HOST); - - if ($src_domain) - foreach ([...$this->domain_blacklist, ...$also_blacklist] as $domain) { - if (strstr($src_domain, $domain) !== false) { - return true; - } - } - - return false; + return !in_array($origin_domain, $this->lemmy_domains); } function api_version() { -- cgit v1.2.3