simplify things a bit

author: Andrew Dolgov <[email protected]> 2023-06-10 14:04:38 +0300
committer: Andrew Dolgov <[email protected]> 2023-06-10 14:04:38 +0300
commit: fe31e18fd0928ee2efa71973568f0a2e800d259a (patch)
tree: a33f4e216558247d87574dcde024172a82f727ee
parent: 63f4ea5f5be426810077acf49967a140053c266d (diff)
1 files changed, 50 insertions, 125 deletions
diff --git a/init.php b/init.php
index f6ba6e3..3191bcb 100644
--- a/init.php
+++ b/init.php
@@ -5,7 +5,7 @@ class Af_Lemmy extends Plugin {
 	private $host;
 
 	/** @var array<string> */
-	private $domain_blacklist = [ "github.com" ];
+	private $lemmy_domains = ["lemmy.ml", "beehaw.org", "feddit.de"];
 
 	/** @var array<int, array<int, string|null>> */
 	private $generated_enclosures = [];
@@ -27,9 +27,15 @@ class Af_Lemmy extends Plugin {
 		$host->add_hook($host::HOOK_PRE_SUBSCRIBE, $this);
 	}
 
+	// TODO: maybe support other federated instances like beehaw? whitelist?
 	function hook_pre_subscribe(&$url, $auth_login, $auth_pass) {
 
-		// TODO
+		$origin_domain = parse_url($url, PHP_URL_HOST);
+
+		if (in_array($origin_domain, $this->lemmy_domains)) {
+
+
+		}
 
 		return false;
 	}
@@ -52,15 +58,12 @@ class Af_Lemmy extends Plugin {
 
 			$matches = [];
 
-			/* skip links going back to origin (and any other blacklisted stuff, including /u/user profiles, except for /pictrs) */
-			if (!$found &&
-					strpos($entry_href, "/pictrs/") === FALSE &&
-					(strpos($entry_href, "/u/") !== FALSE || $this->is_blacklisted($entry_href, [$origin_domain]))) {
-				Debug::log("BODY: URL $entry_href is blacklisted, skipping", Debug::LOG_EXTENDED);
+			if (!$found &&	!$this->can_embed($entry_href)) {
+				Debug::log("BODY: not allowed to embed URL: $entry_href, skipping", Debug::LOG_EXTENDED);
 				continue;
 			}
 
-			Debug::log("BODY: processing URL: " . $entry_href, Debug::LOG_VERBOSE);
+			Debug::log("BODY: processing URL: $entry_href", Debug::LOG_VERBOSE);
 
 			if (!$found && preg_match("/^https?:\/\/twitter.com\/(.*?)\/status\/(.*)/", $entry_href, $matches)) {
 				Debug::log("handling as twitter: " . $matches[1] . " " . $matches[2], Debug::LOG_VERBOSE);
@@ -128,7 +131,7 @@ class Af_Lemmy extends Plugin {
 
 				$this->handle_as_video($doc, $entry, $source_stream, $poster_url);
 
-				$found = true;
+				$found = 1;
 			}
 
 			$matches = array();
@@ -142,10 +145,10 @@ class Af_Lemmy extends Plugin {
 				/* push generated video URL to enclosures so that youtube embed plugins would deal with it later (if enabled) */
 				$this->generated_enclosures[] = [$video_url, "text/html", null, null, '', ''];
 
-				$found = true;
+				$found = 1;
 			}
 
-			if (!$found && (preg_match("/\.(jpe?g|gif|png)(\?[0-9][0-9]*)?[$\?]?/i", $entry_href) ||
+			if (!$found && (preg_match("/\.(jpe?g|webp|gif|png)(\?[0-9][0-9]*)?[$\?]?/i", $entry_href) ||
 				/* mb_strpos($entry_href, "i.reddituploads.com") !== false || */
 				mb_strpos($this->get_content_type($entry_href), "image/") !== false)) {
 
@@ -155,69 +158,24 @@ class Af_Lemmy extends Plugin {
 				$found = 1;
 			}
 
-			// imgur via link rel="image_src" href="..."
-			if (!$found && preg_match("/imgur/", $entry_href)) {
+			// let's try meta properties
+			if (!$found) {
 
-				Debug::log("handling as imgur page/whatever", Debug::LOG_VERBOSE);
+				Debug::log("probing content-type...", Debug::LOG_EXTENDED);
 
 				$content_type = $this->get_content_type($entry_href);
 
-				if ($content_type && strpos($content_type, "text/html") !== false) {
-
-					$content = UrlHelper::fetch(["url" => $entry_href,
-						"http_accept" => "text/*"]);
-
-					if ($content) {
-						$cdoc = new DOMDocument();
-
-						if (@$cdoc->loadHTML($content)) {
-							$cxpath = new DOMXPath($cdoc);
-
-							/** @var ?DOMElement $rel_image */
-							$rel_image = $cxpath->query("//link[@rel='image_src']")->item(0);
+				Debug::log("got content-type: $content_type", Debug::LOG_VERBOSE);
 
-							if ($rel_image) {
-
-								$img = $doc->createElement('img');
-								$img->setAttribute("src", $rel_image->getAttribute("href"));
-
-								$br = $doc->createElement('br');
-								$entry->parentNode->insertBefore($img, $entry);
-								$entry->parentNode->insertBefore($br, $entry);
-
-								$found = true;
-							}
-						}
-					}
-
-				} else {
-					Debug::log("skipping imgur $entry_href because of content type: $content_type", Debug::LOG_VERBOSE);
-				}
-			}
-
-			// wtf is this even
-			if (!$found && preg_match("/^https?:\/\/gyazo\.com\/([^\.\/]+$)/", $entry_href, $matches)) {
-				$img_id = $matches[1];
-
-				Debug::log("handling as gyazo: $img_id", Debug::LOG_VERBOSE);
-
-				$img = $doc->createElement('img');
-				$img->setAttribute("src", "https://i.gyazo.com/$img_id.jpg");
-
-				$br = $doc->createElement('br');
-				$entry->parentNode->insertBefore($img, $entry);
-				$entry->parentNode->insertBefore($br, $entry);
-
-				$found = true;
-			}
+				if ($content_type && strpos($content_type, "image/") !== FALSE) {
+					Debug::log("Handling as a picture based on content-type", Debug::LOG_VERBOSE);
 
-			// let's try meta properties
-			if (!$found) {
-				Debug::log("looking for meta og:image", Debug::LOG_VERBOSE);
+					$this->handle_as_image($doc, $entry, $entry_href, $entry_href);
+					$found = 1;
 
-				$content_type = $this->get_content_type($entry_href);
+				} else if ($content_type && strpos($content_type, "text/html") !== FALSE) {
 
-				if ($content_type && strpos($content_type, "text/html") !== false) {
+					Debug::log("looking for meta og:image/video...", Debug::LOG_VERBOSE);
 
 					$content = UrlHelper::fetch(["url" => $entry_href,
 						"http_accept" => "text/*"]);
@@ -247,7 +205,7 @@ class Af_Lemmy extends Plugin {
 									}
 
 									$this->handle_as_video($doc, $entry, $source_stream, $poster_url);
-									$found = true;
+									$found = 1;
 								}
 
 							} else if ($og_image) {
@@ -262,13 +220,26 @@ class Af_Lemmy extends Plugin {
 									$entry->parentNode->insertBefore($img, $entry);
 									$entry->parentNode->insertBefore($br, $entry);
 
-									$found = true;
+									$found = 1;
 								}
 							}
 						}
 					}
+
+					Debug::log("trying to extract full content using readability...", Debug::LOG_VERBOSE);
+
+					$this->host->run_hooks_callback(PluginHost::HOOK_GET_FULL_TEXT,
+						function ($result) use (&$article, &$found) {
+							if ($result && mb_strlen($result) >= 128) {
+								$article["content"] .= "<hr/>" . $result;
+								$found = true;
+								return true;
+							}
+						},
+						$entry_href);
+
 				} else {
-					Debug::log("BODY: skipping $entry_href because of content type: $content_type", Debug::LOG_VERBOSE);
+					Debug::log("BODY: skipping because of content type: $content_type", Debug::LOG_VERBOSE);
 				}
 			}
 		}
@@ -278,25 +249,20 @@ class Af_Lemmy extends Plugin {
 
 	function hook_article_filter($article) {
 
-		if (preg_match("/lemmy.ml\/post\/[0-9]{1,}$/", $article['link']) !== false && !empty($article["content"])) {
+		$origin_domain = parse_url($article["feed"]["site_url"] ?? '', PHP_URL_HOST);
+
+		if (in_array($origin_domain, $this->lemmy_domains) && preg_match("/\/post\/[0-9]{1,}$/", $article['link']) && !empty($article["content"])) {
 			$doc = new DOMDocument();
 
 			if (@$doc->loadHTML($article["content"])) {
 				$xpath = new DOMXPath($doc);
 
-				if ($this->is_blacklisted($article['link']))
-					return $article;
-
-				$origin_domain = parse_url($article["feed"]["site_url"] ?? '', PHP_URL_HOST);
-
 				$found = $this->inline_stuff($article, $doc, $xpath, $origin_domain);
 				$node = $doc->getElementsByTagName('body')->item(0);
 
 				if ($node && $found) {
 					$article["content"] = $doc->saveHTML($node);
 					$article["enclosures"] = $this->generated_enclosures;
-				} else {
-					$article = $this->readability($article, $article['link'], $doc, $xpath);
 				}
 			}
 		}
@@ -369,57 +335,16 @@ class Af_Lemmy extends Plugin {
 		return $this->get_header($url, CURLINFO_CONTENT_TYPE, $useragent);
 	}
 
-	/**
-	 * @param array<string,mixed> $article
-	 * @param string $url
-	 * @param DOMDocument $doc
-	 * @param DOMXPath $xpath
-	 * @param bool $debug
-	 * @return array<string,mixed>
-	 * @throws PDOException
-	 */
-	private function readability(array $article, string $url, DOMDocument $doc, DOMXpath $xpath, bool $debug = false) : array {
+	private function can_embed(string $url) : bool {
+		if (strpos($url, "/pictrs/") !== FALSE)
+			return true;
 
-		if (function_exists("curl_init") && !preg_match("/post\/[0-9]{1,}$/", $url) && mb_strlen(strip_tags($article["content"])) <= 200) {
+		if (preg_match("#/u/|/c/|/post/#", $url))
+			return false;
 
-			/* link may lead to a huge video file or whatever, we need to check content type before trying to
-			parse it which p much requires curl */
+		$origin_domain = parse_url($url, PHP_URL_HOST);
 
-			$useragent_compat = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)";
-			$content_type = $this->get_content_type($url, $useragent_compat);
-
-			if ($content_type && strpos($content_type, "text/html") !== false) {
-
-				$this->host->run_hooks_callback(PluginHost::HOOK_GET_FULL_TEXT,
-					function ($result) use (&$article) {
-						if ($result) {
-							$article["content"] .= "<hr/>" . $result;
-							return true;
-						}
-					},
-					$url);
-			}
-		}
-
-		return $article;
-	}
-
-	/**
-	 * @param string $src
-	 * @param array<string> $also_blacklist
-	 * @return bool
-	 */
-	private function is_blacklisted(string $src, array $also_blacklist = []) : bool {
-		$src_domain = parse_url($src, PHP_URL_HOST);
-
-		if ($src_domain)
-		foreach ([...$this->domain_blacklist, ...$also_blacklist] as $domain) {
-				if (strstr($src_domain, $domain) !== false) {
-					return true;
-				}
-			}
-
-		return false;
+		return !in_array($origin_domain, $this->lemmy_domains);
 	}
 
 	function api_version() {
author	Andrew Dolgov <[email protected]>	2023-06-10 14:04:38 +0300
committer	Andrew Dolgov <[email protected]>	2023-06-10 14:04:38 +0300
commit	fe31e18fd0928ee2efa71973568f0a2e800d259a (patch)
tree	a33f4e216558247d87574dcde024172a82f727ee
parent	63f4ea5f5be426810077acf49967a140053c266d (diff)