summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Dolgov <[email protected]>2023-06-10 14:04:38 +0300
committerAndrew Dolgov <[email protected]>2023-06-10 14:04:38 +0300
commitfe31e18fd0928ee2efa71973568f0a2e800d259a (patch)
treea33f4e216558247d87574dcde024172a82f727ee
parent63f4ea5f5be426810077acf49967a140053c266d (diff)
simplify things a bit
-rw-r--r--init.php175
1 files changed, 50 insertions, 125 deletions
diff --git a/init.php b/init.php
index f6ba6e3..3191bcb 100644
--- a/init.php
+++ b/init.php
@@ -5,7 +5,7 @@ class Af_Lemmy extends Plugin {
private $host;
/** @var array<string> */
- private $domain_blacklist = [ "github.com" ];
+ private $lemmy_domains = ["lemmy.ml", "beehaw.org", "feddit.de"];
/** @var array<int, array<int, string|null>> */
private $generated_enclosures = [];
@@ -27,9 +27,15 @@ class Af_Lemmy extends Plugin {
$host->add_hook($host::HOOK_PRE_SUBSCRIBE, $this);
}
+ // TODO: maybe support other federated instances like beehaw? whitelist?
function hook_pre_subscribe(&$url, $auth_login, $auth_pass) {
- // TODO
+ $origin_domain = parse_url($url, PHP_URL_HOST);
+
+ if (in_array($origin_domain, $this->lemmy_domains)) {
+
+
+ }
return false;
}
@@ -52,15 +58,12 @@ class Af_Lemmy extends Plugin {
$matches = [];
- /* skip links going back to origin (and any other blacklisted stuff, including /u/user profiles, except for /pictrs) */
- if (!$found &&
- strpos($entry_href, "/pictrs/") === FALSE &&
- (strpos($entry_href, "/u/") !== FALSE || $this->is_blacklisted($entry_href, [$origin_domain]))) {
- Debug::log("BODY: URL $entry_href is blacklisted, skipping", Debug::LOG_EXTENDED);
+ if (!$found && !$this->can_embed($entry_href)) {
+ Debug::log("BODY: not allowed to embed URL: $entry_href, skipping", Debug::LOG_EXTENDED);
continue;
}
- Debug::log("BODY: processing URL: " . $entry_href, Debug::LOG_VERBOSE);
+ Debug::log("BODY: processing URL: $entry_href", Debug::LOG_VERBOSE);
if (!$found && preg_match("/^https?:\/\/twitter.com\/(.*?)\/status\/(.*)/", $entry_href, $matches)) {
Debug::log("handling as twitter: " . $matches[1] . " " . $matches[2], Debug::LOG_VERBOSE);
@@ -128,7 +131,7 @@ class Af_Lemmy extends Plugin {
$this->handle_as_video($doc, $entry, $source_stream, $poster_url);
- $found = true;
+ $found = 1;
}
$matches = array();
@@ -142,10 +145,10 @@ class Af_Lemmy extends Plugin {
/* push generated video URL to enclosures so that youtube embed plugins would deal with it later (if enabled) */
$this->generated_enclosures[] = [$video_url, "text/html", null, null, '', ''];
- $found = true;
+ $found = 1;
}
- if (!$found && (preg_match("/\.(jpe?g|gif|png)(\?[0-9][0-9]*)?[$\?]?/i", $entry_href) ||
+ if (!$found && (preg_match("/\.(jpe?g|webp|gif|png)(\?[0-9][0-9]*)?[$\?]?/i", $entry_href) ||
/* mb_strpos($entry_href, "i.reddituploads.com") !== false || */
mb_strpos($this->get_content_type($entry_href), "image/") !== false)) {
@@ -155,69 +158,24 @@ class Af_Lemmy extends Plugin {
$found = 1;
}
- // imgur via link rel="image_src" href="..."
- if (!$found && preg_match("/imgur/", $entry_href)) {
+ // let's try meta properties
+ if (!$found) {
- Debug::log("handling as imgur page/whatever", Debug::LOG_VERBOSE);
+ Debug::log("probing content-type...", Debug::LOG_EXTENDED);
$content_type = $this->get_content_type($entry_href);
- if ($content_type && strpos($content_type, "text/html") !== false) {
-
- $content = UrlHelper::fetch(["url" => $entry_href,
- "http_accept" => "text/*"]);
-
- if ($content) {
- $cdoc = new DOMDocument();
-
- if (@$cdoc->loadHTML($content)) {
- $cxpath = new DOMXPath($cdoc);
-
- /** @var ?DOMElement $rel_image */
- $rel_image = $cxpath->query("//link[@rel='image_src']")->item(0);
+ Debug::log("got content-type: $content_type", Debug::LOG_VERBOSE);
- if ($rel_image) {
-
- $img = $doc->createElement('img');
- $img->setAttribute("src", $rel_image->getAttribute("href"));
-
- $br = $doc->createElement('br');
- $entry->parentNode->insertBefore($img, $entry);
- $entry->parentNode->insertBefore($br, $entry);
-
- $found = true;
- }
- }
- }
-
- } else {
- Debug::log("skipping imgur $entry_href because of content type: $content_type", Debug::LOG_VERBOSE);
- }
- }
-
- // wtf is this even
- if (!$found && preg_match("/^https?:\/\/gyazo\.com\/([^\.\/]+$)/", $entry_href, $matches)) {
- $img_id = $matches[1];
-
- Debug::log("handling as gyazo: $img_id", Debug::LOG_VERBOSE);
-
- $img = $doc->createElement('img');
- $img->setAttribute("src", "https://i.gyazo.com/$img_id.jpg");
-
- $br = $doc->createElement('br');
- $entry->parentNode->insertBefore($img, $entry);
- $entry->parentNode->insertBefore($br, $entry);
-
- $found = true;
- }
+ if ($content_type && strpos($content_type, "image/") !== FALSE) {
+ Debug::log("Handling as a picture based on content-type", Debug::LOG_VERBOSE);
- // let's try meta properties
- if (!$found) {
- Debug::log("looking for meta og:image", Debug::LOG_VERBOSE);
+ $this->handle_as_image($doc, $entry, $entry_href, $entry_href);
+ $found = 1;
- $content_type = $this->get_content_type($entry_href);
+ } else if ($content_type && strpos($content_type, "text/html") !== FALSE) {
- if ($content_type && strpos($content_type, "text/html") !== false) {
+ Debug::log("looking for meta og:image/video...", Debug::LOG_VERBOSE);
$content = UrlHelper::fetch(["url" => $entry_href,
"http_accept" => "text/*"]);
@@ -247,7 +205,7 @@ class Af_Lemmy extends Plugin {
}
$this->handle_as_video($doc, $entry, $source_stream, $poster_url);
- $found = true;
+ $found = 1;
}
} else if ($og_image) {
@@ -262,13 +220,26 @@ class Af_Lemmy extends Plugin {
$entry->parentNode->insertBefore($img, $entry);
$entry->parentNode->insertBefore($br, $entry);
- $found = true;
+ $found = 1;
}
}
}
}
+
+ Debug::log("trying to extract full content using readability...", Debug::LOG_VERBOSE);
+
+ $this->host->run_hooks_callback(PluginHost::HOOK_GET_FULL_TEXT,
+ function ($result) use (&$article, &$found) {
+ if ($result && mb_strlen($result) >= 128) {
+ $article["content"] .= "<hr/>" . $result;
+ $found = true;
+ return true;
+ }
+ },
+ $entry_href);
+
} else {
- Debug::log("BODY: skipping $entry_href because of content type: $content_type", Debug::LOG_VERBOSE);
+ Debug::log("BODY: skipping because of content type: $content_type", Debug::LOG_VERBOSE);
}
}
}
@@ -278,25 +249,20 @@ class Af_Lemmy extends Plugin {
function hook_article_filter($article) {
- if (preg_match("/lemmy.ml\/post\/[0-9]{1,}$/", $article['link']) !== false && !empty($article["content"])) {
+ $origin_domain = parse_url($article["feed"]["site_url"] ?? '', PHP_URL_HOST);
+
+ if (in_array($origin_domain, $this->lemmy_domains) && preg_match("/\/post\/[0-9]{1,}$/", $article['link']) && !empty($article["content"])) {
$doc = new DOMDocument();
if (@$doc->loadHTML($article["content"])) {
$xpath = new DOMXPath($doc);
- if ($this->is_blacklisted($article['link']))
- return $article;
-
- $origin_domain = parse_url($article["feed"]["site_url"] ?? '', PHP_URL_HOST);
-
$found = $this->inline_stuff($article, $doc, $xpath, $origin_domain);
$node = $doc->getElementsByTagName('body')->item(0);
if ($node && $found) {
$article["content"] = $doc->saveHTML($node);
$article["enclosures"] = $this->generated_enclosures;
- } else {
- $article = $this->readability($article, $article['link'], $doc, $xpath);
}
}
}
@@ -369,57 +335,16 @@ class Af_Lemmy extends Plugin {
return $this->get_header($url, CURLINFO_CONTENT_TYPE, $useragent);
}
- /**
- * @param array<string,mixed> $article
- * @param string $url
- * @param DOMDocument $doc
- * @param DOMXPath $xpath
- * @param bool $debug
- * @return array<string,mixed>
- * @throws PDOException
- */
- private function readability(array $article, string $url, DOMDocument $doc, DOMXpath $xpath, bool $debug = false) : array {
+ private function can_embed(string $url) : bool {
+ if (strpos($url, "/pictrs/") !== FALSE)
+ return true;
- if (function_exists("curl_init") && !preg_match("/post\/[0-9]{1,}$/", $url) && mb_strlen(strip_tags($article["content"])) <= 200) {
+ if (preg_match("#/u/|/c/|/post/#", $url))
+ return false;
- /* link may lead to a huge video file or whatever, we need to check content type before trying to
- parse it which p much requires curl */
+ $origin_domain = parse_url($url, PHP_URL_HOST);
- $useragent_compat = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)";
- $content_type = $this->get_content_type($url, $useragent_compat);
-
- if ($content_type && strpos($content_type, "text/html") !== false) {
-
- $this->host->run_hooks_callback(PluginHost::HOOK_GET_FULL_TEXT,
- function ($result) use (&$article) {
- if ($result) {
- $article["content"] .= "<hr/>" . $result;
- return true;
- }
- },
- $url);
- }
- }
-
- return $article;
- }
-
- /**
- * @param string $src
- * @param array<string> $also_blacklist
- * @return bool
- */
- private function is_blacklisted(string $src, array $also_blacklist = []) : bool {
- $src_domain = parse_url($src, PHP_URL_HOST);
-
- if ($src_domain)
- foreach ([...$this->domain_blacklist, ...$also_blacklist] as $domain) {
- if (strstr($src_domain, $domain) !== false) {
- return true;
- }
- }
-
- return false;
+ return !in_array($origin_domain, $this->lemmy_domains);
}
function api_version() {