init.php 18 KB


  1. <?php
  2. use Jenssegers\ImageHash\Implementations\PerceptualHash;
  3. use Jenssegers\ImageHash\ImageHash;
  4. class Af_Img_Phash extends Plugin {
  5. /* @var PluginHost $host */
  6. private $host;
  7. private $default_domains_list = "imgur.com reddituploads.com pbs.twimg.com .redd.it i.sli.mg media.tumblr.com redditmedia.com kek.gg gfycat.com";
  8. private $default_similarity = 5;
  9. private $data_max_age = 240; // days
  10. /* @var DiskCache $cache */
  11. private $cache;
  12. function about() {
  13. return array(1.0,
  14. "Filter duplicate images using perceptual hashing (requires GD)",
  15. "fox",
  16. false,
  17. "https://git.tt-rss.org/fox/ttrss-perceptual-image-hash/wiki");
  18. }
  19. function get_js() {
  20. return file_get_contents(__DIR__ . "/init.js");
  21. }
  22. function get_css() {
  23. return file_get_contents(__DIR__ . "/init.css");
  24. }
  25. function save() {
  26. $similarity = (int) $_POST["similarity"];
  27. $domains_list = $_POST["domains_list"];
  28. $enable_globally = checkbox_to_sql_bool($_POST["phash_enable_globally"]);
  29. if ($similarity < 0) $similarity = 0;
  30. $this->host->set($this, "similarity", $similarity);
  31. $this->host->set($this, "enable_globally", $enable_globally);
  32. $this->host->set($this, "domains_list", $domains_list);
  33. echo $this->T_sprintf("Data saved (%s, %s, %d)", $similarity, $domains_list, $enable_globally);
  34. }
  35. function init($host) {
  36. $this->host = $host;
  37. $this->cache = new DiskCache("images");
  38. $host->add_hook($host::HOOK_ARTICLE_FILTER, $this, 100);
  39. $host->add_hook($host::HOOK_PREFS_TAB, $this);
  40. $host->add_hook($host::HOOK_PREFS_EDIT_FEED, $this);
  41. $host->add_hook($host::HOOK_PREFS_SAVE_FEED, $this);
  42. $host->add_hook($host::HOOK_HOUSE_KEEPING, $this);
  43. $host->add_hook($host::HOOK_RENDER_ARTICLE, $this, 100);
  44. $host->add_hook($host::HOOK_RENDER_ARTICLE_CDM, $this, 100);
  45. $host->add_hook($host::HOOK_RENDER_ARTICLE_API, $this, 100);
  46. $host->add_hook($host::HOOK_ARTICLE_IMAGE, $this, 100);
  47. }
  48. function hook_prefs_tab($args) {
  49. if ($args != "prefFeeds") return;
  50. print "<div dojoType='dijit.layout.AccordionPane'
  51. title=\"<i class='material-icons'>photo</i> ".$this->__( 'Filter similar images (af_img_phash)')."\">";
  52. if (DB_TYPE == "pgsql") {
  53. if (true === IMG_HASH_SQL_FUNCTION) {
  54. print_error("Using SQL implementation of bit_count; UI performance may not be as responsive as installing extension 'https://github.com/sldab/count-bits'. See README.txt");
  55. }
  56. else {
  57. try { $res = $this->pdo->query("select 'unique_1bits'::regproc"); } catch (PDOException $e) { ; }
  58. if (!$res || !$res->fetch()) {
  59. print_error("Required function from count_bits extension not found.");
  60. }
  61. }
  62. }
  63. $similarity = (int) $this->host->get($this, "similarity", $this->default_similarity);
  64. $domains_list = $this->host->get($this, "domains_list", $this->default_domains_list);
  65. $enable_globally = $this->host->get($this, "enable_globally");
  66. print "<form dojoType='dijit.form.Form'>";
  67. print "<script type='dojo/method' event='onSubmit' args='evt'>
  68. evt.preventDefault();
  69. if (this.validate()) {
  70. console.log(dojo.objectToQuery(this.getValues()));
  71. new Ajax.Request('backend.php', {
  72. parameters: dojo.objectToQuery(this.getValues()),
  73. onComplete: function(transport) {
  74. Notify.info(transport.responseText);
  75. }
  76. });
  77. //this.reset();
  78. }
  79. </script>";
  80. print_hidden("op", "pluginhandler");
  81. print_hidden("method", "save");
  82. print_hidden("plugin", "af_img_phash");
  83. print "<h2>" . $this->__( "Global settings") . "</h2>";
  84. print "<fieldset>";
  85. print "<label>".$this->__( "Limit to domains (space-separated):")."</label>";
  86. print "<textarea dojoType='dijit.form.SimpleTextarea' style='height: 100px; width: 500px; display: block'
  87. required='1' name='domains_list'>$domains_list</textarea>";
  88. print "</fieldset><fieldset>";
  89. print "<label>".$this->__( "Maximum hamming distance:")."</label>";
  90. print "<input dojoType='dijit.form.NumberSpinner'
  91. placeholder='5' required='1' name='similarity' id='phash_img_similarity' value='$similarity'>";
  92. print "<div dojoType='dijit.Tooltip' connectId='phash_img_similarity' position='below'>" .
  93. $this->__( "Lower hamming distance value indicates images being more similar.") . "</div>";
  94. print "</fieldset><fieldset class='narrow'>";
  95. print "<label class='checkbox'>";
  96. print_checkbox("phash_enable_globally", $enable_globally);
  97. print " " . $this->__( "Enable for all feeds");
  98. print "</label>";
  99. print "</fieldset>";
  100. print "</table>";
  101. print_button("submit", $this->__( "Save"), "class='alt-primary'");
  102. print "</form>";
  103. $enabled_feeds = $this->host->get($this, "enabled_feeds");
  104. if (!array($enabled_feeds)) $enabled_feeds = array();
  105. $enabled_feeds = $this->filter_unknown_feeds($enabled_feeds);
  106. $this->host->set($this, "enabled_feeds", $enabled_feeds);
  107. if (count($enabled_feeds) > 0) {
  108. print "<h3>" . __("Currently enabled for (click to edit):") . "</h3>";
  109. print "<ul class='panel panel-scrollable list list-unstyled'>";
  110. foreach ($enabled_feeds as $f) {
  111. print "<li><i class='material-icons'>rss_feed</i> <a href='#' onclick=\"CommonDialogs.editFeed($f)\">".
  112. Feeds::getFeedTitle($f) . "</a></li>";
  113. }
  114. print "</ul>";
  115. }
  116. print "</div>";
  117. }
  118. function hook_prefs_edit_feed($feed_id) {
  119. print "<header>".$this->__( "Similar images")."</header>";
  120. print "<section>";
  121. $enabled_feeds = $this->host->get($this, "enabled_feeds");
  122. if (!array($enabled_feeds)) $enabled_feeds = array();
  123. $key = array_search($feed_id, $enabled_feeds);
  124. $checked = $key !== FALSE ? "checked" : "";
  125. print "<fieldset>";
  126. print "<label class='checkbox'><input dojoType='dijit.form.CheckBox' type='checkbox' id='phash_similarity_enabled'
  127. name='phash_similarity_enabled' $checked> ".$this->__( 'Filter similar images')."</label>";
  128. print "</fieldset>";
  129. print "</section>";
  130. }
  131. function hook_prefs_save_feed($feed_id) {
  132. $enabled_feeds = $this->host->get($this, "enabled_feeds");
  133. if (!is_array($enabled_feeds)) $enabled_feeds = array();
  134. $enable = checkbox_to_sql_bool($_POST["phash_similarity_enabled"]);
  135. $key = array_search($feed_id, $enabled_feeds);
  136. if ($enable) {
  137. if ($key === FALSE) {
  138. array_push($enabled_feeds, $feed_id);
  139. }
  140. } else {
  141. if ($key !== FALSE) {
  142. unset($enabled_feeds[$key]);
  143. }
  144. }
  145. $this->host->set($this, "enabled_feeds", $enabled_feeds);
  146. }
  147. private function rewrite_duplicate($doc, $elem, $api_mode = false) {
  148. if ($elem->hasAttribute("src")) {
  149. $uri = $this->absolutize_url($elem->getAttribute("src"));
  150. $check_uri = $uri;
  151. } else if ($elem->hasAttribute("poster")) {
  152. $check_uri = $this->absolutize_url($elem->getAttribute("poster"));
  153. $video_source = $elem->getElementsByTagName("source")->item(0);
  154. if ($video_source) {
  155. $uri = $video_source->getAttribute("src");
  156. }
  157. }
  158. if ($check_uri && $uri) {
  159. $p = $doc->createElement('p');
  160. $a = $doc->createElement("a");
  161. $a->setAttribute("href", $uri);
  162. $a->setAttribute("target", "_blank");
  163. $a->appendChild(new DOMText(truncate_middle($uri, 48, "...")));
  164. $p->appendChild($a);
  165. if (!$api_mode) {
  166. $b = $doc->createElement("a");
  167. $b->setAttribute("href", "#");
  168. $b->setAttribute("onclick", "Plugins.Af_Img_Phash.showSimilar(this)");
  169. $b->setAttribute("data-check-url", $this->absolutize_url($check_uri));
  170. $b->appendChild(new DOMText("(similar)"));
  171. $p->appendChild(new DOMText(" "));
  172. $p->appendChild($b);
  173. }
  174. $elem->parentNode->replaceChild($p, $elem);
  175. }
  176. }
  177. function hook_article_filter($article) {
  178. $enable_globally = $this->host->get($this, "enable_globally");
  179. $domains_list = $this->host->get($this, "domains_list");
  180. if (!$domains_list) $domains_list = $this->default_domains_list;
  181. $domains_list = explode(" ", $domains_list);
  182. if (!$enable_globally) {
  183. $enabled_feeds = $this->host->get($this, "enabled_feeds");
  184. if (is_array($enabled_feeds)) {
  185. $key = array_search($article["feed"]["id"], $enabled_feeds);
  186. if ($key === FALSE) return $article;
  187. } else {
  188. return $article;
  189. }
  190. }
  191. $owner_uid = $article["owner_uid"];
  192. $article_guid = $article["guid_hashed"];
  193. $doc = new DOMDocument();
  194. if (@$doc->loadHTML($article["content"])) {
  195. $xpath = new DOMXPath($doc);
  196. $imgs = $xpath->query("//img[@src]|//video[@poster]");
  197. foreach ($imgs as $img) {
  198. $src = $img->tagName == "video" ? $img->getAttribute("poster") : $img->getAttribute("src");
  199. $src = $this->absolutize_url(rewrite_relative_url($article["link"], $src));
  200. $domain_found = $this->check_src_domain($src, $domains_list);
  201. if ($domain_found) {
  202. _debug("phash: checking $src");
  203. $sth = $this->pdo->prepare("SELECT id FROM ttrss_plugin_img_phash_urls WHERE
  204. owner_uid = ? AND url = ? LIMIT 1");
  205. $sth->execute([$owner_uid, $src]);
  206. if ($sth->fetch()) {
  207. _debug("phash: url already stored, not processing");
  208. continue;
  209. } else {
  210. _debug("phash: downloading and calculating hash...");
  211. if ($this->cache->isWritable()) {
  212. $cached_file = sha1($src);
  213. if (!$this->cache->exists($cached_file)) {
  214. $data = fetch_file_contents(array("url" => $src, "max_size" => MAX_CACHE_FILE_SIZE));
  215. if ($data) {
  216. $this->cache->put($cached_file, $data);
  217. }
  218. } else {
  219. _debug("phash: reading from local cache: $cached_file");
  220. $data = $this->cache->get($cached_file);
  221. }
  222. } else {
  223. _debug("phash: cache directory is not writable");
  224. $data = fetch_file_contents(array("url" => $src, "max_size" => MAX_CACHE_FILE_SIZE));
  225. }
  226. if ($data) {
  227. $implementation = new PerceptualHash();
  228. $hasher = new ImageHash($implementation);
  229. $data_resource = @imagecreatefromstring($data);
  230. if ($data_resource) {
  231. $hash = $hasher->hash($data_resource);
  232. _debug("phash: calculated perceptual hash: $hash");
  233. if ($hash) {
  234. $hash = base_convert($hash, 16, 10);
  235. if (PHP_INT_SIZE > 4) {
  236. while ($hash > PHP_INT_MAX) {
  237. $bitstring = base_convert($hash, 10, 2);
  238. $bitstring = substr($bitstring, 1);
  239. $hash = base_convert($bitstring, 2, 10);
  240. }
  241. }
  242. $sth = $this->pdo->prepare("INSERT INTO
  243. ttrss_plugin_img_phash_urls (url, article_guid, owner_uid, phash) VALUES
  244. (?, ?, ?, ?)");
  245. $sth->execute([$src, $article_guid, $owner_uid, $hash]);
  246. }
  247. } else {
  248. _debug("phash: unable to load image: $src");
  249. }
  250. } else {
  251. _debug("phash: unable to fetch: $src");
  252. }
  253. }
  254. }
  255. }
  256. }
  257. return $article;
  258. }
  259. function api_version() {
  260. return 2;
  261. }
  262. private function filter_unknown_feeds($enabled_feeds) {
  263. $tmp = array();
  264. foreach ($enabled_feeds as $feed) {
  265. $sth = $this->pdo->prepare("SELECT id FROM ttrss_feeds WHERE id = ? AND owner_uid = ?");
  266. $sth->execute([$feed, $_SESSION['uid']]);
  267. if ($row = $sth->fetch()) {
  268. array_push($tmp, $feed);
  269. }
  270. }
  271. return $tmp;
  272. }
  273. function hook_render_article($article) {
  274. return $this->hook_render_article_cdm($article);
  275. }
  276. function hook_render_article_api($row) {
  277. $article = isset($row['headline']) ? $row['headline'] : $row['article'];
  278. return $this->hook_render_article_cdm($article, true);
  279. }
  280. function hook_article_image($enclosures, $content, $site_url) {
  281. $article = $this->hook_render_article_cdm(["content" => $content], false);
  282. return ["", "", $article["content"]];
  283. }
  284. function hook_render_article_cdm($article, $api_mode = false) {
  285. if (DB_TYPE == "pgsql" && true !== IMG_HASH_SQL_FUNCTION) {
  286. try { $res = $this->pdo->query("select 'unique_1bits'::regproc"); } catch (PDOException $e) { ; }
  287. if (!$res || !$res->fetch()) return $article;
  288. }
  289. $owner_uid = $_SESSION["uid"];
  290. $doc = new DOMDocument();
  291. $domains_list = $this->host->get($this, "domains_list", $this->default_domains_list);
  292. $domains_list = explode(" ", $domains_list);
  293. $need_saving = false;
  294. $similarity = (int) $this->host->get($this, "similarity", $this->default_similarity);
  295. $article_guid = $article["guid"];
  296. if (@$doc->loadHTML($article["content"])) {
  297. $xpath = new DOMXPath($doc);
  298. $imgs = $xpath->query("//img[@src]|//video[@poster]");
  299. foreach ($imgs as $img) {
  300. $src = $img->tagName == "video" ? $img->getAttribute("poster") : $img->getAttribute("src");
  301. $src = $this->absolutize_url(rewrite_relative_url($article["link"], $src, $api_mode));
  302. $domain_found = $this->check_src_domain($src, $domains_list);
  303. if ($domain_found) {
  304. // check for URL duplicates first
  305. $sth = $this->pdo->prepare("SELECT id FROM ttrss_plugin_img_phash_urls WHERE
  306. owner_uid = ? AND
  307. url = ? AND
  308. article_guid != ? LIMIT 1");
  309. $sth->execute([$owner_uid, $src, $article_guid]);
  310. if ($sth->fetch()) {
  311. $need_saving = true;
  312. $this->rewrite_duplicate($doc, $img, $api_mode);
  313. continue;
  314. }
  315. // check using perceptual hash duplicates
  316. $sth = $this->pdo->prepare("SELECT phash FROM ttrss_plugin_img_phash_urls WHERE
  317. owner_uid = ? AND
  318. url = ? LIMIT 1");
  319. $sth->execute([$owner_uid, $src]);
  320. if ($row = $sth->fetch()) {
  321. $phash = $row['phash'];
  322. //$similarity = 15;
  323. $sth = $this->pdo->prepare("SELECT article_guid FROM ttrss_plugin_img_phash_urls WHERE
  324. owner_uid = ? AND
  325. created_at >= ".$this->interval_days($this->data_max_age)." AND
  326. ".$this->bitcount_func($phash)." <= ? ORDER BY created_at LIMIT 1");
  327. $sth->execute([$owner_uid, $similarity]);
  328. if ($row = $sth->fetch()) {
  329. $test_guid = $row['article_guid'];
  330. if ($test_guid != $article_guid) {
  331. $need_saving = true;
  332. $this->rewrite_duplicate($doc, $img, $api_mode);
  333. }
  334. }
  335. }
  336. }
  337. }
  338. }
  339. if ($need_saving) $article["content"] = $doc->saveXML();
  340. return $article;
  341. }
  342. function hook_house_keeping() {
  343. $this->pdo->query("DELETE FROM ttrss_plugin_img_phash_urls
  344. WHERE created_at < ".$this->interval_days($this->data_max_age));
  345. }
  346. private function check_src_domain($src, $domains_list) {
  347. $src_domain = parse_url($src, PHP_URL_HOST);
  348. foreach ($domains_list as $domain) {
  349. if (strstr($src_domain, $domain) !== FALSE) {
  350. return true;
  351. }
  352. }
  353. return false;
  354. }
  355. private function guid_to_article_title($article_guid, $owner_uid) {
  356. $sth = $this->pdo->prepare("SELECT feed_id, title, updated
  357. FROM ttrss_entries, ttrss_user_entries
  358. WHERE ref_id = id AND
  359. guid = ? AND
  360. owner_uid = ?");
  361. $sth->execute([$article_guid, $owner_uid]);
  362. if ($row = $sth->fetch()) {
  363. $article_title = $row["title"];
  364. $feed_id = $row["feed_id"];
  365. $updated = $row["updated"];
  366. $article_title = $this->T_sprintf("%s in %s (%s)",
  367. "<span title='$article_guid'>$article_title</span>",
  368. "<a href='#' onclick='viewfeed({feed: $feed_id})'>" . Feeds::getFeedTitle($feed_id) . "</a>",
  369. make_local_datetime($updated, true));
  370. } else {
  371. $article_title = "N/A ($article_guid)";
  372. }
  373. return $article_title;
  374. }
  375. function showsimilar() {
  376. $url = $_REQUEST["param"];
  377. $url_htmlescaped = htmlspecialchars($url);
  378. $owner_uid = $_SESSION["uid"];
  379. $similarity = (int) $this->host->get($this, "similarity", $this->default_similarity);
  380. print "<section class='narrow'>";
  381. print "<img class='trgm-related-thumb pull-right' src=\"$url_htmlescaped\">";
  382. print "<fieldset><h2><a target='_blank' href=\"$url_htmlescaped\">".truncate_middle($url_htmlescaped, 48)."</a></h2></fieldset>";
  383. $sth = $this->pdo->prepare("SELECT phash FROM ttrss_plugin_img_phash_urls WHERE
  384. owner_uid = ? AND
  385. url = ? LIMIT 1");
  386. $sth->execute([$owner_uid, $url]);
  387. if ($row = $sth->fetch()) {
  388. $phash = $row['phash'];
  389. $sth = $this->pdo->prepare("SELECT article_guid, ".SUBSTRING_FOR_DATE."(created_at,1,19) AS created_at FROM ttrss_plugin_img_phash_urls WHERE
  390. owner_uid = ? AND
  391. created_at >= ".$this->interval_days($this->data_max_age)." AND
  392. ".$this->bitcount_func($phash)." <= ? ORDER BY created_at LIMIT 1");
  393. $sth->execute([$owner_uid, $similarity]);
  394. if ($row = $sth->fetch()) {
  395. $article_guid = $row['article_guid'];
  396. $article_title = $this->guid_to_article_title($article_guid, $owner_uid);
  397. $created_at = $row['created_at'];
  398. print "<fieldset class='narrow'><label class='inline'>".$this->__( "Perceptual hash:")."</label>".
  399. base_convert($phash, 10, 16) . "</fieldset>";
  400. print "<fieldset class='narrow'><label class='inline'>".$this->__( "Belongs to:")."</label>
  401. $article_title</fieldset>";
  402. print "<fieldset class='narrow'><label class='inline'>".$this->__( "Registered:")."</label>
  403. $created_at</fieldset>";
  404. $sth = $this->pdo->prepare("SELECT url, article_guid, ".$this->bitcount_func($phash)." AS distance
  405. FROM ttrss_plugin_img_phash_urls WHERE
  406. ".$this->bitcount_func($phash)." <= ?
  407. ORDER BY distance LIMIT 30");
  408. $sth->execute([$similarity]);
  409. print "<ul class='panel panel-scrollable list list-unstyled'>";
  410. while ($line = $sth->fetch()) {
  411. print "<li>";
  412. $url = htmlspecialchars($line["url"]);
  413. $distance = $line["distance"];
  414. $rel_article_guid = $line["article_guid"];
  415. $article_title = $this->guid_to_article_title($rel_article_guid, $owner_uid);
  416. $is_checked = ($rel_article_guid == $article_guid) ? "checked" : "";
  417. print "<div><a target='_blank' href=\"$url\">".truncate_middle($url, 48)."</a> ".
  418. "(" . $this->T_sprintf("Distance: %d", $distance) . ")";
  419. if ($is_checked) print " <strong>(".$this->__( "Original").")</strong>";
  420. print "<br/>$article_title";
  421. print "<br/><img class='trgm-related-thumb' src=\"$url\"></div>";
  422. print "</li>";
  423. }
  424. print "</ul>";
  425. } else {
  426. print "<div class='text-error'>" . $this->__( "No information found for this URL.") . "</div>";
  427. }
  428. } else {
  429. print "<div class='text-error'>" . $this->__( "No information found for this URL.") . "</div>";
  430. }
  431. print "</section>";
  432. print "<footer class='text-center'>
  433. <button dojoType='dijit.form.Button' onclick=\"dijit.byId('phashSimilarDlg').hide()\">"
  434. .$this->__( 'Close this window')."</button>
  435. </footer>";
  436. }
  437. private function absolutize_url($src) {
  438. if (strpos($src, "//") === 0)
  439. $src = "https:" . $src;
  440. return $src;
  441. }
  442. private function interval_days($days) {
  443. if (DB_TYPE == "pgsql") {
  444. return "NOW() - INTERVAL '$days days' ";
  445. } else {
  446. return "DATE_SUB(NOW(), INTERVAL $days DAY) ";
  447. }
  448. }
  449. private function bitcount_func($phash) {
  450. if (DB_TYPE == "pgsql") {
  451. return true === IMG_HASH_SQL_FUNCTION ? "bit_count('$phash' # phash)" : "unique_1bits('$phash', phash)";
  452. } else {
  453. return "bit_count('$phash' ^ phash)";
  454. }
  455. }
  456. }
  457. ?>