Browse Source

tag-related fixes

1. move tag sanitization to feedparser common item class
2. enforce length limit on tags when parsing
3. support multiple tags passed via one dc:subject and other such elements, parse them as a comma-separated list
4. sort resulting tag list to prevent different order between feed updates
5. remove some duplicate code related to tag validation
6. allow + symbol in tags
Andrew Dolgov 10 months ago
parent
commit
304d3a0b88
5 changed files with 52 additions and 83 deletions
  1. 2 34
      classes/article.php
  2. 4 4
      classes/feeditem/atom.php
  3. 31 0
      classes/feeditem/common.php
  4. 4 4
      classes/feeditem/rss.php
  5. 11 41
      classes/rssutils.php

+ 2 - 34
classes/article.php

@@ -305,19 +305,9 @@ class Article extends Handler_Protected {
 				post_int_id = ? AND owner_uid = ?");
 			$sth->execute([$int_id, $_SESSION['uid']]);
 
-			foreach ($tags as $tag) {
-				$tag = Article::sanitize_tag($tag);
-
-				if (!Article::tag_is_valid($tag)) {
-					continue;
-				}
-
-				if (preg_match("/^[0-9]*$/", $tag)) {
-					continue;
-				}
-
-				//					print "<!-- $id : $int_id : $tag -->";
+			$tags = FeedItem_Common::normalize_categories($tags);
 
+			foreach ($tags as $tag) {
 				if ($tag != '') {
 					$sth = $this->pdo->prepare("INSERT INTO ttrss_tags
 								(post_int_id, owner_uid, tag_name)
@@ -331,7 +321,6 @@ class Article extends Handler_Protected {
 
 			/* update tag cache */
 
-			sort($tags_to_cache);
 			$tags_str = join(",", $tags_to_cache);
 
 			$sth = $this->pdo->prepare("UPDATE ttrss_user_entries
@@ -802,27 +791,6 @@ class Article extends Handler_Protected {
 		return $rv;
 	}
 
-	static function sanitize_tag($tag) {
-		$tag = trim($tag);
-
-		$tag = mb_strtolower($tag, 'utf-8');
-
-		$tag = preg_replace('/[,\'\"\+\>\<]/', "", $tag);
-
-		if (DB_TYPE == "mysql") {
-			$tag = preg_replace('/[\x{10000}-\x{10FFFF}]/u', "\xEF\xBF\xBD", $tag);
-		}
-
-		return $tag;
-	}
-
-	static function tag_is_valid($tag) {
-		if (!$tag || is_numeric($tag) || mb_strlen($tag) > 250)
-			return false;
-
-		return true;
-	}
-
 	static function get_article_image($enclosures, $content, $site_url) {
 
 		$article_image = "";

+ 4 - 4
classes/feeditem/atom.php

@@ -103,20 +103,20 @@ class FeedItem_Atom extends FeedItem_Common {
 
 	function get_categories() {
 		$categories = $this->elem->getElementsByTagName("category");
-		$cats = array();
+		$cats = [];
 
 		foreach ($categories as $cat) {
 			if ($cat->hasAttribute("term"))
-				array_push($cats, trim($cat->getAttribute("term")));
+				array_push($cats, $cat->getAttribute("term"));
 		}
 
 		$categories = $this->xpath->query("dc:subject", $this->elem);
 
 		foreach ($categories as $cat) {
-			array_push($cats, clean(trim($cat->nodeValue)));
+			array_push($cats, $cat->nodeValue);
 		}
 
-		return $cats;
+		return $this->normalize_categories($cats);
 	}
 
 	function get_enclosures() {

+ 31 - 0
classes/feeditem/common.php

@@ -162,4 +162,35 @@ abstract class FeedItem_Common extends FeedItem {
 		}
 	}
 
+	static function normalize_categories($cats) {
+
+		$tmp = [];
+
+		foreach ($cats as $rawcat) {
+			$tmp = array_merge($tmp, explode(",", $rawcat));
+		}
+
+		$tmp = array_map(function($srccat) {
+			$cat = clean(trim(mb_strtolower($srccat)));
+
+			// we don't support numeric tags
+			if (is_numeric($cat))
+				$cat = 't:' . $cat;
+
+			$cat = preg_replace('/[,\'\"]/', "", $cat);
+
+			if (DB_TYPE == "mysql") {
+				$cat = preg_replace('/[\x{10000}-\x{10FFFF}]/u', "\xEF\xBF\xBD", $cat);
+			}
+
+			if (mb_strlen($cat) > 250)
+				$cat = mb_substr($cat, 0, 250);
+
+			return $cat;
+		}, $tmp);
+
+		asort($tmp);
+
+		return array_unique($tmp);
+	}
 }

+ 4 - 4
classes/feeditem/rss.php

@@ -97,19 +97,19 @@ class FeedItem_RSS extends FeedItem_Common {
 
 	function get_categories() {
 		$categories = $this->elem->getElementsByTagName("category");
-		$cats = array();
+		$cats = [];
 
 		foreach ($categories as $cat) {
-			array_push($cats, trim($cat->nodeValue));
+			array_push($cats, $cat->nodeValue);
 		}
 
 		$categories = $this->xpath->query("dc:subject", $this->elem);
 
 		foreach ($categories as $cat) {
-			array_push($cats, clean(trim($cat->nodeValue)));
+			array_push($cats, $cat->nodeValue);
 		}
 
-		return $cats;
+		return $this->normalize_categories($cats);
 	}
 
 	function get_enclosures() {

+ 11 - 41
classes/rssutils.php

@@ -626,28 +626,8 @@ class RSSUtils {
 				Debug::log("author $entry_author", Debug::$LOG_VERBOSE);
 				Debug::log("looking for tags...", Debug::$LOG_VERBOSE);
 
-				// parse <category> entries into tags
-
-				$additional_tags = array();
-
-				$additional_tags_src = $item->get_categories();
-
-				if (is_array($additional_tags_src)) {
-					foreach ($additional_tags_src as $tobj) {
-						array_push($additional_tags, $tobj);
-					}
-				}
-
-				$entry_tags = array_unique($additional_tags);
-
-				for ($i = 0; $i < count($entry_tags); $i++) {
-					$entry_tags[$i] = mb_strtolower($entry_tags[$i], 'utf-8');
-
-					// we don't support numeric tags, let's prefix them
-					if (is_numeric($entry_tags[$i])) $entry_tags[$i] = 't:' . $entry_tags[$i];
-				}
-
-				Debug::log("tags found: " . join(",", $entry_tags), Debug::$LOG_VERBOSE);
+				$entry_tags = $item->get_categories();
+				Debug::log("tags found: " . join(", ", $entry_tags), Debug::$LOG_VERBOSE);
 
 				Debug::log("done collecting data.", Debug::$LOG_VERBOSE);
 
@@ -1107,9 +1087,7 @@ class RSSUtils {
 						$manual_tags = trim_array(explode(",", $f["param"]));
 
 						foreach ($manual_tags as $tag) {
-							if (Article::tag_is_valid($tag)) {
-								array_push($entry_tags, $tag);
-							}
+							array_push($entry_tags, $tag);
 						}
 					}
 				}
@@ -1122,19 +1100,17 @@ class RSSUtils {
 				$filtered_tags = array();
 				$tags_to_cache = array();
 
-				if ($entry_tags && is_array($entry_tags)) {
-					foreach ($entry_tags as $tag) {
-						if (array_search($tag, $boring_tags) === false) {
-							array_push($filtered_tags, $tag);
-						}
+				foreach ($entry_tags as $tag) {
+					if (array_search($tag, $boring_tags) === false) {
+						array_push($filtered_tags, $tag);
 					}
 				}
 
 				$filtered_tags = array_unique($filtered_tags);
 
-				if (Debug::get_loglevel() >= Debug::$LOG_EXTENDED) {
-					Debug::log("filtered article tags:", Debug::$LOG_VERBOSE);
-					print_r($filtered_tags);
+				if (Debug::get_loglevel() >= Debug::$LOG_VERBOSE) {
+					Debug::log("filtered tags: " . implode(", ", $filtered_tags), Debug::$LOG_VERBOSE);
+
 				}
 
 				// Save article tags in the database
@@ -1149,12 +1125,9 @@ class RSSUtils {
 									(owner_uid,tag_name,post_int_id)
 									VALUES (?, ?, ?)");
 
-					foreach ($filtered_tags as $tag) {
-
-						$tag = Article::sanitize_tag($tag);
-
-						if (!Article::tag_is_valid($tag)) continue;
+					$filtered_tags = FeedItem_Common::normalize_categories($filtered_tags);
 
+					foreach ($filtered_tags as $tag) {
 						$tsth->execute([$tag, $entry_int_id, $owner_uid]);
 
 						if (!$tsth->fetch()) {
@@ -1165,9 +1138,6 @@ class RSSUtils {
 					}
 
 					/* update the cache */
-
-					$tags_to_cache = array_unique($tags_to_cache);
-
 					$tags_str = join(",", $tags_to_cache);
 
 					$tsth = $pdo->prepare("UPDATE ttrss_user_entries