Browse Source

get_feeds_from_html: remove XML preamble hack
move several related helper functions to Feeds class

Andrew Dolgov 1 year ago
parent
commit
6d746453c7
4 changed files with 88 additions and 115 deletions
  1. 87 4
      classes/feeds.php
  2. 0 12
      classes/handler/public.php
  3. 1 1
      classes/pref/feeds.php
  4. 0 98
      include/functions.php

+ 87 - 4
classes/feeds.php

@@ -1135,9 +1135,9 @@ class Feeds extends Handler_Protected {
 
 		$pdo = Db::pdo();
 
-		$url = fix_url($url);
+		$url = Feeds::fix_url($url);
 
-		if (!$url || !validate_feed_url($url)) return array("code" => 2);
+		if (!$url || !Feeds::validate_feed_url($url)) return array("code" => 2);
 
 		$contents = @fetch_file_contents($url, false, $auth_login, $auth_pass);
 
@@ -1153,8 +1153,8 @@ class Feeds extends Handler_Protected {
 			return array("code" => 5, "message" => $fetch_last_error);
 		}
 
-		if (mb_strpos($fetch_last_content_type, "html") !== FALSE && is_html($contents)) {
-			$feedUrls = get_feeds_from_html($url, $contents);
+		if (mb_strpos($fetch_last_content_type, "html") !== FALSE && Feeds::is_html($contents)) {
+			$feedUrls = Feeds::get_feeds_from_html($url, $contents);
 
 			if (count($feedUrls) == 0) {
 				return array("code" => 3);
@@ -1923,5 +1923,88 @@ class Feeds extends Handler_Protected {
         return $colormap[$sum];
 	}
 
+	static function get_feeds_from_html($url, $content) {
+		$url     = Feeds::fix_url($url);
+		$baseUrl = substr($url, 0, strrpos($url, '/') + 1);
+
+		libxml_use_internal_errors(true);
+		$feedUrls = [];
+
+		$doc = new DOMDocument();
+		if ($doc->loadHTML($content)) {
+			$xpath = new DOMXPath($doc);
+			$entries = $xpath->query('/html/head/link[@rel="alternate" and '.
+				'(contains(@type,"rss") or contains(@type,"atom"))]|/html/head/link[@rel="feed"]');
+
+			foreach ($entries as $entry) {
+				if ($entry->hasAttribute('href')) {
+					$title = $entry->getAttribute('title');
+					if ($title == '') {
+						$title = $entry->getAttribute('type');
+					}
+					$feedUrl = rewrite_relative_url(
+						$baseUrl, $entry->getAttribute('href')
+					);
+					$feedUrls[$feedUrl] = $title;
+				}
+			}
+		}
+		return $feedUrls;
+	}
+
+	static function is_html($content) {
+		return preg_match("/<html|DOCTYPE html/i", substr($content, 0, 8192)) !== 0;
+	}
+
+	static function validate_feed_url($url) {
+		$parts = parse_url($url);
+
+		return ($parts['scheme'] == 'http' || $parts['scheme'] == 'feed' || $parts['scheme'] == 'https');
+	}
+
+	/**
+	 * Fixes incomplete URLs by prepending "http://".
+	 * Also replaces feed:// with http://, and
+	 * prepends a trailing slash if the url is a domain name only.
+	 *
+	 * @param string $url Possibly incomplete URL
+	 *
+	 * @return string Fixed URL.
+	 */
+	static function fix_url($url) {
+
+		// support schema-less urls
+		if (strpos($url, '//') === 0) {
+			$url = 'https:' . $url;
+		}
+
+		if (strpos($url, '://') === false) {
+			$url = 'http://' . $url;
+		} else if (substr($url, 0, 5) == 'feed:') {
+			$url = 'http:' . substr($url, 5);
+		}
+
+		//prepend slash if the URL has no slash in it
+		// "http://www.example" -> "http://www.example/"
+		if (strpos($url, '/', strpos($url, ':') + 3) === false) {
+			$url .= '/';
+		}
+
+		//convert IDNA hostname to punycode if possible
+		if (function_exists("idn_to_ascii")) {
+			$parts = parse_url($url);
+			if (mb_detect_encoding($parts['host']) != 'ASCII')
+			{
+				$parts['host'] = idn_to_ascii($parts['host']);
+				$url = build_url($parts);
+			}
+		}
+
+		if ($url != "http:///")
+			return $url;
+		else
+			return '';
+	}
+
 }
 

+ 0 - 12
classes/handler/public.php

@@ -728,18 +728,6 @@ class Handler_Public extends Handler {
 		}
 	}
 
-	/* function subtest() {
-		header("Content-type: text/plain; charset=utf-8");
-
-		$url = clean($_REQUEST["url"]);
-
-		print "$url\n\n";
-
-
-		print_r(get_feeds_from_html($url, fetch_file_contents($url)));
-
-	} */
-
 	function subscribe() {
 		if (SINGLE_USER_MODE) {
 			login_sequence();

+ 1 - 1
classes/pref/feeds.php

@@ -1708,7 +1708,7 @@ class Pref_Feeds extends Handler_Protected {
 		foreach ($feeds as $feed) {
 			$feed = trim($feed);
 
-			if (validate_feed_url($feed)) {
+			if (Feeds::validate_feed_url($feed)) {
 
 				$this->pdo->beginTransaction();
 

+ 0 - 98
include/functions.php

@@ -2017,68 +2017,6 @@
 		return false;
 	}
 
-	/**
-	 * Fixes incomplete URLs by prepending "http://".
-	 * Also replaces feed:// with http://, and
-	 * prepends a trailing slash if the url is a domain name only.
-	 *
-	 * @param string $url Possibly incomplete URL
-	 *
-	 * @return string Fixed URL.
-	 */
-	function fix_url($url) {
-
-		// support schema-less urls
-		if (strpos($url, '//') === 0) {
-			$url = 'https:' . $url;
-		}
-
-		if (strpos($url, '://') === false) {
-			$url = 'http://' . $url;
-		} else if (substr($url, 0, 5) == 'feed:') {
-			$url = 'http:' . substr($url, 5);
-		}
-
-		//prepend slash if the URL has no slash in it
-		// "http://www.example" -> "http://www.example/"
-		if (strpos($url, '/', strpos($url, ':') + 3) === false) {
-			$url .= '/';
-		}
-
-		//convert IDNA hostname to punycode if possible
-		if (function_exists("idn_to_ascii")) {
-			$parts = parse_url($url);
-			if (mb_detect_encoding($parts['host']) != 'ASCII')
-			{
-				$parts['host'] = idn_to_ascii($parts['host']);
-				$url = build_url($parts);
-			}
-		}
-
-		if ($url != "http:///")
-			return $url;
-		else
-			return '';
-	}
-
-	function validate_feed_url($url) {
-		$parts = parse_url($url);
-
-		return ($parts['scheme'] == 'http' || $parts['scheme'] == 'feed' || $parts['scheme'] == 'https');
-
-	}
-
-	/* function save_email_address($email) {
-		// FIXME: implement persistent storage of emails
-
-		if (!$_SESSION['stored_emails'])
-			$_SESSION['stored_emails'] = array();
-
-		if (!in_array($email, $_SESSION['stored_emails']))
-			array_push($_SESSION['stored_emails'], $email);
-	} */
-
-
 	function get_feed_access_key($feed_id, $is_cat, $owner_uid = false) {
 
 		if (!$owner_uid) $owner_uid = $_SESSION["uid"];
@@ -2107,42 +2045,6 @@
 		}
 	}
 
-	function get_feeds_from_html($url, $content)
-	{
-		$url     = fix_url($url);
-		$baseUrl = substr($url, 0, strrpos($url, '/') + 1);
-
-		libxml_use_internal_errors(true);
-
-		$doc = new DOMDocument();
-		$doc->loadHTML('<?xml encoding="UTF-8">' . $content);
-		$xpath = new DOMXPath($doc);
-		$entries = $xpath->query('/html/head/link[@rel="alternate" and '.
-			'(contains(@type,"rss") or contains(@type,"atom"))]|/html/head/link[@rel="feed"]');
-		$feedUrls = array();
-		foreach ($entries as $entry) {
-			if ($entry->hasAttribute('href')) {
-				$title = $entry->getAttribute('title');
-				if ($title == '') {
-					$title = $entry->getAttribute('type');
-				}
-				$feedUrl = rewrite_relative_url(
-					$baseUrl, $entry->getAttribute('href')
-				);
-				$feedUrls[$feedUrl] = $title;
-			}
-		}
-		return $feedUrls;
-	}
-
-	function is_html($content) {
-		return preg_match("/<html|DOCTYPE html/i", substr($content, 0, 8192)) !== 0;
-	}
-
-	function url_is_html($url, $login = false, $pass = false) {
-		return is_html(fetch_file_contents($url, false, $login, $pass));
-	}
-
 	function build_url($parts) {
 		return $parts['scheme'] . "://" . $parts['host'] . $parts['path'];
 	}