diff options
author | Andrew Dolgov <[email protected]> | 2019-06-20 07:51:48 +0300 |
---|---|---|
committer | Andrew Dolgov <[email protected]> | 2019-06-20 07:51:48 +0300 |
commit | 6d746453c75e2c482458f687585ef436a28a9888 (patch) | |
tree | 4e56e86c2751856fd635c86c69bac1e124beb687 | |
parent | 270b39a33738ae45f1cab2d7a7dd518af10221ca (diff) |
get_feeds_from_html: remove XML preamble hack
move several related helper functions to Feeds class
-rwxr-xr-x | classes/feeds.php | 91 | ||||
-rwxr-xr-x | classes/handler/public.php | 12 | ||||
-rwxr-xr-x | classes/pref/feeds.php | 2 | ||||
-rw-r--r-- | include/functions.php | 98 |
4 files changed, 88 insertions, 115 deletions
diff --git a/classes/feeds.php b/classes/feeds.php index 92bbd113b..2714f4596 100755 --- a/classes/feeds.php +++ b/classes/feeds.php @@ -1135,9 +1135,9 @@ class Feeds extends Handler_Protected { $pdo = Db::pdo(); - $url = fix_url($url); + $url = Feeds::fix_url($url); - if (!$url || !validate_feed_url($url)) return array("code" => 2); + if (!$url || !Feeds::validate_feed_url($url)) return array("code" => 2); $contents = @fetch_file_contents($url, false, $auth_login, $auth_pass); @@ -1153,8 +1153,8 @@ class Feeds extends Handler_Protected { return array("code" => 5, "message" => $fetch_last_error); } - if (mb_strpos($fetch_last_content_type, "html") !== FALSE && is_html($contents)) { - $feedUrls = get_feeds_from_html($url, $contents); + if (mb_strpos($fetch_last_content_type, "html") !== FALSE && Feeds::is_html($contents)) { + $feedUrls = Feeds::get_feeds_from_html($url, $contents); if (count($feedUrls) == 0) { return array("code" => 3); @@ -1923,5 +1923,88 @@ class Feeds extends Handler_Protected { return $colormap[$sum]; } + static function get_feeds_from_html($url, $content) { + $url = Feeds::fix_url($url); + $baseUrl = substr($url, 0, strrpos($url, '/') + 1); + + libxml_use_internal_errors(true); + $feedUrls = []; + + $doc = new DOMDocument(); + if ($doc->loadHTML($content)) { + $xpath = new DOMXPath($doc); + $entries = $xpath->query('/html/head/link[@rel="alternate" and '. + '(contains(@type,"rss") or contains(@type,"atom"))]|/html/head/link[@rel="feed"]'); + + foreach ($entries as $entry) { + if ($entry->hasAttribute('href')) { + $title = $entry->getAttribute('title'); + if ($title == '') { + $title = $entry->getAttribute('type'); + } + $feedUrl = rewrite_relative_url( + $baseUrl, $entry->getAttribute('href') + ); + $feedUrls[$feedUrl] = $title; + } + } + } + return $feedUrls; + } + + static function is_html($content) { + return preg_match("/<html|DOCTYPE html/i", substr($content, 0, 8192)) !== 0; + } + + static function validate_feed_url($url) { + $parts = parse_url($url); + + return ($parts['scheme'] == 'http' || $parts['scheme'] == 'feed' || $parts['scheme'] == 'https'); + } + + /** + * Fixes incomplete URLs by prepending "http://". + * Also replaces feed:// with http://, and + * prepends a trailing slash if the url is a domain name only. + * + * @param string $url Possibly incomplete URL + * + * @return string Fixed URL. + */ + static function fix_url($url) { + + // support schema-less urls + if (strpos($url, '//') === 0) { + $url = 'https:' . $url; + } + + if (strpos($url, '://') === false) { + $url = 'http://' . $url; + } else if (substr($url, 0, 5) == 'feed:') { + $url = 'http:' . substr($url, 5); + } + + //prepend slash if the URL has no slash in it + // "http://www.example" -> "http://www.example/" + if (strpos($url, '/', strpos($url, ':') + 3) === false) { + $url .= '/'; + } + + //convert IDNA hostname to punycode if possible + if (function_exists("idn_to_ascii")) { + $parts = parse_url($url); + if (mb_detect_encoding($parts['host']) != 'ASCII') + { + $parts['host'] = idn_to_ascii($parts['host']); + $url = build_url($parts); + } + } + + if ($url != "http:///") + return $url; + else + return ''; + } + } diff --git a/classes/handler/public.php b/classes/handler/public.php index 318cecd72..e9a3abc53 100755 --- a/classes/handler/public.php +++ b/classes/handler/public.php @@ -728,18 +728,6 @@ class Handler_Public extends Handler { } } - /* function subtest() { - header("Content-type: text/plain; charset=utf-8"); - - $url = clean($_REQUEST["url"]); - - print "$url\n\n"; - - - print_r(get_feeds_from_html($url, fetch_file_contents($url))); - - } */ - function subscribe() { if (SINGLE_USER_MODE) { login_sequence(); diff --git a/classes/pref/feeds.php b/classes/pref/feeds.php index 6cbf15a58..f94f45430 100755 --- a/classes/pref/feeds.php +++ b/classes/pref/feeds.php @@ -1708,7 +1708,7 @@ class Pref_Feeds extends Handler_Protected { foreach ($feeds as $feed) { $feed = trim($feed); - if (validate_feed_url($feed)) { + if (Feeds::validate_feed_url($feed)) { $this->pdo->beginTransaction(); diff --git a/include/functions.php b/include/functions.php index 6dc9990e8..fe4ca1ecb 100644 --- a/include/functions.php +++ b/include/functions.php @@ -2017,68 +2017,6 @@ return false; } - /** - * Fixes incomplete URLs by prepending "http://". - * Also replaces feed:// with http://, and - * prepends a trailing slash if the url is a domain name only. - * - * @param string $url Possibly incomplete URL - * - * @return string Fixed URL. - */ - function fix_url($url) { - - // support schema-less urls - if (strpos($url, '//') === 0) { - $url = 'https:' . $url; - } - - if (strpos($url, '://') === false) { - $url = 'http://' . $url; - } else if (substr($url, 0, 5) == 'feed:') { - $url = 'http:' . substr($url, 5); - } - - //prepend slash if the URL has no slash in it - // "http://www.example" -> "http://www.example/" - if (strpos($url, '/', strpos($url, ':') + 3) === false) { - $url .= '/'; - } - - //convert IDNA hostname to punycode if possible - if (function_exists("idn_to_ascii")) { - $parts = parse_url($url); - if (mb_detect_encoding($parts['host']) != 'ASCII') - { - $parts['host'] = idn_to_ascii($parts['host']); - $url = build_url($parts); - } - } - - if ($url != "http:///") - return $url; - else - return ''; - } - - function validate_feed_url($url) { - $parts = parse_url($url); - - return ($parts['scheme'] == 'http' || $parts['scheme'] == 'feed' || $parts['scheme'] == 'https'); - - } - - /* function save_email_address($email) { - // FIXME: implement persistent storage of emails - - if (!$_SESSION['stored_emails']) - $_SESSION['stored_emails'] = array(); - - if (!in_array($email, $_SESSION['stored_emails'])) - array_push($_SESSION['stored_emails'], $email); - } */ - - function get_feed_access_key($feed_id, $is_cat, $owner_uid = false) { if (!$owner_uid) $owner_uid = $_SESSION["uid"]; @@ -2107,42 +2045,6 @@ } } - function get_feeds_from_html($url, $content) - { - $url = fix_url($url); - $baseUrl = substr($url, 0, strrpos($url, '/') + 1); - - libxml_use_internal_errors(true); - - $doc = new DOMDocument(); - $doc->loadHTML('<?xml encoding="UTF-8">' . $content); - $xpath = new DOMXPath($doc); - $entries = $xpath->query('/html/head/link[@rel="alternate" and '. - '(contains(@type,"rss") or contains(@type,"atom"))]|/html/head/link[@rel="feed"]'); - $feedUrls = array(); - foreach ($entries as $entry) { - if ($entry->hasAttribute('href')) { - $title = $entry->getAttribute('title'); - if ($title == '') { - $title = $entry->getAttribute('type'); - } - $feedUrl = rewrite_relative_url( - $baseUrl, $entry->getAttribute('href') - ); - $feedUrls[$feedUrl] = $title; - } - } - return $feedUrls; - } - - function is_html($content) { - return preg_match("/<html|DOCTYPE html/i", substr($content, 0, 8192)) !== 0; - } - - function url_is_html($url, $login = false, $pass = false) { - return is_html(fetch_file_contents($url, false, $login, $pass)); - } - function build_url($parts) { return $parts['scheme'] . "://" . $parts['host'] . $parts['path']; } |