diff options
author | Andrew Dolgov <[email protected]> | 2013-07-31 10:30:14 +0400 |
---|---|---|
committer | Andrew Dolgov <[email protected]> | 2013-07-31 10:30:17 +0400 |
commit | 6b4617970f2c25ac852daab873f5374d611d5b7e (patch) | |
tree | e7026b5c9879b7489454626aef00e3c2f04e06db /include | |
parent | f035e6dc822e3b8766d55689abff26a6bf52d404 (diff) |
add text_languagedetect to guess article language for better hyphenation
(bump schema)
Diffstat (limited to 'include')
-rw-r--r-- | include/functions.php | 9 | ||||
-rw-r--r-- | include/rssfuncs.php | 16 |
2 files changed, 22 insertions, 3 deletions
diff --git a/include/functions.php b/include/functions.php index cef0ea9eb..9d14aae7a 100644 --- a/include/functions.php +++ b/include/functions.php @@ -1,6 +1,6 @@ <?php define('EXPECTED_CONFIG_VERSION', 26); - define('SCHEMA_VERSION', 121); + define('SCHEMA_VERSION', 122); define('LABEL_BASE_INDEX', -1024); define('PLUGIN_FEED_BASE_INDEX', -128); @@ -87,6 +87,7 @@ require_once "lib/accept-to-gettext.php"; require_once "lib/gettext/gettext.inc"; + require_once "lib/languagedetect/LanguageDetect.php"; function startup_gettext() { @@ -2650,6 +2651,7 @@ comments, int_id, uuid, + lang, hide_images, unread,feed_id,marked,published,link,last_read,orig_feed_id, last_marked, last_published, @@ -2692,6 +2694,7 @@ "tag_cache," . "label_cache," . "link," . + "lang," . "uuid," . "last_read," . "(SELECT hide_images FROM ttrss_feeds WHERE id = feed_id) AS hide_images," . @@ -3118,7 +3121,7 @@ ccache_update($feed_id, $owner_uid); } - $result = db_query("SELECT id,title,link,content,feed_id,comments,int_id, + $result = db_query("SELECT id,title,link,content,feed_id,comments,int_id,lang, ".SUBSTRING_FOR_DATE."(updated,1,16) as updated, (SELECT site_url FROM ttrss_feeds WHERE id = feed_id) as site_url, (SELECT hide_images FROM ttrss_feeds WHERE id = feed_id) as hide_images, @@ -3290,7 +3293,7 @@ } $rv['content'] .= "</div>"; - $rv['content'] .= "<div class=\"postContent\" lang=\"en\">"; + $rv['content'] .= "<div class=\"postContent\" lang=\"".$line['lang']."\">"; $rv['content'] .= $line["content"]; $rv['content'] .= format_article_enclosures($id, diff --git a/include/rssfuncs.php b/include/rssfuncs.php index cfb0e7a46..756ecbfc1 100644 --- a/include/rssfuncs.php +++ b/include/rssfuncs.php @@ -354,6 +354,11 @@ $rss->init(); } + require_once "lib/languagedetect/LanguageDetect.php"; + + $lang = new Text_LanguageDetect(); + $lang->setNameMode(2); + // print_r($rss); $feed = db_escape_string($feed); @@ -565,6 +570,15 @@ print "\n"; } + $entry_language = $lang->detect($entry_content, 1); + + if (count($entry_language) > 0) { + $entry_language = array_keys($entry_language); + $entry_language = db_escape_string($entry_language[0]); + + _debug("detected language: $entry_language", $debug_enabled); + } + $entry_comments = $item->get_comments_url(); $entry_author = $item->get_author(); @@ -677,6 +691,7 @@ comments, num_comments, plugin_data, + lang, author) VALUES ('$entry_title', @@ -691,6 +706,7 @@ '$entry_comments', '$num_comments', '$entry_plugin_data', + '$entry_language', '$entry_author')"); $article_labels = array(); |