From 74a752879bcad54daa62994f9eae42fe4afdd299 Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Wed, 25 Nov 2015 21:07:24 +0300 Subject: af_sort_bayes: move to -attic repo, not really suitable for production use of any kind --- plugins/af_sort_bayes/lib/class.naivebayesian.php | 297 ---------------------- 1 file changed, 297 deletions(-) delete mode 100644 plugins/af_sort_bayes/lib/class.naivebayesian.php (limited to 'plugins/af_sort_bayes/lib/class.naivebayesian.php') diff --git a/plugins/af_sort_bayes/lib/class.naivebayesian.php b/plugins/af_sort_bayes/lib/class.naivebayesian.php deleted file mode 100644 index 4a4ffa7eb..000000000 --- a/plugins/af_sort_bayes/lib/class.naivebayesian.php +++ /dev/null @@ -1,297 +0,0 @@ -nbs = $nbs; - - return true; - } - - /** categorize a document. - Get list of categories in which the document can be categorized - with a score for each category. - - @return array keys = category ids, values = scores - @param string document - */ - function categorize($document) { - $scores = array(); - $categories = $this->nbs->getCategories(); - $tokens = $this->_getTokens($document); - - // calculate the score in each category - $total_words = 0; - $ncat = 0; - - while (list($category, $data) = each($categories)) { - $total_words += $data['word_count']; - $ncat++; - } - - reset($categories); - - while (list($category, $data) = each($categories)) { - $scores[$category] = $data['probability']; - // small probability for a word not in the category - // maybe putting 1.0 as a 'no effect' word can also be good - - if ($data['word_count'] > 0) - $small_proba = 1.0 / ($data['word_count'] * 2); - else - $small_proba = 0; - - reset($tokens); - - while (list($token, $count) = each($tokens)) { - - if ($this->nbs->wordExists($token)) { - $word = $this->nbs->getWord($token, $category); - - if ($word['count']) { - $proba = $word['count'] / $data['word_count']; - } - else { - $proba = $small_proba; - } - - $scores[$category] *= pow($proba, $count) * pow($total_words / $ncat, $count); - // pow($total_words/$ncat, $count) is here to avoid underflow. - - } - } - } - - return $this->_rescale($scores); - } - - /** training against a document. - Set a document as being in a specific category. The document becomes a reference - and is saved in the table of references. After a set of training is done - the updateProbabilities() function must be run. - - @see updateProbabilities() - @see untrain() - @return bool success - @param string document id, must be unique - @param string category_id the category id in which the document should be - @param string content of the document - */ - function train($doc_id, $category_id, $content) { - $ret = false; - - - // if this doc_id already trained, no trained - if (!$this->nbs->getReference($doc_id, false)) { - - $tokens = $this->_getTokens($content); - - while (list($token, $count) = each($tokens)) { - $this->nbs->updateWord($token, $count, $category_id); - } - - $this->nbs->saveReference($doc_id, $category_id, $content); - - $ret = true; - } - else { - $ret = false; - } - - return $ret; - } - - /** untraining of a document. - To remove just one document from the references. - - @see updateProbabilities() - @see untrain() - @return bool success - @param string document id, must be unique - */ - function untrain($doc_id) { - $ref = $this->nbs->getReference($doc_id); - - if (isset($ref['content'])) { - - $tokens = $this->_getTokens($ref['content']); - - while (list($token, $count) = each($tokens)) { - $this->nbs->removeWord($token, $count, $ref['category_id']); - } - - $this->nbs->removeReference($doc_id); - - return true; - } else { - return false; - } - } - - /** rescale the results between 0 and 1. - - @author Ken Williams, ken@mathforum.org - @see categorize() - @return array normalized scores (keys => category, values => scores) - @param array scores (keys => category, values => scores) - */ - function _rescale($scores) { - // Scale everything back to a reasonable area in - // logspace (near zero), un-loggify, and normalize - $total = 0.0; - $max = 0.0; - reset($scores); - - while (list($cat, $score) = each($scores)) { - if ($score >= $max) - $max = $score; - } - - reset($scores); - while (list($cat, $score) = each($scores)) { - $scores[$cat] = (float) exp($score - $max); - $total += (float) pow($scores[$cat], 2); - } - - $total = (float) sqrt($total); - - reset($scores); - while (list($cat, $score) = each($scores)) { - $scores[$cat] = (float) $scores[$cat] / $total; - } - reset($scores); - - return $scores; - } - - /** update the probabilities of the categories and word count. - This function must be run after a set of training - - @see train() - @see untrain() - @return bool sucess - */ - function updateProbabilities() { - // this function is really only database manipulation - // that is why all is done in the NaiveBayesianStorage - return $this->nbs->updateProbabilities(); - } - - /** Get the list of token to ignore. - @return array ignore list - */ - function getIgnoreList() { - //return array('the', 'that', 'you', 'for', 'and'); - - // https://en.wikipedia.org/wiki/Most_common_words_in_English - return array('the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'I', 'it', 'for', 'not', 'on', 'with', - 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', - 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', - 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', - 'no', 'just', 'him', 'know', 'take', 'people', 'into', 'year', 'your', 'good', 'some', 'could', - 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', - 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', - 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us', 'read', 'more'); - - } - - /** get the tokens from a string - - @author James Seng. [http://james.seng.cc/] (based on his perl version) - - @return array tokens - @param string the string to get the tokens from - */ - function _getTokens($string) { - $rawtokens = array(); - $tokens = array(); - //$string = $this->_cleanString($string); - - if (count(0 >= $this->ignore_list)) { - $this->ignore_list = $this->getIgnoreList(); - } - - $rawtokens = preg_split("/[\(\),:\.;\t\r\n ]/", $string, -1, PREG_SPLIT_NO_EMPTY); - - // remove some tokens - while (list(, $token) = each($rawtokens)) { - $token = trim($token); - if (!(('' == $token) || (mb_strpos($token, "&") !== FALSE) || (mb_strlen($token) < $this->min_token_length) || (mb_strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) { - $tokens[$token]++; - } - } - - return $tokens; - } - - /** clean a string from the diacritics - - @author Antoine Bajolet [phpdig_at_toiletoine.net] - @author SPIP [http://uzine.net/spip/] - - @return string clean string - @param string string with accents - */ - function _cleanString($string) { - $diac = /* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) . - /* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) . - /* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) . - /* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) . - /* E */ chr(200) . chr(201) . chr(202) . chr(203) . - /* e */ chr(232) . chr(233) . chr(234) . chr(235) . - /* Cc */ chr(199) . chr(231) . - /* I */ chr(204) . chr(205) . chr(206) . chr(207) . - /* i */ chr(236) . chr(237) . chr(238) . chr(239) . - /* U */ chr(217) . chr(218) . chr(219) . chr(220) . - /* u */ chr(249) . chr(250) . chr(251) . chr(252) . - /* yNn */ chr(255) . chr(209) . chr(241); - - return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn')); - } - - } -- cgit v1.2.3