1 files changed, 273 insertions, 0 deletions
diff --git a/plugins/af_sort_bayes/lib/class.naivebayesian.php b/plugins/af_sort_bayes/lib/class.naivebayesian.php
new file mode 100644
index 000000000..1c2ef463b
--- /dev/null
+++ b/plugins/af_sort_bayes/lib/class.naivebayesian.php
@@ -0,0 +1,273 @@
+<?php
+	/*
+	 ***** BEGIN LICENSE BLOCK *****
+	 This file is part of PHP Naive Bayesian Filter.
+
+	 The Initial Developer of the Original Code is
+	 Loic d'Anterroches [loic_at_xhtml.net].
+	 Portions created by the Initial Developer are Copyright (C) 2003
+	 the Initial Developer. All Rights Reserved.
+
+	 Contributor(s):
+	 See the source
+
+	 PHP Naive Bayesian Filter is free software; you can redistribute it
+	 and/or modify it under the terms of the GNU General Public License as
+	 published by the Free Software Foundation; either version 2 of
+	 the License, or (at your option) any later version.
+
+	 PHP Naive Bayesian Filter is distributed in the hope that it will
+	 be useful, but WITHOUT ANY WARRANTY; without even the implied
+	 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+	 See the GNU General Public License for more details.
+
+	 You should have received a copy of the GNU General Public License
+	 along with Foobar; if not, write to the Free Software
+	 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+	 Alternatively, the contents of this file may be used under the terms of
+	 the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+	 in which case the provisions of the LGPL are applicable instead
+	 of those above.
+
+	 ***** END LICENSE BLOCK *****
+	 */
+
+	class NaiveBayesian {
+		/** min token length for it to be taken into consideration */
+		var $min_token_length = 3;
+		/** max token length for it to be taken into consideration */
+		var $max_token_length = 15;
+		/** list of token to ignore
+		 @see getIgnoreList()
+		 */
+		var $ignore_list = array();
+		/** storage object
+		 @see class NaiveBayesianStorage
+		 */
+		var $nbs = null;
+
+		function NaiveBayesian($nbs) {
+			$this->nbs = $nbs;
+
+			return true;
+		}
+
+		/** categorize a document.
+		 Get list of categories in which the document can be categorized
+		 with a score for each category.
+
+		 @return array keys = category ids, values = scores
+		 @param string document
+		 */
+		function categorize($document) {
+			$scores = array();
+			$categories = $this->nbs->getCategories();
+			$tokens = $this->_getTokens($document);
+
+			// calculate the score in each category
+			$total_words = 0;
+			$ncat = 0;
+
+			while (list($category, $data) = each($categories)) {
+				$total_words += $data['word_count'];
+				$ncat++;
+			}
+
+			reset($categories);
+
+			while (list($category, $data) = each($categories)) {
+				$scores[$category] = $data['probability'];
+				// small probability for a word not in the category
+				// maybe putting 1.0 as a 'no effect' word can also be good
+				$small_proba = 1.0 / ($data['word_count'] * 2);
+
+				reset($tokens);
+
+				while (list($token, $count) = each($tokens)) {
+					if ($this->nbs->wordExists($token)) {
+						$word = $this->nbs->getWord($token, $category);
+
+						if ($word['count']) {
+							$proba = $word['count'] / $data['word_count'];
+						}
+						else {
+							$proba = $small_proba;
+						}
+
+						$scores[$category] *= pow($proba, $count) * pow($total_words / $ncat, $count);
+						// pow($total_words/$ncat, $count) is here to avoid underflow.
+
+					}
+				}
+			}
+
+			return $this->_rescale($scores);
+		}
+
+		/** training against a document.
+		 Set a document as being in a specific category. The document becomes a reference
+		 and is saved in the table of references. After a set of training is done
+		 the updateProbabilities() function must be run.
+
+		 @see updateProbabilities()
+		 @see untrain()
+		 @return bool success
+		 @param string document id, must be unique
+		 @param string category_id the category id in which the document should be
+		 @param string content of the document
+		 */
+		function train($doc_id, $category_id, $content) {
+			$ret = false;
+
+			// if this doc_id already trained, no trained
+			if (!$this->nbs->getReference($doc_id)) {
+				$tokens = $this->_getTokens($content);
+
+				while (list($token, $count) = each($tokens)) {
+					$this->nbs->updateWord($token, $count, $category_id);
+				}
+
+				$this->nbs->saveReference($doc_id, $category_id, $content);
+
+				$ret = true;
+			}
+			else {
+				$ret = false;
+			}
+
+			return $ret;
+		}
+
+		/** untraining of a document.
+		 To remove just one document from the references.
+
+		 @see updateProbabilities()
+		 @see untrain()
+		 @return bool success
+		 @param string document id, must be unique
+		 */
+		function untrain($doc_id) {
+			$ref = $this->nbs->getReference($doc_id);
+			$tokens = $this->_getTokens($ref['content']);
+
+			while (list($token, $count) = each($tokens)) {
+				$this->nbs->removeWord($token, $count, $ref['category_id']);
+			}
+
+			$this->nbs->removeReference($doc_id);
+
+			return true;
+		}
+
+		/** rescale the results between 0 and 1.
+
+		 @author Ken Williams, [email protected]
+		 @see categorize()
+		 @return array normalized scores (keys => category, values => scores)
+		 @param array scores (keys => category, values => scores)
+		 */
+		function _rescale($scores) {
+			// Scale everything back to a reasonable area in
+			// logspace (near zero), un-loggify, and normalize
+			$total = 0.0;
+			$max = 0.0;
+			reset($scores);
+
+			while (list($cat, $score) = each($scores)) {
+				if ($score >= $max)
+					$max = $score;
+			}
+
+			reset($scores);
+			while (list($cat, $score) = each($scores)) {
+				$scores[$cat] = (float) exp($score - $max);
+				$total += (float) pow($scores[$cat], 2);
+			}
+
+			$total = (float) sqrt($total);
+
+			reset($scores);
+			while (list($cat, $score) = each($scores)) {
+				$scores[$cat] = (float) $scores[$cat] / $total;
+			}
+			reset($scores);
+
+			return $scores;
+		}
+
+		/** update the probabilities of the categories and word count.
+		 This function must be run after a set of training
+
+		 @see train()
+		 @see untrain()
+		 @return bool sucess
+		 */
+		function updateProbabilities() {
+			// this function is really only database manipulation
+			// that is why all is done in the NaiveBayesianStorage
+			return $this->nbs->updateProbabilities();
+		}
+
+		/** Get the list of token to ignore.
+		 @return array ignore list
+		 */
+		function getIgnoreList() {
+			return array('the', 'that', 'you', 'for', 'and');
+		}
+
+		/** get the tokens from a string
+
+		 @author James Seng. [http://james.seng.cc/] (based on his perl version)
+
+		 @return array tokens
+		 @param  string the string to get the tokens from
+		 */
+		function _getTokens($string) {
+			$rawtokens = array();
+			$tokens = array();
+			$string = $this->_cleanString($string);
+
+			if (count(0 >= $this->ignore_list)) {
+				$this->ignore_list = $this->getIgnoreList();
+			}
+
+			$rawtokens = split("[^-_A-Za-z0-9]+", $string);
+
+			// remove some tokens
+			while (list(, $token) = each($rawtokens)) {
+				$token = trim($token);
+				if (!(('' == $token) || (strlen($token) < $this->min_token_length) || (strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) {
+					$tokens[$token]++;
+				}
+			}
+
+			return $tokens;
+		}
+
+		/** clean a string from the diacritics
+
+		 @author Antoine Bajolet [phpdig_at_toiletoine.net]
+		 @author SPIP [http://uzine.net/spip/]
+
+		 @return string clean string
+		 @param  string string with accents
+		 */
+		function _cleanString($string) {
+			$diac = /* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) .
+				/* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) .
+				/* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) .
+				/* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) .
+				/* E */ chr(200) . chr(201) . chr(202) . chr(203) .
+				/* e */ chr(232) . chr(233) . chr(234) . chr(235) .
+				/* Cc */ chr(199) . chr(231) .
+				/* I */ chr(204) . chr(205) . chr(206) . chr(207) .
+				/* i */ chr(236) . chr(237) . chr(238) . chr(239) .
+				/* U */ chr(217) . chr(218) . chr(219) . chr(220) .
+				/* u */ chr(249) . chr(250) . chr(251) . chr(252) .
+				/* yNn */ chr(255) . chr(209) . chr(241);
+
+			return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
+		}
+
+	}