af_sort_bayes: move to -attic repo, not really suitable for production use

of any kind
author: Andrew Dolgov <[email protected]> 2015-11-25 21:07:24 +0300
committer: Andrew Dolgov <[email protected]> 2015-11-25 21:07:24 +0300
commit: 74a752879bcad54daa62994f9eae42fe4afdd299 (patch)
tree: 2168cc5bcb8d9bd0f5522d2bee732f0cc872ed55 /plugins/af_sort_bayes/lib/class.naivebayesian.php
parent: f0ebb41b278a3ac9fdc276a9c13d3b4387f721dc (diff)
1 files changed, 0 insertions, 297 deletions
diff --git a/plugins/af_sort_bayes/lib/class.naivebayesian.php b/plugins/af_sort_bayes/lib/class.naivebayesian.php
deleted file mode 100644
index 4a4ffa7eb..000000000
--- a/plugins/af_sort_bayes/lib/class.naivebayesian.php
+++ /dev/null
@@ -1,297 +0,0 @@
-<?php
-	/*
-	 ***** BEGIN LICENSE BLOCK *****
-	 This file is part of PHP Naive Bayesian Filter.
-
-	 The Initial Developer of the Original Code is
-	 Loic d'Anterroches [loic_at_xhtml.net].
-	 Portions created by the Initial Developer are Copyright (C) 2003
-	 the Initial Developer. All Rights Reserved.
-
-	 Contributor(s):
-	 See the source
-
-	 PHP Naive Bayesian Filter is free software; you can redistribute it
-	 and/or modify it under the terms of the GNU General Public License as
-	 published by the Free Software Foundation; either version 2 of
-	 the License, or (at your option) any later version.
-
-	 PHP Naive Bayesian Filter is distributed in the hope that it will
-	 be useful, but WITHOUT ANY WARRANTY; without even the implied
-	 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-	 See the GNU General Public License for more details.
-
-	 You should have received a copy of the GNU General Public License
-	 along with Foobar; if not, write to the Free Software
-	 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-	 Alternatively, the contents of this file may be used under the terms of
-	 the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
-	 in which case the provisions of the LGPL are applicable instead
-	 of those above.
-
-	 ***** END LICENSE BLOCK *****
-	 */
-
-	class NaiveBayesian {
-		/** min token length for it to be taken into consideration */
-		var $min_token_length = 3;
-		/** max token length for it to be taken into consideration */
-		var $max_token_length = 15;
-		/** list of token to ignore
-		 @see getIgnoreList()
-		 */
-		var $ignore_list = array();
-		/** storage object
-		 @see class NaiveBayesianStorage
-		 */
-		var $nbs = null;
-
-		function NaiveBayesian($nbs) {
-			$this->nbs = $nbs;
-
-			return true;
-		}
-
-		/** categorize a document.
-		 Get list of categories in which the document can be categorized
-		 with a score for each category.
-
-		 @return array keys = category ids, values = scores
-		 @param string document
-		 */
-		function categorize($document) {
-			$scores = array();
-			$categories = $this->nbs->getCategories();
-			$tokens = $this->_getTokens($document);
-
-			// calculate the score in each category
-			$total_words = 0;
-			$ncat = 0;
-
-			while (list($category, $data) = each($categories)) {
-				$total_words += $data['word_count'];
-				$ncat++;
-			}
-
-			reset($categories);
-
-			while (list($category, $data) = each($categories)) {
-				$scores[$category] = $data['probability'];
-				// small probability for a word not in the category
-				// maybe putting 1.0 as a 'no effect' word can also be good
-
-				if ($data['word_count'] > 0)
-					$small_proba = 1.0 / ($data['word_count'] * 2);
-				else
-					$small_proba = 0;
-
-				reset($tokens);
-
-				while (list($token, $count) = each($tokens)) {
-
-					if ($this->nbs->wordExists($token)) {
-						$word = $this->nbs->getWord($token, $category);
-
-						if ($word['count']) {
-							$proba = $word['count'] / $data['word_count'];
-						}
-						else {
-							$proba = $small_proba;
-						}
-
-						$scores[$category] *= pow($proba, $count) * pow($total_words / $ncat, $count);
-						// pow($total_words/$ncat, $count) is here to avoid underflow.
-
-					}
-				}
-			}
-
-			return $this->_rescale($scores);
-		}
-
-		/** training against a document.
-		 Set a document as being in a specific category. The document becomes a reference
-		 and is saved in the table of references. After a set of training is done
-		 the updateProbabilities() function must be run.
-
-		 @see updateProbabilities()
-		 @see untrain()
-		 @return bool success
-		 @param string document id, must be unique
-		 @param string category_id the category id in which the document should be
-		 @param string content of the document
-		 */
-		function train($doc_id, $category_id, $content) {
-			$ret = false;
-
-
-			// if this doc_id already trained, no trained
-			if (!$this->nbs->getReference($doc_id, false)) {
-
-				$tokens = $this->_getTokens($content);
-
-				while (list($token, $count) = each($tokens)) {
-					$this->nbs->updateWord($token, $count, $category_id);
-				}
-
-				$this->nbs->saveReference($doc_id, $category_id, $content);
-
-				$ret = true;
-			}
-			else {
-				$ret = false;
-			}
-
-			return $ret;
-		}
-
-		/** untraining of a document.
-		 To remove just one document from the references.
-
-		 @see updateProbabilities()
-		 @see untrain()
-		 @return bool success
-		 @param string document id, must be unique
-		 */
-		function untrain($doc_id) {
-			$ref = $this->nbs->getReference($doc_id);
-
-			if (isset($ref['content'])) {
-
-				$tokens = $this->_getTokens($ref['content']);
-
-				while (list($token, $count) = each($tokens)) {
-					$this->nbs->removeWord($token, $count, $ref['category_id']);
-				}
-
-				$this->nbs->removeReference($doc_id);
-
-				return true;
-			} else {
-				return false;
-			}
-		}
-
-		/** rescale the results between 0 and 1.
-
-		 @author Ken Williams, [email protected]
-		 @see categorize()
-		 @return array normalized scores (keys => category, values => scores)
-		 @param array scores (keys => category, values => scores)
-		 */
-		function _rescale($scores) {
-			// Scale everything back to a reasonable area in
-			// logspace (near zero), un-loggify, and normalize
-			$total = 0.0;
-			$max = 0.0;
-			reset($scores);
-
-			while (list($cat, $score) = each($scores)) {
-				if ($score >= $max)
-					$max = $score;
-			}
-
-			reset($scores);
-			while (list($cat, $score) = each($scores)) {
-				$scores[$cat] = (float) exp($score - $max);
-				$total += (float) pow($scores[$cat], 2);
-			}
-
-			$total = (float) sqrt($total);
-
-			reset($scores);
-			while (list($cat, $score) = each($scores)) {
-				$scores[$cat] = (float) $scores[$cat] / $total;
-			}
-			reset($scores);
-
-			return $scores;
-		}
-
-		/** update the probabilities of the categories and word count.
-		 This function must be run after a set of training
-
-		 @see train()
-		 @see untrain()
-		 @return bool sucess
-		 */
-		function updateProbabilities() {
-			// this function is really only database manipulation
-			// that is why all is done in the NaiveBayesianStorage
-			return $this->nbs->updateProbabilities();
-		}
-
-		/** Get the list of token to ignore.
-		 @return array ignore list
-		 */
-		function getIgnoreList() {
-			//return array('the', 'that', 'you', 'for', 'and');
-
-			// https://en.wikipedia.org/wiki/Most_common_words_in_English
-			return array('the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'I', 'it', 'for', 'not', 'on', 'with',
-				'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her',
-				'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up',
-				'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time',
-				'no', 'just', 'him', 'know', 'take', 'people', 'into', 'year', 'your', 'good', 'some', 'could',
-				'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think',
-				'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even',
-				'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us', 'read', 'more');
-
-		}
-
-		/** get the tokens from a string
-
-		 @author James Seng. [http://james.seng.cc/] (based on his perl version)
-
-		 @return array tokens
-		 @param  string the string to get the tokens from
-		 */
-		function _getTokens($string) {
-			$rawtokens = array();
-			$tokens = array();
-			//$string = $this->_cleanString($string);
-
-			if (count(0 >= $this->ignore_list)) {
-				$this->ignore_list = $this->getIgnoreList();
-			}
-
-			$rawtokens = preg_split("/[\(\),:\.;\t\r\n ]/", $string, -1, PREG_SPLIT_NO_EMPTY);
-
-			// remove some tokens
-			while (list(, $token) = each($rawtokens)) {
-				$token = trim($token);
-				if (!(('' == $token) || (mb_strpos($token, "&") !== FALSE) || (mb_strlen($token) < $this->min_token_length) || (mb_strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) {
-					$tokens[$token]++;
-				}
-			}
-
-			return $tokens;
-		}
-
-		/** clean a string from the diacritics
-
-		 @author Antoine Bajolet [phpdig_at_toiletoine.net]
-		 @author SPIP [http://uzine.net/spip/]
-
-		 @return string clean string
-		 @param  string string with accents
-		 */
-		function _cleanString($string) {
-			$diac = /* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) .
-				/* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) .
-				/* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) .
-				/* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) .
-				/* E */ chr(200) . chr(201) . chr(202) . chr(203) .
-				/* e */ chr(232) . chr(233) . chr(234) . chr(235) .
-				/* Cc */ chr(199) . chr(231) .
-				/* I */ chr(204) . chr(205) . chr(206) . chr(207) .
-				/* i */ chr(236) . chr(237) . chr(238) . chr(239) .
-				/* U */ chr(217) . chr(218) . chr(219) . chr(220) .
-				/* u */ chr(249) . chr(250) . chr(251) . chr(252) .
-				/* yNn */ chr(255) . chr(209) . chr(241);
-
-			return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
-		}
-
-	}
author	Andrew Dolgov <[email protected]>	2015-11-25 21:07:24 +0300
committer	Andrew Dolgov <[email protected]>	2015-11-25 21:07:24 +0300
commit	74a752879bcad54daa62994f9eae42fe4afdd299 (patch)
tree	2168cc5bcb8d9bd0f5522d2bee732f0cc872ed55 /plugins/af_sort_bayes/lib/class.naivebayesian.php
parent	f0ebb41b278a3ac9fdc276a9c13d3b4387f721dc (diff)