summaryrefslogtreecommitdiff
path: root/plugins/af_sort_bayes/lib/class.naivebayesian.php
diff options
context:
space:
mode:
Diffstat (limited to 'plugins/af_sort_bayes/lib/class.naivebayesian.php')
-rw-r--r--plugins/af_sort_bayes/lib/class.naivebayesian.php273
1 files changed, 273 insertions, 0 deletions
diff --git a/plugins/af_sort_bayes/lib/class.naivebayesian.php b/plugins/af_sort_bayes/lib/class.naivebayesian.php
new file mode 100644
index 000000000..1c2ef463b
--- /dev/null
+++ b/plugins/af_sort_bayes/lib/class.naivebayesian.php
@@ -0,0 +1,273 @@
+<?php
+ /*
+ ***** BEGIN LICENSE BLOCK *****
+ This file is part of PHP Naive Bayesian Filter.
+
+ The Initial Developer of the Original Code is
+ Loic d'Anterroches [loic_at_xhtml.net].
+ Portions created by the Initial Developer are Copyright (C) 2003
+ the Initial Developer. All Rights Reserved.
+
+ Contributor(s):
+ See the source
+
+ PHP Naive Bayesian Filter is free software; you can redistribute it
+ and/or modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of
+ the License, or (at your option) any later version.
+
+ PHP Naive Bayesian Filter is distributed in the hope that it will
+ be useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Foobar; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Alternatively, the contents of this file may be used under the terms of
+ the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ in which case the provisions of the LGPL are applicable instead
+ of those above.
+
+ ***** END LICENSE BLOCK *****
+ */
+
+ class NaiveBayesian {
+ /** min token length for it to be taken into consideration */
+ var $min_token_length = 3;
+ /** max token length for it to be taken into consideration */
+ var $max_token_length = 15;
+ /** list of token to ignore
+ @see getIgnoreList()
+ */
+ var $ignore_list = array();
+ /** storage object
+ @see class NaiveBayesianStorage
+ */
+ var $nbs = null;
+
+ function NaiveBayesian($nbs) {
+ $this->nbs = $nbs;
+
+ return true;
+ }
+
+ /** categorize a document.
+ Get list of categories in which the document can be categorized
+ with a score for each category.
+
+ @return array keys = category ids, values = scores
+ @param string document
+ */
+ function categorize($document) {
+ $scores = array();
+ $categories = $this->nbs->getCategories();
+ $tokens = $this->_getTokens($document);
+
+ // calculate the score in each category
+ $total_words = 0;
+ $ncat = 0;
+
+ while (list($category, $data) = each($categories)) {
+ $total_words += $data['word_count'];
+ $ncat++;
+ }
+
+ reset($categories);
+
+ while (list($category, $data) = each($categories)) {
+ $scores[$category] = $data['probability'];
+ // small probability for a word not in the category
+ // maybe putting 1.0 as a 'no effect' word can also be good
+ $small_proba = 1.0 / ($data['word_count'] * 2);
+
+ reset($tokens);
+
+ while (list($token, $count) = each($tokens)) {
+ if ($this->nbs->wordExists($token)) {
+ $word = $this->nbs->getWord($token, $category);
+
+ if ($word['count']) {
+ $proba = $word['count'] / $data['word_count'];
+ }
+ else {
+ $proba = $small_proba;
+ }
+
+ $scores[$category] *= pow($proba, $count) * pow($total_words / $ncat, $count);
+ // pow($total_words/$ncat, $count) is here to avoid underflow.
+
+ }
+ }
+ }
+
+ return $this->_rescale($scores);
+ }
+
+ /** training against a document.
+ Set a document as being in a specific category. The document becomes a reference
+ and is saved in the table of references. After a set of training is done
+ the updateProbabilities() function must be run.
+
+ @see updateProbabilities()
+ @see untrain()
+ @return bool success
+ @param string document id, must be unique
+ @param string category_id the category id in which the document should be
+ @param string content of the document
+ */
+ function train($doc_id, $category_id, $content) {
+ $ret = false;
+
+ // if this doc_id already trained, no trained
+ if (!$this->nbs->getReference($doc_id)) {
+ $tokens = $this->_getTokens($content);
+
+ while (list($token, $count) = each($tokens)) {
+ $this->nbs->updateWord($token, $count, $category_id);
+ }
+
+ $this->nbs->saveReference($doc_id, $category_id, $content);
+
+ $ret = true;
+ }
+ else {
+ $ret = false;
+ }
+
+ return $ret;
+ }
+
+ /** untraining of a document.
+ To remove just one document from the references.
+
+ @see updateProbabilities()
+ @see untrain()
+ @return bool success
+ @param string document id, must be unique
+ */
+ function untrain($doc_id) {
+ $ref = $this->nbs->getReference($doc_id);
+ $tokens = $this->_getTokens($ref['content']);
+
+ while (list($token, $count) = each($tokens)) {
+ $this->nbs->removeWord($token, $count, $ref['category_id']);
+ }
+
+ $this->nbs->removeReference($doc_id);
+
+ return true;
+ }
+
+ /** rescale the results between 0 and 1.
+
+ @author Ken Williams, [email protected]
+ @see categorize()
+ @return array normalized scores (keys => category, values => scores)
+ @param array scores (keys => category, values => scores)
+ */
+ function _rescale($scores) {
+ // Scale everything back to a reasonable area in
+ // logspace (near zero), un-loggify, and normalize
+ $total = 0.0;
+ $max = 0.0;
+ reset($scores);
+
+ while (list($cat, $score) = each($scores)) {
+ if ($score >= $max)
+ $max = $score;
+ }
+
+ reset($scores);
+ while (list($cat, $score) = each($scores)) {
+ $scores[$cat] = (float) exp($score - $max);
+ $total += (float) pow($scores[$cat], 2);
+ }
+
+ $total = (float) sqrt($total);
+
+ reset($scores);
+ while (list($cat, $score) = each($scores)) {
+ $scores[$cat] = (float) $scores[$cat] / $total;
+ }
+ reset($scores);
+
+ return $scores;
+ }
+
+ /** update the probabilities of the categories and word count.
+ This function must be run after a set of training
+
+ @see train()
+ @see untrain()
+ @return bool sucess
+ */
+ function updateProbabilities() {
+ // this function is really only database manipulation
+ // that is why all is done in the NaiveBayesianStorage
+ return $this->nbs->updateProbabilities();
+ }
+
+ /** Get the list of token to ignore.
+ @return array ignore list
+ */
+ function getIgnoreList() {
+ return array('the', 'that', 'you', 'for', 'and');
+ }
+
+ /** get the tokens from a string
+
+ @author James Seng. [http://james.seng.cc/] (based on his perl version)
+
+ @return array tokens
+ @param string the string to get the tokens from
+ */
+ function _getTokens($string) {
+ $rawtokens = array();
+ $tokens = array();
+ $string = $this->_cleanString($string);
+
+ if (count(0 >= $this->ignore_list)) {
+ $this->ignore_list = $this->getIgnoreList();
+ }
+
+ $rawtokens = split("[^-_A-Za-z0-9]+", $string);
+
+ // remove some tokens
+ while (list(, $token) = each($rawtokens)) {
+ $token = trim($token);
+ if (!(('' == $token) || (strlen($token) < $this->min_token_length) || (strlen($token) > $this->max_token_length) || (preg_match('/^[0-9]+$/', $token)) || (in_array($token, $this->ignore_list)))) {
+ $tokens[$token]++;
+ }
+ }
+
+ return $tokens;
+ }
+
+ /** clean a string from the diacritics
+
+ @author Antoine Bajolet [phpdig_at_toiletoine.net]
+ @author SPIP [http://uzine.net/spip/]
+
+ @return string clean string
+ @param string string with accents
+ */
+ function _cleanString($string) {
+ $diac = /* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) .
+ /* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) .
+ /* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) .
+ /* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) .
+ /* E */ chr(200) . chr(201) . chr(202) . chr(203) .
+ /* e */ chr(232) . chr(233) . chr(234) . chr(235) .
+ /* Cc */ chr(199) . chr(231) .
+ /* I */ chr(204) . chr(205) . chr(206) . chr(207) .
+ /* i */ chr(236) . chr(237) . chr(238) . chr(239) .
+ /* U */ chr(217) . chr(218) . chr(219) . chr(220) .
+ /* u */ chr(249) . chr(250) . chr(251) . chr(252) .
+ /* yNn */ chr(255) . chr(209) . chr(241);
+
+ return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
+ }
+
+ }