summaryrefslogtreecommitdiff
path: root/plugins/af_sort_bayes/lib/class.naivebayesianstorage.php
blob: 99db1fc79f0f2f70939a926b766963d781388ad2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
<?php
	/*
	 ***** BEGIN LICENSE BLOCK *****
	 This file is part of PHP Naive Bayesian Filter.

	 The Initial Developer of the Original Code is
	 Loic d'Anterroches [loic_at_xhtml.net].
	 Portions created by the Initial Developer are Copyright (C) 2003
	 the Initial Developer. All Rights Reserved.

	 Contributor(s):

	 PHP Naive Bayesian Filter is free software; you can redistribute it
	 and/or modify it under the terms of the GNU General Public License as
	 published by the Free Software Foundation; either version 2 of
	 the License, or (at your option) any later version.

	 PHP Naive Bayesian Filter is distributed in the hope that it will
	 be useful, but WITHOUT ANY WARRANTY; without even the implied
	 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	 See the GNU General Public License for more details.

	 You should have received a copy of the GNU General Public License
	 along with Foobar; if not, write to the Free Software
	 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

	 Alternatively, the contents of this file may be used under the terms of
	 the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
	 in which case the provisions of the LGPL are applicable instead
	 of those above.

	 ***** END LICENSE BLOCK *****
	 */

	/** Access to the storage of the data for the filter.

	 To avoid dependency with respect to any database, this class handle all the
	 access to the data storage. You can provide your own class as long as
	 all the methods are available. The current one rely on a MySQL database.

	 methods:
	 - array getCategories()
	 - bool  wordExists(string $word)
	 - array getWord(string $word, string $categoryid)

	 */
	class NaiveBayesianStorage {
		var $con = null;
		var $owner_uid = null;
		var $max_document_length = 3000; // classifier can't rescale output for very long strings apparently

		function NaiveBayesianStorage($owner_uid) {
			$this->con = Db::get();
			$this->owner_uid = $owner_uid;

			return true;
		}

		/** get the list of categories with basic data.

		 @return array key = category ids, values = array(keys = 'probability', 'word_count')
		 */
		function getCategories() {
			$categories = array();
			$rs = $this->con->query('SELECT * FROM ttrss_plugin_af_sort_bayes_categories WHERE owner_uid = ' . $this->owner_uid);

			while ($line = $this->con->fetch_assoc($rs)) {
				$categories[$line['id']] = array('probability' => $line['probability'],
					'category' => $line['category'],
					'word_count' => $line['word_count']
				);
			}

			return $categories;
		}

		function getCategoryByName($category) {
			$rs = $this->con->query("SELECT id FROM ttrss_plugin_af_sort_bayes_categories WHERE category = '" .
				$this->con->escape_string($category) . "' AND owner_uid = " . $this->owner_uid);

			if ($this->con->num_rows($rs) != 0) {
				return $this->con->fetch_result($rs, 0, "id");
			}

			return false;
		}

		function getCategoryById($category_id) {
			$rs = $this->con->query("SELECT category FROM ttrss_plugin_af_sort_bayes_categories WHERE id = '" .
				(int)$category_id . "' AND owner_uid = " . $this->owner_uid);

			if ($this->con->num_rows($rs) != 0) {
				return $this->con->fetch_result($rs, 0, "category");
			}

			return false;
		}

		/** see if the word is an already learnt word.
		 @return bool
		 @param string word
		 */
		function wordExists($word) {
			$rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" . $this->con->escape_string($word) . "' AND
				owner_uid = " . $this->owner_uid);

			return $this->con->num_rows($rs) != 0;
		}

		/** get details of a word in a category.
		 @return array ('count' => count)
		 @param  string word
		 @param  string category id
		 */
		function getWord($word, $category_id) {
			$details = array();

			$rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" .
				$this->con->escape_string($word) . "' AND category_id=" . (int)$category_id);

			if ($this->con->num_rows($rs) == 0 ) {
				$details['count'] = 0;
			} else {
				$details['count'] = $this->con->fetch_result($rs, 0, "count");
			}

			return $details;
		}

		/** update a word in a category.
		 If the word is new in this category it is added, else only the count is updated.

		 @return bool success
		 @param string word
		 @param int    count
		 @paran string category id
		 */
		function updateWord($word, $count, $category_id) {
			$oldword = $this->getWord($word, $category_id);

			if (0 == $oldword['count']) {
				return $this->con->query("INSERT INTO ttrss_plugin_af_sort_bayes_wordfreqs (word, category_id, count, owner_uid)
					VALUES ('" . $this->con->escape_string($word) . "', '" .
					(int)$category_id . "', '" .
					(int)$count . "', '".
					$this->owner_uid . "')");
			}
			else {
				return $this->con->query("UPDATE ttrss_plugin_af_sort_bayes_wordfreqs SET count = count + " . (int) $count . " WHERE category_id = '" . $this->con->escape_string($category_id) . "' AND word = '" . $this->con->escape_string($word) . "'");
			}
		}

		/** remove a word from a category.

		 @return bool success
		 @param string word
		 @param int  count
		 @param string category id
		 */
		function removeWord($word, $count, $category_id) {
			$oldword = $this->getWord($word, $category_id);

			if (0 != $oldword['count'] && 0 >= ($oldword['count'] - $count)) {
				return $this->con->query("DELETE FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE word='" .
					$this->con->escape_string($word) . "' AND category_id='" .
					$this->con->escape_string($category_id) . "'");
			}
			else {
				return $this->con->query("UPDATE ttrss_plugin_af_sort_bayes_wordfreqs SET count = count - " .
					(int) $count . " WHERE category_id = '" . $this->con->escape_string($category_id) . "'
					AND word = '" . $this->con->escape_string($word) . "'");
			}
		}

		/** update the probabilities of the categories and word count.
		 This function must be run after a set of training

		 @return bool sucess
		 */
		function updateProbabilities() {
			// first update the word count of each category
			$rs = $this->con->query("SELECT SUM(count) AS total FROM ttrss_plugin_af_sort_bayes_wordfreqs WHERE owner_uid = ".$this->owner_uid);

			$total_words = $this->con->fetch_result($rs, 0, "total");

			if ($total_words == 0) {
				$this->con->query("UPDATE ttrss_plugin_af_sort_bayes_categories SET word_count=0, probability=0 WHERE owner_uid = " . $this->owner_uid);
				return true;
			}

			$rs = $this->con->query("SELECT tc.id AS category_id, SUM(count) AS total FROM ttrss_plugin_af_sort_bayes_categories AS tc
				LEFT JOIN ttrss_plugin_af_sort_bayes_wordfreqs AS tw ON (tc.id = tw.category_id) WHERE tc.owner_uid = ".$this->owner_uid." GROUP BY tc.id");

			while ($line = $this->con->fetch_assoc($rs)) {

				$proba = (int)$line['total'] / $total_words;
				$this->con->query("UPDATE ttrss_plugin_af_sort_bayes_categories SET word_count=" . (int) $line['total'] .
					", probability=" . $proba . " WHERE id = '" . $line['category_id'] . "'");
			}

			return true;
		}

		/** save a reference in the database.

		 @return bool success
		 @param  string reference if, must be unique
		 @param  string category id
		 @param  string content of the reference
		 */
		function saveReference($doc_id, $category_id, $content) {
			return $this->con->query("INSERT INTO ttrss_plugin_af_sort_bayes_references (document_id, category_id, owner_uid) VALUES
				('" . $this->con->escape_string($doc_id) . "', '" .
					(int)$category_id . "', " .
					(int)$this->owner_uid . ")");
		}

		/** get a reference from the database.

		 @return array  reference( category_id => ...., content => ....)
		 @param  string id
		 */
		function getReference($doc_id, $include_content = true)
		{

			$ref = array();
			$rs = $this->con->query("SELECT * FROM ttrss_plugin_af_sort_bayes_references WHERE document_id='" .
				$this->con->escape_string($doc_id) . "' AND owner_uid = " . $this->owner_uid);

			if ($this->con->num_rows($rs) == 0) {
				return $ref;
			}

			$ref['category_id'] = $this->con->fetch_result($rs, 0, 'category_id');
			$ref['id'] = $this->con->fetch_result($rs, 0, 'id');
			$ref['document_id'] = $this->con->fetch_result($rs, 0, 'document_id');

			if ($include_content) {
				$rs = $this->con->query("SELECT content, title FROM ttrss_entries WHERE guid = '" .
					$this->con->escape_string($ref['document_id']) . "'");

				if ($this->con->num_rows($rs) != 0) {
					$ref['content'] = mb_substr(mb_strtolower($this->con->fetch_result($rs, 0, 'title') . ' ' . strip_tags($this->con->fetch_result($rs, 0, 'content'))), 0,
					$this->max_document_length);
				}
			}

			return $ref;
		}

		/** remove a reference from the database

		 @return bool sucess
		 @param  string reference id
		 */
		function removeReference($doc_id) {

			return $this->con->query("DELETE FROM ttrss_plugin_af_sort_bayes_references WHERE document_id='" . $this->con->escape_string($doc_id) . "' AND owner_uid = " . $this->owner_uid);
		}

	}