-
Notifications
You must be signed in to change notification settings - Fork 95
/
NaiveBayesianClassifier.m
227 lines (183 loc) · 10.8 KB
/
NaiveBayesianClassifier.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
(*
Implementation of naive Bayesian classifier generation in Mathematica
Copyright (C) 2013 Anton Antonov
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Written by Anton Antonov,
7320 Colbury Ave,
Windermere, Florida, USA.
*)
(*
Mathematica is (C) Copyright 1988-2013 Wolfram Research, Inc.
Protected by copyright law and international treaties.
Unauthorized reproduction or distribution subject to severe civil
and criminal penalties.
Mathematica is a registered trademark of Wolfram Research, Inc.
*)
(* Version 0.8 *)
(* This package contains definitions for generation of naive Bayesian classifiers. *)
(* I am not sure that this is best desgin of functionality and signatures. It is in my TODO list to review the design and write functions that are better of handling unexpected arguments. Another TODO task is the ability to specify different distribution approximation ranges for each variable. Right now the ranges are specified for all -- see the range argument in the function definitions below. *)
BeginPackage["NaiveBayesianClassifier`"]
MakeBayesianClassifier::usage = "MakeBayesianClassifier[dataArg_?ArrayQ, labelArg_, rangeArg : (Automatic | {_?NumberQ, _?NumberQ}), nbins_Integer] makes probability function of the label labelArg by approximating the variable distributions using rangeArg and nbins."
MakeBayesianClassifiers::usage = "MakeBayesianClassifiers[dataArg_?ArrayQ, nbins_Integer] makes Bayesian functions for each label in the argument dataArg by approximating the variable distributions using nbins number of bins."
NBCClassify::usage = "NBCClassify[{cf_,cfLabel_}, {ncf_,ncfLabel_}, thA_?NumberQ, thNA_?NumberQ, x_?VectorQ, inds:(All|{_Integer..})] applies two piece-wise functions, cf and ncf, derived from data with two labels {cfLabel,ncfLabel} to x[[inds]]. The values cf[x[[inds]]] and ncf[x[[inds]]] are for the probabilities to get cfLabel and ncfLabel respectively. The tuning parameters thA and thNA are used to decide is the overall classification result -- see the function definition."
NBCClassificationStatistics::usage = "NBCClassificationStatistics[{cf_,cfLabel_}, {ncf_,ncfLabel_}, thA_?NumberQ, thNA_?NumberQ, testData_?ArrayQ, inds:(All|{_Integer..})] computes statistics for the performance of a naive Bayesian classifier made of cf and ncf over test data. The function NBCClassify is used internally. This function is superseded by NBCClassificationSuccess, which has a different signature."
NBCClassificationSuccess::usage = "NBCClassificationSuccess[classFunc, testDataArray, lbls] finds the classification success using classFunc over the test data testDataArray for each classification label in lbls. If the last argument, lbls, is omitted then Union[testDataArray[[All,-1]]] is taken as the set of labels. The returned result is a set of rules {{_,True|False}->_?NumberQ ..}. The rules {_,True}->_ are for the fractions of correct guesses; the rules {_,False}->_ are for the fractions of incorrect guesses. The rules {_,All}->_ are for the classification success fractions using all records of testDataArray."
NBCClassificationSuccessCounts::usage = "NBCClassificationSuccessCounts[classFunc, testDataArray, lbls] finds number of successful classifications using classFunc over the test data testDataArray for each classification label in lbls. If the last argument, lbls, is omitted then Union[testDataArray[[All,-1]]] is taken as the set of labels. The returned result is a set of rules {{_,True|False}->_Integer ..}. The rules {_,True}->_ are for the number of correct guesses; the rules {_,False}->_ are for the number of incorrect guesses. The rules {_,All}->_ are for the number of successful classifications using all records of testDataArray."
Begin["`Private`"]
(* This function computes P(X=x/C[i])/P(X=x). *)
Clear[MakeBayesianFunction]
MakeBayesianFunction[data : {_?NumberQ ..}, allData : {_?NumberQ ..}, range : (Automatic | {_?NumberQ, _?NumberQ}), nbins_Integer] :=
Block[{ds, bcs, allbcs, func},
If[TrueQ[range === Automatic],
ds = FindDivisions[{Min[data], Max[data]}, nbins],
ds = FindDivisions[range, nbins]
];
bcs = BinCounts[data, {ds}];
allbcs = BinCounts[allData, {ds}];
(* P(X=x/Ci)*)
bcs = N[bcs/Length[data]];
(* P(X=x)*)
allbcs = N[allbcs/Length[allData]];
func =
Piecewise[
MapThread[
Function[{int, c, ac}, {If[ac > 0, c/ac, 0], int[[1]] <= # < int[[2]]}],
{Partition[ds, 2, 1], bcs, allbcs}
]
];
With[{f = func}, f &]
];
MakeBayesianFunction[data : {_String ..}, allData : {_String ..}, dummy___] :=
MakeBayesianFunction[data, allData];
MakeBayesianFunction[data : {_String ..}, allData : {_String ..}] :=
Block[{rules, allRules, func},
(* P(X=x/C[i])*)
rules = Append[Rule @@@ Tally[data], _ -> 0];
rules[[All, 2]] = rules[[All, 2]]/Length[data];
(* P(X=x)*)
allRules = Append[Rule @@@ Tally[allData], _ -> 0];
allRules[[All, 2]] = allRules[[All, 2]]/Length[allData];
func =
Piecewise[
Map[
Function[{v}, {If[(v /. allRules) > 0, N[(v /. rules)/(v /. allRules)], 0], # == v}],
Most[rules[[All, 1]]]
]
];
With[{f = func}, f &]
];
(* This function takes P(X=x/C[i])/P(X=x) and multiplies it with P(C[i]).
Therefore, we get
P(C[i]/X=x)=(P(X=x/C[i])P(C[i]))/P(X=x)=(P(X=x\[Intersection]C[i])/P(X=x)).
The last column of the array argument data is made of labels.
The argument label is one of the labels in data[[All,-1]] .
The returned result is a function to be applied to a vector of the same type as data[[1,1;;-2]] .
*)
Clear[MakeBayesianClassifier]
MakeBayesianClassifier[data_?ArrayQ, label_, range : (Automatic | {_?NumberQ, _?NumberQ}), nbins_Integer] :=
Block[{funcs, ldata},
ldata = Select[data, #[[-1]] == label &];
funcs =
MapThread[
MakeBayesianFunction[#1, #2, range, nbins] &,
{Most@Transpose[ldata], Most@Transpose[data]}];
With[{fs = funcs, factor = N[Length[ldata]/Length[data]]},
factor*Apply[Times, MapThread[#1[#2] &, {fs, #}]] &]
];
Clear[MakeBayesianClassifiers]
MakeBayesianClassifiers[data_?ArrayQ, nbins_Integer] :=
Block[{labels, t, funcs},
labels = Union[data[[All, -1]]];
funcs = Map[Function[{l}, MakeBayesianClassifier[data, l, Automatic, nbins]], labels];
Thread[labels -> funcs]
];
(* This function is for NBC classification of over data with two labels {False,True}.
For using the NBC functions made with MakeBayesianClassifier(s) over data with more labels
other classification functions have to made.
The argument cf is for True, the argument ncf is for False.
The arguments thA and thNA are threshold parameters.
The argument x is data record.
The argument inds is used to take elements of x.
The functions cf and ncf are applied to x[[inds]]. *)
Clear[NBCClassify]
NBCClassify[cf_, ncf_, thA_?NumberQ, thNA_?NumberQ, x_?VectorQ, inds:(All|{_Integer..}):All] :=
NBCClassify[{cf,True}, {ncf,False}, thA, thNA, x, inds];
NBCClassify[{cf_,cfLabel_}, {ncf_,ncfLabel_}, thA_?NumberQ, thNA_?NumberQ, x_?VectorQ, inds:(All|{_Integer..}):All] :=
Block[{rcf, rncf},
{rcf, rncf} = {cf[x[[inds]]], ncf[x[[inds]]]};
Which[
rcf >= thA || (1 - rncf) >= thNA, cfLabel,
rncf > 0.5, ncfLabel,
rcf > rncf, cfLabel,
True, ncfLabel
]
];
Clear[NBCClassificationStatistics]
NBCClassificationStatistics[cf_, ncf_, thA_?NumberQ, thNA_?NumberQ, testData_?ArrayQ, inds:(All|{_Integer..}):All] :=
NBCClassificationStatistics[{cf,True}, {ncf,False}, thA, thNA, testData, inds];
NBCClassificationStatistics[{cf_,cfLabel_}, {ncf_,ncfLabel_}, thA_?NumberQ, thNA_?NumberQ, testData_?ArrayQ, inds:(All|{_Integer..}):All] :=
Block[{res, ncAll, ncTrue, ncFalse, data},
data = testData;
res = NBCClassify[{cf,cfLabel},{ncf,ncfLabel},thA,thNA,#,inds]& /@ data;
ncAll = Count[MapThread[Equal,{res,data[[All,-1]]}],True];
ncAll = ncAll/Length[data];
data = Select[testData,#[[-1]]==cfLabel&];
res = NBCClassify[{cf,cfLabel},{ncf,ncfLabel},thA,thNA,#,inds]& /@ data;
ncTrue = Count[MapThread[Equal,{res,data[[All,-1]]}],True];
ncTrue = ncTrue/Length[data];
data = Select[testData,#[[-1]]==ncfLabel&];
res = NBCClassify[{cf,cfLabel},{ncf,ncfLabel},thA,thNA,#,inds]& /@ data;
ncFalse = Count[MapThread[Equal,{res,data[[All,-1]]}],True];
ncFalse = ncFalse/Length[data];
Transpose[{
{"all records", Row[{cfLabel, " records"}], Row[{ncfLabel, " records"}]},
N@{ncAll, ncTrue, ncFalse}}]
];
NBCClassificationSuccess::nlbl =
NBCClassificationSuccessCounts::nlbl =
"The specified label `1` is not one of the data array labels `2`.";
NBCClassificationSuccessCountsInternal[classFunc_, dataArr_?MatrixQ, labels_, mHead_] :=
Block[{guesses, guessStats, tdata, t, dataLabels = Union[dataArr[[All, -1]]]},
t =
Table[
If[! MemberQ[dataLabels, lbl],
Message[mHead::nlbl, lbl, dataLabels];
{0, 0},
(*ELSE*)
tdata = Select[dataArr, #[[-1]] == lbl &];
guesses = classFunc[Most[#]] & /@ tdata;
guessStats = MapThread[Equal, {guesses, tdata[[All, -1]]}];
{Count[guessStats, True], Count[guessStats, False]}
], {lbl, labels}];
t = MapThread[{{#1, True} -> #2[[1]], {#1, False} -> #2[[2]]} &, {labels, t}];
guesses = classFunc[Most[#]] & /@ dataArr;
guessStats = MapThread[Equal, {guesses, dataArr[[All, -1]]}];
Flatten[#, 1] &@
Join[t, {{All, True} -> (Count[guessStats, True]), {All, False} -> (Count[guessStats, False])}]
];
NBCClassificationSuccessCounts[classFunc_, dataArr_?MatrixQ] :=
NBCClassificationSuccessCounts[classFunc, dataArr, Union[dataArr[[All, -1]]] ];
NBCClassificationSuccessCounts[classFunc_, dataArr_?MatrixQ, labels_?VectorQ] :=
NBCClassificationSuccessCountsInternal[classFunc, dataArr, labels, NBCClassificationSuccessCounts]
NBCClassificationSuccess[classFunc_, dataArr_?MatrixQ] :=
NBCClassificationSuccess[classFunc, dataArr, Union[dataArr[[All, -1]]] ];
NBCClassificationSuccess[classFunc_, dataArr_?MatrixQ, labels_?VectorQ] :=
Block[{countRules, tdata},
countRules =
NBCClassificationSuccessCountsInternal[classFunc, dataArr, labels, NBCClassificationSuccess];
sizeRules =
Map[# -> If[# === All, Length[dataArr], Count[dataArr[[All, -1]], #]] &, Union[countRules[[All, 1, 1]]]];
Map[#[[1]] -> #[[2]]/N[#[[1, 1]] /. sizeRules] &, countRules]
];
End[]
EndPackage[]