-
Notifications
You must be signed in to change notification settings - Fork 0
/
Naive_Bayes_Classifier.py
161 lines (125 loc) · 5.27 KB
/
Naive_Bayes_Classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import json
import helpers
from Types import Iris_Data_Sample, Iris_Dataset, Matrix_Of_Strings, Summary, Summary_By_Class
from typing import Set, Dict
import numpy as np
class NaiveBayesClassifier:
def __init__(self):
self.dataset: Iris_Dataset = []
self.noOfFeatures = 0
self.noOfSamples = 0
self.summaryByClass: Summary_By_Class = dict()
def _parseSimpleMetadata(self):
self.noOfSamples = len(self.dataset)
self.noOfFeatures = 0 if len(
self.dataset) <= 0 else len(self.dataset[0].features)
def loadDatasetFromFile(self, filename):
# Loads the dataset from a json file of data samples
try:
with open(filename) as fp:
dataset = json.load(fp)
self.dataset = helpers.convertJsonDatasetToIrisDataset(dataset)
except Exception as e:
raise e
self._parseSimpleMetadata()
def loadDataset(self, dataset: Iris_Dataset):
# Assign the dataset to be an array of pre-loaded samples
self.dataset = dataset
self._parseSimpleMetadata()
def loadDatasetFromArr(self, data: Matrix_Of_Strings):
""" Load the dataset from an array(array(strings))
Something like this
[
["1.0", "3.2", "6.9", "4.20"],
["1.0", "3.2", "6.9", "4.20"],
["1.0", "3.2", "6.9", "4.20"],
<-- Features --> <- Category ->
]
"""
[_noOfFeatures, _noOfSamples,
dataset] = helpers.convertArrayToDatasamples(data)
self.dataset = dataset
self._parseSimpleMetadata()
def allCategories(self, data: Iris_Dataset):
allCategories: Set[str] = set()
for sample in data:
allCategories.add(sample.category)
return list(allCategories)
@staticmethod
def describeData(dataset: Iris_Dataset):
noOfSamples = len(dataset)
noOfFeatures = 0 if len(dataset) <= 0 else len(dataset[0].features)
summary: Summary = Summary(
noOfSamples=noOfSamples,
noOfFeatures=noOfFeatures,
mean=[],
stddev=[]
)
for i in range(noOfFeatures):
featureValues = []
for sample in dataset:
featureValues.append(sample.features[i])
m = np.mean(featureValues)
s = np.std(featureValues)
summary.mean.append(m)
summary.stddev.append(s)
return summary
@staticmethod
def separateByClass(dataset: Iris_Dataset):
separatedByClass: Dict[str, Iris_Dataset] = dict()
for sample in dataset:
if sample.category not in separatedByClass:
separatedByClass[sample.category] = list()
separatedByClass[sample.category].append(sample)
return separatedByClass
@staticmethod
def describeByClass(dataset: Iris_Dataset):
separatedByClass = NaiveBayesClassifier.separateByClass(dataset)
summaryByClass: Summary_By_Class = dict()
for category in separatedByClass:
samples = separatedByClass[category]
summaryOfThisCategory = NaiveBayesClassifier.describeData(samples)
summaryByClass[category] = summaryOfThisCategory
return summaryByClass
def train(self):
self.summaryByClass = NaiveBayesClassifier.describeByClass(
self.dataset)
# print(self.summaryByClass)
# for category in self.summaryByClass:
# summary = self.summaryByClass[category]
# print(f"{category}: {summary}")
def computeClassProbabilities(self, newSample: Iris_Data_Sample):
# Bayes theorem:
# Posterior = (likelihood*prior)/evidence
# P(class/X) = (P(X/class) * P(class))/P(X)
# We generally ignore the denominator, i.e, evidence
probabilities: Dict[str, float] = dict()
for category in self.summaryByClass:
priorProbability = self.summaryByClass[category].noOfSamples / \
self.noOfSamples
evidence = 1
likelihood = 1
for i in range(self.summaryByClass[category].noOfFeatures):
# mean of ith feature of this class
m = self.summaryByClass[category].mean[i]
# standard deviation of ith feature of this class
s = self.summaryByClass[category].stddev[i]
x = newSample.features[i] # ith feature of given sample
p = helpers.gaussianPdf(x, m, s)
likelihood *= p
# P(class=category/newSample)
probabilities[category] = (likelihood*priorProbability)/evidence
return probabilities
def predict(self, newSample: Iris_Data_Sample):
probabilities = self.computeClassProbabilities(newSample)
return helpers.customArgmax(probabilities), probabilities
def _computeLoss(self, dataset: Iris_Dataset) -> float:
noOfSamples = len(dataset)
incorrect = 0
for sample in dataset:
prediction = self.predict(sample)
if prediction[0] != sample.category:
incorrect += 1
return incorrect * 100 / noOfSamples
def computeLoss(self):
return self._computeLoss(self.dataset)