-
Notifications
You must be signed in to change notification settings - Fork 0
/
Naive_Bayes_Classifier.tex
204 lines (165 loc) · 11.5 KB
/
Naive_Bayes_Classifier.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
\documentclass{article}
\usepackage{amsthm}
\usepackage{color, colortbl}
\usepackage[table]{xcolor}
\usepackage{amssymb,latexsym}
\usepackage{fancyhdr,amssymb}
\usepackage{url}
\usepackage[]{algorithm2e}
\usepackage{mathtools}
\usepackage[shortlabels]{enumitem}
\usepackage{graphicx}
\usepackage{hyperref}
\hypersetup{
colorlinks=true,
citecolor=blue,
filecolor=black,
linkcolor=blue,
urlcolor=blue
}
\setlength\extrarowheight{6pt}
\title{Naive Bayes Classifier Model}
\author{Gabriel Lapointe}
\begin{document}
\section{Naive Bayes Classification}
Let's say that we work with a dataset of $n$ observations (rows) and $m$ output classes where we want to classify $n$ texts.
\subsection{Definitions and Notations}
Let $T = \{x_1,x_2,\ldots,x_n\}$ be the multiset of texts where every text $x_i$ is defined by a multiset of words $\{w_{i,1}, w_{i,2}, \ldots, w_{i,k_i}\}$. Note that the position of the texts in $X$ does not matter.
We note
\begin{equation} \label{eq:ExplainedVariableMatrix}
Y =\begin{bmatrix}
y_{1,1} & y_{1,2} & \ldots & y_{1,m} \\
y_{2,1} & y_{2,2} & \ldots & y_{2,m} \\
\vdots & \vdots & \ddots & \vdots \\
y_{n,1} & y_{n,2} & \ldots & y_{n,m}
\end{bmatrix}
\end{equation}
the matrix of binary output values (explained variables) $y_{i,j} \in \{0,1\}$ for $1 \leq i \leq n$ and $1 \leq j \leq m$.
Since the goal is to estimate $Y$ because we are not supposed to know $y_{i,j}$, we note
\begin{equation} \label{eq:EstimatorMatrix}
\widehat{Y} =\begin{bmatrix}
\widehat{y}_{1,1} & \widehat{y}_{1,2} & \ldots & \widehat{y}_{1,m} \\
\widehat{y}_{2,1} & \widehat{y}_{2,2} & \ldots & \widehat{y}_{2,m} \\
\vdots & \vdots & \ddots & \vdots \\
\widehat{y}_{n,1} & \widehat{y}_{n,2} & \ldots & \widehat{y}_{n,m}
\end{bmatrix}
\end{equation}
the estimator matrix of $Y$ where $\widehat{y}_{i,j} \in [0,1]$ because we want to give a probability.
Between the estimated and the true values, there is generally a bias that we note
\begin{equation} \label{eq:BiasMatrix}
\epsilon =\begin{bmatrix}
\epsilon_{1,1} & \epsilon_{1,2} & \ldots & \epsilon_{1,m} \\
\epsilon_{2,1} & \epsilon_{2,2} & \ldots & \epsilon_{2,m} \\
\vdots & \vdots & \ddots & \vdots \\
\epsilon_{n,1} & \epsilon_{n,2} & \ldots & \epsilon_{n,m}
\end{bmatrix}
\end{equation}
where $\epsilon_{i,j} \in [-1, 1]$ because the bias may be negative or positive. If $y_{i,j} = 1$ and the model estimated $\widehat{y}_{i,j} = 0.971$, then the bias is positive because $\epsilon_{i,j} = 1 - 0.97 = 0.03$. However, if $y_{i,j} = 0$ and $\widehat{y}_{i,j} = 0.12$, then the bias is negative because $\epsilon_{i,j} = 0 - 0.12 = -0.12$.
We deduce the vectored equation
\begin{equation}
Y = \widehat{Y} + \epsilon
\end{equation}
where the operator $+$ is the element-wise matrix addition.
Let $f : T \longrightarrow \mathbb{M}_{n \times m}([0,1])$ be a model defined by $f(x) = \widehat{Y}$ where the notation $\mathbb{M}_{n \times m}([0,1])$ means the set of matrix $n$ by $m$ for which each element is a real number in $[0,1]$.
The goal is to find a model $f$ such that the bias $\epsilon$ is minimized when $f$ is applied on $x$. Obtaining $\epsilon = \mathbf{0}_{n \times m}$ means that the model $f$ predict perfectly how the texts will be classified.
\subsection{Theoretical Problem}
Let $C = \{c_1,c_2,\ldots, c_m\}$ be the set of all output class labels. We define the following random variables:
\begin{itemize}
\item $c \in C$ representing an output class label;
\item $x \in T$ representing a text.
\end{itemize}
For a given text $x$, in virtue of the Bayes theorem, we have
\begin{equation}
\mathbb{P}(c = c_j | x = x_i) = \frac{\mathbb{P}(x = x_i | c = c_j)\mathbb{P}(c = c_j)}{\mathbb{P}(x = x_i)}.
\end{equation}
The goal of the Naive Bayes Classification is to find the output class label $c$ that maximize the probability that a text $x \in T$ maps to the output class label $c_j$ knowing $x_i$. In other terms, this means that
\begin{equation} \label{eq_CMax}
c_{max} = \arg\max_{c \in C} \mathbb{P}(c = c_j | x = x_i) = \arg\max_{c \in C} \frac{\mathbb{P}(x = x_i | c = c_j)\mathbb{P}(c = c_j)}{\mathbb{P}(x = x_i)}.
\end{equation}
We extract 2 assumptions that will simplify the equation \eqref{eq_CMax}:
\begin{enumerate}
\item $\mathbb{P}(x = x_i)$ is the same for all output class labels and does not affect the argmax which is on $c$.
\item Two texts $x_i, x_j \in T$ where $i \neq j$ are independent. This implies that $\mathbb{P}(x = x_i | c)$ is independent of $\mathbb{P}(x = x_j | c)$.
\end{enumerate}
Applying the first property gives
\begin{equation}
c_{max} = \arg\max_{c \in C} \mathbb{P}(x | c)\mathbb{P}(c)
\end{equation}
which can be written equivalently using the definition of $T$ as
\begin{equation} \label{eq:ElementX_cmax}
c_{max} = \arg\max_{c \in C} \mathbb{P}(x = x_1, x = x_2, \ldots, x = x_n | c)\mathbb{P}(c).
\end{equation}
Now, applying the second property in \eqref{eq:ElementX_cmax} gives
\begin{equation}
c_{max} = \arg\max_{c \in C} \mathbb{P}(c) \prod\limits_{i = 1}^n \mathbb{P}(x = x_i | c).
\end{equation}
\subsection{Bag of Words Model}
Let $W$ be the set of words contained in $T$. Take $x_i = (w_{i,1}, w_{i,2}, \ldots, w_{i,k_i}) \in \mathcal{T}$ a text containing $k_i$ words where a word $w_{i,j} \in W$. We assume that a text cannot be empty meaning that $\mathcal{T} \neq \emptyset$.
We want to use the maximum likelihood estimator $\widehat{P}$ defined as the frequency of a word $w_{i,j}$ among the $n$ texts where $1 \leq j \leq k_i$ knowing the output class label $c_l \in C$. The estimator $\widehat{P}$ estimates the likelihood function $P(x_1,x_2,\ldots,x_n ; c) = \prod\limits_{i=1}^n \mathbb{P}(x = x_i | c = c_l)$.
To calculate that frequency, we have to calculate the ratio between the number of occurrences of the word $w_{i,j}$ among the $n$ texts, where the output class label is $c_j$, and the total number of words in the $n$ texts where the output class label is $c_j$.
Let $f : W \times C \longrightarrow \mathbb{N}$ be a function defined as $f(w_{i,j}, c_l) = z$ that returns the number of occurrences ($z$) a word $w_{i,j}$ is found among all texts classified as the output class label $c_l$.
Therefore, the maximum likelihood estimator of $P(x_1,x_2,\ldots,x_n ; c)$ is defined as
\begin{equation} \label{eq:LikelihoodEstimatorWords}
\widehat{P}(w_{i,j} \in x_i | c) = \frac{f(w_{i,j}, c) + 1}{\sum\limits_{w \in W} f(w, c) + 1}.
\end{equation}
We also need the maximum likelihood estimator of $P(c = c_l) = \mathbb{P}(c = c_l)$ which is defined as the ratio between the number of texts classified as $c_l$ and the number of texts $n$. Let $T_c = \{x_i \in T : x_i \mapsto c_l\}$ be the set of all texts $x_i$ classified as $c_l$.
We note $|T_c|$ the cardinality of $T_c$. The estimator is defined as
\begin{equation}
\widehat{P}(c = c_l) = \frac{|T_c|}{n}.
\end{equation}
The reason behind the Laplace smoothing, adding 1 to the numerator and denominator of $\widehat{P}(w_{i,j} \in x_i | c)$, is to handle the case when $f(w_{i,j}, c) = 0$. If a word $w_{i,j}$ is not found for a given output class label $c_l$, then $\widehat{P}(w_{i,j} \in x_i | c) = 0$. Having only one case like this without adding 1 causes
\begin{equation}
\widehat{P}(c) \prod\limits_{i=1}^n \widehat{P}(w_{i,j} \in x_i | c) = 0.
\end{equation}
\subsection{Example}
In this example, we want to classify texts as toxic or non toxic. Suppose that we have the train dataset \ref{tbl:TrainDataset} where the texts have already been cleaned.
\begin{table}[!htb]
\caption{Train Dataset} \label{tbl:TrainDataset}
\centering
\begin{tabular}{|>{\raggedright}m{50mm} | m{15mm} |} \hline
\multicolumn{1}{|>{\centering}m{50mm}|}{\cellcolor{black!30}\textbf{Text}}
& \multicolumn{1}{|>{\centering}m{15mm}|}{\cellcolor{black!30}\textbf{Is Toxic}} \\ \hline
fuck fuck fuck shit shit & 1 \\ \hline
explanation natural processing language matter & 0 \\ \hline
hell fuck die mother fuck shit & 1 \\ \hline
block pollution environment climate natural & 0 \\ \hline
mother fuck stupid piece shit & 1 \\ \hline
\end{tabular}
\end{table}
We have to predict if the text \textit{shit language yourself hell fuck shit} is toxic or not.
From the dataset \ref{tbl:TrainDataset} including the text to classify, we set $T = \{x_1, x_2, x_3, x_4, x_5, x_t\}$ as
\begin{align*}
x_1 &= \{fuck, fuck, fuck, shit, shit\} \\
x_2 &= \{explanation, natural, processing, language, matter\} \\
x_3 &= \{hell, fuck, die, mother, fuck, shit\} \\
x_4 &= \{block, pollution, environment, climate, natural, mother\} \\
x_5 &= \{mother, fuck, stupid, piece, shit\} \\
x_t &= \{shit, language, yourself, hell, fuck, shit\}.
\end{align*}
where $x_t$ is the text to classify. The output classes are $Y = (1,0,1,0,1)$.
Let $W = \{fuck, shit, explanation, natural, processing, language, matter, hell,$ $ die, mother, block, pollution, environment, climate, stupid, piece, yourself\}$ be the set of words used in $T$. We have $|W| = 17$. Let $W_t$ be the multiset of words in texts classified as toxic and $W_n$ the multiset of words in texts classified as non toxic. Thus, we have $|W_t| = 16$ and $|W_n| = 11$.
Let $c \in C = \{\texttt{"toxic"}, \texttt{"non toxic"}\}$ be the random variable representing an output class label and $w \in W$ the random variable representing a word.
\begin{align*}
\widehat{P}(w = \texttt{"fuck"} | c = \texttt{"toxic"}) &= \frac{f(w, c)+1}{|W_t| + |W|} = \frac{6 + 1}{16 + 17} = \frac{7}{33} = 0.2121 \\
\widehat{P}(w = \texttt{"shit"} | c = \texttt{"toxic"}) &= \frac{f(w, c)+1}{|W_t| + |W|} = \frac{4 + 1}{16 + 17} = \frac{5}{33} = 0.1515 \\
\widehat{P}(w = \texttt{"hell"} | c = \texttt{"toxic"}) &= \frac{f(w, c)+1}{|W_t| + |W|} = \frac{1 + 1}{16 + 17} = \frac{2}{33} = 0.0606 \\
\widehat{P}(w = \texttt{"die"} | c = \texttt{"toxic"}) &= \frac{f(w, c)+1}{|W_t| + |W|} = \frac{1 + 1}{16 + 17} = \frac{2}{33} = 0.0606 \\
\widehat{P}(w = \texttt{"mother"} | \texttt{"toxic"}) &= \frac{f(w, c)+1}{|W_t| + |W|} = \frac{2 + 1}{16 + 17} = \frac{3}{33} = 0.0909 \\
\widehat{P}(w = \texttt{"stupid"} | c = \texttt{"toxic"}) &= \frac{f(w, c)+1}{|W_t| + |W|} = \frac{1 + 1}{16 + 17} = \frac{2}{33} = 0.0606 \\
\widehat{P}(w = \texttt{"piece"} | c = \texttt{"toxic"}) &= \frac{f(w, c)+1}{|W_t| + |W|} = \frac{1 + 1}{16 + 17} = \frac{2}{33} = 0.0606 \\
\widehat{P}(w = \texttt{"mother"} | c = \texttt{"non toxic"}) &= \frac{f(w, c)+1}{|W_n| + |W|} = \frac{1 + 1}{11 + 17} = \frac{2}{28} = 0.0714 \\
\widehat{P}(w = \texttt{"natural"} | c = \texttt{"non toxic"}) &= \frac{f(w, c)+1}{|W_n| + |W|} = \frac{2 + 1}{11 + 17} = \frac{3}{28} = 0.1071
\end{align*}
Note that for a word $w \in \{explanation, processing, language, matter, block, $ $pollution, environment, climate\}$, the probability is $\widehat{P}(w | c = \texttt{"non toxic"}) = 0.0714$ and $\widehat{P}(w | c = \texttt{"toxic"}) = 0.0303$.
Since the dataset contains 2 texts classified as toxic and 3 texts as non toxic, we have
\begin{align*}
\widehat{P}(c = \texttt{"toxic"}) &= \frac{3}{5} = 0.6 \\
\widehat{P}(c = \texttt{"non toxic"}) &= \frac{2}{5} = 0.4.
\end{align*}
Let's predict in which output class label $x_t$ is classified. We use the equation \eqref{eq:LikelihoodEstimatorWords}.
\begin{align*}
\widehat{P}(c = \texttt{"toxic"} | x = x_t) &= 0.6 \times 0.1515^2 \times 0.0303 \times 0.0303 \times 0.0606 \times 0.2121 \\ &= 0.000000163 \\
\widehat{P}(c = \texttt{"non toxic"} | x = x_t) &= 0.4 \times 0.0357^2 \times 0.0714 \times 0.0357 \times 0.0357 \times 0.0357 \\ &= 0.000000002
\end{align*}
It follows that $x_t$ is classified as a toxic text.
\end{document}