-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMAP.tex
73 lines (60 loc) · 3.69 KB
/
MAP.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
\chapter{Maximum a posteriori (MAP) estimate}
\section{MLE a special case of MAP}
MAP usually comes up in Bayesian setting. Because, as the name suggests, it works on a posterior distribution, not only the likelihood \cite{mle-map}.
Recall Bayes Theorem:
\begin{equation}
\begin{split}
p(\theta | data) & = \frac{p(data | \theta) \cdot p(\theta)}{p(data)} \\
\end{split}
\end{equation}
Definition:
\begin{equation}
\begin{split}
{\bf \hat{\theta}_{\text{MAP}}} &= \arg\max_{\bf \theta} p(\theta | data) \\
&= \arg\max_{\bf \theta} \frac{p(data | \theta) p(\theta)}{p(data)} \\
&= \arg\max_{\bf \theta} p(data | \theta) p(\theta) \\
&= \arg\max_{\bf \theta} \log(p(data | \theta) p(\theta)) \\
&= \arg\max_{\bf \theta} \log p(data | \theta) + \log p(\theta) \\
&= \hat{\theta}_{\text{MLE}} + \log p(\theta) \\
\end{split}
\end{equation}
\section{Uniform Distribution priors}
If the prior distribution is uniform i.e values assigned to $\theta = \frac{1}{N}$ everywhere in the distribution. Here we show that MLE is a special case of MAP, where the prior is uniform \cite{mle-map}.
\begin{equation}
\begin{split}
{\bf \hat{\theta}_{\text{MAP}}} &= \arg\max_{\bf \theta} p(data | \theta) p(\theta) \\
&= \arg\max_{\bf \theta} \log(p(data | \theta) p(\theta)) \\
&= \arg\max_{\bf \theta} \log p(data | \theta) + \log p(\theta) \\
&= \hat{\theta}_{\text{MLE}} + constant \\
&= \hat{\theta}_{\text{MLE}} \\
\end{split}
\end{equation}
\section{Normal Distribution priors or L2 regularization}
Now let's take the case of Normal distribution
Recall Normal Distribution:
\begin{equation}
Normal(x \mid \mu,\sigma) = \frac{1}{{\sigma \sqrt {2\pi } }}e^{{{ - \left( {x - \mu } \right)^2 } \mathord{\left/ {\vphantom {{ - \left( {x - \mu } \right)^2 } {2\sigma ^2 }}} \right. \kern-\nulldelimiterspace} {2\sigma ^2 }}}
\end{equation}
L2 regularization
\begin{equation}
\begin{split}
{\bf \hat{\theta}_{\text{MAP}}} & = \arg\max_{\bf \beta} \Big[ \log \prod_{i=1}^{n} \frac{1}{\sigma\sqrt{2\pi}}e^{-\frac{(y_i- (\beta_0 + \beta_1 x_{i,1} + ... + \beta_p x_{i,p}))^2}{2\sigma^2}} + \log \prod_{j=0}^{p} \frac{1}{\tau\sqrt{2\pi}}e^{-\frac{\beta_j^2}{2\tau^2}} \Big] \\
& = \arg\max_{\bf \beta} \Big[- \sum_{i=1}^{n} {\frac{(y_i- (\beta_0 + \beta_1 x_{i,1} + ... + \beta_p x_{i,p}))^2}{2\sigma^2}} - \sum_{j=0}^{p} {\frac{\beta_j^2}{2\tau^2}} \Big]\\
& = \arg\min_{\bf \beta} \frac{1}{2\sigma^2} \big[ \sum_{i=1}^{n} (y_i-(\beta_0 + \beta_1 x_{i,1} + ... + \beta_p x_{i,p}))^2 + \frac{\sigma^2}{\tau^2} \sum_{j=0}^{p} \beta_j^2 \big] \\
& = \arg\min_{\bf \beta} \big[ \sum_{i=1}^{n} (y_i-(\beta_0 + \beta_1 x_{i,1} + ... + \beta_p x_{i,p}))^2 + \lambda \sum_{j=0}^{p} \beta_j^2 \big]
\end{split}
\end{equation}
\section{Laplacean priors or L1 regularization}
Laplacean distribution:
\begin{equation}
Laplace(x \mid \mu, b) = \frac{1}{2b} e^{-\frac{|x-\mu|}{b}}
\end{equation}
L1 regularization
\begin{equation}
\begin{split}
{\bf \hat{\theta}_{\text{MAP}}} & = \arg\max_{\bf \beta} \Big[ \log \prod_{i=1}^{n} \frac{1}{\sigma\sqrt{2\pi}}e^{-\frac{(y_i- (\beta_0 + \beta_1 x_{i,1} + ... + \beta_p x_{i,p}))^2}{2\sigma^2}} + \log \prod_{j=0}^{p} \frac{1}{2b}e^{-\frac{|\beta_j|}{2b}} \Big] \\
&= \arg\max_{\bf \beta} \Big[- \sum_{i=1}^{n} {\frac{(y_i- (\beta_0 + \beta_1 x_{i,1} + ... + \beta_p x_{i,p}))^2}{2\sigma^2}} - \sum_{j=0}^{p} {\frac{|\beta_j|}{2b}} \Big]\\
&= \arg\min_{\bf \beta} \frac{1}{2\sigma^2} \big[ \sum_{i=1}^{n} (y_i-(\beta_0 + \beta_1 x_{i,1} + ... + \beta_p x_{i,p}))^2 + \frac{\sigma^2}{b} \sum_{j=0}^{p} |\beta_j| \big] \\
&= \arg\min_{\bf \beta} \big[ \sum_{i=1}^{n} (y_i-(\beta_0 + \beta_1 x_{i,1} + ... + \beta_p x_{i,p}))^2 + \lambda \sum_{j=0}^{p} |\beta_j| \big]
\end{split}
\end{equation}