-
Notifications
You must be signed in to change notification settings - Fork 1
/
softmax.py
125 lines (96 loc) · 3.4 KB
/
softmax.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import numpy as np
# Softmax activation function
def softmax(z):
# Shift z values so highest value is 0
# Must stabilize as exp can get out of control
z_norm = z - np.max(z)
exp = np.exp(z_norm)
return exp / np.sum(exp, axis=0, keepdims=True)
# Softmax gradient using jacobian
def softmax_grad_simple(s):
# input s is softmax value of the original input x. Its shape is (1,n)
# e.i. s = np.array([0.3,0.7]), x = np.array([0,1])
# make the matrix whose size is n^2.
jacobian_m = np.diag(s)
for i in range(len(jacobian_m)):
for j in range(len(jacobian_m)):
if i == j:
jacobian_m[i][j] = s[i] * (1-s[i])
else:
jacobian_m[i][j] = -s[i]*s[j]
return jacobian_m
# Vectorized version of softmax gradient
def softmax_grad(softmax):
s = softmax.reshape(-1,1)
return np.diagflat(s) - np.dot(s, s.T)
# Implementation found here: https://github.com/eliben/deep-learning-samples/blob/master/softmax/softmax.py
def softmax_gradient_eliben(z):
"""Computes the gradient of the softmax function.
z: (T, 1) array of input values where the gradient is computed. T is the
number of output classes.
Returns D (T, T) the Jacobian matrix of softmax(z) at the given z. D[i, j]
is DjSi - the partial derivative of Si w.r.t. input j.
"""
Sz = softmax(z)
# -SjSi can be computed using an outer product between Sz and itself. Then
# we add back Si for the i=j cases by adding a diagonal matrix with the
# values of Si on its diagonal.
D = -np.outer(Sz, Sz) + np.diag(Sz.flatten())
return D
# Alternative softmax solution:
# https://stackoverflow.com/questions/45949141/compute-a-jacobian-matrix-from-scratch-in-python
def softmax_grad_v3(probs):
n_elements = probs.shape[0]
jacobian = probs[:, np.newaxis] * (np.eye(n_elements) - probs[np.newaxis, :])
return jacobian
# (n_class, n_class, n_m_examples)
# Finds softmax for m training examples
def softmax_grad_mexamples(z):
n_class, n_m = z.shape
s_grad = np.empty((n_class, n_class, n_m))
for i in range(z.shape[1]):
soft_grad = softmax_grad(z[:, i])
s_grad[:, :, i] = soft_grad
return s_grad
# Finds softmax gradient for m training examples - transposed version for clarity
def softmax_grad_mexamples_transpose(z):
# (n_m x n_class) matrix - y class per m row
n_m, n_class = z.shape
s_grad = np.empty((n_m, n_class, n_class))
for i in range(z.shape[0]):
row = z[i]
sgr = softmax_grad(row)
s_grad[i] = sgr
return s_grad
# x = np.array([1, 2, 3, 4])
#
# sm = softmax(x)
# print(sm)
#
# sm_g_j = softmax_grad(sm)
# print(sm_g_j)
#
#
# sm_g_j = softmax_gradient_eliben(x)
# print(sm_g_j)
z = np.array([[1, 2, 3, 4], [2, 2, 2, 2], [0, 1, 0, 1]]).T
print('Grad m')
sgm2 = softmax_grad_mexamples_transpose(z.T).T
print(sgm2)
sgm = softmax_grad_mexamples(z)
print(sgm)
# dcost_step = binary_cross_entropy_d(Y_train, a3)
# a3_d = sigmoid_d(a3)
# dz3_step = a3_d * dcost_step
'''
cost_d = categorical_cross_entropy_d(Y, a3)
a3_d = softmax_d_m(a3)
print('A3', a3.shape)
print(cost_d.shape)
print(a3_d.shape)
cost_d_r = cost_d.reshape((cost_d.shape[0], 1, cost_d.shape[1]))
dz3_step = np.einsum('ijk,jyk->iyk', a3_d, cost_d_r)
dz3_step_r = dz3_step.reshape((dz3_step.shape[0], dz3_step.shape[2]))
dz3_test = np.einsum('ijk,jk->ik', a3_d, cost_d)
da2, dw3, db3 = linear_d(dz3, w3, a2, b3)
'''