|
6 | 6 | from dask import delayed, persist, compute, set_options
|
7 | 7 | import functools
|
8 | 8 | import numpy as np
|
| 9 | +import numpy.linalg as LA |
9 | 10 | import dask.array as da
|
| 11 | +import dask.dataframe as dd |
10 | 12 | from scipy.optimize import fmin_l_bfgs_b
|
11 | 13 |
|
12 | 14 |
|
@@ -138,6 +140,67 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
|
138 | 140 | return beta
|
139 | 141 |
|
140 | 142 |
|
| 143 | +def _choose_step_sgd(initial, k): |
| 144 | + return initial / (k + 1) |
| 145 | + |
| 146 | + |
| 147 | +@normalize |
| 148 | +def sgd(X, y, max_iter=1e3, tol=1e-2, family=Logistic, batch_size=64, |
| 149 | + initial_step=1.0, **kwargs): |
| 150 | + """Stochastic Gradient Descent. |
| 151 | +
|
| 152 | + Parameters |
| 153 | + ---------- |
| 154 | + X : array-like, shape (n_samples, n_features) |
| 155 | + y : array-like, shape (n_samples,) |
| 156 | + max_iter : int, float |
| 157 | + maximum number of iterations to attempt before declaring |
| 158 | + failure to converge |
| 159 | + tol : float |
| 160 | + Maximum allowed change from prior iteration required to |
| 161 | + declare convergence |
| 162 | + batch_size : int |
| 163 | + The batch size used to approximate the gradient. Larger batch sizes |
| 164 | + will approximate the gradient better. |
| 165 | + initial_step : float |
| 166 | + Initial step size used in the optimization. The step size decays like |
| 167 | + initial_step/(1 + iter_count). |
| 168 | + family : Family |
| 169 | +
|
| 170 | + Returns |
| 171 | + ------- |
| 172 | + beta : array-like, shape (n_features,) |
| 173 | + """ |
| 174 | + gradient = family.gradient |
| 175 | + n, p = X.shape |
| 176 | + if np.isnan(n): |
| 177 | + raise ValueError('SGD needs shape information to allow indexing. ' |
| 178 | + 'Possible by passing a computed array in (`X.compute()` ' |
| 179 | + 'or `X.values.compute()`), then doing using ' |
| 180 | + '`dask.array.from_array ') |
| 181 | + |
| 182 | + beta = np.zeros(p) |
| 183 | + |
| 184 | + iter_count = 0 |
| 185 | + converged = False |
| 186 | + |
| 187 | + while not converged: |
| 188 | + beta_old = beta.copy() |
| 189 | + iter_count += 1 |
| 190 | + |
| 191 | + i = np.random.choice(n, size=(batch_size,)) |
| 192 | + Xbeta = dot(X[i], beta) |
| 193 | + |
| 194 | + grad = gradient(Xbeta, X[i], y[i]).compute() |
| 195 | + |
| 196 | + beta -= _choose_step_sgd(initial_step, iter_count) * grad / batch_size |
| 197 | + |
| 198 | + rel_error = LA.norm(beta_old - beta) / LA.norm(beta) |
| 199 | + converged = (rel_error < tol) or (iter_count > max_iter) |
| 200 | + |
| 201 | + return beta |
| 202 | + |
| 203 | + |
141 | 204 | @normalize
|
142 | 205 | def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs):
|
143 | 206 | """Newtons Method for Logistic Regression.
|
@@ -430,5 +493,6 @@ def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic,
|
430 | 493 | 'gradient_descent': gradient_descent,
|
431 | 494 | 'newton': newton,
|
432 | 495 | 'lbfgs': lbfgs,
|
433 |
| - 'proximal_grad': proximal_grad |
| 496 | + 'proximal_grad': proximal_grad, |
| 497 | + 'sgd': sgd |
434 | 498 | }
|
0 commit comments