Skip to content

Commit 876897f

Browse files
author
Scott Sievert
committed
ENH: Basic implementation of SGD
BUG: dataframe size info not exact and indexing needed squash Getting rid of ASGD
1 parent 64e01eb commit 876897f

File tree

2 files changed

+69
-4
lines changed

2 files changed

+69
-4
lines changed

dask_glm/algorithms.py

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
from dask import delayed, persist, compute, set_options
77
import functools
88
import numpy as np
9+
import numpy.linalg as LA
910
import dask.array as da
11+
import dask.dataframe as dd
1012
from scipy.optimize import fmin_l_bfgs_b
1113

1214

@@ -138,6 +140,67 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
138140
return beta
139141

140142

143+
def _choose_step_sgd(initial, k):
144+
return initial / (k + 1)
145+
146+
147+
@normalize
148+
def sgd(X, y, max_iter=1e3, tol=1e-2, family=Logistic, batch_size=64,
149+
initial_step=1.0, **kwargs):
150+
"""Stochastic Gradient Descent.
151+
152+
Parameters
153+
----------
154+
X : array-like, shape (n_samples, n_features)
155+
y : array-like, shape (n_samples,)
156+
max_iter : int, float
157+
maximum number of iterations to attempt before declaring
158+
failure to converge
159+
tol : float
160+
Maximum allowed change from prior iteration required to
161+
declare convergence
162+
batch_size : int
163+
The batch size used to approximate the gradient. Larger batch sizes
164+
will approximate the gradient better.
165+
initial_step : float
166+
Initial step size used in the optimization. The step size decays like
167+
initial_step/(1 + iter_count).
168+
family : Family
169+
170+
Returns
171+
-------
172+
beta : array-like, shape (n_features,)
173+
"""
174+
gradient = family.gradient
175+
n, p = X.shape
176+
if np.isnan(n):
177+
raise ValueError('SGD needs shape information to allow indexing. '
178+
'Possible by passing a computed array in (`X.compute()` '
179+
'or `X.values.compute()`), then doing using '
180+
'`dask.array.from_array ')
181+
182+
beta = np.zeros(p)
183+
184+
iter_count = 0
185+
converged = False
186+
187+
while not converged:
188+
beta_old = beta.copy()
189+
iter_count += 1
190+
191+
i = np.random.choice(n, size=(batch_size,))
192+
Xbeta = dot(X[i], beta)
193+
194+
grad = gradient(Xbeta, X[i], y[i]).compute()
195+
196+
beta -= _choose_step_sgd(initial_step, iter_count) * grad / batch_size
197+
198+
rel_error = LA.norm(beta_old - beta) / LA.norm(beta)
199+
converged = (rel_error < tol) or (iter_count > max_iter)
200+
201+
return beta
202+
203+
141204
@normalize
142205
def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs):
143206
"""Newtons Method for Logistic Regression.
@@ -430,5 +493,6 @@ def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic,
430493
'gradient_descent': gradient_descent,
431494
'newton': newton,
432495
'lbfgs': lbfgs,
433-
'proximal_grad': proximal_grad
496+
'proximal_grad': proximal_grad,
497+
'sgd': sgd
434498
}

dask_glm/estimators.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def family(self):
3232

3333
def __init__(self, fit_intercept=True, solver='admm', regularizer='l2',
3434
max_iter=100, tol=1e-4, lamduh=1.0, rho=1,
35-
over_relax=1, abstol=1e-4, reltol=1e-2):
35+
over_relax=1, abstol=1e-4, reltol=1e-2, **kwargs):
3636
self.fit_intercept = fit_intercept
3737
self.solver = solver
3838
self.regularizer = regularizer
@@ -61,9 +61,10 @@ def __init__(self, fit_intercept=True, solver='admm', regularizer='l2',
6161

6262
self._fit_kwargs = {k: getattr(self, k) for k in fit_kwargs}
6363

64-
def fit(self, X, y=None):
64+
def fit(self, X, y=None, **kwargs):
6565
X_ = self._maybe_add_intercept(X)
66-
self._coef = algorithms._solvers[self.solver](X_, y, **self._fit_kwargs)
66+
self._coef = algorithms._solvers[self.solver](X_, y, **self._fit_kwargs,
67+
**kwargs)
6768

6869
if self.fit_intercept:
6970
self.coef_ = self._coef[:-1]

0 commit comments

Comments
 (0)