BUG: dataframe size info not exact and indexing needed

Scott Sievert · Scott Sievert · commit fd1b2c3f0ee0 · 2018-03-14T22:51:31.000-05:00
diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py
@@ -145,7 +145,7 @@ def _choose_step_sgd(initial, k):
 
 @normalize
 def sgd(X, y, max_iter=1e3, tol=1e-8, family=Logistic, batch_size=64,
-        initial_step=10.0, n=None, **kwargs):
+        initial_step=1.0, **kwargs):
     """Stochastic Gradient Descent.
 
     Parameters
@@ -164,34 +164,33 @@ def sgd(X, y, max_iter=1e3, tol=1e-8, family=Logistic, batch_size=64,
     initial_step : float
         Initial step size used in the optimization. The step size decays like
         initial_step/(1 + iter_count).
-    n : int
-        The number of examples, or the first dimension of the matrix X. This argument will only be used if X.shape[1] is NaN.
     family : Family
 
     Returns
     -------
     beta : array-like, shape (n_features,)
     """
-    gradient, hessian = family.gradient, family.hessian
-    n_examples, p = X.shape
-    if not np.isnan(n_examples):
-        n = n_examples
-    if n is None:
-        raise ValueError('Pass number of examples in with kwarg `n`')
-    beta = np.zeros(p)  # always init to zeros?
+    gradient = family.gradient
+    n, p = X.shape
+    if np.isnan(n):
+        raise ValueError('SGD needs shape information to allow indexing. '
+                         'Possible by passing a computed array in (`X.compute()` '
+                         'or `X.values.compute()`), then doing using '
+                         '`dask.array.from_array ')
+
+    beta = np.zeros(p)
 
     iter_count = 0
     converged = False
 
     while not converged:
-        beta_old = beta
+        beta_old = beta.copy()
         iter_count += 1
 
         i = np.random.choice(n, size=(batch_size,))
         Xbeta = dot(X[i], beta)
 
-        grad = gradient(Xbeta, X[i], y[i])
-        (grad,) = compute((grad,))
+        grad = gradient(Xbeta, X[i], y[i]).compute()
 
         beta -= _choose_step_sgd(initial_step, iter_count) * grad / batch_size