speedup and optimizations for SparseLinear

soumith · soumith · commit a38407a57def · 2015-01-02T20:42:12.000-08:00
diff --git a/SparseLinear.lua b/SparseLinear.lua
@@ -4,11 +4,16 @@ function SparseLinear:__init(inputSize, outputSize)
    parent.__init(self)
 
    self.weightDecay = 0
-   self.weight = torch.Tensor(outputSize, inputSize)
-   self.bias = torch.Tensor(outputSize)
-   self.gradWeight = torch.Tensor(outputSize, inputSize)
-   self.gradBias = torch.Tensor(outputSize)
-   self.lastInput = torch.Tensor()
+   self.weight = torch.Tensor(outputSize, inputSize):zero()
+   self.bias = torch.Tensor(outputSize):zero()
+   self.gradWeight = torch.Tensor(outputSize, inputSize):zero()
+   self.gradBias = torch.Tensor(outputSize):zero()
+   self.lastInput = nil
+
+   if torch.getnumthreads() > 1 and outputSize >= 128 then
+     self.shardBuffer = torch.Tensor(outputSize, torch.getnumthreads())
+   end
+
    -- state
    self.gradInput:resize(inputSize)
    self.output:resize(outputSize)
@@ -20,7 +25,7 @@ function SparseLinear:reset(stdv)
    if stdv then
       stdv = stdv * math.sqrt(3)
    else
-      stdv = 1./math.sqrt(self.weight:size(1))
+      stdv = 1./math.sqrt(self.weight:size(2))
    end
    if nn.oldSeed then
       for i=1,self.weight:size(1) do
@@ -40,22 +45,18 @@ function SparseLinear:updateOutput(input)
 end
 
 function SparseLinear:accGradParameters(input, gradOutput, scale)
+   if not self.lastInput then
+     self.lastInput = input:clone()
+   else
+     self.lastInput:resizeAs(input):copy(input)
+   end
+
    return input.nn.SparseLinear_accGradParameters(self, input, gradOutput, scale)
 end
 
 function SparseLinear:updateGradInput(input, gradOutput)
    if self.gradInput then
-      self.gradInput:resize(input:size())
-      self.gradInput:copy(input)
-      local numNonzero = self.gradInput:size(1)
-      for e=1,numNonzero do         
-         local g = 0
-         local i = self.gradInput[{e,1}]
-         for j=1,self.output:size(1) do
-            g = g + self.weight[{j,i}] * gradOutput[j]
-         end
-         self.gradInput[{e,2}] = g
-      end
+      input.nn.SparseLinear_updateGradInput(self, input, gradOutput)
       return self.gradInput
    end
-end
+end
diff --git a/generic/SparseLinear.c b/generic/SparseLinear.c
@@ -2,34 +2,91 @@
 #define TH_GENERIC_FILE "generic/SparseLinear.c"
 #else
 
+static int nn_(checkInput)(THTensor* t) {
+  return t->nDimension == 2 && t->size[1] == 2;
+}
+
+static int nn_(checkSize2D)(THTensor* t, long size0, long size1) {
+  return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
+}
+
+static int nn_(checkSize1D)(THTensor* t, long size0) {
+  return t->nDimension == 1 && t->size[0] == size0;
+}
+
 static int nn_(SparseLinear_updateOutput)(lua_State *L)
 {
   long i;
   THTensor * input = luaT_checkudata(L, 2, torch_Tensor);
   THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
   THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
   THTensor * output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
-  long dim = weight->size[1]; /* number of weights.. */
+
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  luaL_argcheck(L, nn_(checkInput)(input), 2, "input size must be nnz x 2");
+  luaL_argcheck(L, nn_(checkSize1D)(output, outDim), 1, "output size wrong");
+  luaL_argcheck(L, nn_(checkSize1D)(bias, outDim), 1, "bias size wrong");
+
+  lua_getfield(L, 1, "shardBuffer");
+  if (!lua_isnil(L, -1)) {
+    THTensor *buffer =
+      luaT_getfieldcheckudata(L, 1, "shardBuffer", torch_Tensor);
+    long num_shards = buffer->size[1];
+    luaL_argcheck(L,
+                  buffer->nDimension == 2 && buffer->size[0] == outDim &&
+                      num_shards > 0,
+                  1,
+                  "shardBuffer size wrong");
+
+    THTensor_(zero)(buffer);
+    #pragma omp parallel for private(i) schedule(static) num_threads(num_shards)
+    for (i = 0; i < input->size[0]; i++) {
+      int shardId = omp_get_thread_num();
+      long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
+
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+                      THTensor_(get2d)(input, i, 1),
+                      THTensor_(data)(weight) + offset * weight->stride[1],
+                      weight->stride[0],
+                      THTensor_(data)(buffer) + shardId * buffer->stride[1],
+                      buffer->stride[0]);
+      } else {
+        luaL_error(L, "index out of bound. updateOutput: \
+%ld not between 1 and %ld", offset + 1, inDim);
+      }
+    }
+
+    THTensor_(sum)(output, buffer, 1);
+    THTensor_(cadd)(output, bias, 1.0, output);
+
+    lua_getfield(L, 1, "output");
+    return 1;
+  }
 
   THTensor_(copy)(output, bias);
   for(i = 0; i < input->size[0]; i++)
   {
     long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
-    if(offset >= 0 && offset < dim) /* make sure indices are in bounds.. */
+    if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
     {
         real val = THTensor_(get2d)(input, i, 1);
-        THBlas_(axpy)(output->size[0], 
-                      val, 
+        THBlas_(axpy)(output->size[0],
+                      val,
                       THTensor_(data)(weight)+offset*weight->stride[1],
-                      weight->stride[0], 
-                      THTensor_(data)(output), 
+                      weight->stride[0],
+                      THTensor_(data)(output),
                       output->stride[0]);
     }
     else {
-        printf("\nupdateOutput: %ld not between 1 and %ld\n", offset+1, dim);
-        luaL_error(L, "index out of bound");
+        luaL_error(L, "index out of bound. updateOutput: \
+%ld not between 1 and %ld", offset + 1, inDim);
     }
   }
+
+  lua_getfield(L, 1, "output");
   return 1;
 }
 
@@ -42,39 +99,47 @@ static int nn_(SparseLinear_accGradParameters)(lua_State *L)
   THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
   THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
   THTensor * gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  THTensor * lastInput = luaT_getfieldcheckudata(L, 1, "lastInput", torch_Tensor);
   real weightDecay = luaT_getfieldchecknumber(L, 1, "weightDecay");
-  long dim = gradWeight->size[1]; /* number of weights.. */
 
-  for(i = 0; i < input->size[0]; i++)
+  long nnz = input->size[0];
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  luaL_argcheck(L, nn_(checkInput)(input), 2, "input size must be nnz x 2");
+  luaL_argcheck(
+    L, nn_(checkSize1D)(gradOutput, outDim), 3, "gradOutput size wrong");
+  luaL_argcheck(
+    L, nn_(checkSize2D)(gradWeight, outDim, inDim), 1, "gradWeight size wrong");
+  luaL_argcheck(
+    L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong");
+
+  #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000)
+  for(i = 0; i < nnz; i++)
   {
       long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
 
-      if(offset >= 0 && offset < dim) /* make sure indices are in bounds.. */
+      if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
       {
           real val = scale*THTensor_(get2d)(input, i, 1);
-          
-          THBlas_(axpy)(gradOutput->size[0], 
-                        val, 
-                        THTensor_(data)(gradOutput), 
-                        gradOutput->stride[0], 
-                        THTensor_(data)(gradWeight)+offset*gradWeight->stride[1], 
+
+          THBlas_(axpy)(outDim,
+                        val,
+                        THTensor_(data)(gradOutput),
+                        gradOutput->stride[0],
+                        THTensor_(data)(gradWeight)+offset*gradWeight->stride[1],
                         gradWeight->stride[0]);
       }
       else {
-          printf("\naccGradParameters: %ld not between 1 and %ld\n", offset+1, dim);
-          luaL_error(L, "index out of bound");
+          luaL_error(L, "index out of bound. accGradParameters: \
+%ld not between 1 and %ld", offset + 1, inDim);
       }
   }
-  
-  THTensor_(cadd)(gradBias, gradBias, scale, gradOutput); 
-  
+
+  THTensor_(cadd)(gradBias, gradBias, scale, gradOutput);
+
   if(weightDecay != 0)
     THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
-  
-  THTensor_(resizeAs)(lastInput, input);
-  THTensor_(copy)(lastInput, input);
-  
+
   return 0;
 }
 
@@ -85,37 +150,137 @@ int nn_(SparseLinear_updateParameters)(lua_State *L)
   THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
   THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
   THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
-  THTensor * gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
-  THTensor * lastInput = luaT_getfieldcheckudata(L, 1, "lastInput", torch_Tensor);
-  
-  long dim = weight->size[1]; /* number of weights.. */
+  THTensor * gradWeight = luaT_getfieldcheckudata(
+    L, 1, "gradWeight", torch_Tensor);
+  THTensor * lastInput = luaT_getfieldcheckudata(
+    L, 1, "lastInput", torch_Tensor);
+
+  long nnz = lastInput->size[0];
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  luaL_argcheck(
+    L, nn_(checkSize2D)(gradWeight, outDim, inDim), 1, "gradWeight size wrong");
+  luaL_argcheck(
+    L, nn_(checkSize1D)(bias, outDim), 1, "bias size wrong");
+  luaL_argcheck(
+    L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong");
+
   THTensor_(cadd)(bias, bias, -learningRate, gradBias);
-  
-  for(i = 0; i < lastInput->size[0]; i++) 
+
+  #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 50000)
+  for(i = 0; i < nnz; i++)
   {
       long offset = (long)(THTensor_(get2d)(lastInput, i, 0)) - 1;
-      
-      if(offset >= 0 && offset < dim) /* make sure indices are in bounds.. */
+
+      if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
       {
-          THBlas_(axpy)(bias->size[0], 
-                        -learningRate, 
-                        THTensor_(data)(gradWeight)+offset*gradWeight->stride[1], 
-                        gradWeight->stride[0], 
-                        THTensor_(data)(weight)+offset*weight->stride[1], 
+          real* pGradWeight =
+            THTensor_(data)(gradWeight)+offset*gradWeight->stride[1];
+          THBlas_(axpy)(outDim,
+                        -learningRate,
+                        pGradWeight,
+                        gradWeight->stride[0],
+                        THTensor_(data)(weight)+offset*weight->stride[1],
                         weight->stride[0]);
       }
       else {
-          printf("\nupdateParameters: %ld not between 1 and %ld\n", offset+1, dim);
-          luaL_error(L, "index out of bound");
+          luaL_error(L, "index out of bound. updateParameters: \
+%ld not between 1 and %ld", offset + 1, inDim);
+      }
+  }
+  return 0;
+}
+
+int nn_(SparseLinear_zeroGradParameters)(lua_State *L)
+{
+  long i;
+  THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
+  THTensor * gradWeight = luaT_getfieldcheckudata(
+    L, 1, "gradWeight", torch_Tensor);
+  THTensor * lastInput = luaT_getfieldcheckudata(
+    L, 1, "lastInput", torch_Tensor);
+
+  long nnz = lastInput->size[0];
+  long outDim = gradWeight->size[0];
+  long inDim = gradWeight->size[1];
+
+  luaL_argcheck(
+    L, nn_(checkSize1D)(gradBias, outDim), 1, "gradBias size wrong");
+
+  THTensor_(zero)(gradBias);
+  #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 50000)
+  for(i = 0; i < nnz; i++)
+  {
+      long offset = (long)(THTensor_(get2d)(lastInput, i, 0)) - 1;
+
+      if(offset >= 0 && offset < inDim) /* make sure indices are in bounds.. */
+      {
+          real* pGradWeight =
+            THTensor_(data)(gradWeight)+offset*gradWeight->stride[1];
+          if(gradWeight->stride[0] == 1) {
+              THVector_(fill)(pGradWeight, 0, outDim);
+          } else {
+              long j;
+              for(j = 0; j < outDim; ++j) {
+                  pGradWeight[j * gradWeight->stride[0]] = 0;
+              }
+          }
+      }
+      else {
+          luaL_error(L, "index out of bound. zeroGradParameters: \
+%ld not between 1 and %ld", offset + 1, inDim);
       }
   }
   return 0;
 }
 
+static int nn_(SparseLinear_updateGradInput)(lua_State *L) {
+  THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
+  THTensor *gradInput =
+      luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
+
+  long i;
+  long nnz = input->size[0];
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  luaL_argcheck(
+    L, nn_(checkInput)(input), 2, "input must be an nnz x 2 tensor");
+  luaL_argcheck(
+    L, nn_(checkSize1D)(gradOutput, outDim), 3, "gradOutput size wrong");
+
+  THTensor_(resize2d)(gradInput, input->size[0], input->size[1]);
+
+  #pragma omp parallel for private(i) schedule(static) if(outDim * nnz > 100000)
+  for (i = 0; i < nnz; ++i) {
+    long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1;
+    THTensor_(set2d)(gradInput, i, 0, offset + 1);
+
+    if (offset >= 0 && offset < inDim) {
+      real val =
+          THBlas_(dot)(outDim,
+                       THTensor_(data)(gradOutput),
+                       gradOutput->stride[0],
+                       THTensor_(data)(weight) + offset * weight->stride[1],
+                       weight->stride[0]);
+      THTensor_(set2d)(gradInput, i, 1, val);
+    } else {
+      luaL_error(L, "index out of bound. updateGradInput: \
+%ld not between 1 and %ld", offset + 1, inDim);
+    }
+  }
+  return 0;
+}
+
 static const struct luaL_Reg nn_(SparseLinear__) [] = {
   {"SparseLinear_updateOutput", nn_(SparseLinear_updateOutput)},
   {"SparseLinear_accGradParameters", nn_(SparseLinear_accGradParameters)},
   {"SparseLinear_updateParameters", nn_(SparseLinear_updateParameters)},
+  {"SparseLinear_zeroGradParameters", nn_(SparseLinear_zeroGradParameters)},
+  {"SparseLinear_updateGradInput", nn_(SparseLinear_updateGradInput)},
   {NULL, NULL}
 };