ROCm
diff --git a/‎include/caffe/internal_thread.hpp‎
Lines changed: 3 additions & 1 deletion b/‎include/caffe/internal_thread.hpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎include/caffe/layers/cudnn_conv_layer.hpp‎
Lines changed: 10 additions & 27 deletions b/‎include/caffe/layers/cudnn_conv_layer.hpp‎
Lines changed: 10 additions & 27 deletions
diff --git a/‎src/caffe/internal_thread.cpp‎
Lines changed: 3 additions & 5 deletions b/‎src/caffe/internal_thread.cpp‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎src/caffe/layers/base_data_layer.cpp‎
Lines changed: 3 additions & 5 deletions b/‎src/caffe/layers/base_data_layer.cpp‎
Lines changed: 3 additions & 5 deletions
@@ -18,7 +18,7 @@ namespace caffe {
  */
 class InternalThread {
  public:
-  InternalThread() : thread_() {}
+  InternalThread() : device_(0), thread_() {}
   virtual ~InternalThread();
 
   /**
@@ -34,6 +34,8 @@ class InternalThread {
   bool is_started() const;
 
  protected:
+  int device_;
+
   /* Implement this method in your subclass
       with the code you want your thread to run. */
   virtual void InternalThreadEntry() {}
 
@@ -29,8 +29,8 @@ namespace caffe {
 template <typename Dtype>
 class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
  public:
-  explicit CuDNNConvolutionLayer(const LayerParameter& param)
-      : ConvolutionLayer<Dtype>(param), handles_setup_(false) {}
+  explicit CuDNNConvolutionLayer(const LayerParameter& param);
+
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -43,49 +43,32 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
-
   bool handles_setup_;
 
 #ifdef USE_MIOPEN
-  miopenHandle_t* handle_;
-  hipStream_t*    stream_;
 
   // algorithms for forward and backwards convolutions
-  miopenConvFwdAlgorithm_t*        fwd_algo_;
-  miopenConvBwdWeightsAlgorithm_t* bwd_weight_algo_;
-  miopenConvBwdDataAlgorithm_t*    bwd_data_algo_;
+  vector<miopenConvFwdAlgorithm_t>        fwd_algo_;
+  vector<miopenConvBwdWeightsAlgorithm_t> bwd_weight_algo_;
+  vector<miopenConvBwdDataAlgorithm_t>    bwd_data_algo_;
 
   vector<miopenTensorDescriptor_t>      bottom_descs_, top_descs_;
   miopenTensorDescriptor_t              bias_desc_;
   miopenTensorDescriptor_t              filter_desc_;
   vector<miopenConvolutionDescriptor_t> conv_descs_;
 
   int N_, C_, W_, H_;
-#endif
-
-#ifdef USE_CUDNN
-  cudnnHandle_t* handle_;
-  cudaStream_t*  stream_;
-
-  // algorithms for forward and backwards convolutions
-  cudnnConvolutionFwdAlgo_t *fwd_algo_;
-  cudnnConvolutionBwdFilterAlgo_t *bwd_filter_algo_;
-  cudnnConvolutionBwdDataAlgo_t *bwd_data_algo_;
-
-  vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
-  cudnnTensorDescriptor_t    bias_desc_;
-  cudnnFilterDescriptor_t      filter_desc_;
-  vector<cudnnConvolutionDescriptor_t> conv_descs_;
+  miopenHandle_t handle_;
 #endif
 
   int bottom_offset_, top_offset_, bias_offset_;
 
-  size_t *workspace_fwd_sizes_;
-  size_t *workspace_bwd_data_sizes_;
-  size_t *workspace_bwd_filter_sizes_;
+  vector<size_t> workspace_fwd_sizes_;
+  vector<size_t> workspace_bwd_filter_sizes_;
+  vector<size_t> workspace_bwd_data_sizes_;
   size_t workspaceSizeInBytes;  // size of underlying storage
   void *workspaceData;  // underlying storage
-  void **workspace;  // aliases into workspaceData
+  vector<void*>  workspace;  // aliases into workspaceData
 };
 #endif
 
 
@@ -20,18 +20,15 @@ bool InternalThread::must_stop() {
 
 void InternalThread::StartInternalThread() {
   CHECK(!is_started()) << "Threads should persist and not be restarted.";
+  LOG(INFO) << "Starting internal thread on device " << device_;
 
-  int device = 0;
-#ifndef CPU_ONLY
-  HIP_CHECK(hipGetDevice(&device));
-#endif
   Caffe::Brew mode = Caffe::mode();
   int rand_seed = caffe_rng_rand();
   int solver_count = Caffe::solver_count();
   bool root_solver = Caffe::root_solver();
 
   try {
-    thread_.reset(new boost::thread(&InternalThread::entry, this, device, mode,
+    thread_.reset(new boost::thread(&InternalThread::entry, this, device_, mode,
           rand_seed, solver_count, root_solver));
   } catch (std::exception& e) {
     LOG(FATAL) << "Thread exception: " << e.what();
@@ -40,6 +37,7 @@ void InternalThread::StartInternalThread() {
 
 void InternalThread::entry(int device, Caffe::Brew mode, int rand_seed,
     int solver_count, bool root_solver) {
+  LOG(INFO) << "Started internal thread on device " << device;
 #ifndef CPU_ONLY
   HIP_CHECK(hipSetDevice(device));
 #endif
 
@@ -75,10 +75,7 @@ void BasePrefetchingDataLayer<Dtype>::LayerSetUp(
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
 #ifndef CPU_ONLY
-  hipStream_t stream;
-  if (Caffe::mode() == Caffe::GPU) {
-    HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-  }
+  hipStream_t stream = nullptr;
 #endif
 
   try {
@@ -98,7 +95,8 @@ void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
   }
 #ifndef CPU_ONLY
   if (Caffe::mode() == Caffe::GPU) {
-    HIP_CHECK(hipStreamDestroy(stream));
+    if (stream != nullptr)
+      HIP_CHECK(hipStreamDestroy(stream));
   }
 #endif
 }