diff --git a/README.md b/README.md index 5001fb6..66a99b3 100644 --- a/README.md +++ b/README.md @@ -480,10 +480,10 @@ The recurrent op kernels are only run on NVIDIA hardware. | Hidden Units | Batch Size | TimeSteps | Recurrent Type | Application | Total Time (ms) | Fwd TeraFLOPS | Processor | | -------------- | ------------ | ----------- | ---------------- | --------------------- | ------------ | --------------- | --------------- | -| 1760 | 16 | 50 | Vanilla | Speech Recognition | 6.75 | 1.46 | Tesla V100 FP32 | -| 2560 | 32 | 50 | Vanilla | Speech Recognition | 11.48 | 3.43 | Tesla V100 Mixed Precision | -| 1024 | 128 | 25 | LSTM | Machine Translation | 6.46 | 12.41 | Tesla V100 Mixed Precision | -| 2816 | 32 | 1500 | GRU | Speech Recognition | 591.02 | 10.45 | Tesla V100 Mixed Precision | +| 1760 | 16 | 50 | Vanilla | Speech Recognition | 8.21 | 1.19 | Tesla V100 Mixed Precision | +| 2560 | 32 | 50 | Vanilla | Speech Recognition | 10.50 | 4.08 | Tesla V100 Mixed Precision | +| 1024 | 128 | 25 | LSTM | Machine Translation | 5.56 | 10.91 | Tesla V100 Mixed Precision | +| 2816 | 32 | 1500 | GRU | Speech Recognition | 380.04 | 11.85 | Tesla V100 Mixed Precision | ### All-Reduce Results diff --git a/code/nvidia/rnn_bench.cu b/code/nvidia/rnn_bench.cu index 30a7ca1..694ce68 100644 --- a/code/nvidia/rnn_bench.cu +++ b/code/nvidia/rnn_bench.cu @@ -116,6 +116,8 @@ class cudnnRNN { size_t train_size_; Tensor weights_; + Tensor dW_; + Tensor workspace_; Tensor trainspace_; @@ -169,7 +171,6 @@ class cudnnRNN { weights_ = rand(std::vector{static_cast(weight_size_ / sizeof(T)), 1}, curand_gen); - std::vector dim = {weights_.size(), 1, 1}; wDesc_ = FilterDescriptorNd(CUDNN_TENSOR_NCHW, dim); @@ -179,6 +180,8 @@ class cudnnRNN { xDescArray_.ptr(), &workspace_size_) ); + dW_ = zeros(std::vector{static_cast(weight_size_ / sizeof(T)), 1}); + workspace_ = zeros(std::vector{static_cast(workspace_size_ / sizeof(float)), 1}); CHECK_CUDNN_ERROR( cudnnGetRNNTrainingReserveSize(cudnn_handle, @@ -243,14 +246,33 @@ class cudnnRNN { (void *)trainspace_.begin(), train_size_) ); } + + void backward_params(Tensor x, Tensor hx, Tensor y) { + CHECK_CUDNN_ERROR(cudnnRNNBackwardWeights(cudnn_handle, + rnn_desc_.desc(), + time_steps_, + xDescArray_.ptr(), + (void *)x.begin(), + hx_desc_.desc(), + (void *)hx.begin(), + yDescArray_.ptr(), + (void *)y.begin(), + (void *)workspace_.begin(), + workspace_size_, + wDesc_.desc(), + (void *)dW_.begin(), + (void *)trainspace_.begin(), + train_size_) ); + } + }; template -std::tuple time_rnn(int hidden_size, - int batch_size, - int time_steps, - const std::string& type, - int inference) { +std::tuple time_rnn(int hidden_size, + int batch_size, + int time_steps, + const std::string& type, + int inference) { cudnnRNN rnn(hidden_size, batch_size, time_steps, type); @@ -285,7 +307,8 @@ std::tuple time_rnn(int hidden_size, auto end = std::chrono::steady_clock::now(); auto forward_time = std::chrono::duration(end - start).count() / numRepeats; - int backward_time = 0; + int bwd_data_time = 0; + int bwd_params_time = 0; if (!inference) { //Warm up @@ -303,12 +326,31 @@ std::tuple time_rnn(int hidden_size, cudaDeviceSynchronize(); end = std::chrono::steady_clock::now(); - backward_time = std::chrono::duration(end - start).count() / numRepeats; + bwd_data_time = std::chrono::duration(end - start).count() / numRepeats; + + /* Backward wrt params */ + //Warm up + rnn.backward_params(x, hx, y); + + cudaDeviceSynchronize(); + + start = std::chrono::steady_clock::now(); + + for (int i = 0; i < numRepeats; ++i) { + rnn.backward_params(x, hx, y); + } + + cudaDeviceSynchronize(); + + end = std::chrono::steady_clock::now(); + bwd_params_time = std::chrono::duration(end - start).count() / numRepeats; + } return std::make_tuple(static_cast(forward_time), - static_cast(backward_time)); + static_cast(bwd_data_time), + static_cast(bwd_params_time)); } @@ -347,11 +389,14 @@ int main(int argc, char **argv) { } std::cout << std::setw(30) << "Times" << std::endl; - std::cout << std::setfill('-') << std::setw(88) << "-" << std::endl; + std::cout << std::setfill('-') << std::setw(115) << "-" << std::endl; std::cout << std::setfill(' '); - std::cout << " type hidden N timesteps precision fwd_time (usec) "; - if (!inference) - std::cout << "bwd_time (usec)"; + std::cout << " type hidden N timesteps precision fwd_time (usec) "; + if (!inference) { + std::cout << "bwd_inputs_time (usec)"; + std::cout << " bwd_params_time (usec)"; + } + std::cout << std::endl; for (const auto &problem : (inference ? inference_server_set : training_set)) { int hidden_state, batch_size, time_steps; @@ -363,7 +408,7 @@ int main(int argc, char **argv) { std::cout << std::setw(8) << batch_size; std::cout << std::setw(8) << time_steps; std::cout << std::setw(14) << precision; - int fwd_time, bwd_time; + int fwd_time, bwd_data_time, bwd_params_time; std::stringstream ss; ss << "Unsupported precision requested. Precision: " << precision << " Inference: " << inference; @@ -371,41 +416,46 @@ int main(int argc, char **argv) { #if CUDNN_MAJOR >= 6 if (inference) { if (precision == "float") { - std::tie(fwd_time, bwd_time) = time_rnn(hidden_state, - batch_size, - time_steps, - type, - inference); + std::tie(fwd_time, bwd_data_time, bwd_params_time) = + time_rnn(hidden_state, + batch_size, + time_steps, + type, + inference); } else if (precision == "half") { - std::tie(fwd_time, bwd_time) = time_rnn(hidden_state, - batch_size, - time_steps, - type, - inference); + std::tie(fwd_time, bwd_data_time, bwd_params_time) = + time_rnn(hidden_state, + batch_size, + time_steps, + type, + inference); } else if (precision == "int8") { - std::tie(fwd_time, bwd_time) = time_rnn(hidden_state, - batch_size, - time_steps, - type, - inference); + std::tie(fwd_time, bwd_data_time, bwd_params_time) = + time_rnn(hidden_state, + batch_size, + time_steps, + type, + inference); } else { throw std::runtime_error(ss.str()); } } else { if (precision == "float") { - std::tie(fwd_time, bwd_time) = time_rnn(hidden_state, - batch_size, - time_steps, - type, - inference); + std::tie(fwd_time, bwd_data_time, bwd_params_time) = + time_rnn(hidden_state, + batch_size, + time_steps, + type, + inference); } else if (precision == "half") { - std::tie(fwd_time, bwd_time) = time_rnn(hidden_state, - batch_size, - time_steps, - type, - inference); + std::tie(fwd_time, bwd_data_time, bwd_params_time) = + time_rnn(hidden_state, + batch_size, + time_steps, + type, + inference); } else { throw std::runtime_error(ss.str()); } @@ -413,16 +463,20 @@ int main(int argc, char **argv) { #else if (precision != "float") throw std::runtime_error(ss.str()); - std::tie(fwd_time, bwd_time) = time_rnn(hidden_state, - batch_size, - time_steps, - type, - inference); + + std::tie(fwd_time, bwd_data_time, bwd_params_time) = + time_rnn(hidden_state, + batch_size, + time_steps, + type, + inference); #endif std::cout << std::setw(18) << fwd_time; - if (!inference) - std::cout << std::setw(18) << bwd_time; + if (!inference) { + std::cout << std::setw(20) << bwd_data_time; + std::cout << std::setw(20) << bwd_params_time; + } std::cout << std::endl; } diff --git a/results/train/DeepBench_NV_1080Ti.xlsx b/results/train/DeepBench_NV_1080Ti.xlsx index 9bc99fc..29a4cc0 100644 Binary files a/results/train/DeepBench_NV_1080Ti.xlsx and b/results/train/DeepBench_NV_1080Ti.xlsx differ diff --git a/results/train/DeepBench_NV_M40.xlsx b/results/train/DeepBench_NV_M40.xlsx index d483a5e..3cd8d20 100644 Binary files a/results/train/DeepBench_NV_M40.xlsx and b/results/train/DeepBench_NV_M40.xlsx differ diff --git a/results/train/DeepBench_NV_P100.xlsx b/results/train/DeepBench_NV_P100.xlsx index 6aa1b71..b59aca2 100644 Binary files a/results/train/DeepBench_NV_P100.xlsx and b/results/train/DeepBench_NV_P100.xlsx differ diff --git a/results/train/DeepBench_NV_TitanX.xlsx b/results/train/DeepBench_NV_TitanX.xlsx index a91b1c2..cd5137e 100644 Binary files a/results/train/DeepBench_NV_TitanX.xlsx and b/results/train/DeepBench_NV_TitanX.xlsx differ diff --git a/results/train/DeepBench_NV_TitanX_Pascal.xlsx b/results/train/DeepBench_NV_TitanX_Pascal.xlsx index 6b0e11d..98ebd26 100644 Binary files a/results/train/DeepBench_NV_TitanX_Pascal.xlsx and b/results/train/DeepBench_NV_TitanX_Pascal.xlsx differ diff --git a/results/train/DeepBench_NV_TitanXp.xlsx b/results/train/DeepBench_NV_TitanXp.xlsx index 9bc3b7c..35f2e0a 100644 Binary files a/results/train/DeepBench_NV_TitanXp.xlsx and b/results/train/DeepBench_NV_TitanXp.xlsx differ diff --git a/results/train/DeepBench_NV_V100.xlsx b/results/train/DeepBench_NV_V100.xlsx index 8ea86ce..fb2a4ba 100644 Binary files a/results/train/DeepBench_NV_V100.xlsx and b/results/train/DeepBench_NV_V100.xlsx differ