Speed-up batch normalization by avoiding branches in the inner loop

Dobiasd · Dobiasd · commit 352aaae37570 · 2023-10-23T09:29:01.000+02:00
diff --git a/include/fdeep/layers/batch_normalization_layer.hpp b/include/fdeep/layers/batch_normalization_layer.hpp
@@ -118,26 +118,55 @@ class batch_normalization_layer : public layer
         }
 
         tensor output(input.shape(), 0);
+
         const auto denoms = fplus::transform([this](const auto& mv)
             { return std::sqrt(mv + this->epsilon_); },
             moving_variance_);
-        for (std::size_t dim5 = 0; dim5 < output.shape().size_dim_5_; ++dim5)
-        {
-            for (std::size_t dim4 = 0; dim4 < output.shape().size_dim_4_; ++dim4)
+
+        if (use_gamma && use_beta) {
+            for (std::size_t dim5 = 0; dim5 < output.shape().size_dim_5_; ++dim5)
             {
-                for (std::size_t z = 0; z < output.shape().depth_; ++z)
+                for (std::size_t dim4 = 0; dim4 < output.shape().size_dim_4_; ++dim4)
                 {
-                    if (use_gamma && use_beta) {
+                    for (std::size_t z = 0; z < output.shape().depth_; ++z)
+                    {
                         apply_to_channel(apply_to_value_gamma_beta, moving_mean_, beta_, gamma_, input, output, denoms[z], z, dim5, dim4);
                     }
-                    else if (use_gamma) {
+                }
+            }
+        }
+        else if (use_gamma) {
+            for (std::size_t dim5 = 0; dim5 < output.shape().size_dim_5_; ++dim5)
+            {
+                for (std::size_t dim4 = 0; dim4 < output.shape().size_dim_4_; ++dim4)
+                {
+                    for (std::size_t z = 0; z < output.shape().depth_; ++z)
+                    {
                         apply_to_channel(apply_to_value_gamma, moving_mean_, beta_, gamma_, input, output, denoms[z], z, dim5, dim4);
                     }
-                    else if (use_beta) {
+                }
+            }
+        }
+        else if (use_beta) {
+            for (std::size_t dim5 = 0; dim5 < output.shape().size_dim_5_; ++dim5)
+            {
+                for (std::size_t dim4 = 0; dim4 < output.shape().size_dim_4_; ++dim4)
+                {
+                    for (std::size_t z = 0; z < output.shape().depth_; ++z)
+                    {
                         apply_to_channel(apply_to_value_beta, moving_mean_, beta_, gamma_, input, output, denoms[z], z, dim5, dim4);
                     }
-                    else {
-                        apply_to_channel(apply_to_value, moving_mean_, beta_, gamma_, input, output, denoms[z], z, dim5, dim4);
+                }
+            }
+        }
+        else {
+            for (std::size_t dim5 = 0; dim5 < output.shape().size_dim_5_; ++dim5)
+            {
+                for (std::size_t dim4 = 0; dim4 < output.shape().size_dim_4_; ++dim4)
+                {
+                    for (std::size_t z = 0; z < output.shape().depth_; ++z)
+                    {
+                         apply_to_channel(apply_to_value, moving_mean_, beta_, gamma_, input, output, denoms[z], z, dim5, dim4);
                     }
                 }
             }