Skip to content

Commit 39f69fb

Browse files
committedOct 23, 2015
Merge pull request BVLC#3229 from cdoersch/batchnorm2
Yet another batch normalization PR
2 parents 50a23b7 + a52ee65 commit 39f69fb

11 files changed

+1139
-2
lines changed
 
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# reduce learning rate after 120 epochs (60000 iters) by factor 0f 10
2+
# then another factor of 10 after 10 more epochs (5000 iters)
3+
4+
# The train/test net protocol buffer definition
5+
net: "examples/cifar10/cifar10_full_sigmoid_train_test.prototxt"
6+
# test_iter specifies how many forward passes the test should carry out.
7+
# In the case of CIFAR10, we have test batch size 100 and 100 test iterations,
8+
# covering the full 10,000 testing images.
9+
test_iter: 10
10+
# Carry out testing every 1000 training iterations.
11+
test_interval: 1000
12+
# The base learning rate, momentum and the weight decay of the network.
13+
base_lr: 0.001
14+
momentum: 0.9
15+
#weight_decay: 0.004
16+
# The learning rate policy
17+
lr_policy: "step"
18+
gamma: 1
19+
stepsize: 5000
20+
# Display every 200 iterations
21+
display: 100
22+
# The maximum number of iterations
23+
max_iter: 60000
24+
# snapshot intermediate results
25+
snapshot: 10000
26+
snapshot_prefix: "examples/cifar10_full_sigmoid"
27+
# solver mode: CPU or GPU
28+
solver_mode: GPU
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# reduce learning rate after 120 epochs (60000 iters) by factor 0f 10
2+
# then another factor of 10 after 10 more epochs (5000 iters)
3+
4+
# The train/test net protocol buffer definition
5+
net: "examples/cifar10/cifar10_full_sigmoid_train_test_bn.prototxt"
6+
# test_iter specifies how many forward passes the test should carry out.
7+
# In the case of CIFAR10, we have test batch size 100 and 100 test iterations,
8+
# covering the full 10,000 testing images.
9+
test_iter: 10
10+
# Carry out testing every 1000 training iterations.
11+
test_interval: 1000
12+
# The base learning rate, momentum and the weight decay of the network.
13+
base_lr: 0.001
14+
momentum: 0.9
15+
#weight_decay: 0.004
16+
# The learning rate policy
17+
lr_policy: "step"
18+
gamma: 1
19+
stepsize: 5000
20+
# Display every 200 iterations
21+
display: 100
22+
# The maximum number of iterations
23+
max_iter: 60000
24+
# snapshot intermediate results
25+
snapshot: 10000
26+
snapshot_prefix: "examples/cifar10_full_sigmoid_bn"
27+
# solver mode: CPU or GPU
28+
solver_mode: GPU
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
name: "CIFAR10_full"
2+
layer {
3+
name: "cifar"
4+
type: "Data"
5+
top: "data"
6+
top: "label"
7+
include {
8+
phase: TRAIN
9+
}
10+
transform_param {
11+
mean_file: "examples/cifar10/mean.binaryproto"
12+
}
13+
data_param {
14+
source: "examples/cifar10/cifar10_train_lmdb"
15+
batch_size: 111
16+
backend: LMDB
17+
}
18+
}
19+
layer {
20+
name: "cifar"
21+
type: "Data"
22+
top: "data"
23+
top: "label"
24+
include {
25+
phase: TEST
26+
}
27+
transform_param {
28+
mean_file: "examples/cifar10/mean.binaryproto"
29+
}
30+
data_param {
31+
source: "examples/cifar10/cifar10_test_lmdb"
32+
batch_size: 1000
33+
backend: LMDB
34+
}
35+
}
36+
layer {
37+
name: "conv1"
38+
type: "Convolution"
39+
bottom: "data"
40+
top: "conv1"
41+
param {
42+
lr_mult: 1
43+
}
44+
param {
45+
lr_mult: 2
46+
}
47+
convolution_param {
48+
num_output: 32
49+
pad: 2
50+
kernel_size: 5
51+
stride: 1
52+
weight_filler {
53+
type: "gaussian"
54+
std: 0.0001
55+
}
56+
bias_filler {
57+
type: "constant"
58+
}
59+
}
60+
}
61+
layer {
62+
name: "pool1"
63+
type: "Pooling"
64+
bottom: "conv1"
65+
top: "pool1"
66+
pooling_param {
67+
pool: MAX
68+
kernel_size: 3
69+
stride: 2
70+
}
71+
}
72+
73+
74+
75+
layer {
76+
name: "Sigmoid1"
77+
type: "Sigmoid"
78+
bottom: "pool1"
79+
top: "Sigmoid1"
80+
}
81+
82+
layer {
83+
name: "conv2"
84+
type: "Convolution"
85+
bottom: "Sigmoid1"
86+
top: "conv2"
87+
param {
88+
lr_mult: 1
89+
}
90+
param {
91+
lr_mult: 2
92+
}
93+
convolution_param {
94+
num_output: 32
95+
pad: 2
96+
kernel_size: 5
97+
stride: 1
98+
weight_filler {
99+
type: "gaussian"
100+
std: 0.01
101+
}
102+
bias_filler {
103+
type: "constant"
104+
}
105+
}
106+
}
107+
108+
109+
layer {
110+
name: "Sigmoid2"
111+
type: "Sigmoid"
112+
bottom: "conv2"
113+
top: "Sigmoid2"
114+
}
115+
layer {
116+
name: "pool2"
117+
type: "Pooling"
118+
bottom: "Sigmoid2"
119+
top: "pool2"
120+
pooling_param {
121+
pool: AVE
122+
kernel_size: 3
123+
stride: 2
124+
}
125+
}
126+
layer {
127+
name: "conv3"
128+
type: "Convolution"
129+
bottom: "pool2"
130+
top: "conv3"
131+
convolution_param {
132+
num_output: 64
133+
pad: 2
134+
kernel_size: 5
135+
stride: 1
136+
weight_filler {
137+
type: "gaussian"
138+
std: 0.01
139+
}
140+
bias_filler {
141+
type: "constant"
142+
}
143+
}
144+
param {
145+
lr_mult: 1
146+
}
147+
param {
148+
lr_mult: 1
149+
}
150+
151+
}
152+
153+
layer {
154+
name: "Sigmoid3"
155+
type: "Sigmoid"
156+
bottom: "conv3"
157+
top: "Sigmoid3"
158+
}
159+
160+
layer {
161+
name: "pool3"
162+
type: "Pooling"
163+
bottom: "Sigmoid3"
164+
top: "pool3"
165+
pooling_param {
166+
pool: AVE
167+
kernel_size: 3
168+
stride: 2
169+
}
170+
}
171+
172+
layer {
173+
name: "ip1"
174+
type: "InnerProduct"
175+
bottom: "pool3"
176+
top: "ip1"
177+
param {
178+
lr_mult: 1
179+
decay_mult: 0
180+
}
181+
param {
182+
lr_mult: 2
183+
decay_mult: 0
184+
}
185+
inner_product_param {
186+
num_output: 10
187+
weight_filler {
188+
type: "gaussian"
189+
std: 0.01
190+
}
191+
bias_filler {
192+
type: "constant"
193+
}
194+
}
195+
}
196+
layer {
197+
name: "accuracy"
198+
type: "Accuracy"
199+
bottom: "ip1"
200+
bottom: "label"
201+
top: "accuracy"
202+
include {
203+
phase: TEST
204+
}
205+
}
206+
layer {
207+
name: "loss"
208+
type: "SoftmaxWithLoss"
209+
bottom: "ip1"
210+
bottom: "label"
211+
top: "loss"
212+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
name: "CIFAR10_full"
2+
layer {
3+
name: "cifar"
4+
type: "Data"
5+
top: "data"
6+
top: "label"
7+
include {
8+
phase: TRAIN
9+
}
10+
transform_param {
11+
mean_file: "examples/cifar10/mean.binaryproto"
12+
}
13+
data_param {
14+
source: "examples/cifar10/cifar10_train_lmdb"
15+
batch_size: 100
16+
backend: LMDB
17+
}
18+
}
19+
layer {
20+
name: "cifar"
21+
type: "Data"
22+
top: "data"
23+
top: "label"
24+
include {
25+
phase: TEST
26+
}
27+
transform_param {
28+
mean_file: "examples/cifar10/mean.binaryproto"
29+
}
30+
data_param {
31+
source: "examples/cifar10/cifar10_test_lmdb"
32+
batch_size: 1000
33+
backend: LMDB
34+
}
35+
}
36+
layer {
37+
name: "conv1"
38+
type: "Convolution"
39+
bottom: "data"
40+
top: "conv1"
41+
param {
42+
lr_mult: 1
43+
}
44+
convolution_param {
45+
num_output: 32
46+
pad: 2
47+
kernel_size: 5
48+
stride: 1
49+
bias_term: false
50+
weight_filler {
51+
type: "gaussian"
52+
std: 0.0001
53+
}
54+
}
55+
}
56+
layer {
57+
name: "pool1"
58+
type: "Pooling"
59+
bottom: "conv1"
60+
top: "pool1"
61+
pooling_param {
62+
pool: MAX
63+
kernel_size: 3
64+
stride: 2
65+
}
66+
}
67+
68+
layer {
69+
name: "bn1"
70+
type: "BatchNorm"
71+
bottom: "pool1"
72+
top: "bn1"
73+
param {
74+
lr_mult: 0
75+
}
76+
param {
77+
lr_mult: 0
78+
}
79+
param {
80+
lr_mult: 0
81+
}
82+
}
83+
84+
layer {
85+
name: "Sigmoid1"
86+
type: "Sigmoid"
87+
bottom: "bn1"
88+
top: "Sigmoid1"
89+
}
90+
91+
layer {
92+
name: "conv2"
93+
type: "Convolution"
94+
bottom: "Sigmoid1"
95+
top: "conv2"
96+
param {
97+
lr_mult: 1
98+
}
99+
convolution_param {
100+
num_output: 32
101+
pad: 2
102+
kernel_size: 5
103+
stride: 1
104+
bias_term: false
105+
weight_filler {
106+
type: "gaussian"
107+
std: 0.01
108+
}
109+
}
110+
}
111+
112+
layer {
113+
name: "bn2"
114+
type: "BatchNorm"
115+
bottom: "conv2"
116+
top: "bn2"
117+
param {
118+
lr_mult: 0
119+
}
120+
param {
121+
lr_mult: 0
122+
}
123+
param {
124+
lr_mult: 0
125+
}
126+
}
127+
128+
layer {
129+
name: "Sigmoid2"
130+
type: "Sigmoid"
131+
bottom: "bn2"
132+
top: "Sigmoid2"
133+
}
134+
layer {
135+
name: "pool2"
136+
type: "Pooling"
137+
bottom: "Sigmoid2"
138+
top: "pool2"
139+
pooling_param {
140+
pool: AVE
141+
kernel_size: 3
142+
stride: 2
143+
}
144+
}
145+
layer {
146+
name: "conv3"
147+
type: "Convolution"
148+
bottom: "pool2"
149+
top: "conv3"
150+
param {
151+
lr_mult: 1
152+
}
153+
convolution_param {
154+
num_output: 64
155+
pad: 2
156+
kernel_size: 5
157+
stride: 1
158+
bias_term: false
159+
weight_filler {
160+
type: "gaussian"
161+
std: 0.01
162+
}
163+
}
164+
}
165+
166+
layer {
167+
name: "bn3"
168+
type: "BatchNorm"
169+
bottom: "conv3"
170+
top: "bn3"
171+
param {
172+
lr_mult: 0
173+
}
174+
param {
175+
lr_mult: 0
176+
}
177+
param {
178+
lr_mult: 0
179+
}
180+
}
181+
182+
layer {
183+
name: "Sigmoid3"
184+
type: "Sigmoid"
185+
bottom: "bn3"
186+
top: "Sigmoid3"
187+
}
188+
layer {
189+
name: "pool3"
190+
type: "Pooling"
191+
bottom: "Sigmoid3"
192+
top: "pool3"
193+
pooling_param {
194+
pool: AVE
195+
kernel_size: 3
196+
stride: 2
197+
}
198+
}
199+
200+
layer {
201+
name: "ip1"
202+
type: "InnerProduct"
203+
bottom: "pool3"
204+
top: "ip1"
205+
param {
206+
lr_mult: 1
207+
decay_mult: 1
208+
}
209+
param {
210+
lr_mult: 1
211+
decay_mult: 0
212+
}
213+
inner_product_param {
214+
num_output: 10
215+
weight_filler {
216+
type: "gaussian"
217+
std: 0.01
218+
}
219+
bias_filler {
220+
type: "constant"
221+
}
222+
}
223+
}
224+
layer {
225+
name: "accuracy"
226+
type: "Accuracy"
227+
bottom: "ip1"
228+
bottom: "label"
229+
top: "accuracy"
230+
include {
231+
phase: TEST
232+
}
233+
}
234+
layer {
235+
name: "loss"
236+
type: "SoftmaxWithLoss"
237+
bottom: "ip1"
238+
bottom: "label"
239+
top: "loss"
240+
}
+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/usr/bin/env sh
2+
3+
TOOLS=./build/tools
4+
5+
$TOOLS/caffe train \
6+
--solver=examples/cifar10/cifar10_full_sigmoid_solver.prototxt
7+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/usr/bin/env sh
2+
3+
TOOLS=./build/tools
4+
5+
$TOOLS/caffe train \
6+
--solver=examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt
7+

‎include/caffe/common_layers.hpp

+67-1
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,73 @@ class ArgMaxLayer : public Layer<Dtype> {
7373
int axis_;
7474
};
7575

76+
/**
77+
* @brief Normalizes the input to have 0-mean and/or unit (1) variance across
78+
* the batch.
79+
*
80+
* This layer computes Batch Normalization described in [1]. For
81+
* each channel in the data (i.e. axis 1), it subtracts the mean and divides
82+
* by the variance, where both statistics are computed across both spatial
83+
* dimensions and across the different examples in the batch.
84+
*
85+
* By default, during training time, the network is computing global mean/
86+
* variance statistics via a running average, which is then used at test
87+
* time to allow deterministic outputs for each input. You can manually
88+
* toggle whether the network is accumulating or using the statistics via the
89+
* use_global_stats option. IMPORTANT: for this feature to work, you MUST
90+
* set the learning rate to zero for all three parameter blobs, i.e.,
91+
* param {lr_mult: 0} three times in the layer definition.
92+
*
93+
* Note that the original paper also included a per-channel learned bias and
94+
* scaling factor. It is possible (though a bit cumbersome) to implement
95+
* this in caffe using a single-channel DummyDataLayer filled with zeros,
96+
* followed by a Convolution layer with output the same size as the current.
97+
* This produces a channel-specific value that can be added or multiplied by
98+
* the BatchNorm layer's output.
99+
*
100+
* [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network
101+
* Training by Reducing Internal Covariate Shift." arXiv preprint
102+
* arXiv:1502.03167 (2015).
103+
*
104+
* TODO(dox): thorough documentation for Forward, Backward, and proto params.
105+
*/
106+
template <typename Dtype>
107+
class BatchNormLayer : public Layer<Dtype> {
108+
public:
109+
explicit BatchNormLayer(const LayerParameter& param)
110+
: Layer<Dtype>(param) {}
111+
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
112+
const vector<Blob<Dtype>*>& top);
113+
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
114+
const vector<Blob<Dtype>*>& top);
115+
116+
virtual inline const char* type() const { return "BatchNorm"; }
117+
virtual inline int ExactNumBottomBlobs() const { return 1; }
118+
virtual inline int ExactNumTopBlobs() const { return 1; }
119+
120+
protected:
121+
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
122+
const vector<Blob<Dtype>*>& top);
123+
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
124+
const vector<Blob<Dtype>*>& top);
125+
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
126+
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
127+
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
128+
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
129+
130+
Blob<Dtype> mean_, variance_, temp_, x_norm_;
131+
bool use_global_stats_;
132+
Dtype moving_average_fraction_;
133+
int channels_;
134+
Dtype eps_;
135+
136+
// extra temporarary variables is used to carry out sums/broadcasting
137+
// using BLAS
138+
Blob<Dtype> batch_sum_multiplier_;
139+
Blob<Dtype> num_by_chans_;
140+
Blob<Dtype> spatial_sum_multiplier_;
141+
};
142+
76143
/**
77144
* @brief Index into the input blob along its first axis.
78145
*
@@ -141,7 +208,6 @@ class BatchReindexLayer : public Layer<Dtype> {
141208
const Dtype* ridx_data);
142209
};
143210

144-
145211
/**
146212
* @brief Takes at least two Blob%s and concatenates them along either the num
147213
* or channel dimension, outputting the result.

‎src/caffe/layers/batch_norm_layer.cpp

+236
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
#include <algorithm>
2+
#include <vector>
3+
4+
#include "caffe/common_layers.hpp"
5+
#include "caffe/layer.hpp"
6+
#include "caffe/util/math_functions.hpp"
7+
8+
namespace caffe {
9+
10+
template <typename Dtype>
11+
void BatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
12+
const vector<Blob<Dtype>*>& top) {
13+
BatchNormParameter param = this->layer_param_.batch_norm_param();
14+
moving_average_fraction_ = param.moving_average_fraction();
15+
use_global_stats_ = this->phase_ == TEST;
16+
if (param.has_use_global_stats())
17+
use_global_stats_ = param.use_global_stats();
18+
if (bottom[0]->num_axes() == 1)
19+
channels_ = 1;
20+
else
21+
channels_ = bottom[0]->shape(1);
22+
eps_ = param.eps();
23+
if (this->blobs_.size() > 0) {
24+
LOG(INFO) << "Skipping parameter initialization";
25+
} else {
26+
this->blobs_.resize(3);
27+
vector<int> sz;
28+
sz.push_back(channels_);
29+
this->blobs_[0].reset(new Blob<Dtype>(sz));
30+
this->blobs_[1].reset(new Blob<Dtype>(sz));
31+
sz[0]=1;
32+
this->blobs_[2].reset(new Blob<Dtype>(sz));
33+
for (int i = 0; i < 3; ++i) {
34+
caffe_set(this->blobs_[i]->count(), Dtype(0),
35+
this->blobs_[i]->mutable_cpu_data());
36+
}
37+
}
38+
}
39+
40+
template <typename Dtype>
41+
void BatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
42+
const vector<Blob<Dtype>*>& top) {
43+
if (bottom[0]->num_axes() >= 1)
44+
CHECK_EQ(bottom[0]->shape(1), channels_);
45+
top[0]->ReshapeLike(*bottom[0]);
46+
47+
vector<int> sz;
48+
sz.push_back(channels_);
49+
mean_.Reshape(sz);
50+
variance_.Reshape(sz);
51+
temp_.ReshapeLike(*bottom[0]);
52+
x_norm_.ReshapeLike(*bottom[0]);
53+
sz[0]=bottom[0]->shape(0);
54+
batch_sum_multiplier_.Reshape(sz);
55+
56+
int spatial_dim = bottom[0]->count()/(channels_*bottom[0]->shape(0));
57+
if (spatial_sum_multiplier_.num_axes() == 0 ||
58+
spatial_sum_multiplier_.shape(0) != spatial_dim) {
59+
sz[0] = spatial_dim;
60+
spatial_sum_multiplier_.Reshape(sz);
61+
Dtype* multiplier_data = spatial_sum_multiplier_.mutable_cpu_data();
62+
caffe_set(spatial_sum_multiplier_.count(), Dtype(1), multiplier_data);
63+
}
64+
65+
int numbychans = channels_*bottom[0]->shape(0);
66+
if (num_by_chans_.num_axes() == 0 ||
67+
num_by_chans_.shape(0) != numbychans) {
68+
sz[0] = numbychans;
69+
num_by_chans_.Reshape(sz);
70+
caffe_set(batch_sum_multiplier_.count(), Dtype(1),
71+
batch_sum_multiplier_.mutable_cpu_data());
72+
}
73+
}
74+
75+
template <typename Dtype>
76+
void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
77+
const vector<Blob<Dtype>*>& top) {
78+
const Dtype* bottom_data = bottom[0]->cpu_data();
79+
Dtype* top_data = top[0]->mutable_cpu_data();
80+
int num = bottom[0]->shape(0);
81+
int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);
82+
83+
// elementwise square
84+
caffe_powx(bottom[0]->count(), bottom_data, Dtype(2),
85+
temp_.mutable_cpu_data());
86+
87+
if (use_global_stats_) {
88+
// use the stored mean/variance estimates. TODO(cdoersch): allow an option
89+
// to use an unbiased variance estimate, like the paper does.
90+
const Dtype scale_factor = 1 / this->blobs_[2]->cpu_data()[0];
91+
caffe_cpu_scale(variance_.count(), scale_factor,
92+
this->blobs_[0]->cpu_data(), mean_.mutable_cpu_data());
93+
caffe_cpu_scale(variance_.count(), scale_factor,
94+
this->blobs_[1]->cpu_data(), variance_.mutable_cpu_data());
95+
} else {
96+
// computes variance using var(X) = E(X^2) - (EX)^2
97+
caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
98+
1. / (num * spatial_dim), bottom_data,
99+
spatial_sum_multiplier_.cpu_data(), 0.,
100+
num_by_chans_.mutable_cpu_data());
101+
caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
102+
num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
103+
mean_.mutable_cpu_data());
104+
caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
105+
1. / (num * spatial_dim), temp_.cpu_data(),
106+
spatial_sum_multiplier_.cpu_data(), 0.,
107+
num_by_chans_.mutable_cpu_data());
108+
caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
109+
num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
110+
variance_.mutable_cpu_data());
111+
this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
112+
this->blobs_[2]->mutable_cpu_data()[0] += 1;
113+
caffe_cpu_axpby(mean_.count(), Dtype(1), mean_.cpu_data(),
114+
moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
115+
Dtype m = Dtype(bottom[0]->count()/channels_);
116+
caffe_cpu_axpby(variance_.count(), m/(m-1), variance_.cpu_data(),
117+
moving_average_fraction_, this->blobs_[1]->mutable_cpu_data());
118+
}
119+
// elementwise square of mean
120+
caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2),
121+
temp_.mutable_cpu_data());
122+
123+
caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(),
124+
variance_.mutable_cpu_data()); // variance
125+
126+
// normalize variance
127+
caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
128+
caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
129+
variance_.mutable_cpu_data());
130+
131+
// do mean and variance normalization
132+
if (bottom[0] != top[0]) {
133+
caffe_copy(bottom[0]->count(), bottom_data, top_data);
134+
}
135+
// subtract mean
136+
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
137+
batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
138+
num_by_chans_.mutable_cpu_data());
139+
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
140+
spatial_dim, 1, -1, num_by_chans_.cpu_data(),
141+
spatial_sum_multiplier_.cpu_data(), 1., top_data);
142+
// replicate variance to input size
143+
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
144+
batch_sum_multiplier_.cpu_data(), variance_.cpu_data(), 0.,
145+
num_by_chans_.mutable_cpu_data());
146+
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
147+
spatial_dim, 1, 1., num_by_chans_.cpu_data(),
148+
spatial_sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
149+
caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
150+
// TODO(cdoersch): The caching is only needed because later in-place layers
151+
// might clobber the data. Can we skip this if they won't?
152+
caffe_copy(x_norm_.count(), top_data,
153+
x_norm_.mutable_cpu_data());
154+
}
155+
156+
template <typename Dtype>
157+
void BatchNormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
158+
const vector<bool>& propagate_down,
159+
const vector<Blob<Dtype>*>& bottom) {
160+
CHECK(!use_global_stats_);
161+
const Dtype* top_diff;
162+
if (bottom[0] != top[0]) {
163+
top_diff = top[0]->cpu_diff();
164+
} else {
165+
caffe_copy(x_norm_.count(), top[0]->cpu_diff(), x_norm_.mutable_cpu_diff());
166+
top_diff = x_norm_.cpu_diff();
167+
}
168+
const Dtype* top_data = x_norm_.cpu_data();
169+
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
170+
int num = bottom[0]->shape()[0];
171+
int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);
172+
// if Y = (X-mean(X))/(sqrt(var(X)+eps)), then
173+
//
174+
// dE(Y)/dX =
175+
// (dE/dY - mean(dE/dY) - mean(dE/dY \cdot Y) \cdot Y)
176+
// ./ sqrt(var(X) + eps)
177+
//
178+
// where \cdot and ./ are hadamard product and elementwise division,
179+
// respectively, dE/dY is the top diff, and mean/var/sum are all computed
180+
// along all dimensions except the channels dimension. In the above
181+
// equation, the operations allow for expansion (i.e. broadcast) along all
182+
// dimensions except the channels dimension where required.
183+
184+
// sum(dE/dY \cdot Y)
185+
caffe_mul(temp_.count(), top_data, top_diff, bottom_diff);
186+
caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
187+
bottom_diff, spatial_sum_multiplier_.cpu_data(), 0.,
188+
num_by_chans_.mutable_cpu_data());
189+
caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
190+
num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
191+
mean_.mutable_cpu_data());
192+
193+
// reshape (broadcast) the above
194+
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
195+
batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
196+
num_by_chans_.mutable_cpu_data());
197+
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
198+
spatial_dim, 1, 1., num_by_chans_.cpu_data(),
199+
spatial_sum_multiplier_.cpu_data(), 0., bottom_diff);
200+
201+
// sum(dE/dY \cdot Y) \cdot Y
202+
caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
203+
204+
// sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
205+
caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
206+
top_diff, spatial_sum_multiplier_.cpu_data(), 0.,
207+
num_by_chans_.mutable_cpu_data());
208+
caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
209+
num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
210+
mean_.mutable_cpu_data());
211+
// reshape (broadcast) the above to make
212+
// sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
213+
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
214+
batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
215+
num_by_chans_.mutable_cpu_data());
216+
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num * channels_,
217+
spatial_dim, 1, 1., num_by_chans_.cpu_data(),
218+
spatial_sum_multiplier_.cpu_data(), 1., bottom_diff);
219+
220+
// dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y
221+
caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff,
222+
Dtype(-1. / (num * spatial_dim)), bottom_diff);
223+
224+
// note: temp_ still contains sqrt(var(X)+eps), computed during the forward
225+
// pass.
226+
caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
227+
}
228+
229+
230+
#ifdef CPU_ONLY
231+
STUB_GPU(BatchNormLayer);
232+
#endif
233+
234+
INSTANTIATE_CLASS(BatchNormLayer);
235+
REGISTER_LAYER_CLASS(BatchNorm);
236+
} // namespace caffe

‎src/caffe/layers/batch_norm_layer.cu

+167
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
#include <algorithm>
2+
#include <vector>
3+
4+
#include "caffe/common_layers.hpp"
5+
#include "caffe/layer.hpp"
6+
#include "caffe/util/math_functions.hpp"
7+
8+
namespace caffe {
9+
10+
template <typename Dtype>
11+
void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
12+
const vector<Blob<Dtype>*>& top) {
13+
const Dtype* bottom_data = bottom[0]->gpu_data();
14+
Dtype* top_data = top[0]->mutable_gpu_data();
15+
int num = bottom[0]->shape(0);
16+
int spatial_dim = bottom[0]->count()/(channels_*bottom[0]->shape(0));
17+
18+
// elementwise square
19+
caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
20+
temp_.mutable_gpu_data());
21+
22+
if (use_global_stats_) {
23+
// use the stored mean/variance estimates. TODO(cdoersch): allow an option
24+
// to use an unbiased variance estimate, like the paper does.
25+
const Dtype scale_factor = 1 / this->blobs_[2]->cpu_data()[0];
26+
caffe_gpu_scale(variance_.count(), scale_factor,
27+
this->blobs_[0]->gpu_data(), mean_.mutable_gpu_data());
28+
caffe_gpu_scale(variance_.count(), scale_factor,
29+
this->blobs_[1]->gpu_data(), variance_.mutable_gpu_data());
30+
} else {
31+
// computes variance using var(X) = E(X^2) - (EX)^2
32+
caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
33+
1. / (num * spatial_dim), bottom_data,
34+
spatial_sum_multiplier_.gpu_data(), 0.,
35+
num_by_chans_.mutable_gpu_data());
36+
caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
37+
num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
38+
mean_.mutable_gpu_data());
39+
caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
40+
1. / (num * spatial_dim), temp_.gpu_data(),
41+
spatial_sum_multiplier_.gpu_data(), 0.,
42+
num_by_chans_.mutable_gpu_data());
43+
caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
44+
num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
45+
variance_.mutable_gpu_data());
46+
this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
47+
this->blobs_[2]->mutable_cpu_data()[0] += 1;
48+
caffe_gpu_axpby(mean_.count(), Dtype(1), mean_.gpu_data(),
49+
moving_average_fraction_, this->blobs_[0]->mutable_gpu_data());
50+
Dtype m = Dtype(bottom[0]->count()/channels_);
51+
caffe_gpu_axpby(variance_.count(), m/(m-1), variance_.gpu_data(),
52+
moving_average_fraction_, this->blobs_[1]->mutable_gpu_data());
53+
}
54+
// elementwise square of mean
55+
caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2),
56+
temp_.mutable_gpu_data());
57+
58+
caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(),
59+
variance_.mutable_gpu_data()); // variance
60+
61+
// normalize variance
62+
caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
63+
caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
64+
variance_.mutable_gpu_data());
65+
66+
// do mean and variance normalization
67+
if (bottom[0] != top[0]) {
68+
caffe_copy(bottom[0]->count(), bottom_data, top_data);
69+
}
70+
// subtract mean
71+
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
72+
batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), 0.,
73+
num_by_chans_.mutable_gpu_data());
74+
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
75+
spatial_dim, 1, -1, num_by_chans_.gpu_data(),
76+
spatial_sum_multiplier_.gpu_data(), 1., top_data);
77+
// replicate variance to input size
78+
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
79+
batch_sum_multiplier_.gpu_data(), variance_.gpu_data(), 0.,
80+
num_by_chans_.mutable_gpu_data());
81+
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
82+
spatial_dim, 1, 1., num_by_chans_.gpu_data(),
83+
spatial_sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data());
84+
caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
85+
// TODO(cdoersch): The caching is only needed because later in-place layers
86+
// might clobber the data. Can we skip this if they won't?
87+
caffe_copy(x_norm_.count(), top_data,
88+
x_norm_.mutable_gpu_data());
89+
}
90+
91+
template <typename Dtype>
92+
void BatchNormLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
93+
const vector<bool>& propagate_down,
94+
const vector<Blob<Dtype>*>& bottom) {
95+
CHECK(!use_global_stats_);
96+
const Dtype* top_diff;
97+
if (bottom[0] != top[0]) {
98+
top_diff = top[0]->gpu_diff();
99+
} else {
100+
caffe_copy(x_norm_.count(), top[0]->gpu_diff(), x_norm_.mutable_gpu_diff());
101+
top_diff = x_norm_.gpu_diff();
102+
}
103+
const Dtype* top_data = x_norm_.gpu_data();
104+
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
105+
int num = bottom[0]->shape()[0];
106+
int spatial_dim = bottom[0]->count()/(channels_*bottom[0]->shape(0));
107+
// if Y = (X-mean(X))/(sqrt(var(X)+eps)), then
108+
//
109+
// dE(Y)/dX =
110+
// (dE/dY - mean(dE/dY) - mean(dE/dY \cdot Y) \cdot Y)
111+
// ./ sqrt(var(X) + eps)
112+
//
113+
// where \cdot and ./ are hadamard product and elementwise division,
114+
// respectively, dE/dY is the top diff, and mean/var/sum are all computed
115+
// along all dimensions except the channels dimension. In the above
116+
// equation, the operations allow for expansion (i.e. broadcast) along all
117+
// dimensions except the channels dimension where required.
118+
119+
// sum(dE/dY \cdot Y)
120+
caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff);
121+
caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
122+
bottom_diff, spatial_sum_multiplier_.gpu_data(), 0.,
123+
num_by_chans_.mutable_gpu_data());
124+
caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
125+
num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
126+
mean_.mutable_gpu_data());
127+
128+
// reshape (broadcast) the above
129+
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
130+
batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), 0.,
131+
num_by_chans_.mutable_gpu_data());
132+
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
133+
spatial_dim, 1, 1., num_by_chans_.gpu_data(),
134+
spatial_sum_multiplier_.gpu_data(), 0., bottom_diff);
135+
136+
// sum(dE/dY \cdot Y) \cdot Y
137+
caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
138+
139+
// sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
140+
caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim, 1.,
141+
top_diff, spatial_sum_multiplier_.gpu_data(), 0.,
142+
num_by_chans_.mutable_gpu_data());
143+
caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
144+
num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
145+
mean_.mutable_gpu_data());
146+
// reshape (broadcast) the above to make
147+
// sum(dE/dY)-sum(dE/dY \cdot Y) \cdot Y
148+
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
149+
batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), 0.,
150+
num_by_chans_.mutable_gpu_data());
151+
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num * channels_,
152+
spatial_dim, 1, 1., num_by_chans_.gpu_data(),
153+
spatial_sum_multiplier_.gpu_data(), 1., bottom_diff);
154+
155+
// dE/dY - mean(dE/dY)-mean(dE/dY \cdot Y) \cdot Y
156+
caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff,
157+
Dtype(-1. / (num * spatial_dim)), bottom_diff);
158+
159+
// note: temp_ still contains sqrt(var(X)+eps), computed during the forward
160+
// pass.
161+
caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
162+
}
163+
164+
INSTANTIATE_LAYER_GPU_FUNCS(BatchNormLayer);
165+
166+
167+
} // namespace caffe

‎src/caffe/proto/caffe.proto

+14-1
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ message ParamSpec {
306306
// NOTE
307307
// Update the next available ID when you add a new LayerParameter field.
308308
//
309-
// LayerParameter next available layer-specific ID: 139 (last added: tile_param)
309+
// LayerParameter next available layer-specific ID: 140 (last added: batch_norm_param)
310310
message LayerParameter {
311311
optional string name = 1; // the layer name
312312
optional string type = 2; // the layer type
@@ -355,6 +355,7 @@ message LayerParameter {
355355
// The default for the engine is set by the ENGINE switch at compile-time.
356356
optional AccuracyParameter accuracy_param = 102;
357357
optional ArgMaxParameter argmax_param = 103;
358+
optional BatchNormParameter batch_norm_param = 139;
358359
optional ConcatParameter concat_param = 104;
359360
optional ContrastiveLossParameter contrastive_loss_param = 105;
360361
optional ConvolutionParameter convolution_param = 106;
@@ -466,6 +467,18 @@ message ConcatParameter {
466467
optional uint32 concat_dim = 1 [default = 1];
467468
}
468469

470+
message BatchNormParameter {
471+
// If false, accumulate global mean/variance values via a moving average. If
472+
// true, use those accumulated values instead of computing mean/variance
473+
// across the batch.
474+
optional bool use_global_stats = 1;
475+
// How much does the moving average decay each iteration?
476+
optional float moving_average_fraction = 2 [default = .999];
477+
// Small value to add to the variance estimate so that we don't divide by
478+
// zero.
479+
optional float eps = 3 [default = 1e-5];
480+
}
481+
469482
message ContrastiveLossParameter {
470483
// margin for dissimilar pair
471484
optional float margin = 1 [default = 1.0];
+133
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
#include <algorithm>
2+
#include <cstring>
3+
#include <vector>
4+
5+
#include "gtest/gtest.h"
6+
7+
#include "caffe/blob.hpp"
8+
#include "caffe/common.hpp"
9+
#include "caffe/common_layers.hpp"
10+
#include "caffe/filler.hpp"
11+
12+
#include "caffe/test/test_caffe_main.hpp"
13+
#include "caffe/test/test_gradient_check_util.hpp"
14+
15+
#define BATCH_SIZE 2
16+
#define INPUT_DATA_SIZE 3
17+
18+
namespace caffe {
19+
20+
template <typename TypeParam>
21+
class BatchNormLayerTest : public MultiDeviceTest<TypeParam> {
22+
typedef typename TypeParam::Dtype Dtype;
23+
protected:
24+
BatchNormLayerTest()
25+
: blob_bottom_(new Blob<Dtype>(5, 2, 3, 4)),
26+
blob_top_(new Blob<Dtype>()) {
27+
// fill the values
28+
FillerParameter filler_param;
29+
GaussianFiller<Dtype> filler(filler_param);
30+
filler.Fill(this->blob_bottom_);
31+
blob_bottom_vec_.push_back(blob_bottom_);
32+
blob_top_vec_.push_back(blob_top_);
33+
}
34+
virtual ~BatchNormLayerTest() { delete blob_bottom_; delete blob_top_; }
35+
Blob<Dtype>* const blob_bottom_;
36+
Blob<Dtype>* const blob_top_;
37+
vector<Blob<Dtype>*> blob_bottom_vec_;
38+
vector<Blob<Dtype>*> blob_top_vec_;
39+
};
40+
41+
TYPED_TEST_CASE(BatchNormLayerTest, TestDtypesAndDevices);
42+
43+
TYPED_TEST(BatchNormLayerTest, TestForward) {
44+
typedef typename TypeParam::Dtype Dtype;
45+
LayerParameter layer_param;
46+
47+
BatchNormLayer<Dtype> layer(layer_param);
48+
layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
49+
layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
50+
51+
// Test mean
52+
int num = this->blob_bottom_->num();
53+
int channels = this->blob_bottom_->channels();
54+
int height = this->blob_bottom_->height();
55+
int width = this->blob_bottom_->width();
56+
57+
for (int j = 0; j < channels; ++j) {
58+
Dtype sum = 0, var = 0;
59+
for (int i = 0; i < num; ++i) {
60+
for ( int k = 0; k < height; ++k ) {
61+
for ( int l = 0; l < width; ++l ) {
62+
Dtype data = this->blob_top_->data_at(i, j, k, l);
63+
sum += data;
64+
var += data * data;
65+
}
66+
}
67+
}
68+
sum /= height * width * num;
69+
var /= height * width * num;
70+
71+
const Dtype kErrorBound = 0.001;
72+
// expect zero mean
73+
EXPECT_NEAR(0, sum, kErrorBound);
74+
// expect unit variance
75+
EXPECT_NEAR(1, var, kErrorBound);
76+
}
77+
}
78+
79+
TYPED_TEST(BatchNormLayerTest, TestForwardInplace) {
80+
typedef typename TypeParam::Dtype Dtype;
81+
Blob<Dtype> blob_inplace(5, 2, 3, 4);
82+
vector<Blob<Dtype>*> blob_bottom_vec;
83+
vector<Blob<Dtype>*> blob_top_vec;
84+
LayerParameter layer_param;
85+
FillerParameter filler_param;
86+
GaussianFiller<Dtype> filler(filler_param);
87+
filler.Fill(&blob_inplace);
88+
blob_bottom_vec.push_back(&blob_inplace);
89+
blob_top_vec.push_back(&blob_inplace);
90+
91+
BatchNormLayer<Dtype> layer(layer_param);
92+
layer.SetUp(blob_bottom_vec, blob_top_vec);
93+
layer.Forward(blob_bottom_vec, blob_top_vec);
94+
95+
// Test mean
96+
int num = blob_inplace.num();
97+
int channels = blob_inplace.channels();
98+
int height = blob_inplace.height();
99+
int width = blob_inplace.width();
100+
101+
for (int j = 0; j < channels; ++j) {
102+
Dtype sum = 0, var = 0;
103+
for (int i = 0; i < num; ++i) {
104+
for ( int k = 0; k < height; ++k ) {
105+
for ( int l = 0; l < width; ++l ) {
106+
Dtype data = blob_inplace.data_at(i, j, k, l);
107+
sum += data;
108+
var += data * data;
109+
}
110+
}
111+
}
112+
sum /= height * width * num;
113+
var /= height * width * num;
114+
115+
const Dtype kErrorBound = 0.001;
116+
// expect zero mean
117+
EXPECT_NEAR(0, sum, kErrorBound);
118+
// expect unit variance
119+
EXPECT_NEAR(1, var, kErrorBound);
120+
}
121+
}
122+
123+
TYPED_TEST(BatchNormLayerTest, TestGradient) {
124+
typedef typename TypeParam::Dtype Dtype;
125+
LayerParameter layer_param;
126+
127+
BatchNormLayer<Dtype> layer(layer_param);
128+
GradientChecker<Dtype> checker(1e-2, 1e-4);
129+
checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
130+
this->blob_top_vec_);
131+
}
132+
133+
} // namespace caffe

0 commit comments

Comments
 (0)
Please sign in to comment.