Reputation: 201
I had tried several versions of batch_normalization in tensorflow, but none of them worked! The results were all incorrect when I set batch_size = 1 at inference time.
Version 1: directly use the official version in tensorflow.contrib
from tensorflow.contrib.layers.python.layers.layers import batch_norm
use like this:
output = lrelu(batch_norm(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
Version 2: from How could I use Batch Normalization in TensorFlow?
def batch_norm_layer(x, train_phase, scope_bn='bn'):
bn_train = batch_norm(x, decay=0.999, epsilon=1e-3, center=True, scale=True,
updates_collections=None,
is_training=True,
reuse=None, # is this right?
trainable=True,
scope=scope_bn)
bn_inference = batch_norm(x, decay=0.999, epsilon=1e-3, center=True, scale=True,
updates_collections=None,
is_training=False,
reuse=True, # is this right?
trainable=True,
scope=scope_bn)
z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
return z
use like this:
output = lrelu(batch_norm_layer(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training is a placeholder at training time is True and False at inference time.
version 3: from slim https://github.com/tensorflow/models/blob/master/inception/inception/slim/ops.py
def batch_norm_layer(inputs,
is_training=True,
scope='bn'):
decay=0.999
epsilon=0.001
inputs_shape = inputs.get_shape()
with tf.variable_scope(scope) as t_scope:
axis = list(range(len(inputs_shape) - 1))
params_shape = inputs_shape[-1:]
# Allocate parameters for the beta and gamma of the normalization.
beta, gamma = None, None
beta = tf.Variable(tf.zeros_initializer(params_shape),
name='beta',
trainable=True)
gamma = tf.Variable(tf.ones_initializer(params_shape),
name='gamma',
trainable=True)
moving_mean = tf.Variable(tf.zeros_initializer(params_shape),
name='moving_mean',
trainable=False)
moving_variance = tf.Variable(tf.ones_initializer(params_shape),
name='moving_variance',
trainable=False)
if is_training:
# Calculate the moments based on the individual batch.
mean, variance = tf.nn.moments(inputs, axis)
update_moving_mean = moving_averages.assign_moving_average(
moving_mean, mean, decay)
update_moving_variance = moving_averages.assign_moving_average(
moving_variance, variance, decay)
else:
# Just use the moving_mean and moving_variance.
mean = moving_mean
variance = moving_variance
# Normalize the activations.
outputs = tf.nn.batch_normalization(
inputs, mean, variance, beta, gamma, epsilon)
outputs.set_shape(inputs.get_shape())
return outputs
use like this:
output = lrelu(batch_norm_layer(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
version 4: like version3, but add tf.control_dependencies
def batch_norm_layer(inputs,
decay=0.999,
center=True,
scale=True,
epsilon=0.001,
moving_vars='moving_vars',
activation=None,
is_training=True,
trainable=True,
restore=True,
scope='bn',
reuse=None):
inputs_shape = inputs.get_shape()
with tf.variable_op_scope([inputs], scope, 'BatchNorm', reuse=reuse):
axis = list(range(len(inputs_shape) - 1))
params_shape = inputs_shape[-1:]
# Allocate parameters for the beta and gamma of the normalization.
beta = tf.Variable(tf.zeros(params_shape), name='beta')
gamma = tf.Variable(tf.ones(params_shape), name='gamma')
# Create moving_mean and moving_variance add them to
# GraphKeys.MOVING_AVERAGE_VARIABLES collections.
moving_mean = tf.Variable(tf.zeros(params_shape), name='moving_mean',
trainable=False)
moving_variance = tf.Variable(tf.ones(params_shape), name='moving_variance',
trainable=False)
control_inputs = []
if is_training:
# Calculate the moments based on the individual batch.
mean, variance = tf.nn.moments(inputs, axis)
update_moving_mean = moving_averages.assign_moving_average(
moving_mean, mean, decay)
update_moving_variance = moving_averages.assign_moving_average(
moving_variance, variance, decay)
control_inputs = [update_moving_mean, update_moving_variance]
else:
# Just use the moving_mean and moving_variance.
mean = moving_mean
variance = moving_variance
# Normalize the activations.
with tf.control_dependencies(control_inputs):
return tf.nn.batch_normalization(
inputs, mean, variance, beta, gamma, epsilon)
use like this:
output = lrelu(batch_norm(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
The 4 versions of Batch_normalization are all not correct. So, how to use batch normalization correctly?
Another strange phenomenon is if I set batch_norm_layer to null like this, the inference result are all same.
def batch_norm_layer(inputs, is_training):
return inputs
Upvotes: 19
Views: 28284
Reputation: 5354
I have tested that the following simplified implementation of batch normalization gives the same result as tf.contrib.layers.batch_norm
as long as the setting is the same.
def initialize_batch_norm(scope, depth):
with tf.variable_scope(scope) as bnscope:
gamma = tf.get_variable("gamma", shape[-1], initializer=tf.constant_initializer(1.0))
beta = tf.get_variable("beta", shape[-1], initializer=tf.constant_initializer(0.0))
moving_avg = tf.get_variable("moving_avg", shape[-1], initializer=tf.constant_initializer(0.0), trainable=False)
moving_var = tf.get_variable("moving_var", shape[-1], initializer=tf.constant_initializer(1.0), trainable=False)
bnscope.reuse_variables()
def BatchNorm_layer(x, scope, train, epsilon=0.001, decay=.99):
# Perform a batch normalization after a conv layer or a fc layer
# gamma: a scale factor
# beta: an offset
# epsilon: the variance epsilon - a small float number to avoid dividing by 0
with tf.variable_scope(scope, reuse=True):
with tf.variable_scope('BatchNorm', reuse=True) as bnscope:
gamma, beta = tf.get_variable("gamma"), tf.get_variable("beta")
moving_avg, moving_var = tf.get_variable("moving_avg"), tf.get_variable("moving_var")
shape = x.get_shape().as_list()
control_inputs = []
if train:
avg, var = tf.nn.moments(x, range(len(shape)-1))
update_moving_avg = moving_averages.assign_moving_average(moving_avg, avg, decay)
update_moving_var = moving_averages.assign_moving_average(moving_var, var, decay)
control_inputs = [update_moving_avg, update_moving_var]
else:
avg = moving_avg
var = moving_var
with tf.control_dependencies(control_inputs):
output = tf.nn.batch_normalization(x, avg, var, offset=beta, scale=gamma, variance_epsilon=epsilon)
return output
The main tips with using the official implementation of batch normalization in tf.contrib.layers.batch_norm
are: (1) set is_training=True
for training time and is_training=False
for validation and testing time; (2) set updates_collections=None
to make sure that moving_variance
and moving_mean
are updated in place; (3) be aware and careful with the scope setting; (4) set decay
to be a smaller value (decay=0.9
or decay=0.99
) than default value (default is 0.999) if your dataset is small or your total training updates/steps are not that large.
Upvotes: 8
Reputation: 179
I found the Zhongyu Kuang's code really useful, but I stuck on how to dynamically switch between train and test ops, i.e. how to move from a python boolean is_training to a tensorflow boolean placeholder is_training. I need this functionality to be able to test the network on the validation set during the training.
Starting from his code and inspired by this, I wrote the following code:
def batch_norm(x, scope, is_training, epsilon=0.001, decay=0.99):
"""
Returns a batch normalization layer that automatically switch between train and test phases based on the
tensor is_training
Args:
x: input tensor
scope: scope name
is_training: boolean tensor or variable
epsilon: epsilon parameter - see batch_norm_layer
decay: epsilon parameter - see batch_norm_layer
Returns:
The correct batch normalization layer based on the value of is_training
"""
assert isinstance(is_training, (ops.Tensor, variables.Variable)) and is_training.dtype == tf.bool
return tf.cond(
is_training,
lambda: batch_norm_layer(x=x, scope=scope, epsilon=epsilon, decay=decay, is_training=True, reuse=None),
lambda: batch_norm_layer(x=x, scope=scope, epsilon=epsilon, decay=decay, is_training=False, reuse=True),
)
def batch_norm_layer(x, scope, is_training, epsilon=0.001, decay=0.99, reuse=None):
"""
Performs a batch normalization layer
Args:
x: input tensor
scope: scope name
is_training: python boolean value
epsilon: the variance epsilon - a small float number to avoid dividing by 0
decay: the moving average decay
Returns:
The ops of a batch normalization layer
"""
with tf.variable_scope(scope, reuse=reuse):
shape = x.get_shape().as_list()
# gamma: a trainable scale factor
gamma = tf.get_variable("gamma", shape[-1], initializer=tf.constant_initializer(1.0), trainable=True)
# beta: a trainable shift value
beta = tf.get_variable("beta", shape[-1], initializer=tf.constant_initializer(0.0), trainable=True)
moving_avg = tf.get_variable("moving_avg", shape[-1], initializer=tf.constant_initializer(0.0), trainable=False)
moving_var = tf.get_variable("moving_var", shape[-1], initializer=tf.constant_initializer(1.0), trainable=False)
if is_training:
# tf.nn.moments == Calculate the mean and the variance of the tensor x
avg, var = tf.nn.moments(x, range(len(shape)-1))
update_moving_avg = moving_averages.assign_moving_average(moving_avg, avg, decay)
update_moving_var = moving_averages.assign_moving_average(moving_var, var, decay)
control_inputs = [update_moving_avg, update_moving_var]
else:
avg = moving_avg
var = moving_var
control_inputs = []
with tf.control_dependencies(control_inputs):
output = tf.nn.batch_normalization(x, avg, var, offset=beta, scale=gamma, variance_epsilon=epsilon)
return output
Then I use the batch_norm layer in this way:
fc1_weights = tf.Variable(...)
fc1 = tf.matmul(x, fc1_weights)
fc1 = batch_norm(fc1, 'fc1_bn', is_training=is_training)
fc1 = tf.nn.relu(fc1)
Where is_training is a boolean placeholder. Note that the bias addition is not needed because is replaced by the beta parameter as explained in the Batch Normalization paper.
During execution:
# Training phase
sess.run(loss, feed_dict={x: bx, y: by, is_training: True})
# Testing phase
sess.run(loss, feed_dict={x: bx, y: by, is_training: False})
Upvotes: 2