Reputation: 311
I'm initializing my 2 sets of weights/bias using the same function
1st set:
W_omega = tf.Variable(tf.random_uniform([hidden_size, attention_size], -0.1, 0.1), name='W_omega')
b_omega = tf.Variable(tf.random_uniform([attention_size], -0.1, 0.1), name='b_omega')
2nd set:
W = tf.Variable(tf.random_uniform([input_dim, output_dim], -0.1, 0.1), name='W_post_attn')
b = tf.Variable(tf.random_uniform([output_dim], -0.1, 0.1), name='b_post_attn')
But the histogram in Tensorboard shows that the 2nd set of bias is not uniformly distributed (binary distribution centered around +/-0.06, see top image below).
Any idea what might be causing this?
Adding a dummy code using MNIST data and running on jupyter notebook. Note that my orignial code is a binary classification while MNIST has 10 classes. It seems like the number of peaks in the output bias (at the final layer) is correlated to the number of output classes (see figure below).
from __future__ import division, print_function, unicode_literals
from functools import partial
import numpy as np
import os
import tensorflow as tf
def reset_graph(seed=42):
tf.reset_default_graph()
tf.set_random_seed(seed)
np.random.seed(seed)
path = '/your_folder/'
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/mnist/tmp/data/")
reset_graph()
def variable_summaries(var, name):
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
with tf.name_scope(name):
mean = tf.reduce_mean(var)
tf.summary.scalar('mean', mean)
tf.summary.scalar('max', tf.reduce_max(var))
tf.summary.scalar('min', tf.reduce_min(var))
tf.summary.histogram('histogram', var)
n_inputs = 28*28 # MNIST
n_hidden1 = 200
n_outputs = 10
learning_rate = 0.01
n_epochs = 50
batch_size = 50
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
def attention(inputs, attention_size, name):
hidden_size = int(inputs.get_shape()[1])
# Trainable parameters
with tf.name_scope(name):
with tf.name_scope('Attention_variables'):
W_omega = tf.Variable(tf.random_uniform([hidden_size, attention_size], -0.1, 0.1), name='W_omega')
b_omega = tf.Variable(tf.random_uniform([attention_size], -0.1, 0.1), name='b_omega')
u_omega = tf.Variable(tf.random_uniform([attention_size], -0.1, 0.1), name='u_omega')
variable_summaries(W_omega, 'W_omega')
variable_summaries(b_omega, 'b_omega')
with tf.name_scope('Attention_u_it'):
v = tf.tanh(tf.tensordot(inputs, W_omega, axes=[[1], [0]]) + b_omega, name='u_it')
with tf.name_scope('Attention_alpha_it'):
vu = tf.tensordot(v, u_omega, axes=[[1], [0]], name='u_it_u_w')
alphas = tf.nn.softmax(vu, name='alphas')
with tf.name_scope('Attention_output'):
#output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1, name='attention_output')
output = inputs * tf.expand_dims(alphas, -1)
return output
def neuron_layer(X, n_neurons, name, activation=None):
with tf.name_scope(name):
n_inputs = int(X.get_shape()[1])
W = tf.Variable(tf.random_uniform([n_inputs, n_neurons], -0.1, 0.1), name='W')
b = tf.Variable(tf.random_uniform([n_neurons], -0.1, 0.1), name='b')
variable_summaries(W, 'W')
variable_summaries(b, 'b')
if activation is not None:
return activation(tf.matmul(X, W) + b)
else:
return tf.matmul(X, W) + b
with tf.name_scope("dnn"):
hidden1 = attention(X, n_hidden1, name="hidden1_attn")
logits = neuron_layer(hidden1, n_outputs, name="outputs")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
opt = tf.train.GradientDescentOptimizer(learning_rate)
training_op = opt.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
#tensorboard saving parameters
from datetime import datetime
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "/tensorboard_files/"
logdir = "{}/run-{}/".format(root_logdir, now)
# Merge all the summaries and write them out to /tmp/mnist_logs (by default)
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(logdir+'/train', tf.get_default_graph())
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
if epoch%2==0:
acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
#tensorboard summary
summary = sess.run(merged, feed_dict={X: X_batch, y: y_batch})
train_writer.add_summary(summary, epoch)
acc_val = accuracy.eval(feed_dict={X: mnist.validation.images,
y: mnist.validation.labels})
print(epoch, "Train accuracy:", acc_train, "Val accuracy:", acc_val)
save_path = saver.save(sess, path+"my_model_final.ckpt")
Upvotes: 1
Views: 503
Reputation: 53758
The key value that defines how the histogram looks like is the size of the random vector, in your case, it's attention_size=200
and n_neurons=10
(or output_dim=2
in the first snippet). Clearly, the larger the sample size, the closer the sample to uniform. That's why the distribution of b_omega
looks more more like uniform, than the distribution of b
.
Set attention_size=10
and here's what you'll see:
The actual b
values behind this chart are (note the peaks at 0.01
):
[ 0.05595738 0.01231904 -0.08605836 0.01057353 -0.03015073 -0.04255719
0.04719915 0.01116617 -0.0672287 -0.00013051]
The actual b_omega
values are:
[-0.06326838 -0.09758444 0.06982093 0.01574633 0.0039237 0.07463291
0.02308519 0.04594345 0.07912541 0.00175323]
Also note that distributions are the same in each epoch (that's the chart depth), because no weights and biases are ever updated.
Bottom line: initialization is correct, but the histogram chart can be confusing, if one doesn't pay attention to the shape of the variables.
Upvotes: 1