Reputation: 3649
I am new into tensorflow and try to understand how the computation graph works. I am working on the very basic linear regression example on the tensorflow website. I have the following piece of code:
import numpy as np
import tensorflow as tf
def manual_loss(_w, _b, _x, _y):
_loss = 0.0
n = len(_x)
for j in range(n):
_loss += (_w * _x[j] + _b - _y[j]) ** 2
return _loss
def manual_grads(_w, _b, _x, _y):
n = len(_x)
g_w = 0.0
g_b = 0
for j in range(n):
g_w += 2.0 * (_w * _x[j] + _b - _y[j]) * _x[j]
g_b += 2.0 * (_w * _x[j] + _b - _y[j])
return g_w, g_b
# Model parameters
W = tf.Variable([0.3], dtype=tf.float32)
b = tf.Variable([-0.3], dtype=tf.float32)
_W = 0.3
_b = -0.3
# Model input and output
x = tf.placeholder(tf.float32)
linear_model = W * x + b
y = tf.placeholder(tf.float32)
# loss
loss = tf.reduce_sum(tf.square(linear_model - y)) # sum of the squares
grads = tf.gradients(loss, [W, b])
# training data
x_train = [1, 2, 3, 4]
y_train = [0, -1, -2, -3]
# training loop
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
lr = 0.001
for i in range(1000):
results = sess.run([loss, W, b, grads], {x: x_train, y: y_train})
loss_value = results[0]
W_value = results[1]
b_value = results[2]
grad_W = results[3][0]
grad_b = results[3][1]
manual_loss_value = manual_loss(_w=_W, _b=_b, _x=x_train, _y=y_train)
manual_grad_W, manual_grad_b = manual_grads(_w=_W, _b=_b, _x=x_train, _y=y_train)
new_W_value = W_value - lr * grad_W
new_b_value = b_value - lr * grad_b
W = tf.assign(W, value=new_W_value)
b = tf.assign(b, value=new_b_value)
print("***********************")
print("loss={0}".format(loss_value))
print("manual_loss_value={0}".format(manual_loss_value))
print("W={0}".format(W_value))
print("b={0}".format(b_value))
print("manual_W={0}".format(_W))
print("manual_b={0}".format(_b))
print("grad_W={0}".format(grad_W))
print("grad_b={0}".format(grad_b))
print("manual_grad_W={0}".format(manual_grad_W))
print("manual_grad_b={0}".format(manual_grad_b))
print("***********************")
_W -= lr * manual_grad_W
_b -= lr * manual_grad_b
I just try to apply gradient descent to a simple (w*X - b - Y)^2 model. I don't use Tensorflow's own optimizer purposefully, I want to understand the underlying graph update mechanisms. In order to check that the system calculates correct gradients, I implemented my own loss and gradient calculation functions for linear regression as well. Unfortunately, it seems that tensorflow does not calculate the loss function and the gradients as expected. Here is what I get as an output:
***********************
loss=23.65999984741211
manual_loss_value=23.659999999999997
W=[ 0.30000001]
b=[-0.30000001]
manual_W=0.3
manual_b=-0.3
grad_W=[ 52.]
grad_b=[ 15.59999943]
manual_grad_W=52.0
manual_grad_b=15.599999999999998
***********************
***********************
loss=23.65999984741211
manual_loss_value=20.81095744
W=[ 0.24800001]
b=[-0.31560001]
manual_W=0.248
manual_b=-0.3156
grad_W=[ 52.]
grad_b=[ 15.59999943]
manual_grad_W=48.568
manual_grad_b=14.4352
***********************
As you can see, tensorflow calculates incorrect loss value and gradients for W and b in the second iteration, actually the same ones as the first iteration. In some trials, it starts to diverge from the actual values from third or fourth iterations; not always in the second one. Am I doing something wrong here? As soon as I get the values of W and b and their gradients, I update their values with tf.assign()
in the training loop. Does the problem lie here; is it a wrong way to update variables with tensorflow? It is really discouraging to run into such problems just at the start.
Upvotes: 2
Views: 201
Reputation: 1913
I think you have a problem of numeric precision. Numpy uses double floats by default (64 bits). You are declaring your tensors as tf.float32
. Try to change them to tf.float64
.
Edit: I think the difference is due to the exponentiation in the loss function. Try to change for a multiplication as in:
_loss += (_w * _x[j] + _b - _y[j]) * (_w * _x[j] + _b - _y[j])
import numpy as np
import tensorflow as tf
def manual_loss(_w, _b, _x, _y):
_loss = 0.0
n = len(_x)
for j in range(n):
diff = (_w * _x[j] + _b - _y[j])
_loss += diff * diff
return _loss
def manual_grads(_w, _b, _x, _y):
n = len(_x)
g_w = 0.0
g_b = 0
for j in range(n):
g_w += 2.0 * (_w * _x[j] + _b - _y[j]) * _x[j]
g_b += 2.0 * (_w * _x[j] + _b - _y[j])
return g_w, g_b
# Model parameters
W = tf.Variable([0.3], dtype=tf.float64)
b = tf.Variable([-0.3], dtype=tf.float64)
_W = 0.3
_b = -0.3
# Model input and output
x = tf.placeholder(tf.float64)
linear_model = W * x + b
y = tf.placeholder(tf.float64)
# loss
loss = tf.reduce_sum(tf.square(linear_model - y)) # sum of the squares
grads = tf.gradients(loss, [W, b])
# training data
x_train = [1, 2, 3, 4]
y_train = [0, -1, -2, -3]
# training loop
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
lr = 0.001
for i in range(10):
with tf.device('cpu:0'):
results = sess.run([loss, grads], {x: x_train, y: y_train})
loss_value = results[0]
grad_W = results[1][0]
grad_b = results[1][1]
manual_loss_value = manual_loss(_w=_W, _b=_b, _x=x_train, _y=y_train)
manual_grad_W, manual_grad_b = manual_grads(_w=_W, _b=_b, _x=x_train, _y=y_train)
new_W_value = (W - lr * grad_W).eval(session = sess)
new_b_value = (b - lr * grad_b).eval(session = sess)
tf.assign(W, value=new_W_value).eval(session = sess)
tf.assign(b, value=new_b_value).eval(session = sess)
print("***********************")
print("loss={0}".format(loss_value))
print("manual_loss_value={0}".format(manual_loss_value))
print("W={0}".format(W.eval(session = sess)))
print("b={0}".format(b.eval(session = sess)))
print("manual_W={0}".format(_W))
print("manual_b={0}".format(_b))
print("grad_W={0}".format(grad_W))
print("grad_b={0}".format(grad_b))
print("manual_grad_W={0}".format(manual_grad_W))
print("manual_grad_b={0}".format(manual_grad_b))
print("***********************")
_W -= lr * manual_grad_W
_b -= lr * manual_grad_b
Upvotes: 1
Reputation: 1114
I think there is a problem with the use of tf.assign
. The command tf.assign
creates assign nodes, that should be run to be effective. You should change to something like
assign_W_placeholder = tf.placeholder(tf.float32)
assign_b_placeholder = tf.placeholder(tf.float32)
assign_W_node = tf.assign(W, assign_W_placeholder)
assign_b_node = tf.assign(b, assign_b_placeholder)
and then in the for loop, add something like
sess.run(assign_W_node, feed_dict={assign_W_placeholder: new_W_value}
sess.run(assign_b_node, feed_dict={assign_b_placeholder: new_b_value}
After these, tensorflow and manual give the same results.
The complete code:
import numpy as np
import tensorflow as tf
def manual_loss(_w, _b, _x, _y):
_loss = 0.0
n = len(_x)
for j in range(n):
_loss += (_w * _x[j] + _b - _y[j]) ** 2
return _loss
def manual_grads(_w, _b, _x, _y):
n = len(_x)
g_w = 0.0
g_b = 0
for j in range(n):
g_w += 2.0 * (_w * _x[j] + _b - _y[j]) * _x[j]
g_b += 2.0 * (_w * _x[j] + _b - _y[j])
return g_w, g_b
# Model parameters
W = tf.Variable([0.3], dtype=tf.float32)
b = tf.Variable([-0.3], dtype=tf.float32)
_W = 0.3
_b = -0.3
# Model input and output
x = tf.placeholder(tf.float32)
linear_model = W * x + b
y = tf.placeholder(tf.float32)
assign_W_placeholder = tf.placeholder(tf.float32)
assign_b_placeholder = tf.placeholder(tf.float32)
assign_W_node = tf.assign(W, assign_W_placeholder)
assign_b_node = tf.assign(b, assign_b_placeholder)
# loss
loss = tf.reduce_sum(tf.square(linear_model - y)) # sum of the squares
grads = tf.gradients(loss, [W, b])
# training data
x_train = [1, 2, 3, 4]
y_train = [0, -1, -2, -3]
# training loop
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
lr = 0.001
for i in range(1000):
results = sess.run([loss, W, b, grads], {x: x_train, y: y_train})
loss_value = results[0]
W_value = results[1]
b_value = results[2]
grad_W = results[3][0]
grad_b = results[3][1]
manual_loss_value = manual_loss(_w=_W, _b=_b, _x=x_train, _y=y_train)
manual_grad_W, manual_grad_b = manual_grads(_w=_W, _b=_b, _x=x_train, _y=y_train)
new_W_value = W_value - lr * grad_W
new_b_value = b_value - lr * grad_b
sess.run([assign_W_node, assign_b_node],
feed_dict={assign_W_placeholder: new_W_value, assign_b_placeholder: new_b_value})
print("***********************")
print("loss={0}".format(loss_value))
print("manual_loss_value={0}".format(manual_loss_value))
print("W={0}".format(W_value))
print("b={0}".format(b_value))
print("manual_W={0}".format(_W))
print("manual_b={0}".format(_b))
print("grad_W={0}".format(grad_W))
print("grad_b={0}".format(grad_b))
print("manual_grad_W={0}".format(manual_grad_W))
print("manual_grad_b={0}".format(manual_grad_b))
print("***********************")
_W -= lr * manual_grad_W
_b -= lr * manual_grad_b
Upvotes: 1