Reputation: 422
tf.matmul (u_hat = tf.matmul(W_tiled, u_tiled)
) returns different results on CPU and GPU. Mean value of tensorflow/numpy matmul operation after second batch is 3.6066954e+17/2.7731653e-06. Finally when code is running on GPU, matrix product begins to contains NaN values.
Replicated with:
Win 10 | GTX 1080 | CUDA Toolkit 10.0 | Python 3.7 | GPU Driver 431.86 | Tensorflow 2.0
Win 10 | GTX 1080 | CUDA Toolkit 10.0 | Python 3.7 | GPU Driver 431.86 | Tensorflow 1.15
Win 10 | GTX 1080 | CUDA Toolkit 10.0 | Python 3.7 | GPU Driver 441.08 | Tensorflow 2.0
Win 10 | GTX 1080 | CUDA Toolkit 10.0 | Python 3.7 | GPU Driver 441.08 | Tensorflow 1.15
from tensorflow.keras import backend as K
from keras.layers import Layer
from tensorflow.keras import utils
from tensorflow.keras.datasets import mnist
import numpy as np
from tensorflow.keras.layers import *
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
def squash(s, axis=-1, epsilon=1e-7, name=None):
squared_norm = tf.reduce_sum(tf.square(s), axis=axis, keepdims=True)
safe_norm = tf.sqrt(squared_norm + epsilon)
squash_factor = squared_norm / (1. + squared_norm)
unit_vector = s / safe_norm
return squash_factor * unit_vector
def softmax(x, axis=-1):
ex = K.exp(x - K.max(x, axis=axis, keepdims=True))
return ex / K.sum(ex, axis=axis, keepdims=True)
def safe_norm(s, axis=-1, epsilon=1e-7, keep_dims=False, name=None):
squared_norm = tf.reduce_sum(tf.square(s), axis=axis, keepdims=keep_dims)
return tf.sqrt(squared_norm + epsilon)
class Custom_layer(Layer):
def __init__(self, **kwargs):
super(Custom_layer, self).__init__(**kwargs)
def build(self, input_shape):
self.kernel = self.add_weight(
shape=(1, 1152, 10, 16, 8),
initializer=tf.keras.initializers.RandomNormal(0.0, stddev=0.01),
trainable=True)
def call(self, inputs):
reshaped = tf.reshape(inputs, [-1, 1152, 8])
inputs = squash(reshaped)
u_expanded_0 = tf.expand_dims(inputs, -1)
u_expanded_1 = tf.expand_dims(u_expanded_0, 2)
u_tiled = tf.tile(u_expanded_1, [1, 1, 10, 1, 1])
W_tiled = tf.tile(self.kernel, [batch_size, 1, 1, 1, 1])
u_hat = tf.matmul(W_tiled, u_tiled)
try:
numpy_result = np.matmul(W_tiled.numpy(), u_tiled.numpy())
tf.print('\n TensorFlow/numpy max element value=' + str(tf.reduce_max(u_hat).numpy()) + '/' + str(
numpy_result.max()))
tf.print('\n TensorFlow/numpy mean value=' + str(tf.reduce_mean(u_hat).numpy()) + '/' + str(
numpy_result.mean()))
except:
pass
soft = softmax((safe_norm(tf.reduce_sum(u_hat, axis=[1, 3]))))
# tf.print('\n\nW_tile max=' + str(tf.reduce_max(W_tiled)))
# tf.print('W_tile min=' + str(tf.reduce_min(W_tiled)))
# tf.print('u_tiled max=' + str(tf.reduce_max(u_tiled)))
# tf.print('u_tiled min=' + str(tf.reduce_min(u_tiled)))
# tf.print('u_hat max=' + str(tf.reduce_max(u_hat)))
# tf.print('u_hat min=' + str(tf.reduce_min(u_hat)))
tf.debugging.check_numerics(u_tiled, 'u_tiled')
tf.debugging.check_numerics(W_tiled, 'W_tiled')
tf.debugging.check_numerics(u_hat, 'u_hat')
tf.debugging.check_numerics(soft, 'soft')
return soft
def compute_output_shape(self, input_shape):
return (batch_size, 10, 16)
batch_size = 128
epochs = 100
img_rows, img_cols = 28, 28
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
x_train = x_train.astype('float32')
x_train /= 255
y_train = utils.to_categorical(y_train, 10)
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Conv2D(256, (9, 9), activation='relu', input_shape=(28, 28, 1)))
model.add(tf.keras.layers.Conv2D(256, (9, 9), strides=(2, 2), activation='relu'))
model.add(Custom_layer())
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.run_eagerly = True
model.summary()
model.fit(
x_train,
y_train,
batch_size=batch_size,
epochs=epochs
)
I expected similar results on GPU and CPU, but the actual is quite different.
Upvotes: 2
Views: 752
Reputation: 422
Official TF 2.0 build requires CUDA Toolkit 10.0 (10.1 doesn't work), so I recompiled TF 2.0 branch with CUDA Toolkit 10.1. My custom version on TF works as intended and is available here.
Additionally, on official TF 2.0 build (CUDA Toolkit 10.0) I replicated tf.matmul bug from official GitHub repo, but on my custom build (CUDA Toolkit 10.1) works as intended too.
Upvotes: 0
Reputation:
Providing the solution here (Answer Section), even though it is specified by user12292000 in the question, for the benefit of the community.
Official TF 2.0 build requires CUDA Toolkit 10.0 (10.1 doesn't work), so he recompiled TF 2.0 branch with CUDA Toolkit 10.1. Now his custom version on TF works as intended and is available here.
Additionally, on official TF 2.0 build (CUDA Toolkit 10.0) he replicated tf.matmul bug from official GitHub repo, but on his custom build (CUDA Toolkit 10.1) works as intended too.
Upvotes: 1