Jakub Biały
Jakub Biały

Reputation: 422

Tensorflow: incorrect result of matrix multiplication (NaN) on GPU

tf.matmul (u_hat = tf.matmul(W_tiled, u_tiled)) returns different results on CPU and GPU. Mean value of tensorflow/numpy matmul operation after second batch is 3.6066954e+17/2.7731653e-06. Finally when code is running on GPU, matrix product begins to contains NaN values.

Replicated with:

from tensorflow.keras import backend as K
from keras.layers import Layer
from tensorflow.keras import utils
from tensorflow.keras.datasets import mnist
import numpy as np
from tensorflow.keras.layers import *
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf


def squash(s, axis=-1, epsilon=1e-7, name=None):
    squared_norm = tf.reduce_sum(tf.square(s), axis=axis, keepdims=True)
    safe_norm = tf.sqrt(squared_norm + epsilon)
    squash_factor = squared_norm / (1. + squared_norm)
    unit_vector = s / safe_norm
    return squash_factor * unit_vector


def softmax(x, axis=-1):
    ex = K.exp(x - K.max(x, axis=axis, keepdims=True))
    return ex / K.sum(ex, axis=axis, keepdims=True)


def safe_norm(s, axis=-1, epsilon=1e-7, keep_dims=False, name=None):
    squared_norm = tf.reduce_sum(tf.square(s), axis=axis, keepdims=keep_dims)
    return tf.sqrt(squared_norm + epsilon)


class Custom_layer(Layer):
    def __init__(self, **kwargs):
        super(Custom_layer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.add_weight(
            shape=(1, 1152, 10, 16, 8),
            initializer=tf.keras.initializers.RandomNormal(0.0, stddev=0.01),
            trainable=True)

    def call(self, inputs):
        reshaped = tf.reshape(inputs, [-1, 1152, 8])
        inputs = squash(reshaped)

        u_expanded_0 = tf.expand_dims(inputs, -1)
        u_expanded_1 = tf.expand_dims(u_expanded_0, 2)

        u_tiled = tf.tile(u_expanded_1, [1, 1, 10, 1, 1])
        W_tiled = tf.tile(self.kernel, [batch_size, 1, 1, 1, 1])

        u_hat = tf.matmul(W_tiled, u_tiled)
        try:
            numpy_result = np.matmul(W_tiled.numpy(), u_tiled.numpy())
            tf.print('\n TensorFlow/numpy max element value=' + str(tf.reduce_max(u_hat).numpy()) + '/' + str(
                numpy_result.max()))
            tf.print('\n TensorFlow/numpy mean value=' + str(tf.reduce_mean(u_hat).numpy()) + '/' + str(
                numpy_result.mean()))
        except:
            pass

        soft = softmax((safe_norm(tf.reduce_sum(u_hat, axis=[1, 3]))))

        # tf.print('\n\nW_tile max=' + str(tf.reduce_max(W_tiled)))
        # tf.print('W_tile min=' + str(tf.reduce_min(W_tiled)))
        # tf.print('u_tiled max=' + str(tf.reduce_max(u_tiled)))
        # tf.print('u_tiled min=' + str(tf.reduce_min(u_tiled)))
        # tf.print('u_hat max=' + str(tf.reduce_max(u_hat)))
        # tf.print('u_hat min=' + str(tf.reduce_min(u_hat)))

        tf.debugging.check_numerics(u_tiled, 'u_tiled')
        tf.debugging.check_numerics(W_tiled, 'W_tiled')
        tf.debugging.check_numerics(u_hat, 'u_hat')
        tf.debugging.check_numerics(soft, 'soft')

        return soft

    def compute_output_shape(self, input_shape):
        return (batch_size, 10, 16)


batch_size = 128
epochs = 100
img_rows, img_cols = 28, 28
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)

x_train = x_train.astype('float32')
x_train /= 255
y_train = utils.to_categorical(y_train, 10)

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Conv2D(256, (9, 9), activation='relu', input_shape=(28, 28, 1)))
model.add(tf.keras.layers.Conv2D(256, (9, 9), strides=(2, 2), activation='relu'))
model.add(Custom_layer())

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.run_eagerly = True
model.summary()

model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs
)

I expected similar results on GPU and CPU, but the actual is quite different.

Upvotes: 2

Views: 752

Answers (2)

Jakub Biały
Jakub Biały

Reputation: 422

Official TF 2.0 build requires CUDA Toolkit 10.0 (10.1 doesn't work), so I recompiled TF 2.0 branch with CUDA Toolkit 10.1. My custom version on TF works as intended and is available here.

Additionally, on official TF 2.0 build (CUDA Toolkit 10.0) I replicated tf.matmul bug from official GitHub repo, but on my custom build (CUDA Toolkit 10.1) works as intended too.

Upvotes: 0

user11530462
user11530462

Reputation:

Providing the solution here (Answer Section), even though it is specified by user12292000 in the question, for the benefit of the community.

Official TF 2.0 build requires CUDA Toolkit 10.0 (10.1 doesn't work), so he recompiled TF 2.0 branch with CUDA Toolkit 10.1. Now his custom version on TF works as intended and is available here.

Additionally, on official TF 2.0 build (CUDA Toolkit 10.0) he replicated tf.matmul bug from official GitHub repo, but on his custom build (CUDA Toolkit 10.1) works as intended too.

Upvotes: 1

Related Questions