Tensorflow: incorrect result of matrix multiplication (NaN) on GPU

Question

tf.matmul (u_hat = tf.matmul(W_tiled, u_tiled)) returns different results on CPU and GPU. Mean value of tensorflow/numpy matmul operation after second batch is 3.6066954e+17/2.7731653e-06. Finally when code is running on GPU, matrix product begins to contains NaN values.

Replicated with:

Win 10 | GTX 1080 | CUDA Toolkit 10.0 | Python 3.7 | GPU Driver 431.86 | Tensorflow 2.0
Win 10 | GTX 1080 | CUDA Toolkit 10.0 | Python 3.7 | GPU Driver 431.86 | Tensorflow 1.15
Win 10 | GTX 1080 | CUDA Toolkit 10.0 | Python 3.7 | GPU Driver 441.08 | Tensorflow 2.0
Win 10 | GTX 1080 | CUDA Toolkit 10.0 | Python 3.7 | GPU Driver 441.08 | Tensorflow 1.15

from tensorflow.keras import backend as K
from keras.layers import Layer
from tensorflow.keras import utils
from tensorflow.keras.datasets import mnist
import numpy as np
from tensorflow.keras.layers import *
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf


def squash(s, axis=-1, epsilon=1e-7, name=None):
    squared_norm = tf.reduce_sum(tf.square(s), axis=axis, keepdims=True)
    safe_norm = tf.sqrt(squared_norm + epsilon)
    squash_factor = squared_norm / (1. + squared_norm)
    unit_vector = s / safe_norm
    return squash_factor * unit_vector


def softmax(x, axis=-1):
    ex = K.exp(x - K.max(x, axis=axis, keepdims=True))
    return ex / K.sum(ex, axis=axis, keepdims=True)


def safe_norm(s, axis=-1, epsilon=1e-7, keep_dims=False, name=None):
    squared_norm = tf.reduce_sum(tf.square(s), axis=axis, keepdims=keep_dims)
    return tf.sqrt(squared_norm + epsilon)


class Custom_layer(Layer):
    def __init__(self, **kwargs):
        super(Custom_layer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.add_weight(
            shape=(1, 1152, 10, 16, 8),
            initializer=tf.keras.initializers.RandomNormal(0.0, stddev=0.01),
            trainable=True)

    def call(self, inputs):
        reshaped = tf.reshape(inputs, [-1, 1152, 8])
        inputs = squash(reshaped)

        u_expanded_0 = tf.expand_dims(inputs, -1)
        u_expanded_1 = tf.expand_dims(u_expanded_0, 2)

        u_tiled = tf.tile(u_expanded_1, [1, 1, 10, 1, 1])
        W_tiled = tf.tile(self.kernel, [batch_size, 1, 1, 1, 1])

        u_hat = tf.matmul(W_tiled, u_tiled)
        try:
            numpy_result = np.matmul(W_tiled.numpy(), u_tiled.numpy())
            tf.print('
 TensorFlow/numpy max element value=' + str(tf.reduce_max(u_hat).numpy()) + '/' + str(
                numpy_result.max()))
            tf.print('
 TensorFlow/numpy mean value=' + str(tf.reduce_mean(u_hat).numpy()) + '/' + str(
                numpy_result.mean()))
        except:
            pass

        soft = softmax((safe_norm(tf.reduce_sum(u_hat, axis=[1, 3]))))

        # tf.print('

W_tile max=' + str(tf.reduce_max(W_tiled)))
        # tf.print('W_tile min=' + str(tf.reduce_min(W_tiled)))
        # tf.print('u_tiled max=' + str(tf.reduce_max(u_tiled)))
        # tf.print('u_tiled min=' + str(tf.reduce_min(u_tiled)))
        # tf.print('u_hat max=' + str(tf.reduce_max(u_hat)))
        # tf.print('u_hat min=' + str(tf.reduce_min(u_hat)))

        tf.debugging.check_numerics(u_tiled, 'u_tiled')
        tf.debugging.check_numerics(W_tiled, 'W_tiled')
        tf.debugging.check_numerics(u_hat, 'u_hat')
        tf.debugging.check_numerics(soft, 'soft')

        return soft

    def compute_output_shape(self, input_shape):
        return (batch_size, 10, 16)


batch_size = 128
epochs = 100
img_rows, img_cols = 28, 28
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)

x_train = x_train.astype('float32')
x_train /= 255
y_train = utils.to_categorical(y_train, 10)

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Conv2D(256, (9, 9), activation='relu', input_shape=(28, 28, 1)))
model.add(tf.keras.layers.Conv2D(256, (9, 9), strides=(2, 2), activation='relu'))
model.add(Custom_layer())

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.run_eagerly = True
model.summary()

model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs
)

I expected similar results on GPU and CPU, but the actual is quite different.

Jakub Biały · Accepted Answer

Official TF 2.0 build requires CUDA Toolkit 10.0 (10.1 doesn't work), so I recompiled TF 2.0 branch with CUDA Toolkit 10.1. My custom version on TF works as intended and is available here.

Additionally, on official TF 2.0 build (CUDA Toolkit 10.0) I replicated tf.matmul bug from official GitHub repo, but on my custom build (CUDA Toolkit 10.1) works as intended too.

Tensorflow: incorrect result of matrix multiplication (NaN) on GPU

Answers (2)

Related Questions