Custom Parametric Activation Function Leading to NaN Loss and Weights

Question

I am trying to create a custom non-linear activation function, and use a 3 3 hidden layer neural network to do MNIST classification. When I use 'Relu' the network trains fine, but when I use my custom activation function (which is a Parametric SoftPlus Function), the network is gives me NaN loss and weights.

I go through my code step by step for the error to be reproducible

import numpy as np
import matplotlib.image as mpim
import matplotlib.pyplot as plt
from matplotlib import pyplot
from tabulate import tabulate
from tensorflow import keras
import tensorflow as tf
import os
from keras.layers import Layer
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Activation, Dropout, Flatten, Dense, LocallyConnected2D, GaussianNoise, ZeroPadding2D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
from tensorflow.keras import initializers
from tensorflow.keras.constraints import Constraint, MinMaxNorm, NonNeg
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from keras.initializers import Constant
import math
import cmath
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

class ParametricActivationFixed1_Electronic(Layer): #Threshold change relu
    def __init__(self, units=32,x1_init = 0,x2_init = 0,x3_init = 1,mul_init = 1):
        super(ParametricActivationFixed1_Electronic, self).__init__()
        self.units = units
        self.x1_init = x1_init
        self.x2_init = x2_init
        self.x3_init = x3_init
        self.mul_init = mul_init

    def build(self, input_shape):

        self.x1 = self.add_weight("x1", shape=(input_shape[-1],),
                                  initializer=Constant(self.x1_init), trainable=False)
        self.x2 = self.add_weight("x2", shape=(input_shape[-1],),
                                  initializer=Constant(self.x2_init), trainable=False)
        self.x3 = self.add_weight("x3", shape=(input_shape[-1],),
                                  initializer=Constant(self.x3_init), trainable=False)

    def call(self, inputs):
        trnfr_func = self.x3*tf.math.log(1 + tf.math.exp(self.x1*(inputs - self.x2)))
        return 1 *((trnfr_func)*self.mul_init)

activation_fn_electronic_1_1 = ParametricActivationFixed1_Electronic(x1_init = 5.18084836109840,x2_init = 2.23387344968709,x3_init=5.67241315469789)
activation_fn_electronic_1_2 = ParametricActivationFixed1_Electronic(x1_init = 6.95386161380188,x2_init = 2.92341175020311,x3_init=4.28449509582075)
activation_fn_electronic_1_3 = ParametricActivationFixed1_Electronic(x1_init = 6.55845192271617,x2_init = 3.76618901192043,x3_init=4.65713418859783)
activation_fn_electronic_1_4 = ParametricActivationFixed1_Electronic(x1_init = 3.91675537731320,x2_init = 4.48948268363177,x3_init=7.21637142731591)

y11 = activation_fn_electronic_1_1(x).numpy()
y12 = activation_fn_electronic_1_2(x).numpy()
y13 = activation_fn_electronic_1_3(x).numpy()
y14 = activation_fn_electronic_1_4(x).numpy()

# Plot the activation function
plt.figure(figsize=(8, 6))
#plt.plot(x, y1, label='custom1')

plt.plot(x, y11, label='Electronic Function1 Type1')
plt.plot(x, y12, label='Electronic Function1 Type2')
plt.plot(x, y13, label='Electronic Function1 Type3')
plt.plot(x, y14, label='Electronic Function1 Type4')
#plt.plot(x, y11_t, label='Electronic Function1 Type1 Trainable')
#plt.plot(x, y12_t, label='Electronic Function1 Type2 Trainable')
#plt.plot(x, y13_t, label='Electronic Function1 Type3 Trainable')
#plt.plot(x, y14_t, label='Electronic Function1 Type4 Trainable')

plt.xlabel('Input')
plt.ylabel('Output')
plt.title('Parametric Activation Function')
plt.xlim((-10,10))
plt.ylim((0,250))
plt.legend()
plt.grid(True)
plt.show()

The picture of the activation functions is shown Plot of custom activation function

Then I preprocess the training data and create the model

h = 32
w = 32

crop_h = 32
crop_w = 32

h2 = int(crop_h/2)
w2 = int(crop_w/2)

combLines = 1

# Model / data parameters
num_classes = 10
#input_shape = (28, 28, 1)
input_shape = (h,w,1)
# the data, split between train and test sets
#(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
#(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=42)

#### convert to greyscale
#x_train = 0.299*x_train[:,:,:,0]+0.587*x_train[:,:,:,1]+0.114*x_train[:,:,:,2]
#x_test = 0.299*x_test[:,:,:,0]+0.587*x_test[:,:,:,1]+0.114*x_test[:,:,:,2]



#print(x_train)
# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_val = x_val.astype("float32") / 255
x_test = x_test.astype("float32") / 255

x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)
x_val = np.expand_dims(x_val, -1)
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")
print(x_val.shape[0], "val samples")

# Scale images to 5,5,1
x_train = tf.image.resize(x_train, (h,w), method='nearest')
x_test = tf.image.resize(x_test, (h,w), method='nearest')
x_val = tf.image.resize(x_val, (h,w), method='nearest')

x_train = tf.image.central_crop(x_train, (crop_h/h))
x_test = tf.image.central_crop(x_test, (crop_h/h))
x_val = tf.image.central_crop(x_val, (crop_h/h))

x_train = tf.image.resize(x_train, (crop_w,crop_h), method='nearest')
x_test = tf.image.resize(x_test, (crop_w,crop_h), method='nearest')
x_val = tf.image.resize(x_val, (crop_w,crop_h), method='nearest')
# tf.image.central_crop(x_test,(crop_h/h, crop_w/w))

print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")
print(x_val.shape[0], "val samples")

#print(x_test[0])
plt.imshow(x_test[1])
plt.show()

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes)

# weight ranges
#min, max = 0.05 , 1
#min, max = 0.05,1
#min, max = -1. 1
#biasMin, biasMax = 0, 0
noise_SD = 0.1

METRICS = [
    'accuracy',
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
]

model = Sequential()
model.add(Flatten()) #flatten the input
model.add(Dense(100, activation = ParametricActivationFixed1_Electronic(x1_init = 5.18084836109840,x2_init = 2.23387344968709,x3_init=5.67241315469789)))
model.add(GaussianNoise(noise_SD))
model.add(Dense(100, activation = ParametricActivationFixed1_Electronic(x1_init = 5.18084836109840,x2_init = 2.23387344968709,x3_init=5.67241315469789)))
model.add(GaussianNoise(noise_SD))
model.add(Dense(100, activation = ParametricActivationFixed1_Electronic(x1_init = 5.18084836109840,x2_init = 2.23387344968709,x3_init=5.67241315469789)))
model.add(Dense(10,activation='softmax')) 

optimizer = Adam(clipvalue=1,learning_rate=0.001)
# Compile the model
model.compile(optimizer=optimizer, loss='categorical_crossentropy',metrics = METRICS) #(this one works)

history = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=32, epochs=3, shuffle=True, verbose=1) #No shuffle, acc drops

# evaluate the model
results = model.evaluate(x_train, y_train, verbose=0)
train_loss, train_acc, train_precision, train_recall = results 
# Unpack all returned values

results = model.evaluate(x_test, y_test, verbose=0)
test_loss, test_acc, test_precision, test_recall = results
# Unpack all returned values

print('Train: Accuracy: %.3f, Precision: %.3f, Recall: %.3f' % (train_acc, train_precision, train_recall))
print('Test: Accuracy: %.3f, Precision: %.3f, Recall: %.3f' % (test_acc, test_precision, test_recall))
# Print all metrics

The output that I get while training is

Epoch 1/3
1688/1688 [==============================] - 5s 3ms/step - loss: nan - accuracy: 0.1077 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: nan - val_accuracy: 0.1040 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/3
1688/1688 [==============================] - 4s 2ms/step - loss: nan - accuracy: 0.0981 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: nan - val_accuracy: 0.1040 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/3
1688/1688 [==============================] - 4s 2ms/step - loss: nan - accuracy: 0.0981 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: nan - val_accuracy: 0.1040 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Train: Accuracy: 0.098, Precision: 0.000, Recall: 0.000
Test: Accuracy: 0.098, Precision: 0.000, Recall: 0.000

As I also run the same system with ReLU and find that there is no error, suggests the error must lie in the definition of my custom activation function but I cannot seem to identify it. Any ideas?

Custom Parametric Activation Function Leading to NaN Loss and Weights

Answers (0)

Related Questions