Reputation: 1
I am trying to create a custom non-linear activation function, and use a 3 3 hidden layer neural network to do MNIST classification. When I use 'Relu' the network trains fine, but when I use my custom activation function (which is a Parametric SoftPlus Function), the network is gives me NaN loss and weights.
I go through my code step by step for the error to be reproducible
import numpy as np
import matplotlib.image as mpim
import matplotlib.pyplot as plt
from matplotlib import pyplot
from tabulate import tabulate
from tensorflow import keras
import tensorflow as tf
import os
from keras.layers import Layer
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Activation, Dropout, Flatten, Dense, LocallyConnected2D, GaussianNoise, ZeroPadding2D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
from tensorflow.keras import initializers
from tensorflow.keras.constraints import Constraint, MinMaxNorm, NonNeg
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from keras.initializers import Constant
import math
import cmath
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
class ParametricActivationFixed1_Electronic(Layer): #Threshold change relu
def __init__(self, units=32,x1_init = 0,x2_init = 0,x3_init = 1,mul_init = 1):
super(ParametricActivationFixed1_Electronic, self).__init__()
self.units = units
self.x1_init = x1_init
self.x2_init = x2_init
self.x3_init = x3_init
self.mul_init = mul_init
def build(self, input_shape):
self.x1 = self.add_weight("x1", shape=(input_shape[-1],),
initializer=Constant(self.x1_init), trainable=False)
self.x2 = self.add_weight("x2", shape=(input_shape[-1],),
initializer=Constant(self.x2_init), trainable=False)
self.x3 = self.add_weight("x3", shape=(input_shape[-1],),
initializer=Constant(self.x3_init), trainable=False)
def call(self, inputs):
trnfr_func = self.x3*tf.math.log(1 + tf.math.exp(self.x1*(inputs - self.x2)))
return 1 *((trnfr_func)*self.mul_init)
activation_fn_electronic_1_1 = ParametricActivationFixed1_Electronic(x1_init = 5.18084836109840,x2_init = 2.23387344968709,x3_init=5.67241315469789)
activation_fn_electronic_1_2 = ParametricActivationFixed1_Electronic(x1_init = 6.95386161380188,x2_init = 2.92341175020311,x3_init=4.28449509582075)
activation_fn_electronic_1_3 = ParametricActivationFixed1_Electronic(x1_init = 6.55845192271617,x2_init = 3.76618901192043,x3_init=4.65713418859783)
activation_fn_electronic_1_4 = ParametricActivationFixed1_Electronic(x1_init = 3.91675537731320,x2_init = 4.48948268363177,x3_init=7.21637142731591)
y11 = activation_fn_electronic_1_1(x).numpy()
y12 = activation_fn_electronic_1_2(x).numpy()
y13 = activation_fn_electronic_1_3(x).numpy()
y14 = activation_fn_electronic_1_4(x).numpy()
# Plot the activation function
plt.figure(figsize=(8, 6))
#plt.plot(x, y1, label='custom1')
plt.plot(x, y11, label='Electronic Function1 Type1')
plt.plot(x, y12, label='Electronic Function1 Type2')
plt.plot(x, y13, label='Electronic Function1 Type3')
plt.plot(x, y14, label='Electronic Function1 Type4')
#plt.plot(x, y11_t, label='Electronic Function1 Type1 Trainable')
#plt.plot(x, y12_t, label='Electronic Function1 Type2 Trainable')
#plt.plot(x, y13_t, label='Electronic Function1 Type3 Trainable')
#plt.plot(x, y14_t, label='Electronic Function1 Type4 Trainable')
plt.xlabel('Input')
plt.ylabel('Output')
plt.title('Parametric Activation Function')
plt.xlim((-10,10))
plt.ylim((0,250))
plt.legend()
plt.grid(True)
plt.show()
The picture of the activation functions is shown Plot of custom activation function
Then I preprocess the training data and create the model
h = 32
w = 32
crop_h = 32
crop_w = 32
h2 = int(crop_h/2)
w2 = int(crop_w/2)
combLines = 1
# Model / data parameters
num_classes = 10
#input_shape = (28, 28, 1)
input_shape = (h,w,1)
# the data, split between train and test sets
#(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
#(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=42)
#### convert to greyscale
#x_train = 0.299*x_train[:,:,:,0]+0.587*x_train[:,:,:,1]+0.114*x_train[:,:,:,2]
#x_test = 0.299*x_test[:,:,:,0]+0.587*x_test[:,:,:,1]+0.114*x_test[:,:,:,2]
#print(x_train)
# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_val = x_val.astype("float32") / 255
x_test = x_test.astype("float32") / 255
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)
x_val = np.expand_dims(x_val, -1)
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")
print(x_val.shape[0], "val samples")
# Scale images to 5,5,1
x_train = tf.image.resize(x_train, (h,w), method='nearest')
x_test = tf.image.resize(x_test, (h,w), method='nearest')
x_val = tf.image.resize(x_val, (h,w), method='nearest')
x_train = tf.image.central_crop(x_train, (crop_h/h))
x_test = tf.image.central_crop(x_test, (crop_h/h))
x_val = tf.image.central_crop(x_val, (crop_h/h))
x_train = tf.image.resize(x_train, (crop_w,crop_h), method='nearest')
x_test = tf.image.resize(x_test, (crop_w,crop_h), method='nearest')
x_val = tf.image.resize(x_val, (crop_w,crop_h), method='nearest')
# tf.image.central_crop(x_test,(crop_h/h, crop_w/w))
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")
print(x_val.shape[0], "val samples")
#print(x_test[0])
plt.imshow(x_test[1])
plt.show()
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes)
# weight ranges
#min, max = 0.05 , 1
#min, max = 0.05,1
#min, max = -1. 1
#biasMin, biasMax = 0, 0
noise_SD = 0.1
METRICS = [
'accuracy',
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall')
]
model = Sequential()
model.add(Flatten()) #flatten the input
model.add(Dense(100, activation = ParametricActivationFixed1_Electronic(x1_init = 5.18084836109840,x2_init = 2.23387344968709,x3_init=5.67241315469789)))
model.add(GaussianNoise(noise_SD))
model.add(Dense(100, activation = ParametricActivationFixed1_Electronic(x1_init = 5.18084836109840,x2_init = 2.23387344968709,x3_init=5.67241315469789)))
model.add(GaussianNoise(noise_SD))
model.add(Dense(100, activation = ParametricActivationFixed1_Electronic(x1_init = 5.18084836109840,x2_init = 2.23387344968709,x3_init=5.67241315469789)))
model.add(Dense(10,activation='softmax'))
optimizer = Adam(clipvalue=1,learning_rate=0.001)
# Compile the model
model.compile(optimizer=optimizer, loss='categorical_crossentropy',metrics = METRICS) #(this one works)
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=32, epochs=3, shuffle=True, verbose=1) #No shuffle, acc drops
# evaluate the model
results = model.evaluate(x_train, y_train, verbose=0)
train_loss, train_acc, train_precision, train_recall = results
# Unpack all returned values
results = model.evaluate(x_test, y_test, verbose=0)
test_loss, test_acc, test_precision, test_recall = results
# Unpack all returned values
print('Train: Accuracy: %.3f, Precision: %.3f, Recall: %.3f' % (train_acc, train_precision, train_recall))
print('Test: Accuracy: %.3f, Precision: %.3f, Recall: %.3f' % (test_acc, test_precision, test_recall))
# Print all metrics
The output that I get while training is
Epoch 1/3
1688/1688 [==============================] - 5s 3ms/step - loss: nan - accuracy: 0.1077 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: nan - val_accuracy: 0.1040 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/3
1688/1688 [==============================] - 4s 2ms/step - loss: nan - accuracy: 0.0981 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: nan - val_accuracy: 0.1040 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/3
1688/1688 [==============================] - 4s 2ms/step - loss: nan - accuracy: 0.0981 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: nan - val_accuracy: 0.1040 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Train: Accuracy: 0.098, Precision: 0.000, Recall: 0.000
Test: Accuracy: 0.098, Precision: 0.000, Recall: 0.000
As I also run the same system with ReLU and find that there is no error, suggests the error must lie in the definition of my custom activation function but I cannot seem to identify it. Any ideas?
Upvotes: 0
Views: 29