yupthatsme
yupthatsme

Reputation: 47

Numpy Backprop Cost is Not Decreasing

I'm working on a python script that allows the user to define the number of hidden layers and their number of nodes in fully connected neural network.

The problem is, the error is coming up as nan when I try larger datasets. I'm not sure why this is the case but I'm also getting this python error when running in google colab.

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:64: RuntimeWarning: overflow encountered in exp

Here is the output from a small dataset where the error does not occur...

Network Architecture:
----------------------------------------------------------------------------
Input Layer Number of Weights: 60
Hidden Layer 1 Number of Weights: 400
Output Layer Number of Weights: 20
----------------------------------------------------------------------------
Total Number of Weights:  480

Epoch: 1 ERROR: 8.148725708134741e-05
Epoch: 2 ERROR: 8.148670920765655e-05
Epoch: 3 ERROR: 8.14861613419593e-05
Epoch: 4 ERROR: 8.14856134840336e-05
Epoch: 5 ERROR: 8.148506563421254e-05
Epoch: 6 ERROR: 8.148451779205201e-05
Epoch: 7 ERROR: 8.148396995799612e-05
Epoch: 8 ERROR: 8.148342213176729e-05
Epoch: 9 ERROR: 8.148287431336554e-05
Epoch: 10 ERROR: 8.14823265030129e-05
Epoch: 11 ERROR: 8.148177870037632e-05
Epoch: 12 ERROR: 8.148123090584436e-05
Epoch: 13 ERROR: 8.148068311908396e-05
Epoch: 14 ERROR: 8.148013534031717e-05
Epoch: 15 ERROR: 8.147958756948848e-05

Done.
Final Accuracy: 99.99185204124305%

Prediction:
array([0.])

And here is the output from the sklearn boston dataset

Network Architecture:
----------------------------------------------------------------------------
Input Layer Number of Weights: 260
Hidden Layer 1 Number of Weights: 400
Output Layer Number of Weights: 20
----------------------------------------------------------------------------
Total Number of Weights:  680

Epoch: 1 ERROR: nan
Epoch: 2 ERROR: nan
Epoch: 3 ERROR: nan
Epoch: 4 ERROR: nan
Epoch: 5 ERROR: nan
Epoch: 6 ERROR: nan
Epoch: 7 ERROR: nan
Epoch: 8 ERROR: nan
Epoch: 9 ERROR: nan
Epoch: 10 ERROR: nan
Epoch: 11 ERROR: nan
Epoch: 12 ERROR: nan
Epoch: 13 ERROR: nan
Epoch: 14 ERROR: nan
Epoch: 15 ERROR: nan

Done.
Final Accuracy: nan%

Prediction:
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:64: RuntimeWarning: overflow encountered in exp
array([nan])

Any help would be great! Full Script Below...

# Python 3
import numpy as np
np.seterr(divide='ignore', invalid='ignore')

class Model:
  def __init__(self, x, y, number_of_hidden_layers=2, number_of_hidden_nodes=30, quiet=False):
    self.x = x
    self.y = y
    self.number_of_hidden_layers = number_of_hidden_layers
    self.number_of_hidden_nodes = number_of_hidden_nodes
    self.input_layer_activation_function = "tanh"
    self.hidden_layer_activation_function = "tanh"
    self.output_layer_activation_function = "tanh"

    #making a random, reproducible seed
    np.random.seed(1)

    input_shape = self.x[0].shape[0]
    output_shape = self.y[0].shape[0]

    number_of_hidden_nodes = self.number_of_hidden_nodes
    number_of_hidden_layers = self.number_of_hidden_layers

    #init the full lists of hidden plus 2 for input and output
    #weights
    self.W = [None] * (number_of_hidden_layers + 2)
    #activations
    self.A = [None] * (number_of_hidden_layers + 2)
    #deltas
    self.D = [None] * (number_of_hidden_layers + 2)

    input_layer_weights = 2 * np.random.random((input_shape,number_of_hidden_nodes)) - 1
    self.W[0] = (input_layer_weights)

    #middle
    for i in range(number_of_hidden_layers):
      i += 1
      hidden_layer_weights = 2 * np.random.random((number_of_hidden_nodes,number_of_hidden_nodes)) - 1
      self.W[i] = (hidden_layer_weights)

    #output
    output_layer_weights = 2 * np.random.random((number_of_hidden_nodes,output_shape)) - 1
    self.W[len(self.W)-1] = (output_layer_weights)

    if quiet == False:
      #show the architecture:
      print ("Network Architecture:")
      print ("----------------------------------------------------------------------------")
      total = 0
      for count, i in enumerate(self.W):
        total += (i.shape[0] * i.shape[1])
        if count == 0:
          print("Input Layer Number of Weights: " + str(i.shape[0] * i.shape[1]))
        elif count == (len(self.W)-1):
          print("Output Layer Number of Weights: " + str(i.shape[0] * i.shape[1]))
        else:
          print("Hidden Layer " + str(count) + " Number of Weights: " + str(i.shape[0] * i.shape[1]))
      print ("----------------------------------------------------------------------------")
      print("Total Number of Weights: ", total)
      print()

  #nonlin func
  def nonlin(self, x, deriv, function):
    if function == "tanh":
      t=(np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
      if (deriv==True):
          dt=1-t**2
          return dt
      return t

    elif function == "sigmoid":
      if (deriv==True):
          return (x * (1-x))
      return 1/(1 + np.exp(-x))

    elif function == "leaky_relu":
      if (deriv==True):
          dx = np.ones_like(x)
          dx[x < 0] = 0.01
          return dx
      return np.where(x > 0, x, x * 0.01)

  def predict(self, x):
    #forward pass
    input_layer_activation = self.nonlin(np.dot(x, self.W[0]), False, self.input_layer_activation_function)
    self.A[0] = (input_layer_activation)

    for i in range(self.number_of_hidden_layers):
      i += 1
      hidden_layer_activation = self.nonlin(np.dot(self.A[i-1], self.W[i]), False, self.hidden_layer_activation_function)

    output_layer_activation = self.nonlin(np.dot(hidden_layer_activation, self.W[len(self.W)-1]), False,  self.output_layer_activation_function)
    print()
    print("Prediction:")
    return output_layer_activation


  #training
  def train(self, loss_function, epochs, alpha=0.001):
    for j in range(epochs):

        #forward pass
        input_layer_activation = self.nonlin(np.dot(self.x, self.W[0]), False, self.input_layer_activation_function)
        self.A[0] = (input_layer_activation)

        for i in range(self.number_of_hidden_layers):
          i += 1
          hidden_layer_activation = self.nonlin(np.dot(self.A[i-1], self.W[i]), False, self.hidden_layer_activation_function)
          self.A[i] = (hidden_layer_activation)

        output_layer_activation = self.nonlin(np.dot(hidden_layer_activation, self.W[len(self.W)-1]), False,  self.output_layer_activation_function)
        self.A[len(self.A)-1] = (output_layer_activation)

        #choose error in compile
        #so output_layer_activation is the prediction!!!
        if loss_function == "mse":
          error = (self.y - output_layer_activation) **2
        if loss_function == "mae":
          error = np.abs(self.y - output_layer_activation)
        if loss_function == "cce":
          output_layer_activation = np.clip(output_layer_activation, 1e-12, 1. - 1e-12)
          total_number = output_layer_activation.shape[0]
          error = -np.sum(self.y*np.log(output_layer_activation+1e-9))/total_number
        else:
          error = self.y - output_layer_activation

        #print every n steps
        divis = epochs//10
        if (j % divis) == 0:
            print ('Epoch: ' + str(j+1) + ' ERROR: ' + str(np.mean(np.abs(error))))

        #backwards pass
        output_delta = error * self.nonlin(output_layer_activation, True, self.output_layer_activation_function)
        self.D[0] = output_delta

        #setting working vars
        working_delta = output_delta
        past_layer_weights = self.W[len(self.W)-1]

        for i in range(self.number_of_hidden_layers):
          working_index = i+1

          hidden_layer_activation_error = working_delta.dot(past_layer_weights.T)

          hidden_layer_activation_delta = hidden_layer_activation_error * self.nonlin(self.A[len(self.A)-working_index-1], True, self.hidden_layer_activation_function)

          self.D[working_index] = hidden_layer_activation_delta

          working_delta = hidden_layer_activation_delta
          past_layer_weights = self.W[len(self.W)-(working_index+1)]

        input_layer_activation_error = self.D[working_index].dot(self.W[working_index].T)

        input_layer_activation_delta = input_layer_activation_error * self.nonlin(input_layer_activation, True, self.input_layer_activation_function)
        self.D[working_index+1] = input_layer_activation_delta

        #update weights
        internal_alpha = alpha
        self.W[len(W)-1] += input_layer_activation.T.dot(self.D[0]) * internal_alpha

        for i,z in enumerate(range(number_of_hidden_layers,0,-1)):
          i += 1
          self.W[z] += self.A[i].T.dot(self.D[i]) * internal_alpha

        self.W[0] += self.x.T.dot(self.D[len(self.D)-1]) * internal_alpha

    #ending print out
    print()
    print("Done.")
    print("Final Accuracy: " + str(np.abs((np.mean(np.abs(error)))-1)*100) + "%")

#inputs
x = np.array([[0,0,0], [1,1,1], [1,1,1], [0,0,0]])
#output
y = np.array([[0],[1],[1],[0]])

from sklearn.datasets import load_boston
boston = load_boston()
x = boston["data"]
y = boston["target"]
y = y.reshape((x.shape[0], 1))

model = Model(x, y, number_of_hidden_layers=1, number_of_hidden_nodes=20)
model.train("mse", 15, alpha=.001)
model.predict(x[0])

Upvotes: 0

Views: 66

Answers (1)

AcrylicShrimp
AcrylicShrimp

Reputation: 160

I think it is a regression model, and it looks like using tanh activation for all layers. Since tanh's output range is [-1, +1], you should use relu-like activation for the last layer, because range of the target of sklearn's boston dataset is [0, 50].

Upvotes: 1

Related Questions