XOR with ReLU activation function

Question

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

input = [[0,0,1],[0,1,1],[1,0,1],[1,1,1]]
output = [0,1,1,0]

N = np.size(input,0) # number of samples

Ni = np.size(input,1) # dimension of the samples of input

No = 1 # dimension of the sample of output

Nh = 10 # number of hidden units

Ws = 1/4*np.random.rand(Nh,Ni+1)
print(Ws)

Wo = 1/4*np.random.rand(No,Nh)
print(Wo)

alpha = 0.05 # Learning rate

t_ = []

loss_ = []

def ReLU(x):
    return np.maximum(0,x)

def sigmoid(x):
    return 1/(1+np.exp(-x))

## train the model ====================================================================
for epoch in range(0,3000):
    loss = 0
    for id_ in range(0,N):
        dWs = 0*Ws
        dWo = 0*Wo
        
        x = np.append(input[id_],1)
        
        Z_1 = np.dot(Ws,x)
        Z_2 = np.dot(Wo,ReLU(Z_1))
        y = sigmoid(Z_2)
        d = output[id_]

        for j in range(0,Nh):
            for i in range(0,No):
                if Z_1[j] >= 0:
                    dWo[i,j] = dWo[i,j] + (y[i]-d)*Z_1[j]
                    #dWo[i,j] = dWo[i,j] + sigmoid(Z_1[j])*(y[i]-d)
                else:
                    dWo[i,j] += 0

        Wo = Wo - alpha*dWo
        
        for k in range(0,Ni+1):
            for j in range(0,Nh):
                for i in range(0,No):
                    if Z_1[j] >= 0:
                        dWs[j,k] = dWs[j,k] + x[k]*Wo[i,j]*(y[i]-d)
                        #dWs[j,k] = dWs[j,k] + x[k]*Wo[i,j]*sigmoid(Z_1[j])*(1-sigmoid(Z_1[j]))*(y[i]-d)              
                    else:
                        dWs[j,k] += 0
                        
        Ws = Ws - alpha*dWs
        
        loss = loss + 1/2*np.linalg.norm(y-d)

    if np.mod(epoch,50) == 0:
        print(epoch,"-th epoch trained")
            
        t_ = np.append(t_,epoch)
            
        loss_ = np.append(loss_,loss)
            
        fig = plt.figure(num=0,figsize=[10,5])
        plt.plot(t_,loss_,marker="")
        plt.title('Loss decay')
        plt.xlabel('epoch',FontSize=20)
        plt.ylabel('Loss',FontSize=20)
        plt.show()
            
        ## figure out the function shape the model========================================== 
        xn = np.linspace(0,1,20)
        yn = np.linspace(0,1,20)
        xm, ym = np.meshgrid(xn, yn)
        xx = np.reshape(xm,np.size(xm,0)*np.size(xm,1))
        yy = np.reshape(ym,np.size(xm,0)*np.size(xm,1))
        Z = []
        for id__ in range(0,np.size(xm)):
            x = np.append([xx[id__],yy[id__]],[1,1])
            Z_1 = np.dot(Ws,x)
            y_ = sigmoid(np.dot(Wo,ReLU(Z_1)))
            Z = np.append(Z,y_)
                
        fig = plt.figure(num=1,figsize=[10,5])
        ax = fig.gca(projection='3d')
        surf = ax.plot_surface(xm,ym,np.reshape(Z,(np.size(xm,0),np.size(xm,1))),cmap='coolwarm',linewidth=0,antialiased=False)            
        print("====================================================================")
        plt.show()       
        
## test the trained model ====================================================================
for id_ in range(0,N):
    x = np.append(input[id_],1)
        
    Z_1 = np.dot(Ws,x)
        
    y = sigmoid(np.dot(Wo,ReLU(Z_1)))
    
    print(y)

If I try this with sigmoid function, it works fine but when the ReLU activation function is implemented, the the program doesn't learning anything.

The NN consist of 3 input, hidden, output layers and sigmoid activation fuction is implemented for output function. Hand calculation seems fine but can't find the flaw.

The code below with sigmoid activation function works just fine.

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

input = [[0,0,1],[0,1,1],[1,0,1],[1,1,1]]
output = [0,1,1,0]

N = np.size(input,0) # number of samples

Ni = np.size(input,1) # dimension of the samples of input

No = 1 # dimension of the sample of output

Nh = 5 # number of hidden units

Ws = 1/4*np.random.rand(Nh,Ni+1)
#print(Ws)

Wo = 1/4*np.random.rand(No,Nh)
#print(Wo)

alpha = 0.1 # Learning rate

t_ = []

loss_ = []

def sigmoid(x):
    return 1/(1+np.exp(-x))

## train the model ====================================================================
for epoch in range(0,5000):
    loss = 0
    for id_ in range(0,N):
        dWs = 0*Ws
        dWo = 0*Wo
        
        x = np.append(input[id_],1)
        
        Z_1 = np.dot(Ws,x)
        
        A_1 = sigmoid(Z_1)
        
        Z_2 = np.dot(Wo,A_1)

        y = sigmoid(Z_2)
        
        d = output[id_]

        for j in range(0,Nh):
            for i in range(0,No):
                dWo[i,j] = dWo[i,j] + sigmoid(Z_1[j])*(y[i]-d)
                
        Wo = Wo - alpha*dWo
        
        for k in range(0,Ni+1):
            for j in range(0,Nh):
                for i in range(0,No):
                    dWs[j,k] = dWs[j,k] + x[k]*Wo[i,j]*sigmoid(Z_1[j])*(1-sigmoid(Z_1[j]))*(y[i]-d) 
        
        Ws = Ws - alpha*dWs
        
        loss = loss + 1/2*np.linalg.norm(y-d)
        
    if np.mod(epoch,50) == 0:
        print(epoch,"-th epoch trained")
            
        t_ = np.append(t_,epoch)
            
        loss_ = np.append(loss_,loss)
            
        fig = plt.figure(num=0,figsize=[10,5])
        plt.plot(t_,loss_,marker="")
        plt.title('Loss decay')
        plt.xlabel('epoch',FontSize=20)
        plt.ylabel('Loss',FontSize=20)
        plt.show()
            
        ## figure out the function shape the model========================================== 
        xn = np.linspace(0,1,20)
        yn = np.linspace(0,1,20)
        xm, ym = np.meshgrid(xn, yn)
        xx = np.reshape(xm,np.size(xm,0)*np.size(xm,1))
        yy = np.reshape(ym,np.size(xm,0)*np.size(xm,1))
        Z = []
        for id__ in range(0,np.size(xm)):
            x = np.append([xx[id__],yy[id__]],[1,1])
            Z_1 = np.dot(Ws,x)
            y_ = sigmoid(np.dot(Wo,sigmoid(Z_1)))
            Z = np.append(Z,y_)
                
        fig = plt.figure(num=1,figsize=[10,5])
        ax = fig.gca(projection='3d')
        surf = ax.plot_surface(xm,ym,np.reshape(Z,(np.size(xm,0),np.size(xm,1))),cmap='coolwarm',linewidth=0,antialiased=False)            
        print("====================================================================")
        plt.show()
        
        
## test the trained model ====================================================================
for id_ in range(0,N):
    x = np.append(input[id_],1)
        
    Z_1 = np.dot(Ws,x)
        
    y = sigmoid(np.dot(Wo,sigmoid(Z_1)))
    
    print(y)

XOR with ReLU activation function

Answers (1)

Related Questions