Simple ANN model converges with tanh(x) as the activation function, but it doesn't with leaky ReLu

Question

I'm training a simple ANN model (MLP) using as the activation function tanh(x) and, after some interactions, it converges with error equal to 10^-5, here's my full code:

import numpy as np
import pandas as pd

# Base de dados a ser treinada
x = pd.DataFrame(
    [[1],
    [2],
    [3]],
    columns=['valores x'])

d = pd.DataFrame(
    [[5],
    [4],
    [3]],
    columns=['valores desejados'])

# Convertendo o dataframe em array e normalizando os valores desejados para ficar entre 0 e +1.
x = x.to_numpy()
d = d/(1.05*d.max())
d = d.to_numpy()


# Derivada de tanh(x) = sech²(x) = 1 - (tanh(x))²
def df(x):
    y = 1 - np.power(np.tanh(x), 2)
    return y

#def rede_mlp(n, x, d, net, k, precisao):

# Construindo a rede de duas camadas 
# net = número de neurônios na primeira camada
# n = taxa de aprendizagem
# precisao = precisão do erro quadrático médio
net=3
n = 0.1
precisao=0.00001
w1 = np.random.rand(net,len(x[0]))
w2 = np.random.rand(1,net)
E_M=1
epocas=0

while E_M>precisao:
    E_M=0
    errofinal=0
    for i in range(0,len(x)):

        # FOWARD
        i1 = np.matmul(w1, x[i].reshape(len(x[i]),1))
        y1 = np.tanh(i1)

        i2 = np.matmul(w2, y1)
        y2 = np.tanh(i2)

        # erro com o valor desejado
        erro = d[i].reshape(len(d[i]),1) - y2

        # BACKPROPAGATION
        delta_2 = erro*df(i2)
        w2 = w2 + n*(np.matmul(delta_2, y1.reshape(1, net)))

        delta_1 = (np.matmul(w2.T, delta_2))*df(i1)
        w1 = w1 + n*(np.matmul(delta_1, x[i].reshape(1, len(x[i]))))

        errofinal = errofinal + 0.5*erro**2

    E_M = errofinal/len(x)
    epocas+=1
    print(E_M)

After that, I tried to change the activation function to leaky ReLu, but it didn't converge. I have changed the learning rate n several times, but the error is still high. It's around 7.95, which is big for my data. Here's my try:

import numpy as np
import pandas as pd


# Base de dados a ser treinada
x = pd.DataFrame(
    [[1],
    [2],
    [3]],
    columns=['valores x'])

d = pd.DataFrame(
    [[5],
    [4],
    [3]],
    columns=['valores desejados'])

# Convertendo o dataframe em array e normalizando os valores desejados para ficar entre 0 e +1.
x = x.to_numpy()
d = d.to_numpy()


def df(x):
    x = np.array(x)
    x[x<=0] = 0.01
    x[x>0] = 1
    return x

def f(x):
    return(np.where(x > 0, x, x * 0.01))



#def rede_mlp(n, x, d, net, k, precisao):

# Construindo a rede de duas camadas 
# net = número de neurônios na primeira camada
# n = taxa de aprendizagem
# precisao = precisão do erro quadrático médio
net=3
n = 1e-4
precisao=0.0001
w1 = np.random.rand(net,len(x[0]))
w2 = np.random.rand(1,net)
E_M=20
epocas=0

while E_M>precisao:
    E_M=0
    errofinal=0
    for i in range(0,len(x)):

        # FOWARD
        i1 = np.matmul(w1, x[i].reshape(len(x[i]),1))
        y1 = f(i1)



        i2 = np.matmul(w2, y1)
        y2 = f(i2)


        # erro com o valor desejado
        erro = d[i].reshape(len(d[i]),1) - y2


        # BACKPROPAGATION
        delta_2 = erro*df(i2)
        w2 = w2 + n*(np.matmul(delta_2, y1.reshape(1, net)))


        delta_1 = (np.matmul(w2.T, delta_2))*df(i1)
        w1 = w1 + n*(np.matmul(delta_1, x[i].reshape(1, len(x[i]))))

        errofinal = errofinal + 0.5*erro**2

    #E_M = errofinal/len(x)
    E_M = errofinal
    epocas+=1
    print(E_M)

EDITED:

After some modifications, here's my ReLu code (but the error is still high ~7.77):

import numpy as np
import pandas as pd


# Base de dados a ser treinada
x = pd.DataFrame(
    [[1],
    [2],
    [3]],
    columns=['valores x'])

d = pd.DataFrame(
    [[5],
    [4],
    [3]],
    columns=['valores desejados'])

# Convertendo o dataframe em array e normalizando os valores desejados para ficar entre 0 e +1.
x = x.to_numpy()
d = d.to_numpy()


def df(x):
    return(np.where(x <= 0, 0.01, 1))

def f(x):
    return(np.where(x > 0, x, x * 0.01))


#def rede_mlp(n, x, d, net, k, precisao):

# Construindo a rede de duas camadas 
# net = número de neurônios na primeira camada
# n = taxa de aprendizagem
# precisao = precisão do erro quadrático médio
net=3
n = 1e-3
precisao=0.1
w1 = np.random.rand(net,len(x[0]))
w2 = np.random.rand(1,net)
E_M=20
epocas=0

while E_M>precisao:
    E_M=0
    errofinal=0
    for i in range(0,len(x)):

        # FOWARD
        i1 = np.matmul(w1, x[i].reshape(len(x[i]),1))
        y1 = f(i1)


        i2 = np.matmul(w2, y1)
        y2 = f(i2)


        # erro com o valor desejado
        erro = d[i].reshape(len(d[i]),1) - y2


        # BACKPROPAGATION
        delta_2 = erro*df(i2)
        delta_1 = (np.matmul(w2.T, delta_2))*df(i1)

        w2 = w2 + n*(np.matmul(delta_2, y1.reshape(1, net)))
        w1 = w1 + n*(np.matmul(delta_1, x[i].reshape(1, len(x[i]))))


        errofinal = errofinal + 0.5*erro**2

    #E_M = errofinal/len(x)
    E_M = errofinal
    epocas+=1
    print(E_M)

mcskinner · Accepted Answer

You need to add a bias to the network.

The equation you are trying to model is y = 6 - x, which is trivial if you can use 6 as an intercept (bias), but I think actually impossible if you do not.

Many functions are much easier to represent once you add the bias, which is why including one is standard practice. This Q&A on the role of bias in NNs explains more thoroughly.

I modified your code to add the bias, as well as follow more typical naming conventions, and it converges for me.

net = 3
n = 1e-3
precisao = 0.0001 

w1 = np.random.rand(net, len(x[0])) 
bias1 = np.random.rand()

w2 = np.random.rand(1, net) 
bias2 = np.random.rand()

E_M = 20 
epocas = 0 

while E_M > precisao: 
    E_M = 0 
    errofinal = 0 
    for i in range(0,len(x)): 
        a0 = x[i].reshape(-1, 1) 
        targ = d[i].reshape(-1, 1) 

        z1 = np.matmul(w1, a0) + bias1
        a1 = f(z1) 

        z2 = np.matmul(w2, a1) + bias2
        a2 = f(z2) 

        erro = a2 - targ

        # BACKPROPAGATION 
        delta_2 = erro * df(z2) 
        delta_1 = np.matmul(w2.T, delta_2) * df(z1) 
        bias2 -= n * delta_2
        bias1 -= n * delta_1
        w2 -= n * np.matmul(delta_2, a1.T)
        w1 -= n * np.matmul(delta_1, a0.T)

        errofinal = errofinal + 0.5*erro**2 

    #E_M = errofinal/len(x) 
    E_M = errofinal 
    epocas += 1 
    if epocas % 1000 == 0:
        print(epocas, E_M)

I increased the learning rate so it would converge more quickly.

1000 [[0.14401507]]
2000 [[0.00028834]]

Earlier bug fix suggestion

You are setting the derivative always equal to 1.

def df(x):
    x = np.array(x)
    x[x<=0] = 0.01
    x[x>0] = 1
    return x

The line x[x<=0] = 0.01 sets all non-positive values to 1/100, a positive value. After that every value is positive, since the already-positive values go through unaffected and the negative-or-zero values just turned positive. So the next line x[x>0] = 1 sets all derivatives to 1.

Try this:

def df(x):
    return np.where(np.array(x) <= 0, 0.01, 1)

Simple ANN model converges with tanh(x) as the activation function, but it doesn't with leaky ReLu

Answers (1)

Related Questions

Simple ANN model converges with tanh(x) as the activation function, but it doesn&#39;t with leaky ReLu

Answers (1)

Related Questions

Simple ANN model converges with tanh(x) as the activation function, but it doesn't with leaky ReLu