Neural Network converges on answer, then oscillates wildly

Question

In trying to understand fully connected ANN's, I'm starting with a simple 2-D linear regression example.

My network is trivial - one input layer and an output layer with a set of weights between them. If my understanding is correct, the weight should essentially learn the slope of the best fit line of my data.

My training data is a slightly noisy line as below. It's just a fuzzy line with slope of m = .5. My code pulls one point, propagates it through my network and backpropagates to update the weights. Weight update happens either every datum or averaged over every 5, 10 or 20 points.

My weights are randomized, but I've fixed the seed for sanity's sake while debugging. When I plot the square error from every training example, I get the following hump, settling, then explosion.

Corresponding to the decrease in square error, my algorithm finds the slope of the linear data...then violently rejects it, lol.

My initial thought was that the oscillations I coded into my training data were too wild and might throw the solution out of the local minima. But then I tightened up the spread around y=.5x in my training data with not much effect. To also try to smooth out these effects, I implemented averaged stochastic weight updates to only update after a batch of a few training samples. No love.

I'm also using a very small learning rate (.0005) as I thought that the noise might have got me oscillating down a gradient hill. This initially helped, but the figures are the result of alpha = .005.

Any suggestions on what I'm missing? I'd like to handle this situation so I can approach multi-variate regression.

import random
import matplotlib.pyplot as plt
import numpy as np

random.seed(0)


def gen_linear_regression_data(num_points, slope=.5, var=1.0, plot=False, seed=None):
    if seed is not None:
        np.random.seed(seed)
    data = [idx for idx in range(num_points)]
    labels = [data[idx] * slope for idx in range(num_points)]
    # add noise
    labels = [l + np.random.uniform(-var, var) for l in labels]
    if plot:
        plt.scatter(data, labels)
        plt.show()
    return data, labels


class Sigmoid():
    def activate(self, x):
        return 1 / (1 + np.exp(-x))

    def backtivate(self, x):
        return np.multiply(x, (1 - x))


class Passive():
    def activate(self, x):
        return x

    def backtivate(self, x):
        return 1


class Layer():

    def __init__(self, values, activation="logistic"):
        if not (isinstance(values, list) or isinstance(values, np.ndarray)):
            values = [values]
        self.values = np.matrix(values)
        if self.values.shape[-1] > self.values.shape[0]:
            self.values = self.values.reshape((self.values.shape[-1], 1))
        self.set_activation(activation_str=activation)

    def __getitem__(self, item):
        return self.values[item]

    def __setitem__(self, key, value):
        if not (isinstance(value, int) or isinstance(value, float)):
            raise TypeError("Layer values must be int or float.")
        self.values[key] = value

    def __len__(self):
        return self.values.shape[0]

    def __str__(self):
        return "
".join([str(val) for val in self.values])

    def __mul__(self, other):
        return np.dot(other, self.values)

    def set_activation(self, activation_str):
        if activation_str == "logistic":
            self.activation = Sigmoid()
        elif activation_str == "passive":
            self.activation = Passive()

    def transpose(self):
        return self.values.reshape(len(self), 1)

    def activate(self):
        return self.activation.activate(self.values)

    def backtivate(self):
        return self.activation.backtivate(self.values)


class DataSet():
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        self.data_dict = [{"data": d, "label": l} for (d, l) in zip(self.data, self.labels)]

    def __getitem__(self, item):
        return self.data_dict[item]


class Weights():

    def __init__(self, weights):
        if not isinstance(weights, list) and not all([isinstance(weight, np.ndarray) for weight in weights]):
            raise TypeError("Blah.")
        self.data = weights

    def __len__(self):
        return sum([w.shape[0] * w.shape[1] for w in self.data])

    def __getitem__(self, item):
        weight_idx = np.cumsum([w.shape[0] * w.shape[1] - 1 for w in self.data])
        desired_idx = 0
        for idx, w_idx in enumerate(weight_idx):
            if item < w_idx:
                desired_idx = idx
                break

        if idx > 0:
            cs = np.cumsum(weight_idx)
            another_idx = item - cs[idx - 1]
        else:
            another_idx = item

        self.data[desired_idx][another_idx]


class Network():

    def __init__(self, network_config,
                 first_layer=None,
                 random_weights=True,
                 learning_rate=.0005):

        self.network_config = network_config

        if first_layer is None:
            first_layer = Layer(np.zeros(network_config[0]["layers"]))
        first_layer.set_activation(network_config[0]["activation"])

        # Initialize layers
        self.depth = len(network_config)
        self.layers = [first_layer]

        self.layers.extend(
                [Layer(np.zeros(config["layers"]),
                       activation=config["activation"]) for config in network_config[1:]])

        # Initialize learning rate
        self.learning_rate = learning_rate

        # Initialize weights
        self.weights = []
        for layer_idx in range(self.depth - 1):
            if random_weights:
                self.weights.append(2 * (np.random.rand(len(self[layer_idx + 1]), len(self[layer_idx])) - .5))
            else:
                self.weights.append(np.ones((len(self[layer_idx + 1]), len(self[layer_idx]))))

    def __getitem__(self, item):
        return self.layers[item]

    def __str__(self):
        max_elems = np.max([len(layer) for layer in self.layers])
        matrix = [[str(layer[elem]) if elem < len(layer) else None for layer in self.layers] for elem in
                  range(max_elems)]
        net_str = "
".join([str(layer) for layer in matrix])
        weight_str = str(self.weights)
        try:
            deltas_str = " ".join([str(lay.delta) for lay in self.layers])
        except:
            deltas_str = ""
        return "Net:
%s

Weights:
%s

Deltas:
%s" % (net_str, weight_str, deltas_str)

    def forward_prop(self, debug=False):
        for layer_idx in range(1, self.depth):
            ww = self.weights[layer_idx - 1]
            layer = self.layers[layer_idx - 1]
            weighted_input = np.dot(ww, layer.values)
            self.layers[layer_idx].values = weighted_input
            self.layers[layer_idx].values = self.layers[layer_idx].activate()
            if debug:
                print("-------------")
                print(self)
        return self.layers[-1]

    def back_prop(self, answer, debug=False):
        def calc_deltas():
            for layer_idx, layer in enumerate(reversed(self.layers)):
                if layer_idx == 0:
                    # Calculate dE for Squared Error
                    outputs = self.layers[-1]
                    dE = self.layers[-1][0] - answer
                    square_error = dE ** 2
                    a = dE
                else:
                    a = np.dot(self.weights[-layer_idx].T,
                               self.layers[-layer_idx].delta)
                b = layer.backtivate()
                layer_delta = np.multiply(a, b)
                layer.delta = layer_delta
            return square_error

        def calc_dws():
            dws = []
            deltas = [l.delta for l in self.layers]
            values = [l.values for l in self.layers]
            for layer_idx, layer in enumerate(self.layers[:-1]):
                dws.append(np.multiply(deltas[layer_idx + 1], values[layer_idx].T))
            return dws

        print("Answer:
%f" % answer)
        square_error = calc_deltas()
        dws = calc_dws()
        return dws, square_error

    def set_inputs(self, layer):
        self.layers[0] = layer
        self.layers[0].set_activation(self.network_config[0]["activation"])


def build_network_config(layers):
    network_config = []
    for n_idx, neurons in enumerate(layers):
        if n_idx != len(layers) - 1:
            network_config.append({"layers": neurons, "activation": "passive"})
        else:
            network_config.append({"layers": neurons, "activation": "passive"})
    return network_config


disp_el = 200
batch_size = 1
samples = 300
learning_rate = .00005

# Setup the network
network_config = build_network_config([1, 1])
net = Network(network_config=network_config,
              random_weights=True,
              learning_rate=learning_rate)

# Pull in the labeled data set.
dataset = DataSet(*gen_linear_regression_data(samples, seed=0, plot=True))
errs = []

cum_sum_dws = np.zeros_like(net.weights)
weights = []
for idx, dset in enumerate(dataset):
    initial_layer = Layer(dset["data"])
    net.set_inputs(initial_layer)

    # Feed forward
    net.forward_prop(True)

    # Back propagate error
    dws, sq_err = net.back_prop(dset["label"], debug=True)
    errs.append(sq_err)

    # Update weights
    if idx % batch_size == 0 and idx != 0:
        cum_sum_dws += dws
        cum_sum_dws /= batch_size
        new_weights = [net.weights[idx] - net.learning_rate * cum_sum_dws[idx] for idx in range(len(cum_sum_dws))]
        net.weights = new_weights
        weights.append(new_weights)
        print("dws:
%s" % str([-net.learning_rate * cum_sum_dws[idx] for idx in range(len(dws))]))
        cum_sum_dws = np.zeros_like(dws)
    else:
        cum_sum_dws += dws

plt.scatter(range(len(errs[:disp_el])), errs[:disp_el])
plt.show()
plt.scatter(range(len(weights[:disp_el])), weights[:disp_el])
plt.show()
ww = Weights(net.weights)

# Validate data set
dataset = DataSet(*gen_linear_regression_data(5, seed=0))
for dset in dataset:
    initial_layer = Layer(dset["data"])
    net.set_inputs(initial_layer)
    prediction = net.forward_prop()
    print("data: %s, prediction: %s" % (str(dset["data"]), prediction))

Neural Network converges on answer, then oscillates wildly

Answers (1)

Related Questions