Reputation: 406
In trying to understand fully connected ANN's, I'm starting with a simple 2-D linear regression example.
My network is trivial - one input layer and an output layer with a set of weights between them. If my understanding is correct, the weight should essentially learn the slope of the best fit line of my data.
My training data is a slightly noisy line as below. It's just a fuzzy line with slope of m = .5. My code pulls one point, propagates it through my network and backpropagates to update the weights. Weight update happens either every datum or averaged over every 5, 10 or 20 points.
My weights are randomized, but I've fixed the seed for sanity's sake while debugging. When I plot the square error from every training example, I get the following hump, settling, then explosion.
Corresponding to the decrease in square error, my algorithm finds the slope of the linear data...then violently rejects it, lol.
My initial thought was that the oscillations I coded into my training data were too wild and might throw the solution out of the local minima. But then I tightened up the spread around y=.5x in my training data with not much effect. To also try to smooth out these effects, I implemented averaged stochastic weight updates to only update after a batch of a few training samples. No love.
I'm also using a very small learning rate (.0005) as I thought that the noise might have got me oscillating down a gradient hill. This initially helped, but the figures are the result of alpha = .005.
Any suggestions on what I'm missing? I'd like to handle this situation so I can approach multi-variate regression.
import random
import matplotlib.pyplot as plt
import numpy as np
random.seed(0)
def gen_linear_regression_data(num_points, slope=.5, var=1.0, plot=False, seed=None):
if seed is not None:
np.random.seed(seed)
data = [idx for idx in range(num_points)]
labels = [data[idx] * slope for idx in range(num_points)]
# add noise
labels = [l + np.random.uniform(-var, var) for l in labels]
if plot:
plt.scatter(data, labels)
plt.show()
return data, labels
class Sigmoid():
def activate(self, x):
return 1 / (1 + np.exp(-x))
def backtivate(self, x):
return np.multiply(x, (1 - x))
class Passive():
def activate(self, x):
return x
def backtivate(self, x):
return 1
class Layer():
def __init__(self, values, activation="logistic"):
if not (isinstance(values, list) or isinstance(values, np.ndarray)):
values = [values]
self.values = np.matrix(values)
if self.values.shape[-1] > self.values.shape[0]:
self.values = self.values.reshape((self.values.shape[-1], 1))
self.set_activation(activation_str=activation)
def __getitem__(self, item):
return self.values[item]
def __setitem__(self, key, value):
if not (isinstance(value, int) or isinstance(value, float)):
raise TypeError("Layer values must be int or float.")
self.values[key] = value
def __len__(self):
return self.values.shape[0]
def __str__(self):
return "\n".join([str(val) for val in self.values])
def __mul__(self, other):
return np.dot(other, self.values)
def set_activation(self, activation_str):
if activation_str == "logistic":
self.activation = Sigmoid()
elif activation_str == "passive":
self.activation = Passive()
def transpose(self):
return self.values.reshape(len(self), 1)
def activate(self):
return self.activation.activate(self.values)
def backtivate(self):
return self.activation.backtivate(self.values)
class DataSet():
def __init__(self, data, labels):
self.data = data
self.labels = labels
self.data_dict = [{"data": d, "label": l} for (d, l) in zip(self.data, self.labels)]
def __getitem__(self, item):
return self.data_dict[item]
class Weights():
def __init__(self, weights):
if not isinstance(weights, list) and not all([isinstance(weight, np.ndarray) for weight in weights]):
raise TypeError("Blah.")
self.data = weights
def __len__(self):
return sum([w.shape[0] * w.shape[1] for w in self.data])
def __getitem__(self, item):
weight_idx = np.cumsum([w.shape[0] * w.shape[1] - 1 for w in self.data])
desired_idx = 0
for idx, w_idx in enumerate(weight_idx):
if item < w_idx:
desired_idx = idx
break
if idx > 0:
cs = np.cumsum(weight_idx)
another_idx = item - cs[idx - 1]
else:
another_idx = item
self.data[desired_idx][another_idx]
class Network():
def __init__(self, network_config,
first_layer=None,
random_weights=True,
learning_rate=.0005):
self.network_config = network_config
if first_layer is None:
first_layer = Layer(np.zeros(network_config[0]["layers"]))
first_layer.set_activation(network_config[0]["activation"])
# Initialize layers
self.depth = len(network_config)
self.layers = [first_layer]
self.layers.extend(
[Layer(np.zeros(config["layers"]),
activation=config["activation"]) for config in network_config[1:]])
# Initialize learning rate
self.learning_rate = learning_rate
# Initialize weights
self.weights = []
for layer_idx in range(self.depth - 1):
if random_weights:
self.weights.append(2 * (np.random.rand(len(self[layer_idx + 1]), len(self[layer_idx])) - .5))
else:
self.weights.append(np.ones((len(self[layer_idx + 1]), len(self[layer_idx]))))
def __getitem__(self, item):
return self.layers[item]
def __str__(self):
max_elems = np.max([len(layer) for layer in self.layers])
matrix = [[str(layer[elem]) if elem < len(layer) else None for layer in self.layers] for elem in
range(max_elems)]
net_str = "\n".join([str(layer) for layer in matrix])
weight_str = str(self.weights)
try:
deltas_str = " ".join([str(lay.delta) for lay in self.layers])
except:
deltas_str = ""
return "Net:\n%s\n\nWeights:\n%s\n\nDeltas:\n%s" % (net_str, weight_str, deltas_str)
def forward_prop(self, debug=False):
for layer_idx in range(1, self.depth):
ww = self.weights[layer_idx - 1]
layer = self.layers[layer_idx - 1]
weighted_input = np.dot(ww, layer.values)
self.layers[layer_idx].values = weighted_input
self.layers[layer_idx].values = self.layers[layer_idx].activate()
if debug:
print("-------------")
print(self)
return self.layers[-1]
def back_prop(self, answer, debug=False):
def calc_deltas():
for layer_idx, layer in enumerate(reversed(self.layers)):
if layer_idx == 0:
# Calculate dE for Squared Error
outputs = self.layers[-1]
dE = self.layers[-1][0] - answer
square_error = dE ** 2
a = dE
else:
a = np.dot(self.weights[-layer_idx].T,
self.layers[-layer_idx].delta)
b = layer.backtivate()
layer_delta = np.multiply(a, b)
layer.delta = layer_delta
return square_error
def calc_dws():
dws = []
deltas = [l.delta for l in self.layers]
values = [l.values for l in self.layers]
for layer_idx, layer in enumerate(self.layers[:-1]):
dws.append(np.multiply(deltas[layer_idx + 1], values[layer_idx].T))
return dws
print("Answer:\n%f" % answer)
square_error = calc_deltas()
dws = calc_dws()
return dws, square_error
def set_inputs(self, layer):
self.layers[0] = layer
self.layers[0].set_activation(self.network_config[0]["activation"])
def build_network_config(layers):
network_config = []
for n_idx, neurons in enumerate(layers):
if n_idx != len(layers) - 1:
network_config.append({"layers": neurons, "activation": "passive"})
else:
network_config.append({"layers": neurons, "activation": "passive"})
return network_config
disp_el = 200
batch_size = 1
samples = 300
learning_rate = .00005
# Setup the network
network_config = build_network_config([1, 1])
net = Network(network_config=network_config,
random_weights=True,
learning_rate=learning_rate)
# Pull in the labeled data set.
dataset = DataSet(*gen_linear_regression_data(samples, seed=0, plot=True))
errs = []
cum_sum_dws = np.zeros_like(net.weights)
weights = []
for idx, dset in enumerate(dataset):
initial_layer = Layer(dset["data"])
net.set_inputs(initial_layer)
# Feed forward
net.forward_prop(True)
# Back propagate error
dws, sq_err = net.back_prop(dset["label"], debug=True)
errs.append(sq_err)
# Update weights
if idx % batch_size == 0 and idx != 0:
cum_sum_dws += dws
cum_sum_dws /= batch_size
new_weights = [net.weights[idx] - net.learning_rate * cum_sum_dws[idx] for idx in range(len(cum_sum_dws))]
net.weights = new_weights
weights.append(new_weights)
print("dws:\n%s" % str([-net.learning_rate * cum_sum_dws[idx] for idx in range(len(dws))]))
cum_sum_dws = np.zeros_like(dws)
else:
cum_sum_dws += dws
plt.scatter(range(len(errs[:disp_el])), errs[:disp_el])
plt.show()
plt.scatter(range(len(weights[:disp_el])), weights[:disp_el])
plt.show()
ww = Weights(net.weights)
# Validate data set
dataset = DataSet(*gen_linear_regression_data(5, seed=0))
for dset in dataset:
initial_layer = Layer(dset["data"])
net.set_inputs(initial_layer)
prediction = net.forward_prop()
print("data: %s, prediction: %s" % (str(dset["data"]), prediction))
Upvotes: 0
Views: 84
Reputation: 1537
I think your weights are wrong. If you want a good result you should use a gaussian distribution with m = 0.5 to -1,1. It's like the sigmoid is not running properly. Had the same problem and the neural network was over saturated very fast. Couldn't comment it so i am posting it here, took me long to figure how to initialize the weights based on the equation.
Upvotes: 0