Reputation: 2311
I have a network with two inputs, two hidden nodes in a single layer, and an output node.
I am trying to solve XOR problem:
| i0 | i1 | desired output |
----------------------------
| 0 | 0 | 0 |
| 1 | 0 | 1 |
| 0 | 1 | 1 |
| 1 | 1 | 0 |
With my current code, I am running all 4 records above in a single epoch. I then repeat the epoch 20,000 times. I calculate the error after each record, not each epoch, and I back-propagate the error at this same time.
I use only sigmoid in the output layer, as I understand I want a result between 0
and 1
.
My network, most of the time, converges. Other times, it doesn't.
I have tried using both sigmoid and tanh in the hidden layer, but neither seems to guarantee convergence.
I have tried randomly generating weights between 0
and 1
as well as between -1
and 1
using a uniform distribution. I have tried using Xavier Initialisation as both uniform and normal distribution. None of these seems to prevent the network from failing to converge. I have tried different combinations of activation function and weight generation.
Here is my complete code:
#include <iostream>
#include <array>
#include <random>
#include <chrono>
#include <iomanip>
#include <fstream>
#include <algorithm>
#include <iomanip>
typedef float DataType;
typedef DataType (*ActivationFuncPtr)(const DataType&);
const DataType learningRate = std::sqrt(2.f);
const DataType momentum = 0.25f;
const std::size_t numberEpochs = 20000;
DataType sigmoid(const DataType& x)
{
return DataType(1) / (DataType(1) + std::exp(-x));
}
DataType sigmoid_derivative(const DataType& x)
{
return x * (DataType(1) - x);
}
DataType relu(const DataType& x)
{
return x <= 0 ? 0 : x;
}
DataType relu_derivative(const DataType& x)
{
return x <= 0 ? 0 : 1;
}
DataType tanh(const DataType& x)
{
return std::tanh(x);
}
DataType tanh_derivative(const DataType& x)
{
return DataType(1) - x * x;
}
DataType leaky_relu(const DataType& x)
{
return x <= 0 ? DataType(0.01) * x : x;
}
DataType leaky_relu_derivative(const DataType& x)
{
return x <= 0 ? DataType(0.01) : 1;
}
template<std::size_t NumInputs>
class Neuron
{
public:
Neuron(ActivationFuncPtr activationFunction, ActivationFuncPtr derivativeFunc)
:
m_activationFunction(activationFunction),
m_derivativeFunction(derivativeFunc)
{
RandomiseWeights();
}
void RandomiseWeights()
{
std::generate(m_weights.begin(),m_weights.end(),[&]()
{
return m_xavierNormalDis(m_mt);
});
m_biasWeight = m_xavierNormalDis(m_mt);
for(std::size_t i = 0; i < NumInputs+1; ++i)
m_previousWeightUpdates[i] = 0;
}
void FeedForward(const std::array<DataType,NumInputs>& inputValues)
{
DataType sum = m_biasWeight;
for(std::size_t i = 0; i < inputValues.size(); ++i)
sum += inputValues[i] * m_weights[i];
m_output = m_activationFunction(sum);
m_netInput = sum;
}
DataType GetOutput() const
{
return m_output;
}
DataType GetNetInput() const
{
return m_netInput;
}
std::array<DataType,NumInputs> Backpropagate(const DataType& error,
const std::array<DataType,NumInputs>& inputValues,
std::array<DataType,NumInputs+1>& weightAdjustments)
{
DataType errorOverOutput = error;
DataType outputOverNetInput = m_derivativeFunction(m_output);
std::array<DataType,NumInputs> netInputOverWeight;
for(std::size_t i = 0; i < NumInputs; ++i)
{
netInputOverWeight[i] = inputValues[i];
}
DataType netInputOverBias = DataType(1);
std::array<DataType,NumInputs> errorOverWeight;
for(std::size_t i = 0; i < NumInputs; ++i)
{
errorOverWeight[i] = errorOverOutput * outputOverNetInput * netInputOverWeight[i];
}
DataType errorOverBias = errorOverOutput * outputOverNetInput * netInputOverBias;
for(std::size_t i = 0; i < NumInputs; ++i)
{
weightAdjustments[i] = errorOverWeight[i];
}
weightAdjustments[NumInputs] = errorOverBias;
DataType errorOverNetInput = errorOverOutput * outputOverNetInput;
std::array<DataType,NumInputs> errorWeights;
for(std::size_t i = 0; i < NumInputs; ++i)
{
errorWeights[i] = errorOverNetInput * m_weights[i];
}
return errorWeights;
}
void AdjustWeights(const std::array<DataType,NumInputs+1>& adjustments)
{
for(std::size_t i = 0; i < NumInputs; ++i)
{
m_weights[i] = m_weights[i] - learningRate * adjustments[i] + momentum * m_previousWeightUpdates[i];
m_previousWeightUpdates[i] = learningRate * adjustments[i] + momentum * m_previousWeightUpdates[i];
}
m_biasWeight = m_biasWeight - learningRate * adjustments[NumInputs] + momentum * m_previousWeightUpdates[NumInputs];
m_previousWeightUpdates[NumInputs] = learningRate * adjustments[NumInputs] + momentum * m_previousWeightUpdates[NumInputs];
}
const std::array<DataType,NumInputs>& GetWeights() const { return m_weights; }
const DataType& GetBiasWeight() const { return m_biasWeight; }
protected:
static std::mt19937 m_mt;
static std::uniform_real_distribution<DataType> m_uniformDisRandom;
static std::uniform_real_distribution<DataType> m_xavierUniformDis;
static std::normal_distribution<DataType> m_xavierNormalDis;
std::array<DataType,NumInputs> m_weights;
DataType m_biasWeight;
ActivationFuncPtr m_activationFunction;
ActivationFuncPtr m_derivativeFunction;
DataType m_output;
DataType m_netInput;
std::array<DataType,NumInputs+1> m_previousWeightUpdates;
};
template<std::size_t NumInputs>
std::mt19937 Neuron<NumInputs>::m_mt(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count());
template<std::size_t NumInputs>
std::uniform_real_distribution<DataType> Neuron<NumInputs>::m_uniformDisRandom(-1,1);
template<std::size_t NumInputs>
std::uniform_real_distribution<DataType> Neuron<NumInputs>::m_xavierUniformDis(-std::sqrt(6.f / NumInputs+1),std::sqrt(6.f / NumInputs+1));
template<std::size_t NumInputs>
std::normal_distribution<DataType> Neuron<NumInputs>::m_xavierNormalDis(0,std::sqrt(2.f / NumInputs+1));
main()
{
std::ofstream file("error_out.csv", std::ios::out | std::ios::trunc);
if(!file.is_open())
{
std::cout << "couldn't open file" << std::endl;
return 0;
}
file << std::fixed << std::setprecision(80);
std::array<std::array<DataType,2>,4> inputData = {{{0,0},{0,1},{1,0},{1,1}}};
std::array<std::array<DataType,1>,4> desiredOutputs = {{{0},{1},{1},{0}}};
std::array<Neuron<2>*,2> hiddenLayer1 =
{{
new Neuron<2>(tanh, tanh_derivative),
new Neuron<2>(tanh, tanh_derivative)
}};
std::array<Neuron<2>*,1> outputLayer =
{{
new Neuron<2>(sigmoid, sigmoid_derivative)
}};
std::cout << std::fixed << std::setprecision(80);
std::cout << "Initial Weights: " << std::endl;
const std::array<DataType,2>& outputWeights = outputLayer[0]->GetWeights();
const DataType& outputBias = outputLayer[0]->GetBiasWeight();
const std::array<DataType,2>& hidden1Weights = hiddenLayer1[0]->GetWeights();
const DataType& hidden1Bias = hiddenLayer1[0]->GetBiasWeight();
const std::array<DataType,2>& hidden2Weights = hiddenLayer1[1]->GetWeights();
const DataType& hidden2Bias = hiddenLayer1[1]->GetBiasWeight();
std::cout << "W0: " << hidden1Weights[0] << "\n"
<< "W1: " << hidden1Weights[1] << "\n"
<< "B0: " << hidden1Bias << "\n"
<< "W2: " << hidden2Weights[0] << "\n"
<< "W3: " << hidden2Weights[1] << "\n"
<< "B1: " << hidden2Bias << "\n"
<< "W4: " << outputWeights[0] << "\n"
<< "W5: " << outputWeights[1] << "\n"
<< "B2: " << outputBias << "\n" << std::endl;
DataType finalMSE = 0;
std::size_t epochNumber = 0;
while(epochNumber < numberEpochs)
{
DataType epochMSE = 0;
for(std::size_t row = 0; row < inputData.size(); ++row)
{
const std::array<DataType,2>& dataRow = inputData[row];
const std::array<DataType,1>& outputRow = desiredOutputs[row];
// Feed the values through to the output layer
hiddenLayer1[0]->FeedForward(dataRow);
hiddenLayer1[1]->FeedForward(dataRow);
DataType output0 = hiddenLayer1[0]->GetOutput();
DataType output1 = hiddenLayer1[1]->GetOutput();
outputLayer[0]->FeedForward({output0,output1});
DataType finalOutput0 = outputLayer[0]->GetOutput();
// if there was more than 1 output neuron these errors need to be summed together first to create total error
DataType totalError = 0.5 * std::pow(outputRow[0] - finalOutput0,2.f);
epochMSE += totalError * totalError;
DataType propagateError = -(outputRow[0] - finalOutput0);
std::array<DataType,3> weightAdjustmentsOutput;
std::array<DataType,2> outputError = outputLayer[0]->Backpropagate(propagateError,
{output0,output1},
weightAdjustmentsOutput);
std::array<DataType,3> weightAdjustmentsHidden1;
hiddenLayer1[0]->Backpropagate(outputError[0],dataRow,weightAdjustmentsHidden1);
std::array<DataType,3> weightAdjustmentsHidden2;
hiddenLayer1[1]->Backpropagate(outputError[1],dataRow,weightAdjustmentsHidden2);
outputLayer[0]->AdjustWeights(weightAdjustmentsOutput);
hiddenLayer1[0]->AdjustWeights(weightAdjustmentsHidden1);
hiddenLayer1[1]->AdjustWeights(weightAdjustmentsHidden2);
}
epochMSE *= DataType(1) / inputData.size();
file << epochNumber << "," << epochMSE << std::endl;
finalMSE = epochMSE;
++epochNumber;
}
std::cout << std::fixed << std::setprecision(80)
<< "\n\n====================================\n"
<< " TRAINING COMPLETE"
<< "\n\n====================================" << std::endl;
std::cout << "Final Error: " << finalMSE << std::endl;
std::cout << "Number epochs: " << epochNumber << "/" << numberEpochs << std::endl;
// output tests
std::cout << std::fixed << std::setprecision(2)
<< "\n\n====================================\n"
<< " FINAL TESTS"
<< "\n\n====================================" << std::endl;
for(std::size_t row = 0; row < inputData.size(); ++row)
{
const std::array<DataType,2>& dataRow = inputData[row];
const std::array<DataType,1>& outputRow = desiredOutputs[row];
std::cout << dataRow[0] << "," << dataRow[1] << " (" << outputRow[0] << ") : ";
// Feed the values through to the output layer
hiddenLayer1[0]->FeedForward(dataRow);
hiddenLayer1[1]->FeedForward(dataRow);
DataType output0 = hiddenLayer1[0]->GetOutput();
DataType output1 = hiddenLayer1[1]->GetOutput();
outputLayer[0]->FeedForward({output0,output1});
DataType finalOutput0 = outputLayer[0]->GetOutput();
std::cout << finalOutput0 << std::endl;
}
file.close();
return 0;
}
When things are working, I get an output like:
====================================
FINAL TESTS
====================================
0.00,0.00 (0.00) : 0.00
0.00,1.00 (1.00) : 0.99
1.00,0.00 (1.00) : 0.99
1.00,1.00 (0.00) : 0.00
When it's not working I get an output like:
====================================
FINAL TESTS
====================================
0.00,0.00 (0.00) : 0.57
0.00,1.00 (1.00) : 0.57
1.00,0.00 (1.00) : 1.00
1.00,1.00 (0.00) : 0.00
When it's working, the error for each epoch looks like:
The initial weights were:
W0: -0.47551780939102172851562500000000000000000000000000000000000000000000000000000000
W1: 0.40949764847755432128906250000000000000000000000000000000000000000000000000000000
B0: 2.33756542205810546875000000000000000000000000000000000000000000000000000000000000
W2: 2.16713166236877441406250000000000000000000000000000000000000000000000000000000000
W3: -2.74766492843627929687500000000000000000000000000000000000000000000000000000000000
B1: 0.34863436222076416015625000000000000000000000000000000000000000000000000000000000
W4: -0.53460156917572021484375000000000000000000000000000000000000000000000000000000000
W5: 0.04940851405262947082519531250000000000000000000000000000000000000000000000000000
B2: 0.97842389345169067382812500000000000000000000000000000000000000000000000000000000
But when it doesn't work, the error for each epoch looks like:
the initial weights in this particular one was:
W0: 1.16670060157775878906250000000000000000000000000000000000000000000000000000000000
W1: -2.37987256050109863281250000000000000000000000000000000000000000000000000000000000
B0: 0.41097882390022277832031250000000000000000000000000000000000000000000000000000000
W2: -0.23449644446372985839843750000000000000000000000000000000000000000000000000000000
W3: -1.99990248680114746093750000000000000000000000000000000000000000000000000000000000
B1: 1.77582693099975585937500000000000000000000000000000000000000000000000000000000000
W4: 1.98818421363830566406250000000000000000000000000000000000000000000000000000000000
W5: 2.71223402023315429687500000000000000000000000000000000000000000000000000000000000
B2: -0.79067271947860717773437500000000000000000000000000000000000000000000000000000000
I see nothing really telling about these weights that can help me generate good starting weights (which is what I believe the problem to be, regardless of the activation function used).
Question: What can I do to ensure convergence occurs?
Do I need to change the weight initialisation? Do I need to use different activation functions? Do I need more layers or a different number of nodes?
Upvotes: 3
Views: 276
Reputation: 443
Have made another attempt at this which seems more reliable (although I don't think it's an optimal solution). I've changed the following:
Compile with:
g++ nn_xor_test.cpp
To run it in an infinite loop (will exit if convergence fails):
unset counter; tput civis; clear; while (tput cup 0 0; ./a.out); do ((counter++)); echo -e "\nCounter: $counter"; done
I have run this several times on my desktop, and each time it has exceeded a hundred thousand executions without failing (i.e. so convergence has been met and the results are correct). Convergence usually occurs within 1,000 epochs.
nn_xor_test.cpp
code:
#include <cmath>
#include <iostream>
#include <array>
#include <random>
#include <chrono>
#include <iomanip>
#include <fstream>
#include <algorithm>
#include <iomanip>
#include <vector>
typedef float DataType;
typedef DataType (*ActivationFuncPtr)(const DataType&);
const DataType learningRate = 0.1f;
const DataType momentum = 0;
const std::size_t numberEpochs = 20000;
const DataType convergence_threshold = 1e-3f;
DataType sigmoid(const DataType& x) {
return DataType(1) / (DataType(1) + std::exp(-x));
}
DataType sigmoid_derivative(const DataType& x) {
return x * (DataType(1) - x);
}
DataType tanh(const DataType& x) {
return std::tanh(x);
}
DataType tanh_derivative(const DataType& x) {
return DataType(1) - x * x;
}
int get_binary_result(const DataType& x)
{
return x < 0.5 ? 0 : 1;
}
DataType get_random_number(DataType f, DataType t)
{
static std::random_device rd;
static std::mt19937 gen(rd());
static std::uniform_real_distribution<float> dist(f, t);
DataType retval = 0.0;
while (retval == 0.0) {
retval = dist(gen);
}
return retval;
}
template <std::size_t NumInputs>
class Neuron {
public:
Neuron(ActivationFuncPtr activationFunction, ActivationFuncPtr derivativeFunc)
: m_activationFunction(activationFunction), m_derivativeFunction(derivativeFunc) {
RandomiseWeights();
}
void RandomiseWeights() {
std::generate(m_weights.begin(), m_weights.end(), [&]() { return get_random_number(0.1, 0.9); });
m_biasWeight = get_random_number(0.1, 0.9);
for (std::size_t i = 0; i < NumInputs + 1; ++i) m_previousWeightUpdates[i] = 0;
}
void FeedForward(const std::array<DataType, NumInputs>& inputValues) {
DataType sum = m_biasWeight;
for (std::size_t i = 0; i < inputValues.size(); ++i) sum += inputValues[i] * m_weights[i];
m_output = m_activationFunction(sum);
m_netInput = sum;
}
DataType GetOutput() const { return m_output; }
DataType GetNetInput() const { return m_netInput; }
std::array<DataType, NumInputs> Backpropagate(
const DataType& error, const std::array<DataType, NumInputs>& inputValues,
std::array<DataType, NumInputs + 1>& weightAdjustments) {
DataType errorOverOutput = error;
DataType outputOverNetInput = m_derivativeFunction(m_output);
std::array<DataType, NumInputs> netInputOverWeight;
for (std::size_t i = 0; i < NumInputs; ++i) netInputOverWeight[i] = inputValues[i];
DataType netInputOverBias = DataType(1);
std::array<DataType, NumInputs> errorOverWeight;
for (std::size_t i = 0; i < NumInputs; ++i)
errorOverWeight[i] = errorOverOutput * outputOverNetInput * netInputOverWeight[i];
DataType errorOverBias = errorOverOutput * outputOverNetInput * netInputOverBias;
for (std::size_t i = 0; i < NumInputs; ++i) weightAdjustments[i] = errorOverWeight[i];
weightAdjustments[NumInputs] = errorOverBias;
DataType errorOverNetInput = errorOverOutput * outputOverNetInput;
std::array<DataType, NumInputs> errorWeights;
for (std::size_t i = 0; i < NumInputs; ++i) errorWeights[i] = errorOverNetInput * m_weights[i];
return errorWeights;
}
void AdjustWeights(const std::array<DataType, NumInputs + 1>& adjustments) {
for (std::size_t i = 0; i < NumInputs; ++i) {
m_weights[i] = m_weights[i] - learningRate * adjustments[i] + momentum * m_previousWeightUpdates[i];
m_previousWeightUpdates[i] = learningRate * adjustments[i] + momentum * m_previousWeightUpdates[i];
}
m_biasWeight = m_biasWeight - learningRate * adjustments[NumInputs] +
momentum * m_previousWeightUpdates[NumInputs];
m_previousWeightUpdates[NumInputs] =
learningRate * adjustments[NumInputs] + momentum * m_previousWeightUpdates[NumInputs];
}
const std::array<DataType, NumInputs>& GetWeights() const { return m_weights; }
const DataType& GetBiasWeight() const { return m_biasWeight; }
protected:
static std::mt19937 m_mt;
static std::uniform_real_distribution<DataType> m_uniformDisRandom;
static std::uniform_real_distribution<DataType> m_xavierUniformDis;
static std::normal_distribution<DataType> m_xavierNormalDis;
std::array<DataType, NumInputs> m_weights;
DataType m_biasWeight;
ActivationFuncPtr m_activationFunction;
ActivationFuncPtr m_derivativeFunction;
DataType m_output;
DataType m_netInput;
std::array<DataType, NumInputs + 1> m_previousWeightUpdates;
};
template <std::size_t NumInputs>
std::mt19937 Neuron<NumInputs>::m_mt(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count());
template <std::size_t NumInputs>
std::uniform_real_distribution<DataType> Neuron<NumInputs>::m_uniformDisRandom(-1, 1);
template <std::size_t NumInputs>
std::uniform_real_distribution<DataType> Neuron<NumInputs>::m_xavierUniformDis(-std::sqrt(6.f / (NumInputs + 1)), std::sqrt(6.f / (NumInputs + 1)));
template <std::size_t NumInputs>
std::normal_distribution<DataType> Neuron<NumInputs>::m_xavierNormalDis(0, std::sqrt(2.f / (NumInputs + 1)));
// ------------------------------------------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------------------------------------------
int main() {
std::ofstream file("error_out.csv", std::ios::out | std::ios::trunc);
if (!file.is_open()) {
std::cout << "couldn't open file" << std::endl;
return 0;
}
file << std::fixed << std::setprecision(80);
std::array<std::array<DataType, 2>, 4> inputData = {{{0, 0}, {0, 1}, {1, 0}, {1, 1}}};
std::array<std::array<DataType, 1>, 4> desiredOutputs = {{{0}, {1}, {1}, {0}}};
// hidden layer with 7 neurons
std::array<Neuron<2>*, 7> hiddenLayer1 = {
new Neuron<2>(tanh, tanh_derivative),
new Neuron<2>(tanh, tanh_derivative),
new Neuron<2>(tanh, tanh_derivative),
new Neuron<2>(tanh, tanh_derivative),
new Neuron<2>(tanh, tanh_derivative),
new Neuron<2>(tanh, tanh_derivative),
new Neuron<2>(tanh, tanh_derivative)};
std::array<Neuron<7>*, 1> outputLayer = {new Neuron<7>(sigmoid, sigmoid_derivative)};
// training loop
DataType finalMSE = 0;
std::size_t epochNumber = 0;
while (epochNumber < numberEpochs) {
DataType epochMSE = 0;
for (std::size_t row = 0; row < inputData.size(); ++row) {
const std::array<DataType, 2>& dataRow = inputData[row];
const std::array<DataType, 1>& outputRow = desiredOutputs[row];
// Feed forward through hidden layer
std::array<DataType, 7> hiddenOutputs;
for (std::size_t i = 0; i < hiddenLayer1.size(); ++i) {
hiddenLayer1[i]->FeedForward(dataRow);
hiddenOutputs[i] = hiddenLayer1[i]->GetOutput();
}
// Feed forward through output layer
outputLayer[0]->FeedForward(hiddenOutputs);
DataType finalOutput0 = outputLayer[0]->GetOutput();
// if there was more than 1 output neuron these errors need to be summed together first to create total error
DataType totalError = 0.5 * std::pow(outputRow[0] - finalOutput0, 2.f);
epochMSE += totalError * totalError;
// backpropagate error
DataType propagateError = -(outputRow[0] - finalOutput0);
std::array<DataType, 8> weightAdjustmentsOutput;
std::array<DataType, 7> outputError =
outputLayer[0]->Backpropagate(propagateError, hiddenOutputs, weightAdjustmentsOutput);
for (std::size_t i = 0; i < hiddenLayer1.size(); ++i) {
std::array<DataType, 3> weightAdjustmentsHidden;
hiddenLayer1[i]->Backpropagate(outputError[i], dataRow, weightAdjustmentsHidden);
hiddenLayer1[i]->AdjustWeights(weightAdjustmentsHidden);
}
outputLayer[0]->AdjustWeights(weightAdjustmentsOutput);
}
epochMSE *= DataType(1) / inputData.size();
file << epochNumber << "," << epochMSE << std::endl;
finalMSE = epochMSE;
// Let's exit if the error is less than the convergence error threshold.
if (epochMSE < convergence_threshold) {
std::cout << "\nConverged at epoch: " << epochNumber << std::endl;
std::cout << "Exiting training, as error level less than " << convergence_threshold << "." << std::endl;
break;
}
++epochNumber;
}
file.close();
std::cout << "MSE: " << finalMSE << std::endl;
if (!(finalMSE < convergence_threshold)) {
std::cout << "*** FAILED TO CONVERGE ***" << std::endl;
return 1;
}
// output tests
std::vector<int> results;
std::cout << "----------------------------------------------------------------\n"
<< " FINAL TESTS"
<< "\n----------------------------------------------------------------" << std::endl;
std::cout << std::fixed << std::setprecision(8);
for (std::size_t row = 0; row < inputData.size(); ++row) {
const std::array<DataType, 2>& dataRow = inputData[row];
const std::array<DataType, 1>& outputRow = desiredOutputs[row];
std::cout << (int)dataRow[0] << " XOR " << (int)dataRow[1] << " (expected: " << (int)outputRow[0] << ") : ";
// Feed forward through hidden layer
std::array<DataType, 7> hiddenOutputs;
for (std::size_t i = 0; i < hiddenLayer1.size(); ++i) {
hiddenLayer1[i]->FeedForward(dataRow);
hiddenOutputs[i] = hiddenLayer1[i]->GetOutput();
}
// Feed forward through output layer
outputLayer[0]->FeedForward(hiddenOutputs);
DataType finalOutput0 = outputLayer[0]->GetOutput();
results.push_back(get_binary_result(finalOutput0));
std::cout << finalOutput0 << " (actual: " << get_binary_result(finalOutput0) << ")" << std::endl;
}
for (int i = 0; i < desiredOutputs.size(); i++) {
if (desiredOutputs[0][i] != results[i]) {
std::cout << "********** INVALID RESULTS - NETOWRK DOES NOT WORK **********" << std::endl;
return 2;
}
}
return 0;
}
The typical error plot looks as follows:
You can make the convergence_threshold
smaller if you want "more accurate" results, but then the number of epochs used will also increase. For example, if convergence_threshold
is set to 1e-6
, then the number of epochs used to reach convergence will typically be around 2,800. This seems like overkill though, as the network will work after approximately 1,000 epochs, as can be seen here:
Upvotes: 1
Reputation: 443
I know this is an old question, but the code you provided compiled and ran, so I thought I'd take a look :)
With your code, I would suggest the following:
The reason why convergence isn't guaranteed with your code is because of the "vanishing gradient problem".
You could also add an "error threshold" to test if convergence is met.
I have updated your code with these mentioned changes. With these changes, it should achieve convergence in under 2000 epochs, but there's still no guarantee.
#include <iostream>
#include <fstream>
#include <array>
#include <cmath>
#include <random>
#include <chrono>
#include <algorithm>
#include <iomanip>
typedef float DataType;
typedef DataType (*ActivationFuncPtr)(const DataType&);
const DataType learningRate = std::sqrt(2.f);
const DataType momentum = 0.25f;
const std::size_t numberEpochs = 20000;
const DataType convergence_threshold = 1e-6;
DataType sigmoid(const DataType& x)
{
return DataType(1) / (DataType(1) + std::exp(-x));
}
DataType sigmoid_derivative(const DataType& x)
{
return x * (DataType(1) - x);
}
DataType tanh(const DataType& x)
{
return std::tanh(x);
}
DataType tanh_derivative(const DataType& x)
{
return DataType(1) - x * x;
}
DataType get_binary_result(const DataType& x)
{
return x < 0.5 ? 0 : 1;
}
template<std::size_t NumInputs>
class Neuron
{
public:
Neuron(ActivationFuncPtr activationFunction, ActivationFuncPtr derivativeFunc)
: m_activationFunction(activationFunction),
m_derivativeFunction(derivativeFunc)
{
RandomiseWeights();
}
void RandomiseWeights()
{
std::generate(m_weights.begin(), m_weights.end(), [&]()
{
return m_xavierNormalDis(m_mt);
});
m_biasWeight = m_xavierNormalDis(m_mt);
for (std::size_t i = 0; i < NumInputs + 1; ++i)
m_previousWeightUpdates[i] = 0;
}
void FeedForward(const std::array<DataType, NumInputs>& inputValues)
{
DataType sum = m_biasWeight;
for (std::size_t i = 0; i < inputValues.size(); ++i)
sum += inputValues[i] * m_weights[i];
m_output = m_activationFunction(sum);
m_netInput = sum;
}
DataType GetOutput() const
{
return m_output;
}
const std::array<DataType, NumInputs>& GetWeights() const { return m_weights; }
const DataType& GetBiasWeight() const { return m_biasWeight; }
std::array<DataType, NumInputs> Backpropagate(const DataType& error,
const std::array<DataType, NumInputs>& inputValues,
std::array<DataType, NumInputs + 1>& weightAdjustments)
{
DataType errorOverOutput = error;
DataType outputOverNetInput = m_derivativeFunction(m_output);
std::array<DataType, NumInputs> netInputOverWeight;
for (std::size_t i = 0; i < NumInputs; ++i)
netInputOverWeight[i] = inputValues[i];
DataType netInputOverBias = DataType(1);
std::array<DataType, NumInputs> errorOverWeight;
for (std::size_t i = 0; i < NumInputs; ++i)
errorOverWeight[i] = errorOverOutput * outputOverNetInput * netInputOverWeight[i];
DataType errorOverBias = errorOverOutput * outputOverNetInput * netInputOverBias;
for (std::size_t i = 0; i < NumInputs; ++i)
weightAdjustments[i] = errorOverWeight[i];
weightAdjustments[NumInputs] = errorOverBias;
DataType errorOverNetInput = errorOverOutput * outputOverNetInput;
std::array<DataType, NumInputs> errorWeights;
for (std::size_t i = 0; i < NumInputs; ++i)
errorWeights[i] = errorOverNetInput * m_weights[i];
return errorWeights;
}
void AdjustWeights(const std::array<DataType, NumInputs + 1>& adjustments)
{
for (std::size_t i = 0; i < NumInputs; ++i)
{
m_weights[i] = m_weights[i] - learningRate * adjustments[i] + momentum * m_previousWeightUpdates[i];
m_previousWeightUpdates[i] = learningRate * adjustments[i] + momentum * m_previousWeightUpdates[i];
}
m_biasWeight = m_biasWeight - learningRate * adjustments[NumInputs] + momentum * m_previousWeightUpdates[NumInputs];
m_previousWeightUpdates[NumInputs] = learningRate * adjustments[NumInputs] + momentum * m_previousWeightUpdates[NumInputs];
}
protected:
static std::mt19937 m_mt;
static std::uniform_real_distribution<DataType> m_uniformDisRandom;
static std::uniform_real_distribution<DataType> m_xavierUniformDis;
static std::normal_distribution<DataType> m_xavierNormalDis;
std::array<DataType, NumInputs> m_weights;
DataType m_biasWeight;
ActivationFuncPtr m_activationFunction;
ActivationFuncPtr m_derivativeFunction;
DataType m_output;
DataType m_netInput;
std::array<DataType, NumInputs + 1> m_previousWeightUpdates;
};
template<std::size_t NumInputs>
std::mt19937 Neuron<NumInputs>::m_mt(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count());
template<std::size_t NumInputs>
std::uniform_real_distribution<DataType> Neuron<NumInputs>::m_uniformDisRandom(-1, 1);
template<std::size_t NumInputs>
std::uniform_real_distribution<DataType> Neuron<NumInputs>::m_xavierUniformDis(-std::sqrt(6.f / (NumInputs + 1)), std::sqrt(6.f / (NumInputs + 1)));
template<std::size_t NumInputs>
std::normal_distribution<DataType> Neuron<NumInputs>::m_xavierNormalDis(0, std::sqrt(2.f / (NumInputs + 1)));
int main()
{
std::ofstream file("error_out.csv", std::ios::out | std::ios::trunc);
if (!file.is_open())
{
std::cout << "couldn't open file" << std::endl;
return 0;
}
file << std::fixed << std::setprecision(80);
std::array<std::array<DataType, 2>, 4> inputData = {{{0, 0}, {0, 1}, {1, 0}, {1, 1}}};
std::array<std::array<DataType, 1>, 4> desiredOutputs = {{{0}, {1}, {1}, {0}}};
// define hidden layer with 4 neurons instead of 2
std::array<Neuron<2>*, 4> hiddenLayer1 =
{{
new Neuron<2>(sigmoid, sigmoid_derivative),
new Neuron<2>(sigmoid, sigmoid_derivative),
new Neuron<2>(sigmoid, sigmoid_derivative),
new Neuron<2>(sigmoid, sigmoid_derivative)
}};
// Output layer now takes 4 inputs from the hidden layer
std::array<Neuron<4>*, 1> outputLayer =
{{
new Neuron<4>(sigmoid, sigmoid_derivative)
}};
std::cout << std::fixed << std::setprecision(80);
std::cout << "Initial Weights: " << std::endl;
const std::array<DataType, 4>& outputWeights = outputLayer[0]->GetWeights();
const DataType& outputBias = outputLayer[0]->GetBiasWeight();
const std::array<DataType, 2>& hidden1Weights = hiddenLayer1[0]->GetWeights();
const DataType& hidden1Bias = hiddenLayer1[0]->GetBiasWeight();
const std::array<DataType, 2>& hidden2Weights = hiddenLayer1[1]->GetWeights();
const DataType& hidden2Bias = hiddenLayer1[1]->GetBiasWeight();
const std::array<DataType, 2>& hidden3Weights = hiddenLayer1[2]->GetWeights();
const DataType& hidden3Bias = hiddenLayer1[2]->GetBiasWeight();
const std::array<DataType, 2>& hidden4Weights = hiddenLayer1[3]->GetWeights();
const DataType& hidden4Bias = hiddenLayer1[3]->GetBiasWeight();
std::cout << "W0: " << hidden1Weights[0] << "\n"
<< "W1: " << hidden1Weights[1] << "\n"
<< "B0: " << hidden1Bias << "\n"
<< "W2: " << hidden2Weights[0] << "\n"
<< "W3: " << hidden2Weights[1] << "\n"
<< "B1: " << hidden2Bias << "\n"
<< "W4: " << hidden3Weights[0] << "\n"
<< "W5: " << hidden3Weights[1] << "\n"
<< "B2: " << hidden3Bias << "\n"
<< "W6: " << hidden4Weights[0] << "\n"
<< "W7: " << hidden4Weights[1] << "\n"
<< "B3: " << hidden4Bias << "\n"
<< "W8: " << outputWeights[0] << "\n"
<< "W9: " << outputWeights[1] << "\n"
<< "W10: " << outputWeights[2] << "\n"
<< "W11: " << outputWeights[3] << "\n"
<< "B4: " << outputBias << "\n" << std::endl;
DataType finalMSE = 0;
std::size_t epochNumber = 0;
while (epochNumber < numberEpochs)
{
DataType epochMSE = 0;
for (std::size_t row = 0; row < inputData.size(); ++row)
{
const std::array<DataType, 2>& dataRow = inputData[row];
const std::array<DataType, 1>& outputRow = desiredOutputs[row];
// Feed the values through to the output layer
hiddenLayer1[0]->FeedForward(dataRow);
hiddenLayer1[1]->FeedForward(dataRow);
hiddenLayer1[2]->FeedForward(dataRow);
hiddenLayer1[3]->FeedForward(dataRow);
DataType output0 = hiddenLayer1[0]->GetOutput();
DataType output1 = hiddenLayer1[1]->GetOutput();
DataType output2 = hiddenLayer1[2]->GetOutput();
DataType output3 = hiddenLayer1[3]->GetOutput();
outputLayer[0]->FeedForward({output0, output1, output2, output3});
DataType finalOutput0 = outputLayer[0]->GetOutput();
// if there was more than 1 output neuron these errors need to be summed together first to create total error
DataType totalError = 0.5 * std::pow(outputRow[0] - finalOutput0, 2.f);
epochMSE += totalError * totalError;
// Backpropagation
DataType propagateError = -(outputRow[0] - finalOutput0);
std::array<DataType, 5> weightAdjustmentsOutput;
std::array<DataType, 4> outputError = outputLayer[0]->Backpropagate(propagateError,
{output0, output1, output2, output3},
weightAdjustmentsOutput);
std::array<DataType, 3> weightAdjustmentsHidden1_0;
std::array<DataType, 3> weightAdjustmentsHidden1_1;
std::array<DataType, 3> weightAdjustmentsHidden1_2;
std::array<DataType, 3> weightAdjustmentsHidden1_3;
hiddenLayer1[0]->Backpropagate(outputError[0], dataRow, weightAdjustmentsHidden1_0);
hiddenLayer1[1]->Backpropagate(outputError[1], dataRow, weightAdjustmentsHidden1_1);
hiddenLayer1[2]->Backpropagate(outputError[2], dataRow, weightAdjustmentsHidden1_2);
hiddenLayer1[3]->Backpropagate(outputError[3], dataRow, weightAdjustmentsHidden1_3);
// Adjust weights
outputLayer[0]->AdjustWeights(weightAdjustmentsOutput);
hiddenLayer1[0]->AdjustWeights(weightAdjustmentsHidden1_0);
hiddenLayer1[1]->AdjustWeights(weightAdjustmentsHidden1_1);
hiddenLayer1[2]->AdjustWeights(weightAdjustmentsHidden1_2);
hiddenLayer1[3]->AdjustWeights(weightAdjustmentsHidden1_3);
}
epochMSE *= DataType(1) / inputData.size();
file << epochNumber << "," << epochMSE << std::endl;
finalMSE = epochMSE;
// Let's exit if the error is less than the convergence error threshold.
if (epochMSE < convergence_threshold) {
std::cout << "Exiting, as error level less than " << convergence_threshold << "." << std::endl;
break;
}
++epochNumber;
}
std::cout << std::fixed << std::setprecision(80)
<< "\n\n====================================\n"
<< " TRAINING COMPLETE"
<< "\n\n====================================" << std::endl;
std::cout << "Final Error: " << finalMSE << std::endl;
std::cout << "Number epochs: " << epochNumber << "/" << numberEpochs << std::endl;
if (!(finalMSE < convergence_threshold)) {
std::cout << "*** FAILED TO CONVERGE ***" << std::endl;
return 1;
}
// output tests
std::cout << std::fixed << std::setprecision(2)
<< "\n\n====================================\n"
<< " FINAL TESTS"
<< "\n\n====================================" << std::endl;
for (std::size_t row = 0; row < inputData.size(); ++row)
{
const std::array<DataType, 2>& dataRow = inputData[row];
const std::array<DataType, 1>& outputRow = desiredOutputs[row];
std::cout << dataRow[0] << "," << dataRow[1] << " (" << outputRow[0] << ") : ";
// Feed the values through to the output layer
hiddenLayer1[0]->FeedForward(dataRow);
hiddenLayer1[1]->FeedForward(dataRow);
hiddenLayer1[2]->FeedForward(dataRow);
hiddenLayer1[3]->FeedForward(dataRow);
DataType output0 = hiddenLayer1[0]->GetOutput();
DataType output1 = hiddenLayer1[1]->GetOutput();
DataType output2 = hiddenLayer1[2]->GetOutput();
DataType output3 = hiddenLayer1[3]->GetOutput();
outputLayer[0]->FeedForward({output0, output1, output2, output3});
DataType finalOutput0 = get_binary_result(outputLayer[0]->GetOutput());
std::cout << finalOutput0 << std::endl;
}
file.close();
// Clean up dynamically allocated memory
for (auto& neuron : hiddenLayer1) delete neuron;
for (auto& neuron : outputLayer) delete neuron;
return 0;
}
EDIT: I stand corrected. Even with 4 neurons in the middle layer, the network will sometimes fail to converge (although this happens a lot less frequently than the 2 hidden neuron model). Failure can be confirmed using:
while ./a.out > /dev/null; do ((counter++)); echo "Counter: $counter"; done
On my laptop, the failure usually happens after 1000+ executions. So it looks like the "solution" is the dirty one mentioned in the answer above - if you detect convergence failed, then generate new random weights are start training the network again from scratch.
Upvotes: 2
Reputation: 1599
I haven't read all your code because it is quite long, but:
NeuralNetwork
class and a Connection
class eventually to avoid writing all the logic in main
.ActivationFuncPtr
typedef which you could use to try and mixup different activation functions for different Neurons
(maybe with a genetic algorithm)?Now, to answer your question, there are really no definitive answers, but I can give you a few advice:
1/(1+exp(-4*x))
, 4
being arbitrary for instance.Upvotes: 2