Nova
Nova

Reputation: 19

Why is my Sigmoid and ReLU converges on 0.5 on XOR approximation?

Im learning about neural networks at the moment, and decided to create a class library to use universal neural networks for later projects. (My sources are mostly YT videos and ChatGPT) I first created a basic network, and then added NEAT functionality with mutation. I taught a 2D drone to fly to a target to test that I succeeded.

Then I tried to add backpropagation. I am quite bad at math, however I think I understand the logic behind it, so I implemented a first version with static TanH activation.

It worked. Mostly. With some low chance it also converges in 0.5 on some inputs, but with some fine tuning on parameters it happens about once in fifty runs.

Later I added multiple activation types, so now I have Sigmoid, ReLU, LeakyReLU,GeLU and TanH. TanH, GeLU and Leaky ReLU seems to be working fine, but ReLU and Sigmoig keeps converging to 0.5.

I added Glorot and He initialization to the mix to help with initialization but it still not changed.

I spent about 40-50 hours creating this monstrosity and feel my mind slowly desintegrating at the moment, so I would like to ask the Collective for guidance.

I know my code is very far from being pretty, and even less efficient, but it is fine as it is at the moment, except that it does not work correctly.

I am using unit tests to test it. Created a 2-2-1 neural network with all inputs having 0 bias.

Code for Neural network with irrelevant parts hidden:

public class Brain
    {
        public delegate void BrainSaver(string data);
        public event BrainSaver SaveBrain;

        public static IRandomizer Rand { get; set; }

        public ActivationTypes ActivationType { get; set; } = ActivationTypes.Tanh;

        public double LearningRate { get; set; }

        private int[] layers; // Number of neurons in each layer
        private Vector<double>[] neurons; // Stores neuron values
        private Vector<double>[] biases; // Stores neuron values
        private Dictionary<int, Dictionary<(int, int), double>>[] weights; // Stores weight matrices

        private Brain() { }

        public Brain(int[] layers, bool[] connections)
        {
            this.layers = layers;

            InitializeNeurons();
            InitializeWeights(connections);
        }
        public Brain(int[] layers, int[] connections)
        {
            this.layers = layers;

            InitializeNeurons();
            if (connections != null)
                InitializeWeights(connections);
            else
                InitializeWeights((bool[])null);
        }        

        private void InitializeNeurons()
        {
            neurons = new Vector<double>[layers.Length];
            biases = new Vector<double>[layers.Length];

            for (int i = 0; i < layers.Length; i++)
            {
                neurons[i] = DenseVector.OfArray(new double[layers[i]]);
                biases[i] = DenseVector.CreateRandom(layers[i], new ContinuousUniform(-1d, 1d));
            }
            for (int i = 0; i < biases[0].Count; i++)
            {
                biases[0][i] = 0;
            }
        }

        private void InitializeWeights(bool[] connections)
        {
            weights = new Dictionary<int, Dictionary<(int, int), double>>[layers.Length - 1];

            bool usePreset = connections != null && connections.Length == layers.Skip(1).Sum();

            int presetidx = 0;
            for (int i = 0; i < layers.Length - 1; i++)
            {
                weights[i] = new Dictionary<int, Dictionary<(int, int), double>>();
                double initializationScale = ActivationMethods.InitializationLimit(ActivationType, layers[i], layers[i + 1]);

                for (int iidx = 0; iidx < layers[i + 1]; iidx++)
                {
                    weights[i].Add(iidx, new Dictionary<(int, int), double>());
                    for (int jidx = 0; jidx < layers[i]; jidx++)
                    {
                        if ((usePreset && connections[presetidx++]) || !usePreset)
                        {
                            weights[i][iidx].Add((i, jidx), Rand.Range(-initializationScale, initializationScale)); // Random values between proper limits
                        }
                    }
                }
            }
        }

        private void InitializeWeights(int[] connections)
        {
            bool usePreset = connections != null && connections.Length % 4 == 0;

            List<(int, int, int, int)> plannedConnections = new List<(int, int, int, int)>();

            for (int i = 0; i < connections.Length / 4; i++)
            {
                plannedConnections.Add((connections[i * 4], connections[i * 4 + 1], connections[i * 4 + 2], connections[i * 4 + 3]));
            }

            plannedConnections = plannedConnections.Distinct().ToList();

            int maxConnections = 0;
            for (int i = 1; i < layers.Length; i++)
            {
                maxConnections += layers[i] * layers[i - 1];
            }

            usePreset &= maxConnections > plannedConnections.Count;

            if (usePreset)
            {

                weights = new Dictionary<int, Dictionary<(int, int), double>>[layers.Length - 1];

                for (int i = 0; i < layers.Length - 1; i++)
                {
                    weights[i] = new Dictionary<int, Dictionary<(int, int), double>>();
                    for (int iidx = 0; iidx < layers[i + 1]; iidx++)
                    {
                        weights[i].Add(iidx, new Dictionary<(int, int), double>());
                    }
                }

                foreach (var conn in plannedConnections)
                {
                    weights[conn.Item1 - 1][conn.Item2].Add((conn.Item3, conn.Item4), Rand.Range(-1d, 1d));
                }
            }
            else
            {
                InitializeWeights((bool[])null);
            }
        }

        public double[] Forward(params double[] input)
        {
            neurons[0] = Vector<double>.Build.Dense(input); // Assign input values as the first layer's neuron values

            // Loop through layers, calculate activations
            for (int i = 1; i < layers.Length; i++)
            {
                for (int j = 0; j < layers[i]; j++)
                {
                    Dictionary<(int, int), double> currWeights = weights[i - 1][j];

                    try
                    {
                        Vector<double> weightVector = Vector<double>.Build.Dense(currWeights.Values.ToArray());
                        Vector<double> neuronVector = Vector<double>.Build.Dense(currWeights.Select(x => neurons[x.Key.Item1][x.Key.Item2]).ToArray());

                        float bias = (float)biases[i][j];

                        neurons[i][j] = ActivationMethods.Activation(ActivationType, weightVector * neuronVector + bias);
                    }
                    catch (Exception ex)
                    {
                        Save();
                    }
                }
            }

            return neurons[neurons.Length - 1].ToArray(); // Return output layer
        }

        public void Backward(params double[] expectedOutput)
        {
            // Compute output layer error (difference between predicted and expected output)
            Vector<double> outputLayer = neurons[neurons.Length - 1];
            Vector<double> outputError = Vector<double>.Build.DenseOfEnumerable(outputLayer - Vector<double>.Build.Dense(expectedOutput));

            // Store gradients for backpropagation
            Dictionary<int, Vector<double>> gradients = new Dictionary<int, Vector<double>>();

            for (int i = 0; i < layers.Length; i++)
            {
                Vector<double> vector = Vector<double>.Build.Dense(layers[i]);
                gradients.Add(i, vector);
            }

            // Compute output layer gradient
            gradients[layers.Length - 1] = outputError;

            Vector<double> derivatives;
            // Backpropagate error through hidden layers
            for (int i = layers.Length - 1; i > 0; i--)
            {
                // At this point the current layer gradients are not calculated yet,
                // but since all appropriate weight errors are already accumulated, it can be done now.
                derivatives = Vector<double>.Build.DenseOfEnumerable(neurons[i].Select(x => ActivationMethods.Derivative(ActivationType, x)));
                gradients[i] = gradients[i].PointwiseMultiply(derivatives);

                // since now the current layer gradients are all available, we can continue accumulating
                // the errors of preceding weights.
                foreach (int neuron in weights[i - 1].Keys)
                {
                    foreach ((int, int) key in weights[i - 1][neuron].Keys)
                    {
                        gradients[key.Item1][key.Item2] += weights[i - 1][neuron][key] * gradients[i][neuron];
                    }
                }
            }
            derivatives = Vector<double>.Build.DenseOfEnumerable(neurons[0].Select(x => ActivationMethods.Derivative(ActivationType, x)));
            gradients[0] = gradients[0].PointwiseMultiply(derivatives);

            // Update weights and biases using gradients
            for (int i = 0; i < weights.Length; i++) // foreach weight layer
            {
                foreach (int neuron in weights[i].Keys) // foreach neuron in that layer
                {
                    foreach ((int, int) key in weights[i][neuron].Keys.ToList()) //  adjust weight in each of their connection backwards
                    {
                        weights[i][neuron][key] -= LearningRate * gradients[i + 1][neuron] * neurons[key.Item1][key.Item2]; // Adjust weights
                    }
                }

                biases[i + 1] -= LearningRate * gradients[i + 1]; // Adjust biases
            }
        }
    }

The Activation related stuff is like this:

public enum ActivationTypes
    {
        Sigmoid,
        ReLU,
        LeakyReLU,
        GeLU,
        Tanh
    }

    public static class ActivationMethods
    {
        public static double LeakyReLUAlpha { get; set; } = 0.1d;

        public static double InitializationLimit(ActivationTypes type, double input, double output)
        {
            switch (type)
            {
                case ActivationTypes.Sigmoid:
                    return Glorot(input + output);
                case ActivationTypes.ReLU:
                    return He(input);
                case ActivationTypes.LeakyReLU:
                    return He(input);
                case ActivationTypes.GeLU:
                    return He(input);
                case ActivationTypes.Tanh:
                    return Glorot(input + output);
                default:
                    return Glorot(input + output);
            }
        }

        public static double Activation(ActivationTypes type, double value)
        {
            switch (type)
            {
                case ActivationTypes.Sigmoid:
                    return Sigmoid(value);
                case ActivationTypes.ReLU:
                    return ReLu(value);
                case ActivationTypes.LeakyReLU:
                    return LeakyReLu(value);
                case ActivationTypes.GeLU:
                    return GeLU(value);
                case ActivationTypes.Tanh:
                    return Tanh(value);
                default:
                    return Sigmoid(value);
            }
        }

        public static double Derivative(ActivationTypes type, double value)
        {
            switch (type)
            {
                case ActivationTypes.Sigmoid:
                    return DSigmoid(value);
                case ActivationTypes.ReLU:
                    return DReLu(value);
                case ActivationTypes.LeakyReLU:
                    return DLeakyReLu(value);
                case ActivationTypes.GeLU:
                    return DGeLU(value);
                case ActivationTypes.Tanh:
                    return DTanh(value);
                default:
                    return DSigmoid(value);
            }
        }

        private static double Glorot(double n)
        {
            return Math.Sqrt(6.0 / n);  // Xavier (Glorot) formula
        }

        private static double He(double n)
        {
            return Math.Sqrt(2.0 / n);  // Xavier (Glorot) formula
        }

        private static double Sigmoid(double x)
        {
            return 1 / (1 + Math.Pow(Math.E, -x));
        }

        private static double DSigmoid(double x)
        {
            return Sigmoid(x) * (1 - Sigmoid(x));
        }

        private static double LeakyReLu(double x)
        {
            return x < 0 ? LeakyReLUAlpha * x : x;
        }

        private static double DLeakyReLu(double x)
        {
            return x < 0 ? LeakyReLUAlpha : 1;
        }

        private static double ReLu(double x)
        {
            return x < 0 ? 0 : x;
        }

        private static double DReLu(double x)
        {
            return x < 0 ? 0 : 1;
        }

        private static double GeLU(double x)
        {
            return 0.5 * x * (1 + Math.Tanh(Math.Sqrt(2.0 / Math.PI) * (x + 0.044715 * x * x * x)));
        }

        private static double DGeLU(double x)
        {
            double phi = Math.Exp(-0.5 * x * x) / Math.Sqrt(2 * Math.PI);  // Standard normal PDF
            double phiPrime = 1 - x * x * phi;  // Derivative of the normal PDF

            double phiX = 0.5 * (1 + Erf(x / Math.Sqrt(2))); // Standard normal CDF
            return phiX + x * phiPrime;
        }

        private static double Tanh(double x)
        {
            return 2 / (1 + Math.Pow(Math.E, -(2 * x))) - 1;
        }
        public static double DTanh(double x)
        {
            return 1 - (x * x);
        }

        public static double Erf(double x)
        {
            // constants
            double a1 = 0.254829592;
            double a2 = -0.284496736;
            double a3 = 1.421413741;
            double a4 = -1.453152027;
            double a5 = 1.061405429;
            double p = 0.3275911;

            // Save the sign of x
            int sign = 1;
            if (x < 0)
                sign = -1;
            x = Math.Abs(x);

            // A&S formula 7.1.26
            double t = 1.0 / (1.0 + p * x);
            double y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * Math.Exp(-x * x);

            return sign * y;
        }
    }

Upvotes: -1

Views: 32

Answers (0)

Related Questions