Multi variable gradient descent

I am learning gradient descent for calculating coefficients. Below is what I am doing:

#!/usr/bin/Python

 import numpy as np


   # m denotes the number of examples here, not the number of features
 def gradientDescent(x, y, theta, alpha, m, numIterations):
     xTrans = x.transpose()
     for i in range(0, numIterations):
        hypothesis = np.dot(x, theta)
        loss = hypothesis - y
        # avg cost per example (the 2 in 2*m doesn't really matter here.
        # But to be consistent with the gradient, I include it)
        cost = np.sum(loss ** 2) / (2 * m)
        #print("Iteration %d | Cost: %f" % (i, cost))
        # avg gradient per example
        gradient = np.dot(xTrans, loss) / m
        # update
        theta = theta - alpha * gradient
     return theta

 X = np.array([41.9,43.4,43.9,44.5,47.3,47.5,47.9,50.2,52.8,53.2,56.7,57.0,63.5,65.3,71.1,77.0,77.8])
 y = np.array([251.3,251.3,248.3,267.5,273.0,276.5,270.3,274.9,285.0,290.0,297.0,302.5,304.5,309.3,321.7,330.7,349.0])
 n = np.max(X.shape)
 x = np.vstack([np.ones(n), X]).T      
 m, n = np.shape(x)
 numIterations= 100000
 alpha = 0.0005
 theta = np.ones(n)
 theta = gradientDescent(x, y, theta, alpha, m, numIterations)
 print(theta)

Now my above code works fine. If I now try multiple variables and replace X with X1 like the following:

  X1 = np.array([[41.9,43.4,43.9,44.5,47.3,47.5,47.9,50.2,52.8,53.2,56.7,57.0,63.5,65.3,71.1,77.0,77.8], [29.1,29.3,29.5,29.7,29.9,30.3,30.5,30.7,30.8,30.9,31.5,31.7,31.9,32.0,32.1,32.5,32.9]])

then my code fails and shows me the following error:

  JustTestingSGD.py:14: RuntimeWarning: overflow encountered in square
  cost = np.sum(loss ** 2) / (2 * m)
  JustTestingSGD.py:19: RuntimeWarning: invalid value encountered in subtract
  theta = theta - alpha * gradient
  [ nan  nan  nan]

Can anybody tell me how can I do gradient descent using X1? My expected output using X1 is:

[-153.5 1.24 12.08]

I am open to other Python implementations also. I just want the coefficients (also called thetas) for X1 and y.

Upvotes: 6

Answers (2)

JeeyCi

Reputation: 579

1). use scaling/normalization for faster convergence of gradient descent

2). decrease learning_rate

3). exit loop according tolerance_param

# adopted from https://stackoverflow.com/questions/17784587/gradient-descent-using-python-and-numpy
import numpy as np
import matplotlib.pyplot as plt

   # m denotes the number of examples here, not the number of features
def gradientDescent(x, y, theta, alpha, m, numIterations):
     xTrans = x.T
     theta_prev= theta
     cost_history = []

     for i in range(0, numIterations):
        Y_pred = np.dot(x, theta)
        loss = Y_pred - y            
        # for the conveniences of further derivation of squared error cost: 1/(2m)*loss**2 derivative becomes 1/m*loss
        squared_error_cost = (1 / (2 * m)) * np.sum((Y_pred - y) ** 2)       # Univariate mse of (predicted - actual), division by 2 - for further convenience

        # compute the derivative for weight[i]:
        # Hint: the derivative of square root is = 2 * dot product of feature_column  and errors.
        # When you differentiate the error with respect to theta, you get X*(X*theta.T-y)
        gradient = 1/m * x.T.dot(loss)

        # for MINIMIZING MSE
        # update theta = theta - alpha * gradient
        theta[0] = theta[0] - (alpha * ((1/m) *
                                              np.sum(Y_pred - y)))
        theta[1] = theta[1] - (alpha * ((1/m) *
                                              np.sum((Y_pred - y) * x.T)))
        cost_history.append( squared_error_cost )

        # accuracy
        acc= 1-sum(
                [
                    abs(Y_pred[i]-y[i])/y[i]
                    for i in range(m)
                    if y[i] != 0]
            )/m
        if i%100==0: print(f'at {i}-th step accuracy is {acc}')

        # Check for tolerance .....
        if(gradient[0] + theta_prev[0] <= 10**(-6)) and (gradient[1] + theta_prev[1] <= 10**(-6)): break
        theta_prev = theta
     return cost_history, theta, acc

X = np.array([41.9,43.4,43.9,44.5,47.3,47.5,47.9,50.2,52.8,53.2,56.7,57.0,63.5,65.3,71.1,77.0,77.8])
y = np.array([251.3,251.3,248.3,267.5,273.0,276.5,270.3,274.9,285.0,290.0,297.0,302.5,304.5,309.3,321.7,330.7,349.0])

# NORMALIZATION .....
X = (X-X.min())/(X.max()-X.min())
y = (y-y.min())/(y.max()-y.min())

plt.plot(X,y)
plt.show()

# Add a bias feature (constant 1) to the input matrix.....
n = np.max(X.shape)
X_bias = np.vstack([np.ones(n), X]).T

m, n = np.shape(X_bias)
##print(m)    # 17
numIterations= 1000
alpha = 0.1
theta = np.ones(n)

cost_hist, theta, acc = gradientDescent(X_bias, y, theta, alpha, m, numIterations)  
print("theta: ", theta)
print("Accuracy: ", acc)

yhat=  theta@X_bias.T
print('yhat', yhat)
plt.plot(X, y, 'bo-', X_bias[:,1], yhat, 'k-')
plt.legend(['Data', 'Actual Relationship'])
plt.show()

# plot grad
x = [i for i in range(1, len(cost_hist)+1)]
plt.plot(x, cost_hist)
plt.xlabel('Iteration')
plt.ylabel('Cost')
plt.title('Cost History during Gradient Descent')
plt.show()

p.s. for multivariable - see here or here

import numpy as np

def normalize_features(X):
     """
     Normalize features to have zero mean and unit variance.
     """
     mean = np.mean(X, axis=0)
     std = np.std(X, axis=0)
     X_normalized = (X - mean) / std
     return X_normalized, mean, std

def add_bias_feature(X):
     """
     Add a bias feature (constant 1) to the input matrix.
     """
     return np.column_stack((np.ones(len(X)), X))


def compute_cost(X, y, theta):
     """
     Compute the cost function for linear regression.
     """
     m = len(y)
     h = X.dot(theta)
     cost = (1 / (2 * m)) * np.sum((h - y) ** 2)
     return cost

def gradient_descent(X, y, theta, alpha, num_iterations):
    """
     Perform gradient descent to minimize the cost function.
    """
    m = len(y)
    cost_history = []
    for _ in range(num_iterations):
         h = X.dot(theta)
         error = h - y
         gradient = (1 / m) * X.T.dot(error)
         theta = theta - alpha * gradient
         cost = compute_cost(X, y, theta)
         cost_history.append(cost)
    return theta, cost_history

# Sample data generation
np.random.seed(42)
X = 2 * np.random.rand(100, 3) # 100 samples with 3 features
y = 4 + X.dot(np.array([3, 1.5, 2])) + np.random.randn(100)

# Normalize features and add bias feature
X_normalized, mean, std = normalize_features(X)
X_bias = add_bias_feature(X_normalized)

# Initial parameters
theta_initial = np.zeros(X_bias.shape[1])

# Hyperparameters
alpha = 0.01
num_iterations = 1000

# Run gradient descent
theta_final, cost_history = gradient_descent(X_bias, y, theta_initial, alpha, num_iterations)

# Print the final parameters and cost
print("Final Parameters:", theta_final)
print("Final Cost:", cost_history[-1])

# Plot the cost history
import matplotlib.pyplot as plt
plt.plot(cost_history)
plt.xlabel('Iteration')
plt.ylabel('Cost')
plt.title('Cost History during Gradient Descent')
plt.show()

Upvotes: 0

DrV

Reputation: 23500

The problem is in your algorithm not converging. It diverges instead. The first error:

JustTestingSGD.py:14: RuntimeWarning: overflow encountered in square
cost = np.sum(loss ** 2) / (2 * m)

comes from the problem that at some point calculating the square of something is impossible, as the 64-bit floats cannot hold the number (i.e. it is > 10^309).

JustTestingSGD.py:19: RuntimeWarning: invalid value encountered in subtract
theta = theta - alpha * gradient

This is only a consequence of the error before. The numbers are not reasonable for calculations.

You can actually see the divergence by uncommenting your debug print line. The cost starts to grow, as there is no convergence.

If you try your function with X1 and a smaller value for alpha, it converges.

Upvotes: 3

Multi variable gradient descent

Answers (2)

Related Questions