How to compute bias and variance of pyspark linear regression model

Question

I am trying to calculate the bias and variance of a pyspark linear regression model. I start with a 3rd degree polynomial, add some noise, and fit a linear regression model with varying degrees of polynomial expansion. The goal is to show that bias decreases and variance increases as the degree of the polynomial expansion increases. In my code below, the model bias remains constant because the mean of the prediction is the same for polynomial degrees 1, 2, and 3. I must be calculating bias wrong and I'm also wondering if I'm calculating variance correctly. Can someone verify that I'm calculating bias correctly (or not) and help me figure out why the bias remains the same regardless of the polynomial expansion degree. All comments are welcome about anything incorrect in the code.

    from pyspark.sql import SparkSession
    from pyspark.ml import feature, regression, Pipeline
    from pyspark.sql import functions as fn
    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    spark = SparkSession.builder.getOrCreate()
    sc = spark.sparkContext

    # create numpy arrays for x and y data
    x = np.linspace(-15, 15, 250)
    y = 10 + 5*x + 0.5*np.square(x) - 0.1*np.power(x,3)
    reducible_error = np.random.uniform(-50, 50, len(x))
    irreducible_error = np.random.normal(0, 8, len(x))
    y_noise = y + reducible_error + irreducible_error

    # plot x and y data
    %matplotlib inline
    plt.figure()
    plt.plot(x,y, c='r', label="y")
    plt.scatter(x, y_noise, label="y_noise")
    plt.legend()
    plt.title("10 + 5x + 0.5x^2 - 0.1x^3")
    plt.xlabel("x")
    plt.ylabel("y, y_noise")

    # create a pandas dataframe from the x, y, y_hat data arrays
    pd_df = pd.DataFrame({'x': x, 'y_noise': y_noise, 'y': y}, columns=['x', 'y_noise', 'y'])

    # create a spark dataframe from the pandas dataframe
    df = spark.createDataFrame(pd_df)
    df.show()

    def get_bias_squared(df):
        f_hat_mean = np.mean(df['prediction'])
        return np.mean(np.square(df['y_noise'] - f_hat_mean))

    def get_variance(df):
        f_hat_mean = np.mean(df['prediction'])
        diff = df['prediction'] - f_hat_mean
        return np.mean(np.square(diff))    

    def plot_poly_expansion(n, df, lambda_reg=0., alpha_reg=0.):
        # create the pipeline
        va = feature.VectorAssembler(inputCols=['x'], outputCol='features')
        pe = feature.PolynomialExpansion(degree=n, inputCol='features', outputCol='poly_features')
        lr = regression.LinearRegression(featuresCol='poly_features', labelCol='y_noise', regParam=lambda_reg, 
                                         elasticNetParam=alpha_reg)
        pipe = Pipeline(stages=[va, pe, lr]).fit(df)

        # fit the pipeline
        fit_df = pipe.transform(df)

        # convert the fitted spark dataframe to pandas and plot predicted vs. actual
        fit_pd_df = fit_df.toPandas()

        # display(fit_pd_df.head())
        fit_pd_df.plot(x='x', y=['y', 'y_noise', 'prediction'])
        plt.title("Polynomial degree = %s
Bias = %s, Variance = %s" % (i, get_bias_squared(fit_pd_df), 
                                                                        get_variance(fit_pd_df)))
        plt.xlabel("x")
        plt.ylabel("y")

        return fit_pd_df

    for i in np.arange(1, 4):
        plot_poly_expansion(float(i), df)

How to compute bias and variance of pyspark linear regression model

Answers (1)

Related Questions