Reputation: 11098
I would like to produce a linear best fit trendline of the average CPU usage per day.
My data looks like this:
host_df_means['cpu_usage_percent']
history_datetime
2020-03-03 9.727273
2020-03-04 9.800000
2020-03-05 9.727273
2020-03-06 10.818182
2020-03-07 9.500000
2020-03-08 10.909091
2020-03-09 15.000000
2020-03-10 14.333333
2020-03-11 15.333333
2020-03-12 16.000000
2020-03-13 21.000000
2020-03-14 28.833333
Name: cpu_usage_percent, dtype: float64
I then plot this with:
plot = host_df_means['cpu_usage_percent'].plot()
plot.set_xlim([datetime.date(2020, 3, 3), datetime.date(2020, 3, 31)])
plot;
That creates a plot like this
So now I would like to add a trendline for the future something like this:
Upvotes: 4
Views: 6848
Reputation: 400
I'd use scipy stats linregress function to get your best fit equation to predict future usage, you can also use np.polyfit and np.poly1d like Chris A recommmended. The following encompasses both, something like this:
#Import the necessary libraries
import matplotlib.pyplot as plot
from scipy import stats
import numpy as np
# recreating the data arrays
data = [["2020-03-03",9.727273],
["2020-03-04",9.800000],
["2020-03-05",9.727273],
["2020-03-06",10.818182],
["2020-03-07",9.500000],
["2020-03-08",10.909091],
["2020-03-09",15.000000],
["2020-03-10",14.333333],
["2020-03-11",15.333333],
["2020-03-12",16.000000],
["2020-03-13",21.000000],
["2020-03-14",28.833333]]
fig, ax = plot.subplots()
# Separating string array and CPU usage array
dates = [x[0] for x in data]
usage = [x[1] for x in data]
# dates are linear so just use len of usage array and save dates as tick labels
bestfit = stats.linregress(range(len(usage)),usage)
equation = str(round(bestfit[0],2)) + "x + " + str(round(bestfit[1],2))
ax.plot(range(len(usage)), usage)
ax.plot(range(len(usage)), np.poly1d(np.polyfit(range(len(usage)), usage, 1))(range(len(usage))), '--',label=equation)
# Add how many days ahead you want to prediction
extension = 5
# Extended prediction
for x in range(len(usage),len(usage)+extension):
usage.append((bestfit[0]*x)+bestfit[1]) # mx + c from linear regression found before
day = str()
newdate = dates[-1][0:8]+str(int(dates[-1][-2:])+1)
dates.append(newdate)
ax.plot(range(len(usage)), np.poly1d(np.polyfit(range(len(usage)), usage, 1))(range(len(usage))), ':',label=str(extension)+" day prediction")
# Set date tick labels and legend
plot.xticks(range(len(dates)))
ax.set_xticklabels(dates)
plot.legend()
# Display plot
plot.show()
Upvotes: 1
Reputation: 5686
Keeping your data as pd.DataFrame
, the trick is to convert the dates to a numeric type which can be used to perform the linear regression.
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats
from io import StringIO
# Set up data as in question
host_df_means = pd.read_csv(StringIO("""
2020-03-03 9.727273
2020-03-04 9.800000
2020-03-05 9.727273
2020-03-06 10.818182
2020-03-07 9.500000
2020-03-08 10.909091
2020-03-09 15.000000
2020-03-10 14.333333
2020-03-11 15.333333
2020-03-12 16.000000
2020-03-13 21.000000
"""),
sep='\s+', header=None, parse_dates=[0], index_col=0)
host_df_means.columns = ['cpu_usage_percent']
host_df_means.index.name = 'history_datetime'
fig, ax = plt.subplots(1, 1)
ax.plot(host_df_means.index, host_df_means)
ax.set_xlim([datetime.date(2020, 3, 3), datetime.date(2020, 3, 31)])
# To perform the linear regression we need the dates to be numeric
host_df_means.index = host_df_means.index.map(datetime.date.toordinal)
# Perform linear regression
slope, y0, r, p, stderr = stats.linregress(host_df_means.index,
host_df_means['cpu_usage_percent'])
# x co-ordinates for the start and end of the line
x_endpoints = pd.DataFrame([host_df_means.index[0], host_df_means.index[-1]])
# Compute predicted values from linear regression
y_endpoints = y0 + slope * x_endpoints
# Overlay the line
ax.plot(x_endpoints, y_endpoints, c='r')
ax.set_xlabel('history_datetime')
Upvotes: 2