# !pip install seaborn==0.9.0
import numpy as np
import pandas as pd
import scipy
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
sns.set()
import warnings
warnings.simplefilter('ignore',FutureWarning)
advertising = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv')
advertising.drop(columns='Unnamed: 0', inplace=True)
df = advertising.copy()
df_dependent = 'sales'
df_independent = ['TV','radio','newspaper']
df.columns
lm = smf.ols('sales ~ TV', data=df).fit()
_, ax = plt.subplots(figsize=(10,10))
sm.graphics.plot_fit(lm, exog_idx=1, ax=ax);
Regression results More attributes for OLS results can be looked up.
Some important ones are:
params
: Estimates for the parameters (usually called beta ($\beta_i$))pvalues
: p-values for those estimates.df_model
: Degrees of freedom in the model. (Does not count y-intercept, so p predictors gives $p-1$ degrees of freedom.)mse_total
: Mean squared error. Average over all observations of the squared error: (expected-actual)^2. Check!bse
: Standard errors of the coefficient estimates you get. ("beta standard error")lm.params
rss = sum( np.square(df.sales - lm.params[0] - lm.params[1]*df.TV) )
rss
In StatsModels, the way the RSS is actually used is its 'average': RSS divided by the number of degrees of freedom (DOF) left. This probably makes more sense after you study stats for another year or two. (I hope.)
df_resid
tells you the degrees of freedom of the residuals (it's just $n-p$).mse_resid
is the mean square error of the residuals. The "mean" means divide by DOF of residuals. We will see a use for mse_resid
in Section 3.1.2
lm.mse_resid * lm.df_resid
For making decisions, you will probably read values off of the summary.
lm.summary()
You should never retype values from the summary! If you need to do further analysis, find out the programmatic way to access the data, usually by looking it up in the regression results page.
Example: Which parameters have significances < 0.01?
lm.pvalues < 0.01
lm.params
Reproducing Figure 3.2, left side, we want to make a contour plot showing how the RSS would change as we changed our estimates for $\beta_0$ and $\beta_1$. First we write a function that takes in the two parameters and produces the RSS for the model $sales =\beta_0 + \beta_1 * TV$ on the data set.
def f_1(b0,b1):
"Return RSS of prediction at (x,y)."
return np.sum(np.square(df['sales'] - (b0 + b1*df['TV'])))
ff = np.vectorize(f_1)
Verify that our formula is finding the RSS...
ff(7.032594, 0.047537)
lm.mse_resid * lm.df_resid
Good! Now we can make a contour plot. Remember we use meshgrid
to get all of the ordered pairs of points out of an xrange and a yrange.
(beta0, beta1) = lm.params
b0 = np.linspace(5,9,100)
b1 = np.linspace(0.025,0.07,100)
bb0, bb1 = np.meshgrid(b0,b1)
zz = ff(bb0,bb1)
fig,ax = plt.subplots(figsize=(5,5))
cplot = plt.contour(bb0, bb1, zz/1000, levels=[2.15,2.2,2.3,2.5,3,3.5])
plt.clabel(cplot, inline=1, fontsize=10)
ax.plot(beta0,beta1,'go')
plt.show();
My conclusion: this is why you do your own data analysis. Figure 3.2 (page 63) in the book appears to be a contour plot of RSS/1000 based on the labels drawn on the level curves. Except that the level curves are not circles in my plot and I do not think we are close enough to the picture in the book to explain the difference by scaling.
Question: Why does the contour 3.5 not appear on the graph? Matplotlib documentation is not very clear about the use of the top contour. Apparently it is not drawn, but still affects the colors of the drawn contour (try changing it from 3.5 to 6). Dislike.
samplesize = 50
xs = scipy.stats.expon.rvs(size=samplesize, loc=-2, scale=5)
xs = xs[ (-2<=xs) & (xs<=2) ]
noise = scipy.stats.norm.rvs(size=len(xs),scale=5)
def f33(x): return 2+3*x
ys = 2+ 3*xs + noise
fig33df = pd.DataFrame({'x':xs, 'y': ys})
fig33lm = smf.ols('y~x', data=fig33df).fit()
NOTE: if you switch the xs (exog) and ys (endog), OLS does not give an error until you ask for a summary.
ax = sns.scatterplot(data=fig33df);
_, ax = plt.subplots();
sns.regplot(x='x',y='y',data=fig33df,ax=ax);
ax.plot([-2,2],[f33(-2),f33(2)],color='orange', linewidth=2);
(b0,b1) = fig33lm.params
ax.plot([-2,2],[b0+b1*(-2),b0+b1*(2)], color='lightblue');
def newfit(samplesize=50):
xs = scipy.stats.expon.rvs(size=samplesize, loc=-2, scale=5)
xs = xs[ (-2<=xs) & (xs<=2) ]
noise = scipy.stats.norm.rvs(size=len(xs),scale=5)
ys = 2+ 3*xs + noise
fig33df = pd.DataFrame({'x':xs, 'y': ys})
fig33lm = smf.ols('y~x', data=fig33df).fit()
(b0,b1) = fig33lm.params
return (b0,b1)
def qplot(ax, b0=None, b1=None, samplesize=50, color='blue'):
if (b0 == None) ^ (b1 == None): raise RuntimeError('Must specify both b0 and b1')
if b0 == None and b1== None: (b0,b1)=newfit(samplesize)
ax.plot([-2,2],[b0+b1*(-2),b0+b1*(2)], color=color);
_, ax = plt.subplots()
ax.set_xlim(-2,2)
ax.set_ylim(-15,20)
for _ in range(10):
qplot(ax)
qplot(ax=ax,b0=b0,b1=b1,color='red')
This section is dense. You want to have seen it before you hear it. Answer the following reading questions.
What is a $p$-value? How does it relate to a large value for $|t|$?
TODO: Make our own simulated data and analyze different runs. (Create our own graphs like Figure 3.3.)
In this section we learn what mse_resid
is good for. It's the residual standard error, is an estimator for the variance ($\sigma^2$).
np.sqrt(lm.mse_resid)
Check that it is the same as the result from the book.
rss = sum( np.square(df.sales - lm.params[0] - lm.params[1]*df.TV) )
np.sqrt(rss / (lm.nobs-2) )
There is a confidence interval function.
It has an option for columns (cols
) which appears broken to me. You just need to get the info you need out of the resulting Pandas data frame.
lm.conf_int(alpha=0.01) # 99% confidence
lm.conf_int() # 95% confidence is the default
lm.conf_int().loc['TV']
Reading questions:
lm.ess
lm.mse_resid
See the source code called for conf_int. Do what appears to be the exact same thing with our results. See it works when done manually but not somehow when the code is run.
## FIXME: define lm2 if you want to keep this code here.
a = lm2.params
a
a[1]
a[[2,3]]
cols = np.asarray([2,3])
a[cols]
# ERRORS
#lm2.conf_int(cols=[2,3])