# !pip install seaborn==0.9.0
import numpy as np
import pandas as pd
import scipy
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels
import patsy
import math
sns.set()
import warnings
warnings.simplefilter('ignore',FutureWarning)
# 4.6.3
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# 4.6.4
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# 4.6.5
from sklearn.neighbors import KNeighborsClassifier
# 4.6.6
import sklearn.preprocessing
# sklearn better than statsmodels?
from sklearn.linear_model import LogisticRegression
# debug printing appears immediately
import sys
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
smarket_1 = pd.read_csv('Smarket.csv', na_values='?');
#smarket_1 = pd.read_csv('http://vincentarelbundock.github.io/Rdatasets/csv/ISLR/Smarket.csv', na_values = '?')
smarket = smarket_1.drop(columns=['Unnamed: 0'])
smarket.corr()
sns.scatterplot(x=smarket.index, y='Volume', data=smarket);
Informative article on logistic regression (logit) versus other possible ways to connect 0 to 1 like probit.
There are some snags using StatsModels to do logistic regression.
The biggest issue is that the formula processor from Patsy does not directly work on the outputs (y values). Patsy generates two complementary columns that sum to all ones. Since you can only use one column for the response, you need to pick one. At this point you could just convert the original Yes/No answers to 1/0 instead... so I do that.
LogitResults are the output of the fitting process. Some important fields of LogitResults
are listed here:
params
: Coefficients.pvalues
fittedvalues
bse
: Coefficient standard errors.aic
: A measure of model fit.There are also methods to get items like residual deviance. Check the documentation.
smarket["DirectionUp"] = np.array(
[1 if up else 0 for up in smarket.Direction == "Up"])
dm = sm.Logit.from_formula("DirectionUp ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume", data=smarket).fit()
dm.params
dm.pvalues
dm.fittedvalues.head() # theres lots, head is to not see them all
dm.aic
dm.llf
dm.bse
Predictions for the existing data set come from the predict()
method. To get predictions for another dataset, make another data frame.
predictions_1 = dm.predict();
len(predictions_1)
smarket.head()
Make a new data frame to get the probability of "up" for a given set of predictors.
new_prediction_data = pd.DataFrame({'Lag1': 0.8, 'Lag2': 0.5, 'Lag3': -0.5, 'Lag4': -0.2, 'Lag5': 3, 'Volume': 7 },
index = [0])
predictions_2 = dm.predict(new_prediction_data)
predictions_2
We still need to make the confusion matrix and do a prediction based on some data. In the confusion matrix, we need to turn the probabilities into either 0 or 1 decisions, so round them. In a different case, you could use an "if" statement.
up_predicted = pd.Series( dm.predict().round(), name = "Predicted")
up_actual = pd.Series(smarket.DirectionUp, name = "Actual")
confusion_matrix = pd.crosstab(up_actual, up_predicted)
confusion_matrix
correct_count = confusion_matrix[0][0] + confusion_matrix[1][1]
total = len(smarket)
print("supposed accuracy (but bad): ", correct_count / total)
Assessing a model based on data that you used for training is kind of suspicious. To keep things legit, we need to separate out data and not use it for training, only to check to see the quality of the model. The code below selects a random 10% to be the held out data.
test_data = np.random.random(1250) < 0.10
#test_data = smarket.Year >= 2005 # book does this
training_data = ~test_data
sum(test_data)
dm_parts = sm.Logit.from_formula("DirectionUp ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume", data=smarket[training_data]).fit()
correct = pd.Series(smarket.DirectionUp[test_data], name = "Actual")
predicted = pd.Series(dm_parts.predict(smarket[test_data]).round(), name="Predicted")
confusion_matrix = pd.crosstab(correct,predicted)
confusion_matrix
accuracy = np.mean(correct == predicted)
print("accuracy (done right): ", accuracy)
error_rate = np.mean(correct != predicted)
error_rate
Just predicting "up" gives only a 52% accuracy. That is called predicting the "baseline rate". Your prediction method should do better than that to be useful.
sum(smarket.DirectionUp == 1) / len(smarket)
Book says try again with just Lag1 and Lag2......... but that does not seem to do any better for me.
dm_parts = sm.Logit.from_formula("DirectionUp ~ Lag1+Lag2", data=smarket[training_data]).fit()
correct = pd.Series(smarket.DirectionUp[test_data], name = "Actual")
predicted = pd.Series(dm_parts.predict(smarket[test_data]).round(), name="Predicted")
accuracy = np.mean(correct == predicted)
accuracy
As usual, the predict
method gets you probabilities for the value you predicted - in this case, DirectionUp
. Both of the values below are solidly in the middle of the probability range, but 0.48 would be predicted "Down" and 0.50 would be "Up".
more_demo_data = pd.DataFrame({'Lag1': [1.2, 1.1], 'Lag2': [1.5, -0.8]})
dm_parts.predict(more_demo_data)
The biggest thing I notice about using LDA is that it does not have the nice interface of Pandas DataFrames and Patsy formulas built in. No actual deficiencies noted. R has better summarization info for LDA, but do you really use it?
Note: Do not include a constant term in the design matrix for Linear Discriminant Analysis (also not for Quadratic D.A. in the next section)!
test_data = np.random.random(1250) < 0.10
train_data = ~ test_data
lda = LinearDiscriminantAnalysis()
X = patsy.dmatrix("Lag1+Lag2+0", data=smarket[train_data])
y = smarket.DirectionUp[train_data]
lda_fit = lda.fit(X,y)
I do not know what the warning about collinear means.
Assess the quality of your model by seeing what fraction of your predictions are correct on the test data. The cells below show 46% accuracy.
X_test = patsy.dmatrix("Lag1+Lag2+0", data=smarket[test_data])
ypredict = lda_fit.predict(X_test)
ycorrect = smarket.DirectionUp[test_data]
np.mean(ypredict == ycorrect)
The score
method returns the accuracy without you having to do any of the work by hand. LDA gives models that can score
, but not all classifiers do.
lda_fit.score(X_test, ycorrect)
Predict the probabilities for each class (one column per class). The book explains that these are "posterior probabilities" (meaning after the model is fitted). Check!
lda_test_predictions = lda_fit.predict_proba(X_test)
lda_test_predictions[1:5]
Decision function gives a number, positive is one class, negative is the other.
lda.decision_function(X_test)[1:5]
The book suggests we try changing the probability threshhold. How many observations are 55% certain to get class 1 (which is the second column)?
np.sum(lda_test_predictions[:,1] > 0.55)
Quadratic discriminant analysis (QDA) works the same as LDA. The difference is that the decision boundaries can be more flexible. Read the book! The coding seems the same. Note the nice high score, almost 60% right on the test data when you split training and test data by year. Unfortunately when you randomly choose about 10% of the data for test cases, the amazing prediction power goes away and you are stuck with a lousy 45% correct.
#test_data = np.random.random(1250) < 0.10
test_data = smarket.Year >= 2005
train_data = ~ test_data
qda = QuadraticDiscriminantAnalysis()
X = patsy.dmatrix("Lag1+Lag2+0", data=smarket[train_data])
y = smarket.DirectionUp[train_data]
qda_fit = qda.fit(X,y)
X_test = patsy.dmatrix("Lag1+Lag2+0", data=smarket[test_data])
ypredict = qda_fit.predict(X_test)
ycorrect = smarket.DirectionUp[test_data]
np.mean(ypredict == ycorrect)
K-nearest neighbors is very similar to LDA and QDA in its execution.
There is also a related classifier that uses all of the points within a given radius to classify a point. Good when the points are not all evenly distributed in the sample space.
neighbors = 3
knn = KNeighborsClassifier(neighbors)
knn_fit = knn.fit(X, y)
knn_predict = knn.predict(X_test)
np.mean(knn_predict == ycorrect)
Investigate the accuracy of KNN as the number of neighbors increases.
X_train = patsy.dmatrix("Lag1+Lag2+0", data=smarket[train_data])
y_train = smarket.DirectionUp[train_data]
X_test = patsy.dmatrix("Lag1+Lag2+0", data=smarket[test_data])
y_test = smarket.DirectionUp[test_data]
def knn_accuracy(neighbors):
knn = KNeighborsClassifier(neighbors)
knn_fit = knn.fit(X_train, y_train)
knn_predict = knn.predict(X_test)
return np.mean(knn_predict == ycorrect)
ns = np.arange(1,25)
y = [knn_accuracy(n) for n in ns]
ax = sns.lineplot(x=ns,y=y);
ax.set_ylim([0,0.6]);
ax.set_xlabel('neighbors');
ax.set_ylabel('accuracy');
Important point #1: standardize your data before you do KNN. Or anything, really. Use sklearn.preprocessing.scale.
Caravan dataset documentation. Column 86, 'Purchase', indicates 'Yes' or 'No' did the person purchase a policy.
#df_caravan_1 = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/ISLR/Caravan.csv')
df_caravan_1 = pd.read_csv('Caravan.csv')
df_caravan = df_caravan_1.drop(columns=['Unnamed: 0'])
predictors = set(df_caravan.columns) - set(['Purchase'])
purchase = df_caravan.Purchase
How many data points are there? How many purchases vs not? What is the rate of sales?
purchase.count()
purchase.value_counts()
purchase.value_counts() / purchase.count()
SciKit-Learn has a preprocessing module that will automatically adjust your data to have mean zero and standard deviation one. sklearn.preprocessing.scale
test = df_caravan.index < 1000
train = ~ test
X_scaled = sklearn.preprocessing.scale(df_caravan[list(predictors)])
X = X_scaled[train]
y = purchase[train]
knn = KNeighborsClassifier(1)
knn_fit = knn.fit(X,y)
y_correct = purchase[test]
X_predict = X_scaled[test]
y_predict = knn_fit.predict(X_predict)
accuracy = np.mean(y_correct == y_predict)
errors = 1-accuracy
accuracy
errors
How many people who are predicted to buy insurance actually buy it? We want to know if we should concentrate on the people who the model says will buy. According to this math about 11+% of those predictions are correct.
buy = y_predict == 'Yes'
correct_predictions = y_correct == y_predict
np.mean(correct_predictions[buy])
We automate the process of finding out how many of the predicted buyers actually buy so we can see how changing the number of neighbors affects the percent of predicted buys who actually buy.
test = df_caravan.index < 1000
train = ~ test
X_scaled = sklearn.preprocessing.scale(df_caravan[list(predictors)])
X = X_scaled[train]
y = purchase[train]
y_correct = purchase[test]
X_predict = X_scaled[test]
def knn_success_rate(neighbors):
knn = KNeighborsClassifier(neighbors)
knn_fit = knn.fit(X,y)
y_predict = knn_fit.predict(X_predict)
correct = y_correct == y_predict
buy = y_predict == 'Yes'
return np.mean(correct[buy])
We see that as the number of neighbors goes up, the prediction rate goes up. The book does not carry this analysis far enough to get to the suspicious part.... 100% prediction rate with k=9 neighbors. As k increases, the pool of "Yes" reponses shrinks to almost zero.
neighbors = np.arange(1,10)
brate = [knn_success_rate(k) for k in neighbors]
pframe = pd.DataFrame({'neighbors': neighbors, 'rate': brate})
#odds = [ n%2 == 1 for n in neighbors ] # are odds better than evens?
sns.lineplot(x='neighbors',y='rate',data=pframe);
def score(correct):
if len(correct) == 0: return 0
return np.mean(correct)
Next we investigate a "saturated" logistic model and find out the rate of predictions who buy. When you use a cutoff of 50%, the model is terrible, but if you change the cutoff to 25% or so, it gets better.
I had trouble with this. Get the raw probabilities with predict_proba
. If you train with answers being strings, then when you predict you will get those strings, so make your life easier, train with numbers for the response. Predictions from predict_proba
have one column for each possible predicted value, so in this case even though it we are modeling a binary response we get two columns (for 0 and 1).
test = np.zeros(len(df_caravan), dtype=bool)
test[0:1000] = True
test = np.random.random(len(df_caravan)) < 0.10
train = ~ test
X_scaled = sklearn.preprocessing.scale(df_caravan[list(predictors)])
noyes = np.vectorize(lambda s: 1 if s=='Yes' else 0)
X = X_scaled[train]
y = noyes(purchase[train])
y_correct = noyes(purchase[test])
X_predict = X_scaled[test]
model_fit = LogisticRegression().fit(X,y)
y_predict_p = model_fit.predict_proba(X_predict)
y_predict = (y_predict_p > 0.25)[:,1] # column 0 = No, 1=Yes
correct = y_correct == y_predict
buy = y_predict == 1
print("Predicted to purchase:",np.sum(buy))
print("Score:", score(correct[buy]))
Note on the text: with the first 1000 items as the training set, the prediction is very good, 35 predicted to purchase, accuracy score of over 30%. With a random 10% of the data set held out, the predictions are much worse, ~18 predicted to purchase, ~17% score.
noyes = np.vectorize(lambda s: 1 if s=='Yes' else 0)
X_scaled = sklearn.preprocessing.scale(df_caravan[list(predictors)])
def caravan_predict(test, threshold = 0.25):
"""Given test vector of length df_caravan, return (score, number of yes).
The threshold parameter controls the probability at which a yes is returned."""
train = ~ test
X = X_scaled[train]
y = noyes(purchase[train])
y_correct = noyes(purchase[test])
X_predict = X_scaled[test]
model_fit = LogisticRegression().fit(X,y)
y_predict_p = model_fit.predict_proba(X_predict)
y_predict = (y_predict_p > threshold)[:,1] # column 0 = No, 1=Yes
correct = y_correct == y_predict
buy = y_predict == 1
return (score(correct[buy]), np.sum(buy))
results = np.array([caravan_predict(np.random.random(len(df_caravan)) < 0.10)
for _ in range(20)]) # 100 takes a long time...
fig, ax = plt.subplots(1,2,figsize=(16,4))
ax[0].set_title("Accuracy given YES prediction")
sns.boxplot(results[:,0], ax=ax[0]);
ax[1].set_title("Number of YES predictions")
sns.boxplot(results[:,1], ax=ax[1]);
We get about 22% accuracy and 15 predicted BUY over 100 runs.
np.mean(results,axis=0)
We study the threshold and find that roughly 20-30% threshold has the same accuracy on predicted yes values (about 22%). This was compute-intensive.
def threshold_finder_2(thresh):
test_set = np.random.random(len(df_caravan)) < 0.10
return caravan_predict(test_set,
threshold = thresh)
threshold_finder_2v = np.vectorize(threshold_finder_2)
def threshold_finder_1(thresh, N=50):
return np.mean([threshold_finder_2(thresh) for _ in range(N)],
axis=0)
def threshold_finder(thresholds, N=50):
return np.array([threshold_finder_1(t, N=N) for t in thresholds])
## This takes a long time to run (maybe 15 minutes on my computer)
#ts = np.linspace(0.01,0.50,50)
#vals = threshold_finder(ts)
long_work = pd.read_csv('insurance-predictions.csv')
ts = long_work['threshold']
vals = np.zeros((len(long_work),2))
vals[:,0] = long_work['accuracy']
vals[:,1] = long_work['predictedYES']
fig, axs = plt.subplots(1,2,figsize=(16,4))
axs[0].set_title('accuracy on predicted YES customers')
axs[0].set_xlabel('threshold')
axs[0].set_ylabel('accuracy')
sns.scatterplot(ts, vals[:,0], ax=axs[0]);
#axs[1].set_title('number of predictes YES customers')
axs[1].set_title('total predicted')
axs[1].set_xlabel('threshold')
axs[1].set_ylabel('YES')
sns.scatterplot(ts, vals[:,1], ax=axs[1]);
#long_work = pd.DataFrame({'threshold':ts, 'accuracy': vals[:,0], 'predictedYES': vals[:,1]});
#long_work.to_csv('insurance-predictions.csv')