import numpy as np
import pandas as pd
import scipy
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore',FutureWarning)
np.arange(6)
a = np.arange(6)
b = a.reshape((2,3))
b
np.sqrt(b)
rnorm = scipy.stats.norm(loc=0,scale=1) # mean = loc = 0, standard_deviation = scale = 1
x = rnorm.rvs(size=50)
err = scipy.stats.norm(loc=50, scale=0.1)
y = err.rvs(size=50)
np.corrcoef(x,y)
np.random.seed(1303)
rnorm.rvs(size=8)
# Notice - same random numbers all of the time
np.random.seed(3)
y = rnorm.rvs(size=100)
np.mean(y)
np.var(y)
np.sqrt(np.var(y))
np.std(y)
x = rnorm.rvs(size=100)
y = rnorm.rvs(size=100)
ax = sns.scatterplot(x,y);
Have to dig back into MatPlotLib to set axis labels, so all is not perfect.
ax = sns.scatterplot(x,y);
ax.set(xlabel="the x-axis",ylabel="the y-axis")
plt.show()
Adding a title is a little more annoying, per Stack Overflow explanation of adding a title to a Seaborn plot. There are more complex explanations that work with multiple subplots.
ax = sns.scatterplot(x,y);
ax.set_xlabel('independent var')
ax.set_ylabel('dependent var')
ax.set_title('Massive Title')
plt.show();
Saving an image to a file is also pretty straightforward using savefig from PyPlot.
ax = sns.scatterplot(x,y);
ax.set_title('Save this plot')
plt.savefig('unlabeled-axes.png');
# ugliness to avoid showing figure:
fig = plt.gcf()
plt.close(fig)
np.linspace
makes equally spaced steps between the start and end
x = np.linspace(-np.pi,np.pi,50)
A contour plot needs a 2D array of z values (x,y) -> f(x,y).
The hard part is getting the inputs to the function, or convincing f not to vectorize over x,y in parallel.
x = np.linspace(-np.pi,np.pi,50)
y = x # for clarity only
xx,yy = np.meshgrid(x,y)
def fbasic(x,y): return np.cos(y) / (1+x**2)
f = np.vectorize(lambda x,y: np.cos(y) / (1+x**2))
z = f(xx,yy)
plt.contour(z);
plt.contour(z,45);
def g1(x,y): return (fbasic(x,y)+fbasic(y,x))/2
g = np.vectorize(g1)
z2 = g(xx,yy)
plt.contour(z2,15);
imshow
shows an image, like the R
command image
.
Surely there is a way to get the coordinates input as well as the z
, but in practice a regular grid seems most likely.
randompix = np.random.random((16, 16))
plt.imshow(randompix);
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
surf = ax.plot_surface(xx,yy,z, cmap=cm.coolwarm);
plt.show();
a = np.arange(1,17).reshape((4,4)).T # matches R example
print(a)
a[1,2]
Beware if following code in the book. R indices start at 1, while Python indices start at 0.
a[[0,2],[1,3]]
a[[0,2],:]
a[:,[1,3]]
If you combine the two in one set of brackets, they are traversed in parallel, getting you a[0,1] and a[2,3].
a[[0,2],[1,3]]
When you want a sub-array, index twice.
a[[0,2],:][:,[1,3]]
The ix_
function makes grids out of indices that you give it. Clearer for this!
a[np.ix_([0,2],[1,3])]
Note: R ranges include the last item, Python ranges do not.
a[np.ix_(np.arange(0,3),np.arange(1,4))]
a[[0,1],]
a[:,[0,1]]
a[1,]
Dropping columns is not as convenient in Python.
b = np.delete(a,[0,2],0)
b
c = np.delete(b,[0,2,3],1)
c
a.shape
Note: To get the data from a preloaded R dataset, I do write_table(the_data, filename="whatever", sep="\t")
in R.
Cool fact: read_table
can load straight from a URL.
#auto = pd.read_table("Auto.data")
auto = pd.read_csv("http://www-bcf.usc.edu/~gareth/ISL/Auto.csv")
Get rid of any rows with missing data. This is not always a good idea.
auto = auto.dropna()
auto.shape
auto.columns
sns.scatterplot(auto['cylinders'], auto['mpg']);
sns.boxplot(x="cylinders", y="mpg", data=auto);
sns.stripplot(x="cylinders", y="mpg", data=auto);
sns.distplot(auto['mpg']);
sns.distplot(auto['mpg'],bins=15, kde=False, vertical=True);
sns.pairplot(data=auto);
sns.pairplot(data=auto[['mpg','displacement','horsepower',
'weight','acceleration']]);
I am not aware of a way to interactively identify points on a matplotlib plot that is similar to the R command identify
.
auto.describe()
auto['name'].value_counts().head()
auto['mpg'].describe()
Categorical data can be constructed using astype('category')
in Pandas. Read more about categorical data if you need the information.
auto['cylinders'] = auto['cylinders'].astype('category')
auto['cylinders'].describe()
Easy access to ISL datasets if you have internet access.
college = pd.read_csv("http://www-bcf.usc.edu/~gareth/ISL/College.csv")