sklearn.feature_selection.SelectFromModel
)hide_code_in_slideshow()
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNetCV
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_context('notebook', font_scale=1.5)
%matplotlib inline
X,y = make_regression(n_samples=1000, n_features=10,
n_informative=3,random_state=1234,
shuffle=False, noise=10)
X = np.append(X, X[:,[0]], axis=1)
train_X, test_X, train_y, test_y = train_test_split(X,y,
test_size=0.2,
random_state=1235)
enet = ElasticNetCV(l1_ratio=np.array([.1, .5, .7, .9, .95, .99, 1]),
n_alphas=500,
normalize=True,
selection='random',
random_state=1351)
enet.fit(train_X, train_y)
coefs = pd.DataFrame({'Feature': np.arange(len(enet.coef_)),
'Importance': np.abs(enet.coef_)})
coefs = coefs.set_index('Feature')\
.sort_values('Importance', ascending=False)
fig, ax=plt.subplots(figsize=(12,6))
plot = coefs.plot(kind='bar', title='Example feature importance chart', ax=ax)
from matplotlib import rc
rc('text', usetex=True)
from sklearn.metrics import mean_squared_error
from math import floor
hide_code_in_slideshow()
features_to_test = {
'All': np.arange(0,11),
'0-9': np.arange(0,10),
'1-9': np.arange(1,10),
'1-10': np.arange(1,11)
}
fig, axarr = plt.subplots(ncols=2, nrows=2, figsize=(12,8))
i = 0
for model_name, features in features_to_test.items():
col = i % 2
row = int(floor(i/2))
_train_X = train_X[:, features]
_test_X = test_X[:, features]
enet = ElasticNetCV(l1_ratio=np.array([.1, .5, .7, .9, .95, .99, 1]),
n_alphas=1000,
normalize=True,
selection='random',
random_state=1351)
enet.fit(_train_X, train_y)
coefs = pd.DataFrame({'Feature': features,
'Importance': np.abs(enet.coef_)})
coefs = coefs.set_index('Feature')\
.sort_values('Importance', ascending=False)
plot = coefs.plot(kind='bar', title='Typical feature importance chart', ax=axarr[(row,col)])
title = 'Features:' + model_name + r'; $R^2$ =' + \
str(round(enet.score(_test_X,test_y),3))
axarr[(row,col)].set_title(title)
i += 1
plt.tight_layout()
X,y = make_regression(n_samples=1000, n_features=10,
n_informative=3, random_state=1234,
shuffle=False, noise=10)
X = np.append(X, X[:,[0]], axis=1)
hide_code_in_slideshow()
from sklearn.datasets import load_iris
from sklearn import tree
import graphviz
iris = load_iris()
clf = tree.DecisionTreeClassifier(max_depth=3)
clf = clf.fit(iris.data, iris.target)
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
display.display(graph)
$$imp(X_m) = \sum_{v(s_t)=X_m}p(t)\Delta i(s_t,t)$$
from collections import defaultdict
import numpy as np
def feature_importance_single_tree(tree, feature_names=iris.feature_names):
# Returns normed feature importance for a single sklearn.tree
# Note sklearn trees can handle instance weights, so we need
# to use weighted_n_node_samples
total_samples = np.sum(tree.weighted_n_node_samples)
feature_importance = defaultdict(float)
# Identify leaves as described in sklearn's plot_unveil_tree_structure.html
is_leaf = (tree.children_right == tree.children_left)
for ix in range(len(is_leaf)):
if not is_leaf[ix]:
impurity = tree.impurity[ix]
split_feature = tree.feature[ix]
num_at_node = tree.weighted_n_node_samples[ix]
# Get left child contribution
left_child = tree.children_left[ix]
left_decrease = tree.weighted_n_node_samples[left_child]/num_at_node * \
tree.impurity[left_child]
# Get right child contribution
right_child = tree.children_right[ix]
right_decrease = tree.weighted_n_node_samples[right_child]/num_at_node * \
tree.impurity[right_child]
delta = impurity - left_decrease - right_decrease
feature_importance[feature_names[split_feature]] \
+= num_at_node / total_samples * delta
norm = np.sum(feature_importance.values())
feature_importance = {key: val/norm for key, val in feature_importance.items()}
return feature_importance
imp = feature_importance_single_tree(clf.tree_)
display.display(pd.Series(imp).rename('Importance').to_frame()\
.reset_index().rename(columns={'index':'Feature'}))
Feature | Importance | |
---|---|---|
0 | petal length (cm) | 0.585616 |
1 | petal width (cm) | 0.414384 |
display.display(pd.DataFrame({'Feature': iris.feature_names,
'Importance':clf.feature_importances_})\
.sort_values('Importance', ascending=False))
Feature | Importance | |
---|---|---|
2 | petal length (cm) | 0.585616 |
3 | petal width (cm) | 0.414384 |
0 | sepal length (cm) | 0.000000 |
1 | sepal width (cm) | 0.000000 |
$$Imp(X_m) = \frac{1}{N_T}\sum_T\sum_{t \in T:v(s_t)=X_m}p(t)\Delta i(s_t,t)$$
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100) # Not tuned - just example
forest.fit(iris.data, iris.target)
display.display(pd.DataFrame({'Feature': iris.feature_names,
'Importance':forest.feature_importances_})\
.sort_values('Importance', ascending=False))
Feature | Importance | |
---|---|---|
2 | petal length (cm) | 0.449774 |
3 | petal width (cm) | 0.433002 |
0 | sepal length (cm) | 0.098417 |
1 | sepal width (cm) | 0.018807 |
forest_importance = defaultdict(float)
for tree in forest.estimators_:
tree_importance = feature_importance_single_tree(tree.tree_)
for key, val in tree_importance.items():
forest_importance[key] += val
forest_importance = {key:val/len(forest.estimators_)\
for key, val in forest_importance.items()}
display.display(pd.Series(forest_importance).rename('Importance').to_frame()\
.reset_index().rename(columns={'index':'Feature'})\
.sort_values('Importance', ascending=False))
Feature | Importance | |
---|---|---|
0 | petal length (cm) | 0.449774 |
1 | petal width (cm) | 0.433002 |
2 | sepal length (cm) | 0.098417 |
3 | sepal width (cm) | 0.018807 |
$$imp(X_m) = \big\vert w_m \big\vert$$
We describe it using an arbitrary held-out test set:
Question: Is this any different than the following procedure? If so, how?
import numpy as np
size = 1000
x1 = np.random.uniform(-10,10,size)
z1 = np.random.uniform(-10,10,size)
x2 = z1 + np.random.normal(0,2,size)
x3 = z1 + np.random.normal(0,2,size)
x4 = z1 + np.random.normal(0,2,size)
def actual_function(x1,z1):
return -1./2.*x1 + z1 + np.random.normal(0,1,len(x1))
y = actual_function(x1,z1)
X = np.stack([x1,x2,x3,x4]).T
hide_code_in_slideshow()
corr = pd.DataFrame(X).corr()
corr.index = ['$x_{}$'.format(x) for x in range(4)]
corr.columns = ['$x_{}$'.format(x) for x in range(4)]
display.display(display.Markdown('## Correlation matrix:'))
display.display(corr)
$x_0$ | $x_1$ | $x_2$ | $x_3$ | |
---|---|---|---|---|
$x_0$ | 1.000000 | -0.008428 | -0.004025 | -0.018237 |
$x_1$ | -0.008428 | 1.000000 | 0.900929 | 0.899367 |
$x_2$ | -0.004025 | 0.900929 | 1.000000 | 0.897771 |
$x_3$ | -0.018237 | 0.899367 | 0.897771 | 1.000000 |
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
train_X, test_X, train_y, test_y = train_test_split(X,y,test_size=0.2,
random_state=1345134)
reg = GradientBoostingRegressor(n_estimators=200,subsample=0.5,
random_state=3141)
grid = GridSearchCV(reg,
param_grid={
'max_leaf_nodes':[10,25,50],
'min_samples_leaf':[10,25,50]
}, n_jobs=5, cv=5)
grid = grid.fit(train_X, train_y)
imp = pd.DataFrame({'Feature': ['$x_{}$'.format(i) for i in range(1,5)],
'GBM Importance':grid.best_estimator_.feature_importances_})
import eli5
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(grid)
perm.fit(test_X, test_y)
imp['GBM Permutation'] = perm.feature_importances_
from sklearn.linear_model import ElasticNetCV
enet = ElasticNetCV(l1_ratio=np.array([.1, .5, .7, .9, .95, .99, 1]),
n_alphas=500,
normalize=True,
selection='random',
cv=5,
random_state=1351)
enet.fit(train_X, train_y)
imp['Elastic Net $\\vert w \\vert$'] = np.abs(enet.coef_)
perm2 = PermutationImportance(enet)
perm2.fit(test_X, test_y)
imp['Elastic Net Permutation'] = perm2.feature_importances_
hide_code_in_slideshow()
display.display(imp.set_index('Feature'))
GBM Importance | GBM Permutation | Elastic Net $\vert w \vert$ | Elastic Net Permutation | |
---|---|---|---|---|
Feature | ||||
$x_1$ | 0.269611 | 0.376578 | 0.493210 | 0.371113 |
$x_2$ | 0.252952 | 0.163739 | 0.270237 | 0.137522 |
$x_3$ | 0.240531 | 0.263513 | 0.377563 | 0.257554 |
$x_4$ | 0.236906 | 0.191071 | 0.307340 | 0.192642 |
hide_code_in_slideshow()
sns.set_context('notebook',font_scale=2)
fig, ax = plt.subplots(figsize=(12,8))
plot = imp.set_index('Feature').plot(kind='bar', ax=ax)
This dataset includes $N$ observations $y_i$ of a target $y$ for $i = 1,2,\cdots,N$, along with $p$ features denoted $x_{i,j}$ for $j=1,2,\cdots,p$ and $i=1,2,\cdots,N$. This model generates predictions of the form: $$\hat{y}_i = f(x_{i,1},x_{i,2},\cdots,x_{i,p})$$
In the case of a single feature $x_j$, Friedman's partial dependence plots are obtained by computing the following average and plotting it over a useful range of $x$ values: $$\phi_j(x) = \frac{1}{N}\sum_{i=1}^N f(x_{i,1},\cdots,x_{i,j-1},x,x_{i,j+1},\cdots,x_{i,p})$$
$$\phi_j(x) = \frac{1}{N}\sum_{i=1}^N f(x_{i,1},\cdots,x_{i,j-1},x,x_{i,j+1},\cdots,x_{i,p})$$
sklearn
PDP example, using sklearn
utilities.hide_code_in_slideshow()
print(cal_housing.DESCR)
California housing dataset. The original database is available from StatLib http://lib.stat.cmu.edu/datasets/ The data contains 20,640 observations on 9 variables. This dataset contains the average house value as target variable and the following input variables (features): average income, housing average age, average rooms, average bedrooms, population, average occupation, latitude, and longitude in that order. References ---------- Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions, Statistics and Probability Letters, 33 (1997) 291-297.
sklearn
PDP example, using sklearn
utilities.hide_code_in_slideshow()
fig, ax = plt.subplots(figsize=(12,6))
plot = pd.Series(reg.feature_importances_,
index=names).rename('MDI').to_frame()\
.sort_values('MDI', ascending=False).plot(kind='bar',ax=ax,
title='Importance: Mean Decrease Impurity')
hide_code_in_slideshow()
features = [0, 5, 1, 2, (5, 1)]
fig, ax = plt.subplots(figsize=(12,10))
fig, ax = plot_partial_dependence(reg, X_train, features,
feature_names=names,
n_jobs=3, grid_resolution=50,
ax=ax)
fig.suptitle('Partial dependence of house value on nonlocation features')
plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle
plt.show()
geopandas.
hide_code_in_slideshow()
fig, ax = plt.subplots(figsize=(12,8))
plot_partial_dependence(grid.best_estimator_, Xs, [1],
n_jobs=1, grid_resolution=25, ax=ax)
plt.xlim(-1,1)
plt.ylim(-6,6)
plt.xlabel('$X_1$')
pass
size = 10000
X1 = np.random.uniform(-1,1,size)
X2 = np.random.uniform(-1,1,size)
X3 = np.random.uniform(-1,1,size)
eps = np.random.normal(0,0.5,size)
Y = 0.2*X1 - 5*X2 + 10*X2*np.where(X3>=0,1,0) + eps
Xs = np.stack([X1,X2,X3]).T
hide_code_in_slideshow()
fig, ax = plt.subplots(figsize=(12,8))
plt.scatter(X2[X3>=0], Y[X3>=0], color='blue', label='$X_3>=0$')
plt.scatter(X2[X3<0], Y[X3<0], color='green', label='$X_3<0$')
plt.ylabel('$Y$')
plt.xlabel('$X_2$')
plt.legend()
plt.show()
In his gradient boosting paper, Friedman identified this issue (here presented in our notation):
SauceCat/PDPbox
implementation.hide_code_in_slideshow()
from pdpbox import pdp
train = pd.DataFrame(Xs)
train.columns = [str(x) for x in train.columns]
pdp_one = pdp.pdp_isolate(grid, train, '1')
pdp.pdp_plot(pdp_one, '$x_1$',
plot_org_pts=True, plot_lines=True,
center=False, frac_to_plot=1000)