Download as pdf or txt
Download as pdf or txt
You are on page 1of 3

Better Data Science | Feature Importance

Techniques with Python


● Library imports
● load_brest_cancer for the dataset
● rcParams is only here for plot stylings
In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

import matplotlib.pyplot as plt


from matplotlib import rcParams
rcParams['figure.figsize'] = 14, 7
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False

# Load data
data = load_breast_cancer()
● Convert to Pandas Data Frame:
In [2]:
df = pd.concat([
pd.DataFrame(data.data, columns=data.feature_names),
pd.DataFrame(data.target, columns=['y'])
], axis=1)

df.head()

● Prepare the dataset


○ Scale features
○ Split into training and testing subsets
In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
In [4]:
X = df.drop('y', axis=1)
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [5]:
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

Method #1 - Get importance from coefficients


In [6]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_scaled, y_train)
importances = pd.DataFrame(data={
'Attribute': X_train.columns,
'Importance': model.coef_[0]
})
importances = importances.sort_values(by='Importance', ascending=False)
In [7]:
plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()

Method #2 - Get importances from a fitted tree-based model


In [8]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train_scaled, y_train)
importances = pd.DataFrame(data={
'Attribute': X_train.columns,
'Importance': model.feature_importances_
})
importances = importances.sort_values(by='Importance', ascending=False)
In [9]:
plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()
Method #3 - Get Importances from PCA
In [10]:
from sklearn.decomposition import PCA
In [11]:
pca = PCA().fit(X_train_scaled)

plt.plot(pca.explained_variance_ratio_.cumsum(), lw=3, color='#087E8B')


plt.title('Cumulative explained variance by number of principal components', size=20)
plt.show()

In [12]:
loadings = pd.DataFrame(
data=pca.components_.T * np.sqrt(pca.explained_variance_),
columns=[f'PC{i}' for i in range(1, len(X_train.columns) + 1)],
index=X_train.columns
)
In [13]:
loadings.head()
In [14]:
pc1_loadings = loadings.sort_values(by='PC1', ascending=False)[['PC1']]
pc1_loadings = pc1_loadings.reset_index()
pc1_loadings.columns = ['Attribute', 'CorrelationWithPC1']

plt.bar(x=pc1_loadings['Attribute'], height=pc1_loadings['CorrelationWithPC1'], color='#087E8B')


plt.title('PCA loading scores (first principal component)', size=20)
plt.xticks(rotation='vertical')
plt.show()

You might also like