Professional Documents
Culture Documents
Better Data Science - Feature Importance Techniques With Python
Better Data Science - Feature Importance Techniques With Python
# Load data
data = load_breast_cancer()
● Convert to Pandas Data Frame:
In [2]:
df = pd.concat([
pd.DataFrame(data.data, columns=data.feature_names),
pd.DataFrame(data.target, columns=['y'])
], axis=1)
df.head()
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
importances = pd.DataFrame(data={
'Attribute': X_train.columns,
'Importance': model.coef_[0]
})
importances = importances.sort_values(by='Importance', ascending=False)
In [7]:
plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()
model = XGBClassifier()
model.fit(X_train_scaled, y_train)
importances = pd.DataFrame(data={
'Attribute': X_train.columns,
'Importance': model.feature_importances_
})
importances = importances.sort_values(by='Importance', ascending=False)
In [9]:
plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()
Method #3 - Get Importances from PCA
In [10]:
from sklearn.decomposition import PCA
In [11]:
pca = PCA().fit(X_train_scaled)
In [12]:
loadings = pd.DataFrame(
data=pca.components_.T * np.sqrt(pca.explained_variance_),
columns=[f'PC{i}' for i in range(1, len(X_train.columns) + 1)],
index=X_train.columns
)
In [13]:
loadings.head()
In [14]:
pc1_loadings = loadings.sort_values(by='PC1', ascending=False)[['PC1']]
pc1_loadings = pc1_loadings.reset_index()
pc1_loadings.columns = ['Attribute', 'CorrelationWithPC1']