Random Forest: Implementaciones de Scikit-Learn Sobre QSAR

13/4/2020

Implementaciones de Scikit-learn sobre QSAR

Random Forest
In [2]:

import pandas as pd
import numpy as np
dataset= pd.read_csv("qsar_oral_toxicity.csv", sep=';', prefix='x', header=None)


x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x1015 x1016 x1017 x1018 x1019 x1020

0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0

1 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0

2 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0

3 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0

4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0

5 rows × 1025 columns

In [3]:

from sklearn import preprocessing, model_selection

enc = preprocessing.OrdinalEncoder()
for i, cat in enumerate(enc.categories_[0]): print("{} -> {}".format(cat, i))
dataset['output'] = enc.transform(dataset[['x1024']])

negative -> 0
positive -> 1


x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x1016 x1017 x1018 x1019 x1020 x1021

0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0

1 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0

2 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0

3 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0

4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0

5 rows × 1026 columns

13/4/2020

In [4]:

train, test = model_selection.train_test_split(dataset, test_size=0.2, random_state=42)



negative 6609
positive 584
Name: x1024, dtype: int64

In [5]:

from sklearn.ensemble import RandomForestClassifier

X_train = train.iloc[:, 0:1024].values
Y_train = train.output
clf = RandomForestClassifier(n_estimators=100, max_features="sqrt", max_depth=None, min
clf = clf.fit(X_train, Y_train)

In [6]:

X_test = test.iloc[:, 0:1024].values

Y_test = test.output

In [7]:

from sklearn import metrics

print("\nAcierto:", metrics.accuracy_score(test.output, test_pred))
print(metrics.classification_report(test.output, test_pred))

Acierto: 0.9394107837687604
precision recall f1-score support

0.0 0.95 0.99 0.97 1642

1.0 0.79 0.42 0.55 157

accuracy 0.94 1799

macro avg 0.87 0.70 0.76 1799
weighted avg 0.93 0.94 0.93 1799

In [8]:

from sklearn.metrics import roc_auc_score

roc_value = roc_auc_score(test.output, test_pred)



13/4/2020

In [9]:

from sklearn import tree

clf1 = tree.DecisionTreeClassifier()
clf1 = clf1.fit(X_train, Y_train)

In [10]:

print("\nAcierto:", metrics.accuracy_score(test.output, test_pred1))

print(metrics.classification_report(test.output, test_pred1))

Acierto: 0.9049471928849361
precision recall f1-score support

0.0 0.95 0.94 0.95 1642

1.0 0.46 0.52 0.49 157

accuracy 0.90 1799

macro avg 0.71 0.73 0.72 1799
weighted avg 0.91 0.90 0.91 1799

In [11]:

roc_value1 = roc_auc_score(test.output, test_pred1)



Cross Validation
In [12]:

seed = 1
scoring = 'accuracy'

In [13]:

models = []
models.append(('CART', tree.DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=None)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold,
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())

CART: 0.910466 (0.010126)

RF: 0.939664 (0.010440)

13/4/2020

In [18]:

from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,

'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
print (random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max
_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 8
0, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_lea
f': [1, 2, 4], 'bootstrap': [True, False]}

13/4/2020

In [15]:

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_ite
r = 10, cv = 5, verbose=2, random_state=42)
rf_random.fit(X_train, Y_train)

13/4/2020

Fitting 5 folds for each of 10 candidates, totalling 50 fits

[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_featu
res=sqrt, max_depth=50, bootstrap=True

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent wo


[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_feat

ures=sqrt, max_depth=50, bootstrap=True, total= 12.9s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_featu
res=sqrt, max_depth=50, bootstrap=True

[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 12.8s remaining:


13/4/2020

[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_feat

ures=sqrt, max_depth=50, bootstrap=True, total= 13.1s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_featu
res=sqrt, max_depth=50, bootstrap=True
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_feat
ures=sqrt, max_depth=50, bootstrap=True, total= 13.6s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_featu
res=sqrt, max_depth=50, bootstrap=True
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_feat
ures=sqrt, max_depth=50, bootstrap=True, total= 13.8s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_featu
res=sqrt, max_depth=50, bootstrap=True
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_feat
ures=sqrt, max_depth=50, bootstrap=True, total= 13.9s
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_featu
res=sqrt, max_depth=90, bootstrap=False
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_feat
ures=sqrt, max_depth=90, bootstrap=False, total= 1.0min
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_featu
res=sqrt, max_depth=90, bootstrap=False
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_feat
ures=sqrt, max_depth=90, bootstrap=False, total= 1.1min
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_featu
res=sqrt, max_depth=90, bootstrap=False
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_feat
ures=sqrt, max_depth=90, bootstrap=False, total= 1.1min
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_featu
res=sqrt, max_depth=90, bootstrap=False
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_feat
ures=sqrt, max_depth=90, bootstrap=False, total= 1.1min
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_featu
res=sqrt, max_depth=90, bootstrap=False
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=4, max_feat
ures=sqrt, max_depth=90, bootstrap=False, total= 1.1min
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featur
es=auto, max_depth=60, bootstrap=False
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=60, bootstrap=False, total=49.4min
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featur
es=auto, max_depth=60, bootstrap=False
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=60, bootstrap=False, total=30.2min
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featur
es=auto, max_depth=60, bootstrap=False
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=60, bootstrap=False, total=18.2min
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featur
es=auto, max_depth=60, bootstrap=False
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=60, bootstrap=False, total= 8.3min
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featur
es=auto, max_depth=60, bootstrap=False
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=60, bootstrap=False, total= 9.5min
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featur
es=sqrt, max_depth=30, bootstrap=True
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featu
res=sqrt, max_depth=30, bootstrap=True, total= 27.8s
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featur
es=sqrt, max_depth=30, bootstrap=True
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featu
13/4/2020

res=sqrt, max_depth=30, bootstrap=True, total= 28.6s

[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featur
es=sqrt, max_depth=30, bootstrap=True
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featu
res=sqrt, max_depth=30, bootstrap=True, total= 21.6s
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featur
es=sqrt, max_depth=30, bootstrap=True
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featu
res=sqrt, max_depth=30, bootstrap=True, total= 21.4s
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featur
es=sqrt, max_depth=30, bootstrap=True
[CV] n_estimators=700, min_samples_split=5, min_samples_leaf=1, max_featu
res=sqrt, max_depth=30, bootstrap=True, total= 21.0s
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_featu
res=auto, max_depth=80, bootstrap=False
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_feat
ures=auto, max_depth=80, bootstrap=False, total=12.0min
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_featu
res=auto, max_depth=80, bootstrap=False
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_feat
ures=auto, max_depth=80, bootstrap=False, total=15.1min
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_featu
res=auto, max_depth=80, bootstrap=False
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_feat
ures=auto, max_depth=80, bootstrap=False, total=15.4min
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_featu
res=auto, max_depth=80, bootstrap=False
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_feat
ures=auto, max_depth=80, bootstrap=False, total=13.1min
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_featu
res=auto, max_depth=80, bootstrap=False
[CV] n_estimators=500, min_samples_split=10, min_samples_leaf=1, max_feat
ures=auto, max_depth=80, bootstrap=False, total=16.0min
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_featu
res=sqrt, max_depth=60, bootstrap=False
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_feat
ures=sqrt, max_depth=60, bootstrap=False, total= 9.8s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_featu
res=sqrt, max_depth=60, bootstrap=False
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_feat
ures=sqrt, max_depth=60, bootstrap=False, total= 10.1s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_featu
res=sqrt, max_depth=60, bootstrap=False
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_feat
ures=sqrt, max_depth=60, bootstrap=False, total= 9.8s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_featu
res=sqrt, max_depth=60, bootstrap=False
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_feat
ures=sqrt, max_depth=60, bootstrap=False, total= 10.2s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_featu
res=sqrt, max_depth=60, bootstrap=False
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_feat
ures=sqrt, max_depth=60, bootstrap=False, total= 10.1s
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=False
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=False, total=25.0min
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=False
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=False, total=28.5min
13/4/2020

[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_featu

res=auto, max_depth=50, bootstrap=False
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=False, total=34.1min
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=False
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=False, total=30.8min
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=False
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=False, total=31.1min
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featur
es=sqrt, max_depth=10, bootstrap=True
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featu
res=sqrt, max_depth=10, bootstrap=True, total= 2.2s
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featur
es=sqrt, max_depth=10, bootstrap=True
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featu
res=sqrt, max_depth=10, bootstrap=True, total= 2.2s
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featur
es=sqrt, max_depth=10, bootstrap=True
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featu
res=sqrt, max_depth=10, bootstrap=True, total= 2.1s
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featur
es=sqrt, max_depth=10, bootstrap=True
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featu
res=sqrt, max_depth=10, bootstrap=True, total= 2.1s
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featur
es=sqrt, max_depth=10, bootstrap=True
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_featu
res=sqrt, max_depth=10, bootstrap=True, total= 2.1s
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featur
es=auto, max_depth=100, bootstrap=True
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featu
res=auto, max_depth=100, bootstrap=True, total=11.2min
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featur
es=auto, max_depth=100, bootstrap=True
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featu
res=auto, max_depth=100, bootstrap=True, total= 9.5min
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featur
es=auto, max_depth=100, bootstrap=True
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featu
res=auto, max_depth=100, bootstrap=True, total= 9.6min
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featur
es=auto, max_depth=100, bootstrap=True
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featu
res=auto, max_depth=100, bootstrap=True, total= 9.8min
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featur
es=auto, max_depth=100, bootstrap=True
[CV] n_estimators=600, min_samples_split=2, min_samples_leaf=4, max_featu
res=auto, max_depth=100, bootstrap=True, total= 9.0min
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=True
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=True, total=15.8min
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=True
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=True, total=16.6min
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_featu
13/4/2020

res=auto, max_depth=50, bootstrap=True

[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=True, total=16.4min
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=True
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=True, total=16.7min
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_featu
res=auto, max_depth=50, bootstrap=True
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_feat
ures=auto, max_depth=50, bootstrap=True, total=15.5min

[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 476.0min finished


{'n_estimators': 200,
'min_samples_split': 10,
'min_samples_leaf': 1,
'max_features': 'sqrt',
'max_depth': 60,
'bootstrap': False}

In [22]:

clf2 = RandomForestClassifier(n_estimators=200, max_features="sqrt", max_depth=60, min_

samples_split=10, min_samples_leaf=1, bootstrap= False)
clf2 = clf.fit(X_train, Y_train)

13/4/2020

In [23]:

print("\nAcierto:", metrics.accuracy_score(test.output, test_pred2))
print(metrics.classification_report(test.output, test_pred2))

Acierto: 0.9382990550305725
precision recall f1-score support

0.0 0.95 0.99 0.97 1642

1.0 0.78 0.41 0.54 157

accuracy 0.94 1799

macro avg 0.86 0.70 0.75 1799
weighted avg 0.93 0.94 0.93 1799

