Random Forest: Implementaciones de Scikit-Learn Sobre QSAR

13/4/2020 Scikit-Learn
Implementaciones de Scikit-learn sobre QSAR
Random Forest
In [2]:
import pandas as pd
import numpy as np
dataset= pd.read_csv("qsar_oral_toxicity.csv", sep=';', prefix='x', header=None)
dataset.head()
Out[2]:
x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x1015 x1016 x1017 x1018 x1019 x1020
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
1 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0
3 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
5 rows × 1025 columns
In [3]:
from sklearn import preprocessing, model_selection

enc = preprocessing.OrdinalEncoder()
enc.fit(dataset[['x1024']])
for i, cat in enumerate(enc.categories_[0]): print("{} -> {}".format(cat, i))
dataset['output'] = enc.transform(dataset[['x1024']])
dataset.head()
negative -> 0
positive -> 1
Out[3]:
x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x1016 x1017 x1018 x1019 x1020 x1021
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
1 0 0 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0
3 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
5 rows × 1026 columns
localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 1/11

In [4]:
train, test = model_selection.train_test_split(dataset, test_size=0.2, random_state=42)

train.x1024.value_counts()
Out[4]:
negative 6609
positive 584
Name: x1024, dtype: int64
In [5]:
from sklearn.ensemble import RandomForestClassifier

X_train = train.iloc[:, 0:1024].values
Y_train = train.output
clf = RandomForestClassifier(n_estimators=100, max_features="sqrt", max_depth=None, min
_samples_split=2)
clf = clf.fit(X_train, Y_train)
In [6]:
X_test = test.iloc[:, 0:1024].values

Y_test = test.output
test_pred=clf.predict(X_test)
In [7]:
from sklearn import metrics

print("\nAcierto:", metrics.accuracy_score(test.output, test_pred))
print(metrics.classification_report(test.output, test_pred))
Acierto: 0.9394107837687604
precision recall f1-score support
0.0 0.95 0.99 0.97 1642

1.0 0.79 0.42 0.55 157
accuracy 0.94 1799

macro avg 0.87 0.70 0.76 1799
weighted avg 0.93 0.94 0.93 1799
In [8]:
from sklearn.metrics import roc_auc_score

roc_value = roc_auc_score(test.output, test_pred)
print(roc_value)
0.7047099622178948
ID3

In [9]:
from sklearn import tree

clf1 = tree.DecisionTreeClassifier()
clf1 = clf1.fit(X_train, Y_train)
test_pred1=clf1.predict(X_test)
In [10]:
print("\nAcierto:", metrics.accuracy_score(test.output, test_pred1))

print(metrics.classification_report(test.output, test_pred1))
Acierto: 0.9049471928849361
0.0 0.95 0.94 0.95 1642

1.0 0.46 0.52 0.49 157
accuracy 0.90 1799

macro avg 0.71 0.73 0.72 1799
weighted avg 0.91 0.90 0.91 1799
In [11]:
roc_value1 = roc_auc_score(test.output, test_pred1)

print(roc_value1)
0.731913853697138
Cross Validation
In [12]:
seed = 1
scoring = 'accuracy'
In [13]:
models = []
models.append(('CART', tree.DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=None)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold,
scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
CART: 0.910466 (0.010126)

RF: 0.939664 (0.010440)

In [18]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,

'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
print (random_grid)
{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max
_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 8
0, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_lea
f': [1, 2, 4], 'bootstrap': [True, False]}

In [15]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_ite
r = 10, cv = 5, verbose=2, random_state=42)
rf_random.fit(X_train, Y_train)
rf_random.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits

[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_featu
res=sqrt, max_depth=50, bootstrap=True
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent wo

rkers.
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_feat

ures=sqrt, max_depth=50, bootstrap=True, total= 12.9s
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 12.8s remaining:

0.0s


res=sqrt, max_depth=90, bootstrap=False
ures=sqrt, max_depth=90, bootstrap=False, total= 1.0min
[CV] n_estimators=300, min_samples_split=2, min_samples_leaf=2, max_featur
es=auto, max_depth=60, bootstrap=False
res=auto, max_depth=60, bootstrap=False, total=49.4min
res=auto, max_depth=60, bootstrap=False, total= 8.3min
res=auto, max_depth=60, bootstrap=False, total= 9.5min
es=sqrt, max_depth=30, bootstrap=True
res=sqrt, max_depth=30, bootstrap=True, total= 27.8s

res=auto, max_depth=80, bootstrap=False
ures=auto, max_depth=80, bootstrap=False, total=12.0min
ures=sqrt, max_depth=60, bootstrap=False, total= 9.8s

es=auto, max_depth=100, bootstrap=True
res=auto, max_depth=100, bootstrap=True, total=11.2min
res=auto, max_depth=100, bootstrap=True, total= 9.5min
res=auto, max_depth=50, bootstrap=True
ures=auto, max_depth=50, bootstrap=True, total=15.8min

[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 476.0min finished
Out[15]:
{'n_estimators': 200,
'min_samples_split': 10,
'min_samples_leaf': 1,
'max_features': 'sqrt',
'max_depth': 60,
'bootstrap': False}
In [16]:
rf_random.best_params_
Out[16]:
{'n_estimators': 200,
'min_samples_split': 10,
'min_samples_leaf': 1,
'max_features': 'sqrt',
'max_depth': 60,
'bootstrap': False}
comparamos
In [22]:
clf2 = RandomForestClassifier(n_estimators=200, max_features="sqrt", max_depth=60, min_

samples_split=10, min_samples_leaf=1, bootstrap= False)
clf2 = clf.fit(X_train, Y_train)

In [23]:
test_pred2=clf2.predict(X_test)
print("\nAcierto:", metrics.accuracy_score(test.output, test_pred2))
print(metrics.classification_report(test.output, test_pred2))
Acierto: 0.9382990550305725
0.0 0.95 0.99 0.97 1642

1.0 0.78 0.41 0.54 157
accuracy 0.94 1799

macro avg 0.86 0.70 0.75 1799
weighted avg 0.93 0.94 0.93 1799
In [ ]:

Random Forest: Implementaciones de Scikit-Learn Sobre QSAR

Uploaded by

Copyright:

Available Formats

You might also like

Random Forest: Implementaciones de Scikit-Learn Sobre QSAR

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Random Forest: Implementaciones de Scikit-Learn Sobre QSAR

Uploaded by

Copyright:

Available Formats

13/4/2020 Scikit-Learn

Implementaciones de Scikit-learn sobre QSAR

x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x1015 x1016 x1017 x1018 x1019 x1020

5 rows × 1025 columns

from sklearn import preprocessing, model_selection

x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x1016 x1017 x1018 x1019 x1020 x1021

5 rows × 1026 columns

localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 1/11

train, test = model_selection.train_test_split(dataset, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestClassifier

X_test = test.iloc[:, 0:1024].values

from sklearn import metrics

0.0 0.95 0.99 0.97 1642

accuracy 0.94 1799

from sklearn.metrics import roc_auc_score

localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 2/11

from sklearn import tree

print("\nAcierto:", metrics.accuracy_score(test.output, test_pred1))

0.0 0.95 0.94 0.95 1642

accuracy 0.90 1799

roc_value1 = roc_auc_score(test.output, test_pred1)

CART: 0.910466 (0.010126)

localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 3/11

from sklearn.model_selection import RandomizedSearchCV

random_grid = {'n_estimators': n_estimators,

localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 4/11

from sklearn.ensemble import RandomForestRegressor

localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 5/11

Fitting 5 folds for each of 10 candidates, totalling 50 fits

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent wo

[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_feat

[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 12.8s remaining:

localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 6/11

[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_feat

res=sqrt, max_depth=30, bootstrap=True, total= 28.6s

[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_featu

res=auto, max_depth=50, bootstrap=True

[Parallel(n_jobs=1)]: Done 50 out of 50 | elapsed: 476.0min finished

clf2 = RandomForestClassifier(n_estimators=200, max_features="sqrt", max_depth=60, min_

localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 10/11

0.0 0.95 0.99 0.97 1642

accuracy 0.94 1799

localhost:8888/nbconvert/html/Entrega2/ Scikit-Learn.ipynb?download=false 11/11

You might also like