Professional Documents
Culture Documents
Import: Sys - Executable - M Pip Install
Import: Sys - Executable - M Pip Install
Import: Sys - Executable - M Pip Install
In [1]: # If you want to install any missing packages, then uncomment the lines given
below and run them
# to ensure that you have all the dependencies you need to run the notebook.
import sys
!{sys.executable} -m pip install 'missingno'
Collecting missingno
Downloading missingno-0.4.2-py3-none-any.whl (9.7 kB)
Collecting seaborn
Downloading seaborn-0.11.0-py3-none-any.whl (283 kB)
|████████████████████████████████| 283 kB 4.4 MB/s eta 0:00:01
Requirement already satisfied: scipy in /srv/conda/envs/notebook/lib/python3.
6/site-packages (from missingno) (1.5.2)
Requirement already satisfied: numpy in /srv/conda/envs/notebook/lib/python3.
6/site-packages (from missingno) (1.19.1)
Requirement already satisfied: matplotlib in /srv/conda/envs/notebook/lib/pyt
hon3.6/site-packages (from missingno) (3.3.2)
Requirement already satisfied: pandas>=0.23 in /srv/conda/envs/notebook/lib/p
ython3.6/site-packages (from seaborn->missingno) (1.1.2)
Requirement already satisfied: python-dateutil>=2.1 in /srv/conda/envs/notebo
ok/lib/python3.6/site-packages (from matplotlib->missingno) (2.8.1)
Requirement already satisfied: kiwisolver>=1.0.1 in /srv/conda/envs/notebook/
lib/python3.6/site-packages (from matplotlib->missingno) (1.2.0)
Requirement already satisfied: pillow>=6.2.0 in /srv/conda/envs/notebook/lib/
python3.6/site-packages (from matplotlib->missingno) (7.2.0)
Requirement already satisfied: certifi>=2020.06.20 in /srv/conda/envs/noteboo
k/lib/python3.6/site-packages (from matplotlib->missingno) (2020.6.20)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /s
rv/conda/envs/notebook/lib/python3.6/site-packages (from matplotlib->missingn
o) (2.4.7)
Requirement already satisfied: cycler>=0.10 in /srv/conda/envs/notebook/lib/p
ython3.6/site-packages/cycler-0.10.0-py3.6.egg (from matplotlib->missingno)
(0.10.0)
Requirement already satisfied: pytz>=2017.2 in /srv/conda/envs/notebook/lib/p
ython3.6/site-packages (from pandas>=0.23->seaborn->missingno) (2020.1)
Requirement already satisfied: six>=1.5 in /srv/conda/envs/notebook/lib/pytho
n3.6/site-packages (from python-dateutil>=2.1->matplotlib->missingno) (1.15.
0)
Installing collected packages: seaborn, missingno
Successfully installed missingno-0.4.2 seaborn-0.11.0
In [2]: # Libraries
import pandas as pd
import numpy as np
import missingno as mno
import seaborn as sns
import matplotlib.pyplot as plt
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 1/23
9/29/2020 L20-2055-Assignment1-Part2
Out[3]:
surface_area agricultural_land forest_area armed_forces_total urban_pop_major_cities urb
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 2/23
9/29/2020 L20-2055-Assignment1-Part2
In [6]: # Description
data.describe()
Out[6]:
surface_area agricultural_land forest_area armed_forces_total urban_pop_major_cities
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 3/23
9/29/2020 L20-2055-Assignment1-Part2
In [8]: data.head
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 4/23
9/29/2020 L20-2055-Assignment1-Part2
mobile_subscriptions internet_users \
0 less than 1 per person 0 per 1000 people
1 less than 1 per person 154 per 1000 people
2 more than 1 per person 90 per 100 people
3 more than 1 per person 76 per 100 people
4 more than 1 per person 350 per 1000 people
.. ... ...
357 more than 1 per person 90 per 100 people
358 less than 1 per person 84 per 100 people
359 more than 1 per person 58 per 100 people
360 more than 1 per person 44 per 100 people
361 unknown 45 per 100 people
secure_internet_servers_total improved_sanitation \
0 NaN high access
1 2.623624e+06 low access
2 1.656589e+09 no info
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 5/23
9/29/2020 L20-2055-Assignment1-Part2
women_parliament_seats_rate life_expectancy
0 [0%-25%) 69.494195
1 [0%-25%) 59.237366
2 unknown 81.300000
3 [25%-50%) 81.373197
4 [25%-50%) 73.193561
.. ... ...
357 [0%-25%) 80.956098
358 [0%-25%) 78.841463
359 [0%-25%) 76.836195
360 [0%-25%) 75.756488
361 unknown 79.624390
Out[9]: True
Out[10]: surface_area 0
agricultural_land 4
forest_area 5
armed_forces_total 44
urban_pop_major_cities 2
urban_pop_minor_cities 2
national_income 0
inflation_annual 216
inflation_monthly 206
inflation_weekly 342
mobile_subscriptions 0
internet_users 0
secure_internet_servers_total 10
improved_sanitation 0
women_parliament_seats_rate 0
life_expectancy 0
dtype: int64
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 6/23
9/29/2020 L20-2055-Assignment1-Part2
In [11]: # print(data['mobile_subscriptions'].dtype)
Out[11]: ['national_income',
'mobile_subscriptions',
'internet_users',
'improved_sanitation',
'women_parliament_seats_rate']
In [12]: data['women_parliament_seats_rate'].unique().tolist()
column: national_income
['unknown', 'very low', 'high', 'medium low', 'medium high', 'low', 'very hig
h']
[4, 6, 0, 3, 2, 1, 5]
column: mobile_subscriptions
['less than 1 per person', 'more than 1 per person', 'more than 2 per perso
n', 'unknown', 'more than 3 per person']
[0, 1, 2, 4, 3]
column: improved_sanitation
['high access', 'low access', 'no info', 'very high access', 'medium access',
'very low access']
[0, 1, 3, 4, 2, 5]
column: women_parliament_seats_rate
['[0%-25%)', 'unknown', '[25%-50%)', '[50%-75%)']
[0, 3, 1, 2]
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 7/23
9/29/2020 L20-2055-Assignment1-Part2
In [17]: train_data['internet_users'].head()
Out[17]: 0 0 per 1000 people
1 154 per 1000 people
2 90 per 100 people
3 76 per 100 people
4 350 per 1000 people
Name: internet_users, dtype: object
In [22]: train_data.isnull().sum()
Out[22]: surface_area 0
agricultural_land 4
forest_area 5
armed_forces_total 44
urban_pop_major_cities 2
urban_pop_minor_cities 2
national_income 0
inflation_annual 216
inflation_monthly 206
inflation_weekly 342
mobile_subscriptions 0
internet_users 0
secure_internet_servers_total 10
improved_sanitation 0
women_parliament_seats_rate 0
life_expectancy 0
dtype: int64
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 8/23
9/29/2020 L20-2055-Assignment1-Part2
for c in columns:
missing_vals = train_data[c].isnull().sum()
observed_vals = train_data.loc[train_data[c].notnull(), c]
train_data.loc[train_data[c].isnull(), c] = observed_vals.mean()
# df.loc[df[c].isnull(), c] = np.random.choice(observed_vals, missing_val
s, replace = True)
In [25]: train_data.isnull().sum()
Out[25]: surface_area 0
agricultural_land 0
forest_area 0
armed_forces_total 0
urban_pop_major_cities 0
urban_pop_minor_cities 0
national_income 0
inflation_annual 216
mobile_subscriptions 0
internet_users 0
secure_internet_servers_total 0
improved_sanitation 0
women_parliament_seats_rate 0
life_expectancy 0
dtype: int64
Out[27]: LinearRegression()
(216, 13)
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 9/23
9/29/2020 L20-2055-Assignment1-Part2
In [30]: train_data.isnull().sum()
Out[30]: surface_area 0
agricultural_land 0
forest_area 0
armed_forces_total 0
urban_pop_major_cities 0
urban_pop_minor_cities 0
national_income 0
inflation_annual 0
mobile_subscriptions 0
internet_users 0
secure_internet_servers_total 0
improved_sanitation 0
women_parliament_seats_rate 0
life_expectancy 0
dtype: int64
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 10/23
9/29/2020 L20-2055-Assignment1-Part2
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 11/23
9/29/2020 L20-2055-Assignment1-Part2
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 12/23
9/29/2020 L20-2055-Assignment1-Part2
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 13/23
9/29/2020 L20-2055-Assignment1-Part2
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 14/23
9/29/2020 L20-2055-Assignment1-Part2
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 15/23
9/29/2020 L20-2055-Assignment1-Part2
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 16/23
9/29/2020 L20-2055-Assignment1-Part2
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 17/23
9/29/2020 L20-2055-Assignment1-Part2
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 18/23
9/29/2020 L20-2055-Assignment1-Part2
Out[32]: <AxesSubplot:>
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 19/23
9/29/2020 L20-2055-Assignment1-Part2
/srv/conda/envs/notebook/lib/python3.6/site-packages/seaborn/distributions.p
y:2551: FutureWarning: `distplot` is a deprecated function and will be remove
d in a future version. Please adapt your code to use either `displot` (a figu
re-level function with similar flexibility) or `histplot` (an axes-level func
tion for histograms).
warnings.warn(msg, FutureWarning)
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 20/23
9/29/2020 L20-2055-Assignment1-Part2
randomf = RandomForestRegressor(n_estimators=200)
randomf.fit(X_train, y_train)
Out[37]: RandomForestRegressor(n_estimators=200)
In [38]: # Measure mean absolute error for training and validation sets
print('Mean Absolute Error for Training Set:', mean_absolute_error(y_train, ra
ndomf.predict(X_train)))
print('Mean Absolute Error for Test Set:', mean_absolute_error(y_test, randomf
.predict(X_test)))
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 21/23
9/29/2020 L20-2055-Assignment1-Part2
importances = randomf.feature_importances_
indices = np.argsort(importances)
features = X.columns
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 22/23
9/29/2020 L20-2055-Assignment1-Part2
k = 5
score_param = 'neg_mean_absolute_error'
rf = RandomForestRegressor()
rf_grid = GridSearchCV(estimator=rf, param_grid=params, scoring=score_param, n
_jobs=-1, cv=k, verbose=True)
rf_grid.fit(X_train, y_train)
In [ ]: # get best model (trained on best set of params) from grid search cv
best_model = rf_grid.best_estimator_
# print mean absolute error for predictions taken from the best model
print('Mean Absolute Error for Training Set:', metrics.mean_absolute_error(y_t
rain, predict_train))
print('Mean Absolute Error for Validation Set:', metrics.mean_absolute_error(y
_test, predict_test))
In [ ]:
https://hub.gke2.mybinder.org/user/ipython-ipython-in-depth-3yryl251/nbconvert/html/L20-2055-Assignment1-Part2.ipynb?download=false 23/23