Download as pdf or txt
Download as pdf or txt
You are on page 1of 17

Middle_town_school EDA and Multiclass Classification

In [57]:

#Importing libraries
import pandas as pd
import numpy as np

import math

import seaborn as sns


import plotly
from matplotlib import pyplot as plt

import scipy.stats as stats

from sklearn.preprocessing import LabelEncoder


from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest


from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score


from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_orig= pd.read_csv("middle_tn_schools.csv")
data= data_orig.copy()
data.head()
Out[2]:

name school_rating size reduced_lunch state_percentile_16 state_percentile_15 stu_teach_ratio school_type avg_score

Allendale
0 Elementary 5.0 851.0 10.0 90.2 95.8 15.7 Public
School

Anderson
1 2.0 412.0 71.0 32.8 37.3 12.8 Public
Elementary

Avoca
2 4.0 482.0 43.0 78.4 83.6 16.6 Public
Elementary

Bailey Public
3 0.0 394.0 91.0 1.6 1.0 13.1
Middle Magnet

Barfield
4 4.0 948.0 26.0 85.3 89.2 14.8 Public
Elementary

In [3]:

#Checking for nan values


#Checking for nan values
sns.heatmap(data.isnull())
Out[3]:
<AxesSubplot:>

In [4]:
data.fillna(method='ffill',inplace=True)

In [5]:
data.shape
Out[5]:

(347, 15)

In [6]:
data.columns
Out[6]:
Index(['name', 'school_rating', 'size', 'reduced_lunch', 'state_percentile_16',
'state_percentile_15', 'stu_teach_ratio', 'school_type', 'avg_score_15',
'avg_score_16', 'full_time_teachers', 'percent_black', 'percent_white',
'percent_asian', 'percent_hispanic'],
dtype='object')

In [7]:
## Data Analysis
#1. Univariate Analysis

In [8]:
sns.set(rc = {'figure.figsize':(7,4)})
sns.set_style('darkgrid')

In [9]:
sns.countplot(data['school_rating'])
plt.show()
-The rating got by most of the schools is 4.

-The rating got by least of the schools is 1.

-Around 47.26% of schools has got ratings between 4 and 5.

-Around 12.39% of schools has got rating 0.

In [10]:
data.describe()
Out[10]:

school_rating size reduced_lunch state_percentile_16 state_percentile_15 stu_teach_ratio avg_score_15 avg_score

count 347.000000 347.000000 347.000000 347.000000 347.000000 347.000000 347.000000 347.0000

mean 2.968300 699.472622 50.279539 58.801729 58.113545 15.461671 56.932277 57.0498

std 1.690377 400.598636 25.480236 32.540747 32.633912 5.725170 26.606800 27.9689

min 0.000000 53.000000 2.000000 0.200000 0.600000 4.700000 1.500000 0.1000

25% 2.000000 420.500000 30.000000 30.950000 27.450000 13.700000 37.650000 37.0000

50% 3.000000 595.000000 51.000000 66.400000 65.200000 15.000000 61.400000 60.7000

75% 4.000000 851.000000 71.500000 88.000000 88.500000 16.700000 79.500000 80.2500

max 5.000000 2314.000000 98.000000 99.800000 99.800000 111.000000 99.000000 98.9000

In [11]:
df= data[['avg_score_15','avg_score_16']]
sns.kdeplot(data=df)
skew_15= stats.skew(df['avg_score_15'])
skew_16= stats.skew(data['avg_score_16'])
print(f'The skewness of avg_score_15 is{skew_15} and of avg_score_16 is{skew_16}')

The skewness of avg_score_15 is-0.41573558302013613 and of avg_score_16 is-0.404255894126


4576

-The distubutions of avg_score_15 and avg_score_16 is nearly to normal. So we could say that the score are
equally distibuted.
In [12]:
total=np.sum([data['percent_black'],data['percent_asian'],data['percent_hispanic'],data['
percent_white']],axis=0)
total_100= [100]
for i in range(346):
total_100.append(100)

In [13]:
data['Remaining_Students']= np.subtract(np.array(total_100),total)

In [14]:
data['Remaining_Students'].describe()
Out[14]:
count 347.000000
mean 3.321326
std 2.847219
min -0.100000
25% 1.300000
50% 2.600000
75% 4.250000
max 14.800000
Name: Remaining_Students, dtype: float64

-Approximately there are at average of 3% students who are not from blacks,asian,hispanic and white. So most
probably they would be from Japan, egypt etc.

-Liberty Elementary school has the highest students which do not belong to any of the above group.

-Moses McKissack Middle school has a little issue in their data as they are showing negitive value for remaining
students.

In [15]:
data.drop([197],axis=0, inplace=True)

In [16]:
df2= data[['percent_black','percent_white','percent_asian','percent_hispanic','Remaining_
Students']]
sns.kdeplot(data=df2)
Out[16]:
<AxesSubplot:ylabel='Density'>

-There are 40 education institutes in which the majority students are Africans. Names: ['Bailey Middle', 'Bellshire
Elementary', 'Buena Vista Elementary', 'Caldwell Elementary', 'Carter-Lawrence Elementary', 'Cumberland
Elementary', 'East End Preparatory School', 'Fall-Hamilton Elementary', 'Glenn Elementary', 'Gra-Mar Middle',
'Hattie Cotton Elementary', 'Haynes Middle', 'Head Middle', 'Hillsboro High', 'Hull-Jackson Elementary', 'Hunters
'Hattie Cotton Elementary', 'Haynes Middle', 'Head Middle', 'Hillsboro High', 'Hull-Jackson Elementary', 'Hunters
Lane High', 'Inglewood Elementary', 'Isaiah T. Creswell Middle', 'Jere Baxter Middle', 'John Early Middle', 'Jones
Elementary', 'K I P P Academy Nashville', 'Kirkpatrick Elementary', 'Liberty Collegiate Academy', 'Maplewood
High', 'Middle College High', 'Moses McKissack Middle', 'Murrell School', 'Napier Elementary', 'Nashville Big
Picture High School', 'Nashville Prep', 'Park Avenue Elementary', 'Pearl-Cohn High', 'Robert Churchwell
Elementary', 'Robert E. Lilliard Elementary', 'Rose Park Middle', 'Rosebank Elementary', 'Stratford High', 'Tom
Joy Elementary', 'Warner Elementary']

-There are no education institutes in which Asians has majority.

-There are 7 education institutes in which Hispanics has majority. Name:['Glencliff Elementary', 'Glengarry
Elementary', 'Haywood Elementary', 'John B. Whitsitt Elementary', 'Paragon Mills Elementary', 'STEM
Preparatory Academy', 'Tusculum Elementary']

-Apart from this the other 237 schools has majority of whites.

In [17]:
# Lets have a threshold value of 20 ie. 20 students over 1 teacher.
threshold= 20
data[data['stu_teach_ratio'] > threshold]
Out[17]:

name school_rating size reduced_lunch state_percentile_16 state_percentile_15 stu_teach_ratio school_type avg_sc

East End
Public
63 Preparatory 4.0 383.0 87.0 81.9 78.1 22.5
Charter
School

John
144 Overton 1.0 1894.0 67.0 17.5 10.7 20.5 Public
High

Liberty Bell
169 Middle 5.0 1141.0 46.0 95.5 92.6 22.3 Public
School

Martin
Luther King Public
181 5.0 1224.0 37.0 98.9 97.7 21.1
Junior Magnet
School

Middle
190 College 5.0 122.0 19.0 94.3 96.7 20.3 Public
High

Mt. Juliet
201 Middle 5.0 1547.0 23.0 93.9 90.0 22.4 Public
School

Spring Hill
278 Middle 4.0 790.0 38.0 86.4 80.5 20.7 Public
School

Tennessee
Online Public
296 5.0 222.0 27.0 90.8 62.5 111.0
Public Virtual
School

The Middle
College @
298 Austin Peay 5.0 124.0 20.0 98.3 99.7 31.0 Public
State
University

-There are 9 education institutes where student teacher ratio is more than the threshold value.

-Among them, Tennessee Online Public School is the school where student_teacher ratio is 111. It means that 1
teacher over 111 students.

In [18]:
sns.countplot(data['school_type'])
Out[18]:
<AxesSubplot:xlabel='school_type', ylabel='count'>

-There are no public virtual schools.

In [19]:
#Bivariate Analysis

In [20]:
#-Do stu_teach_ration and ratings has any relation?
ttest,p_value= stats.ttest_ind(data['school_rating'],data['stu_teach_ratio'])
if p_value > 0.05:
print('School_Rating and Stu_teach_ratio have no relation')
else:
print('-School_Rating and Stu_teach_ratio do have relation.')

-School_Rating and Stu_teach_ratio do have relation.

In [21]:
#-Do school size and ratings has any relation?
ttest,p_value= stats.ttest_ind(data['size'],data['school_rating'])
if p_value > 0.05:
print('School_Rating and Size have no relation')
else:
print('-School_Rating and Size do have relation.')

-School_Rating and Size do have relation.

In [22]:
#-Do avg_score_15 and ratings has any relation?
ttest,p_value= stats.ttest_ind(data['avg_score_15'],data['school_rating'])
if p_value > 0.05:
print('School_Rating and avg_score_15 have no relation')
else:
print('-School_Rating and avg_score_15 do have relation.')

-School_Rating and avg_score_15 do have relation.

In [23]:
#-Do average_score_16 and ratings has any relation?
ttest,p_value= stats.ttest_ind(data['avg_score_16'],data['school_rating'])
if p_value > 0.05:
print('School_Rating and avg_score_16 have no relation')
else:
print('-School_Rating and avg_score_16 do have relation.')

-School_Rating and avg_score_16 do have relation.


In [24]:
#Multivariate Analysis
sns.pairplot(data)
Out[24]:
<seaborn.axisgrid.PairGrid at 0x2409c24c430>

In [25]:
sns.set(rc = {'figure.figsize':(12,12)})
sns.heatmap(data.corr(),annot=data.corr())
Out[25]:
<AxesSubplot:>
In [26]:
#Preprocesssing
data['school_rating']=data['school_rating'].astype(str)

In [27]:

data.dtypes
Out[27]:
name object
school_rating object
size float64
reduced_lunch float64
state_percentile_16 float64
state_percentile_15 float64
stu_teach_ratio float64
school_type object
avg_score_15 float64
avg_score_16 float64
full_time_teachers float64
percent_black float64
percent_white float64
percent_asian float64
percent_hispanic float64
Remaining_Students float64
dtype: object

In [28]:
data.drop(['name'],axis=1,inplace=True)

In [29]:
# Utility functions
def lab_enc(dataframe,col_name):
lab= LabelEncoder()
dataframe[f'{col_name}']= lab.fit_transform(dataframe[[f'{col_name}']])
def std_sca(dataframe,col_name):
std= StandardScaler()
dataframe[f'{col_name}']= std.fit_transform(dataframe[[f'{col_name}']])
def out_rem(dataframe,col_name):
ul= dataframe[f'{col_name}'].mean() + 2*dataframe[f'{col_name}'].std()
ll= dataframe[f'{col_name}'].mean() - 2*dataframe[f'{col_name}'].std()
ul_index= dataframe[dataframe[f'{col_name}'] > ul].index
ll_index= dataframe[dataframe[f'{col_name}'] < ll].index
req_inde= np.append(ul_index,ll_index)
dataframe.drop(req_inde,axis=0,inplace=True)
def judge_model(model):
mod= model()
mod.fit(X_train,y_train)
prediction_train= mod.predict(X_train)
train_accuracy= accuracy_score(y_train,prediction_train)
prediction_test= mod.predict(X_test)
test_accuracy= accuracy_score(y_test,prediction_test)
if train_accuracy - test_accuracy > 10 :
print(f'The model is overfitting as the training accuracy is {train_accuracy} whi
le the test accuracy is {test_accuracy}')
elif train_accuracy - test_accuracy == 0 :
print(f'The model is niether overfitting nor underfitting as the training accurac
y is {train_accuracy} while the test accuracy is {test_accuracy}')
else:
print(f'The model is gud to go as the training accuracy is {train_accuracy} while
the test accuracy is {test_accuracy}')

best_values=[]
def parameters_optimizer(model,parameters):
gsc_model= GridSearchCV(model(),param_grid= parameters,cv=5)
gsc_model.fit(X_train,y_train)
best_values.append(gsc_model.best_score_)
print(f'The best score of the {model} model is {gsc_model.best_score_} with best para
meters such as {gsc_model.best_params_}.')

In [30]:
#Encoding school_type feature
lab_enc(data,'school_type')

In [31]:
# Feature Selection
X= data.drop(['school_rating'],axis=1)
Y= data[['school_rating']]
skb= SelectKBest(chi2,k=len(X.columns)).fit_transform(X,Y)

In [32]:
df_new= pd.DataFrame(columns=['Columns','Chi_val'])
df_new['Columns']= X.columns
df_new['Chi_val']= skb[0]
df_new.head()
Out[32]:

Columns Chi_val

0 size 851.0

1 reduced_lunch 10.0

2 state_percentile_16 90.2

3 state_percentile_15 95.8
3 state_percentile_15 95.8
Columns Chi_val
4 stu_teach_ratio 15.7

In [33]:
df_new.sort_values('Chi_val',ascending=False)
req_out=df_new[df_new['Chi_val'] > 9]
req_out_cols= req_out.Columns.values

In [34]:
X_=data[req_out_cols]
Y_=Y

In [35]:
updated_data= pd.concat([X_,Y_],axis=1)

In [36]:
updated_data.head()
Out[36]:

size reduced_lunch state_percentile_16 state_percentile_15 stu_teach_ratio avg_score_15 avg_score_16 full_time_teachers

0 851.0 10.0 90.2 95.8 15.7 89.4 85.2 54.0

1 412.0 71.0 32.8 37.3 12.8 43.0 38.3 32.0

2 482.0 43.0 78.4 83.6 16.6 75.7 73.0 29.0

3 394.0 91.0 1.6 1.0 13.1 2.1 4.4 30.0

4 948.0 26.0 85.3 89.2 14.8 81.3 79.6 64.0

In [37]:
sns.heatmap(X.corr(),annot=data.corr())
Out[37]:
<AxesSubplot:>
-Size and Full time teachers have high degree of correlation. (0.97)

-Avg_score_15 and Avg_score_16 have high degree of correlation. (0.94)

-state_percentile_15 and state_percentile_16 have high degree of correlation. (0.93)

-State_percentile_16 and avg_score_15 have high degree of correlation. (0.93)

-State_percentile_15 and avg_score_16 have high degree of correlation. (0.99)

Conclusions-

-Higher the full time teachers higher the size.

-Higher the average score of 15, higher the average score of 15.

-Higher the state_percentile_15, higher the state_percentie_16.

-Higher the state_percentile , higher the average score.

-As state_percentile_15, state_percentile_16, avg_score_15 and avg_score_16 have high degree of correlation,
So only one feature can be used among them.

-As Size and full time have high degree of correlation , so we can use one feature in place of them.

In [38]:

updated_data.drop(['full_time_teachers','state_percentile_16','state_percentile_15','avg_
score_15'], axis=1,inplace=True)

In [39]:
updated_data.sample()

Out[39]:

size reduced_lunch stu_teach_ratio avg_score_16 percent_white school_rating

245 394.0 74.0 13.1 6.3 51.3 0.0

In [40]:
cols_final= updated_data.columns
for i in range(len(cols_final)):
typ= str(updated_data[f'{cols_final[i]}'].dtype)
if typ == 'object':
pass
else:
out_rem(updated_data,cols_final[i])
std_sca(updated_data,cols_final[i])
In [41]:

updated_data.sample()
Out[41]:

size reduced_lunch stu_teach_ratio avg_score_16 percent_white school_rating

272 -0.910943 0.505586 -0.07846 0.205073 0.151864 3.0

In [42]:
#Plotting new distibutions
sns.set(rc = {'figure.figsize':(6,6)})
for i in range(len(cols_final)):
print(cols_final[i])
plt.figure()
sns.histplot(data[f'{cols_final[i]}'], kde=True)
plt.show()

size

reduced_lunch

stu_teach_ratio
stu_teach_ratio

avg_score_16

percent_white
school_rating

In [43]:

#Applying train test and split


X_train, X_test, y_train, y_test =train_test_split(updated_data[['size','reduced_lunch',
'stu_teach_ratio','avg_score_16','percent_white']], updated_data[['school_rating']], trai
n_size=0.7)

Model building
In [44]:

#LogisticRegression
judge_model(LogisticRegression)

The model is gud to go as the training accuracy is 0.7824074074074074 while the test accu
racy is 0.7634408602150538

In [45]:
#SVC
judge_model(SVC)

The model is gud to go as the training accuracy is 0.8379629629629629 while the test accu
racy is 0.7741935483870968

In [46]:

#DecisionTreeClassifier
judge_model(DecisionTreeClassifier)

The model is gud to go as the training accuracy is 1.0 while the test accuracy is 0.946236
559139785

In [47]:

#RandomForestClassifier
judge_model(RandomForestClassifier)

The model is gud to go as the training accuracy is 1.0 while the test accuracy is 0.946236
559139785

In [48]:
#AdAdaBoostClassifier
#AdAdaBoostClassifier
judge_model(AdaBoostClassifier)

The model is gud to go as the training accuracy is 0.6574074074074074 while the test accu
racy is 0.5698924731182796

In [49]:

#Naive_Bayes
judge_model(GaussianNB)

The model is gud to go as the training accuracy is 0.8888888888888888 while the test accu
racy is 0.9032258064516129

Optimization
In [50]:
#LogisticRegression
lr_grid= {
'penalty':['l2','none','l1','elasticnet'],
'C':[5,10,20,40,70,100],
'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}
parameters_optimizer(LogisticRegression,lr_grid)

The best score of the <class 'sklearn.linear_model._logistic.LogisticRegression'> model i


s 0.8935517970401691 with best parameters such as {'C': 5, 'penalty': 'none', 'solver': '
newton-cg'}.

In [51]:

#SVC
svc_grid= {
'C':[5,10,20,40,70,100],
'kernel':['linear', 'poly', 'rbf']
}
parameters_optimizer(SVC,svc_grid)

The best score of the <class 'sklearn.svm._classes.SVC'> model is 0.8752642706131077 with


best parameters such as {'C': 20, 'kernel': 'linear'}.

In [52]:

#DecisionTreeClassifier
dtc_grid={
'criterion':['gini','entropy'],
'max_depth':[1,2,3,4,5,6,7,8,9,10]
}
parameters_optimizer(DecisionTreeClassifier,dtc_grid)

The best score of the <class 'sklearn.tree._classes.DecisionTreeClassifier'> model is 0.9


306553911205073 with best parameters such as {'criterion': 'entropy', 'max_depth': 3}.

In [53]:

#RandomForestClassifier
rfc_grid={
'n_estimators':[100,150,200,300,400],
'criterion':['gini','entropy'],
'max_depth':[1,2,3,4,5,6,7,8,9,10]
}
parameters_optimizer(RandomForestClassifier,rfc_grid)

The best score of the <class 'sklearn.ensemble._forest.RandomForestClassifier'> model is


0.9168076109936575 with best parameters such as {'criterion': 'entropy', 'max_depth': 8,
'n_estimators': 100}.

In [54]:

#AdaBoostClassifier
#AdaBoostClassifier
abc_grid={
'n_estimators':[100,150,200,250,300],
'learning_rate':[2,3,4,5,6,7],
'algorithm':['SAMME', 'SAMME.R']
}
parameters_optimizer(AdaBoostClassifier,abc_grid)

The best score of the <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'> mode


l is 0.9028541226215644 with best parameters such as {'algorithm': 'SAMME.R', 'learning_r
ate': 2, 'n_estimators': 300}.

In [55]:
#Naive_Bayes
nb_grid={

}
parameters_optimizer(GaussianNB,nb_grid)

The best score of the <class 'sklearn.naive_bayes.GaussianNB'> model is 0.856871035940803


4 with best parameters such as {}.

In [69]:
#Plotting accuracies
sns.set(rc = {'figure.figsize':(8,8)})
model_used=['LogisticRegression','SVC','DecisionTreeClassifier','RandomForestClassifier',
'AdaBoostClassifier','Naive_Bayes']
best_values=[89, 87, 93, 91, 90,85]
plt.barh(model_used, best_values,color=["blue","blue","red","blue","blue","blue"])
plt.title('Best-Scores')
plt.xlabel("Accuracy")
plt.ylabel("Models")
plt.show()

DECISION TREE CLASSIFIER IS THE BEST CLASSIFIER AMONG ALL.

In [ ]:
In [ ]:

You might also like