Professional Documents
Culture Documents
Project Advanced Statistics UMESHHASIJA SEP2021 Jupyter File
Project Advanced Statistics UMESHHASIJA SEP2021 Jupyter File
In [130…
import numpy as np
import pandas as pd
%matplotlib inline
plt.show()
In [131…
df=pd.read_csv("SalaryData.csv")
In [132…
df.head()
In [133…
df
In [134…
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
In [135… df.describe().T
In [136…
df.Education=pd.Categorical(df.Education)
In [137…
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
In [138…
df.Occupation=pd.Categorical(df.Occupation)
In [139…
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
In [140…
formula = 'Salary ~ C(Education)'
aov_table = anova_lm(model)
print(aov_table)
In [141…
formula2 = 'Salary ~ C(Occupation)'
aov_table2 = anova_lm(model2)
print(aov_table2)
In [142…
sns.pointplot(x='Education', y='Salary', data=df)
In [143…
sns.pointplot(x='Occupation', y='Salary', data=df)
In [144…
from scipy.stats import f_oneway
In [145…
# since P value was low for the education it means that there is significant difefrence
# hence performing the test for the Education Treat ment only
In [146…
import statsmodels.formula.api as smf
In [147…
tukey = pairwise_tukeyhsd(endog=df['Salary'],
groups=df['Education'],
alpha=0.05)
In [148… print(tukey)
=========================================================================
-------------------------------------------------------------------------
-------------------------------------------------------------------------
In [149…
tukey2 = pairwise_tukeyhsd(endog=df['Salary'],
groups=df['Occupation'],
alpha=0.05)
In [150…
print(tukey2)
====================================================================================
------------------------------------------------------------------------------------
------------------------------------------------------------------------------------
In [151…
formula3 = 'Salary ~ C(Education) + C(Occupation) + C(Education):C(Occupation)'
aov_table3 = anova_lm(model3)
print(aov_table3)
df sum_sq mean_sq F \
PR(>F)
C(Education) 5.466264e-12
C(Occupation) 7.211580e-02
C(Education):C(Occupation) 2.232500e-05
Residual NaN
In [152…
from matplotlib.pyplot import figure
plt.figure(figsize=(8, 6))
sns.pointplot(x="Occupation",y="Salary",data=df,hue="Education",ci=None);
In [153…
df1=pd.read_csv("Education+-+Post+12th+Standard.csv")
In [154…
df1.head()
Out[154… Names Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.B
Abilene
0 Christian 1660 1232 721 23 52 2885 537 7440
University
Adelphi
1 2186 1924 512 16 29 2683 1227 12280
University
Adrian
2 1428 1097 336 22 50 1036 99 11250
College
Agnes
3 Scott 417 349 137 60 89 510 63 12960
College
Alaska
4 Pacific 193 146 55 16 44 249 869 7560
University
In [155…
df1.shape
In [156…
df1.info()
<class 'pandas.core.frame.DataFrame'>
In [157…
df1.describe().T
sns.boxplot(x='Apps',data=df1,ax=axes[0][0])
sns.histplot(df1["Apps"],kde=True,ax=axes[0][1])
In [159…
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,5),squeeze=False)
sns.boxplot(x='Accept',data=df1,ax=axes[0][0])
sns.histplot(df1["Accept"],kde=True,ax=axes[0][1])
In [160…
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,5),squeeze=False)
sns.boxplot(x='Enroll',data=df1,ax=axes[0][0])
sns.histplot(df1["Enroll"],kde=True,ax=axes[0][1])
In [161…
fig,axes = plt.subplots(nrows=2,ncols=2,figsize=(15,5),squeeze=False)
sns.boxplot(x='Top10perc',data=df1,ax=axes[0][0])
sns.histplot(df1["Top10perc"],kde=True,ax=axes[0][1])
sns.boxplot(x='Top25perc',data=df1,ax=axes[1][0])
sns.histplot(df1["Top25perc"],kde=True,ax=axes[1][1])
In [162…
fig,axes = plt.subplots(nrows=2,ncols=2,figsize=(15,5),squeeze=False)
sns.boxplot(x='P.Undergrad',data=df1,ax=axes[0][0])
sns.histplot(df1["P.Undergrad"],kde=True,ax=axes[0][1])
sns.boxplot(x='F.Undergrad',data=df1,ax=axes[1][0])
sns.histplot(df1["F.Undergrad"],kde=True,ax=axes[1][1])
In [163…
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,5),squeeze=False)
sns.boxplot(x='Outstate',data=df1,ax=axes[0][0])
sns.histplot(df1["Outstate"],kde=True,ax=axes[0][1])
In [164…
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,5),squeeze=False)
sns.boxplot(x='Room.Board',data=df1,ax=axes[0][0])
sns.histplot(df1["Room.Board"],kde=True,ax=axes[0][1])
sns.boxplot(x='Books',data=df1,ax=axes[0][0])
sns.histplot(df1["Books"],kde=True,ax=axes[0][1])
In [166…
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,5),squeeze=False)
sns.boxplot(x='Personal',data=df1,ax=axes[0][0])
sns.histplot(df1['Personal'],kde=True,ax=axes[0][1])
In [167…
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,5),squeeze=False)
sns.boxplot(x='PhD',data=df1,ax=axes[0][0])
sns.histplot(df1['PhD'],kde=True,ax=axes[0][1])
In [168…
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,5),squeeze=False)
sns.boxplot(x='Terminal',data=df1,ax=axes[0][0])
sns.histplot(df1['Terminal'],kde=True,ax=axes[0][1])
In [169…
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,5),squeeze=False)
sns.boxplot(x='S.F.Ratio',data=df1,ax=axes[0][0])
sns.histplot(df1['S.F.Ratio'],kde=True,ax=axes[0][1])
sns.boxplot(x='perc.alumni',data=df1,ax=axes[0][0])
sns.histplot(df1['perc.alumni'],kde=True,ax=axes[0][1])
In [171…
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,5),squeeze=False)
sns.boxplot(x='Expend',data=df1,ax=axes[0][0])
sns.histplot(df1['Expend'],kde=True,ax=axes[0][1])
In [172…
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(15,5),squeeze=False)
sns.boxplot(x='Grad.Rate',data=df1,ax=axes[0][0])
sns.histplot(df1['Grad.Rate'],kde=True,ax=axes[0][1])
In [173…
df1["Grad.Rate"].unique()
Out[173… array([ 60, 56, 54, 59, 15, 55, 63, 73, 80, 52, 76, 74, 68,
69, 100, 46, 34, 48, 70, 65, 88, 58, 71, 85, 79, 91,
72, 84, 49, 82, 35, 51, 75, 53, 96, 67, 18, 33, 97,
89, 93, 78, 83, 61, 81, 64, 62, 118, 24, 66, 47, 50,
21, 87, 77, 43, 95, 37, 99, 45, 42, 98, 94, 38, 86,
44, 22, 57, 29, 36, 39, 40, 26, 90, 92, 32, 27, 41,
In [174…
plt.figure(figsize=(15,8))
sns.heatmap(df1.corr(),annot=True,fmt='.2f');
In [175…
#dropping the string/object variable i.e colelge name
df1_pca=df1.drop("Names",axis=1)
df1_pca.head()
Out[175… Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Boo
In [176…
from scipy.stats import zscore
In [177…
df1_scaled=df1_pca.apply(zscore)
df1_scaled.head()
Out[177… Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.B
In [178…
df1_scaled.describe().T
In [179…
corrMatrix = df1_scaled.corr()
corrMatrix
In [180…
covarMatrix = df1.cov()
covarMatrix
In [181…
plt.figure(figsize = (12,8))
feature_list = df1_pca.columns
for i in range(len(feature_list)):
plt.subplot(4, 5, i + 1)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()
In [182…
plt.figure(figsize = (12,8))
feature_list = df1_scaled.columns
for i in range(len(feature_list)):
plt.subplot(4, 5, i + 1)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()
In [183…
from sklearn.decomposition import PCA
pca_transformed = pca.fit_transform(df1_scaled)
In [184…
#Extract eigen vectors
pca.components_
3.18908750e-01, 2.52315654e-01],
-1.31689865e-01, -1.69240532e-01],
2.26743985e-01, -2.08064649e-01],
7.92734946e-02, 2.69129066e-01],
7.59581203e-02, -1.09267913e-01],
-2.98118619e-01, 2.16163313e-01],
-2.26584481e-01, 5.59943937e-01],
-5.41593771e-02, -5.33553891e-03],
-4.91388809e-02, 4.19043052e-02],
1.32286331e-01, -5.90271067e-01],
6.92088870e-01, 2.19839000e-01],
3.25982295e-01, 1.22106697e-01],
-9.37464497e-02, -6.91969778e-02],
7.31225166e-02, 3.64767385e-02],
-2.27742017e-01, -3.39433604e-03],
-4.38803230e-02, -5.00844705e-03],
-3.53098218e-02, -1.30710024e-02]])
In [185…
#Check the eigen values
pca.explained_variance_
0.03672545, 0.02302787])
In [186…
#Check the explained variance for each PC
#Note: Explained variance = (eigen value of each PC)/(sum of eigen values of all PCs)
pca.explained_variance_ratio_
0.00215754, 0.00135284])
In [187…
#Create a dataframe containing the loadings or coefficients of all PCs
df1_pca_loadings = pd.DataFrame(pca.components_.T,
In [188…
df1_pca_loadings
Apps 0.248766 0.331598 -0.063092 0.281311 0.005741 -0.016237 -0.042486 -0.103090 -0.09
Accept 0.207602 0.372117 -0.101249 0.267817 0.055786 0.007535 -0.012950 -0.056271 -0.17
Enroll 0.176304 0.403724 -0.082986 0.161827 -0.055694 -0.042558 -0.027693 0.058662 -0.12
Top10perc 0.354274 -0.082412 0.035056 -0.051547 -0.395434 -0.052693 -0.161332 -0.122678 0.34
Top25perc 0.344001 -0.044779 -0.024148 -0.109767 -0.426534 0.033092 -0.118486 -0.102492 0.40
F.Undergrad 0.154641 0.417674 -0.061393 0.100412 -0.043454 -0.043454 -0.025076 0.078890 -0.05
P.Undergrad 0.026443 0.315088 0.139682 -0.158558 0.302385 -0.191199 0.061042 0.570784 0.56
Outstate 0.294736 -0.249644 0.046599 0.131291 0.222532 -0.030000 0.108529 0.009846 -0.00
Room.Board 0.249030 -0.137809 0.148967 0.184996 0.560919 0.162755 0.209744 -0.221453 0.27
Books 0.064758 0.056342 0.677412 0.087089 -0.127289 0.641055 -0.149692 0.213293 -0.13
Personal -0.042529 0.219929 0.499721 -0.230711 -0.222311 -0.331398 0.633790 -0.232661 -0.09
PhD 0.318313 0.058311 -0.127028 -0.534725 0.140166 0.091256 -0.001096 -0.077040 -0.18
Terminal 0.317056 0.046429 -0.066038 -0.519443 0.204720 0.154928 -0.028477 -0.012161 -0.25
S.F.Ratio -0.176958 0.246665 -0.289848 -0.161189 -0.079388 0.487046 0.219259 -0.083605 0.27
perc.alumni 0.205082 -0.246595 -0.146989 0.017314 -0.216297 -0.047340 0.243321 0.678524 -0.25
Expend 0.318909 -0.131690 0.226744 0.079273 0.075958 -0.298119 -0.226584 -0.054159 -0.04
Grad.Rate 0.252316 -0.169241 -0.208065 0.269129 -0.109268 0.216163 0.559944 -0.005336 0.04
In [189…
#Create a scree plot
plt.figure(figsize=(8,5))
sns.lineplot(y=pca.explained_variance_ratio_ ,x=range(1,18),marker='o')
plt.xlabel('Number of Components',fontsize=10)
plt.ylabel('Variance Explained',fontsize=10)
plt.title('Scree Plot',fontsize=12)
plt.grid()
plt.show()
In [190…
#Check the cumlative explained variance ratio to find a cut off for selecting the numbe
np.cumsum(pca.explained_variance_ratio_)
0.99864716, 1. ])
In [191…
df1_selected = df1_pca_loadings[['PC1','PC2', 'PC3', 'PC4', 'PC5','PC6']]
df1_selected
In [192…
pca2 = PCA(n_components=6, random_state=123)
pca_final = pca2.fit_transform(df1_scaled)
In [193…
pca_final
...,
In [194…
pca_final_df1 = pd.DataFrame(pca_final, columns = df1_selected.columns)
pca_final_df1
In [195…
plt.figure(figsize = (10,8))
sns.heatmap(pca_final_df1.corr(), annot=True,fmt='.2f');
In [207…
for i in range (0,df1_scaled.shape[1]):
In [208…
#Compare how the original features influence various PCs
plt.figure(figsize = (12,8))
In [ ]: