Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 3

filename = "pima-indians-diabetes.data.

csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
data.plot(kind='density', subplots=True, figsize=(15,17),
layout=(3,3),sharex=False, sharey=False)
plt.show()

import pandas as pd #for handling datasets


import statsmodels.api as sm #for statistical modeling
import pylab as pl #for plotting
import numpy as np #for numerical computation
pd.set_option('display.max_rows',None)

filename = "train_university_admission_data.csv"
names = ['admit','gre','gpa','prestige']
dfTrain = pd.read_csv(filename, names=names)
dfTest = pd.read_csv(filename, names=names)

import pandas as pd #for handling datasets


import statsmodels.api as sm #for statistical modeling
import pylab as pl #for plotting
import numpy as np #for numerical computation
pd.set_option('display.max_rows',None)
filename = "D:\AIML-L2\Dataset\train_university_admission_data.csv"
names = ['admit','gre','gpa','prestige']
dfTrain = pd.read_csv(filename, names=names)
dfTest = pd.read_csv(filename, names=names)

admit gre gpa \


Serial No. GRE Score TOEFL Score University Rating SOP LOR CGPA Research
1 337 118 4 4.5 4.5 9.65 1
2 324 107 4 4 4.5 8.87 1
3 316 104 3 3 3.5 8 1
4 322 110 3 3.5 2.5 8.67 1
prestige
Serial No. GRE Score TOEFL Score University Rating SOP Chance of Admit
1 337 118 4 4.5 0.92
2 324 107 4 4 0.76
3 316 104 3 3 0.72
4 322 110 3 3.5 0.8

filename = "train_university_admission_data.csv"
names = ['admit','gre','gpa','prestige']
dfTrain = pd.read_csv(filename, names=names)
#dfTest = pd.read_csv(filename, names=names)

import pandas as pd #for handling datasets


import statsmodels.api as sm #for statistical modeling
import pylab as pl #for plotting
import numpy as np #for numerical computation
pd.set_option('display.max_rows',None)

filename = "train_university_admission_data.csv"
names = ['admit','gre','gpa','prestige']
dfTrain = pd.read_csv(filename, names=names)
#dfTest = pd.read_csv(filename, names=names)
dfTrain.groupby('prestige').size()

dfTrain.groupby('admit').mean()

https://wipro365.sharepoint.com/sites/ku-practice-4044/ANALYTICS-
COMPETENCY/Recordings/Forms/AllItems.aspx?id=%2Fsites%2Fku%2Dpractice
%2D4044%2FANALYTICS%2DCOMPETENCY%2FRecordings%2FNon%20Academy%2FAIML%2DPython%20for
%20DS%5FL1L2%2DGuru%2DMay20&viewid=aa63c8dc%2D8568%2D4868%2Daad3%2Def4ed0b40a5f

dfTrain.groupby('prestige').mean()[['gre','gpa']]

print(pd.crosstab(dfTrain['admit'], dfTrain['prestige'], rownames=['admit']))

dfTrain.groupby(['admit','prestige']).size()......we can also use this

pd.crosstab(dfTrain['admit'], dfTrain['prestige'], rownames=['admit']).unstack()

pd.crosstab(dfTrain['admit'], dfTrain['prestige'],
rownames=['Admit']).plot(kind='bar')

pd.crosstab(dfTrain['admit'], dfTrain['prestige'],
rownames=['Admit']).plot(kind='bar')

pd.crosstab(dfTrain['admit'],dfTrain['prestige'],rownames=['Admit'])

prestige best good ok veryGood


Admit
0 28 58 45 64
1 11 25 21 47

prestige best good ok veryGood


Admit
0 28 58 45 64
1 11 25 21 47

pd.crosstab(dfTrain['admit'], dfTrain['prestige'], rownames=['admit']).unstack()

.total() we'll add

pd.groupby('dftrain['admiy'].sum()

can you explain this command : pd.crosstab(dfTrain['admit'], dfTrain['prestige'],


rownames=['admit']).unstack()
from Sanjay to everyone:
pd.crosstab(dfTrain['admit'], dfTrain['prestige'], rownames=['admit'],
margins=True)

prestige best good ok veryGood All


admit
0 20 73 47 68 208
1 25 19 9 39 92
All 45 92 56 107 300
from Prashant Kumar to everyone:
print(pd.crosstab(dfTrain['admit'], dfTrain['prestige'], rownames=['admit']).sum())

unstack() function?
from Shivansh Chaudhri to everyone:
dfTrain[dfTrain['admit']==0].sum()
from Gururajan Narasimhan to everyone:
pd.crosstab(dfTrain['admit'], dfTrain['prestige'],
rownames=['Admit']).plot(kind='bar')

print(pd.crosstab(dfTrain['prestige'], dfTrain['admit'], rownames=['prestige']))

doing normalization:
----------------------
X = array[:,0:8]
Y = array[:,8]
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
# summarize transformed data
set_printoptions(precision=3)
print(normalizedX[0:7,:])

doing standardization:
----------------------
X = array[:,0:8]
Y = array[:,8]
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
# summarize transformed data
set_printoptions(precision=3)
print(rescaledX[1:6,:])

Here, set_printoptions(precision=3) is used to display float numbers uptil 3rd


place after the point

X = array[:,0:8]
Y = array[:,8]
scaler = MinMaxScaler(feature_range=(10, 30))
rescaledX = scaler.fit_transform(X)
# summarize transformed data
set_printoptions(precision=4)
print(rescaledX[0:5,:])

X = array[:,0:8]
Y = array[:,8]
scaler = MinMaxScaler(feature_range=(10, 30))
rescaledX = scaler.fit_transform(X)
# summarize transformed data
set_printoptions(precision=4)
print(rescaledX[0:5,:])

peaor son is usied for the normal disctribution -- mostly used one
kinder and used for ranked distribution
peaor man -- is used for ranked and normal disctributed

what is the difference between correlation and similarity measure?


Correlation is to compareing the two diffrent values
similarity is a mesure for comaring as values

You might also like