Professional Documents
Culture Documents
Assignment DMW
Assignment DMW
Assignment DMW
Theory Assignment
12132006
Saarim Salim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy.stats import ranksums
from google.colab import drive
drive.mount('/content/drive')
floc = '/content/drive/MyDrive/data.zip/'
fname = floc+'1.csv'
data = pd.read_csv(fname,
names=['F1','F2','F3','F4','F5','F6','F7','F8','F9','F10','F11','F12','F13
','F14','F15','F16','F17','F18','F19','F20','CLS'])
data.describe()
data.corr()
sns.heatmap(data.corr())
fname = floc+'1.csv'
data = np.genfromtxt(fname, delimiter=',')
r = np.corrcoef(data, rowvar=False)
sns.heatmap(abs(r))
def chivalue(f,cls,p,fn):
in1 = np.where(cls==1)
in2 = np.where(cls==0)
f1 = f[in1[0]]
f0 = f[in2[0]]
civ = np.zeros((2,2))
civ[0][0] = np.mean(f1)-1.96*np.std(f1)/math.sqrt(len(f1))
civ[1][0] = np.mean(f1)+1.96*np.std(f1)/math.sqrt(len(f1))
civ[0][1] = np.mean(f0)-1.96*np.std(f0)/math.sqrt(len(f0))
civ[1][1] = np.mean(f0)+1.96*np.std(f0)/math.sqrt(len(f0))
plt.boxplot(civ, labels=['No Job', 'Job'])
plt.grid(True)
plt.xlabel("Features")
plt.ylabel("Value")
fna=floc+'fig/'+str(p)+'_'+str(fn)+'.png'
plt.savefig(fna)
plt.close()
for i in range(1,52):
fname=floc+str(i)+'.csv'
data=np.genfromtxt(fname,delimiter=',')
for j in range(0,20):
chivalue(data[:,j],data[:,-1],i,j)
def wilrank(data):
in1 = np.where(data[:,-1]==1)
in2 = np.where(data[:,-1]==0)
pv = np.zeros((21))
for i in range(20):
f1 = data[in1[0],i]
f0 = data[in2[0],i]
p = ranksums(f1, f0)
pv[i] = p.pvalue
return pv
for i in range(1,52):
fname=floc+str(i)+'.csv'
data=np.genfromtxt(fname,delimiter=',')
pv=wilrank(data)
in1=np.where(pv<=0.05)
datan=data[:,in1[0]]
fname=floc+str(51+i)+'.csv'
np.savetxt(fname,datan,delimiter=',', fmt='%f')
for i in range(1,52):
fname=floc+str(i)+'.csv'
data=np.genfromtxt(fname,delimiter=',')
varf=np.var(data,axis=0)
in1=np.argsort(varf*-1)
datan=data[:,in1[0:5]]
datan=np.concatenate((datan,data[:,-1].reshape(-1,1)), axis=1)
fname=floc+str(51*2+i)+'.csv'
np.savetxt(fname,datan,delimiter=',', fmt='%f')
def cosscorr(data):
r=np.corrcoef(data[:,0:-1],rowvar=False)
r=np.abs(r)
imp=np.ones((20))
for i in range(0,19):
if imp[i]==1:
in1=np.where(r[i,i+1:]>=0.7)
imp[i+1+in1[0]]=0
return imp
for i in range(1,52):
fname=floc+str(i)+'.csv'
data=np.genfromtxt(fname,delimiter=',')
imp=cosscorr(data)
in1=np.where(imp==1)
datan=data[:,in1[0]]
datan=np.concatenate((datan,data[:,-1].reshape(-1,1)), axis=1)
fname=floc+str(51*3+i)+'.csv'
np.savetxt(fname,datan,delimiter=',', fmt='%f')