Assignment DMW

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 3

Data Mining

Theory Assignment
12132006
Saarim Salim

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy.stats import ranksums
from google.colab import drive

drive.mount('/content/drive')
floc = '/content/drive/MyDrive/data.zip/'
fname = floc+'1.csv'
data = pd.read_csv(fname,
names=['F1','F2','F3','F4','F5','F6','F7','F8','F9','F10','F11','F12','F13
','F14','F15','F16','F17','F18','F19','F20','CLS'])

data.describe()

data.corr()

sns.heatmap(data.corr())

fname = floc+'1.csv'
data = np.genfromtxt(fname, delimiter=',')

r = np.corrcoef(data, rowvar=False)
sns.heatmap(abs(r))

def chivalue(f,cls,p,fn):
in1 = np.where(cls==1)
in2 = np.where(cls==0)
f1 = f[in1[0]]
f0 = f[in2[0]]
civ = np.zeros((2,2))
civ[0][0] = np.mean(f1)-1.96*np.std(f1)/math.sqrt(len(f1))
civ[1][0] = np.mean(f1)+1.96*np.std(f1)/math.sqrt(len(f1))
civ[0][1] = np.mean(f0)-1.96*np.std(f0)/math.sqrt(len(f0))
civ[1][1] = np.mean(f0)+1.96*np.std(f0)/math.sqrt(len(f0))
plt.boxplot(civ, labels=['No Job', 'Job'])
plt.grid(True)
plt.xlabel("Features")
plt.ylabel("Value")
fna=floc+'fig/'+str(p)+'_'+str(fn)+'.png'
plt.savefig(fna)
plt.close()

for i in range(1,52):
fname=floc+str(i)+'.csv'
data=np.genfromtxt(fname,delimiter=',')
for j in range(0,20):
chivalue(data[:,j],data[:,-1],i,j)

def wilrank(data):
in1 = np.where(data[:,-1]==1)
in2 = np.where(data[:,-1]==0)
pv = np.zeros((21))
for i in range(20):
f1 = data[in1[0],i]
f0 = data[in2[0],i]
p = ranksums(f1, f0)
pv[i] = p.pvalue
return pv

for i in range(1,52):
fname=floc+str(i)+'.csv'
data=np.genfromtxt(fname,delimiter=',')
pv=wilrank(data)
in1=np.where(pv<=0.05)
datan=data[:,in1[0]]
fname=floc+str(51+i)+'.csv'
np.savetxt(fname,datan,delimiter=',', fmt='%f')
for i in range(1,52):
fname=floc+str(i)+'.csv'
data=np.genfromtxt(fname,delimiter=',')
varf=np.var(data,axis=0)
in1=np.argsort(varf*-1)
datan=data[:,in1[0:5]]
datan=np.concatenate((datan,data[:,-1].reshape(-1,1)), axis=1)
fname=floc+str(51*2+i)+'.csv'
np.savetxt(fname,datan,delimiter=',', fmt='%f')

def cosscorr(data):
r=np.corrcoef(data[:,0:-1],rowvar=False)
r=np.abs(r)
imp=np.ones((20))
for i in range(0,19):
if imp[i]==1:
in1=np.where(r[i,i+1:]>=0.7)
imp[i+1+in1[0]]=0
return imp

for i in range(1,52):
fname=floc+str(i)+'.csv'
data=np.genfromtxt(fname,delimiter=',')
imp=cosscorr(data)
in1=np.where(imp==1)
datan=data[:,in1[0]]
datan=np.concatenate((datan,data[:,-1].reshape(-1,1)), axis=1)
fname=floc+str(51*3+i)+'.csv'
np.savetxt(fname,datan,delimiter=',', fmt='%f')

You might also like