Code

import pandas as pd
import numpy as np
import math
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
filePath = "E:assignmentdata{}.csv"
folds = 5
iterations = 1000
lr = 0.0005
coeff = list()
def main():
infogainexercise()
for i in range(1, 57):
logreg(i)
print(coeff)
print("length of final ", len(coeff))
dfout =pd.DataFrame(coeff)
print(dfout)
dfout.to_csv(filePath.format("coeff"), index=False, header=None)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def loss_fn(h, y):
return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
def predict(X, theta):
threshold = 0.5
prob = sigmoid(np.dot(X, theta))
return [x for x in map(lambda x: 1 if x else 0, prob >= threshold)]
def logreg(k):
df = pd.read_csv(filePath.format(k), header=None)
sample = math.floor(len(df) / folds)
weights = list()
loss = list()
accuracy = list()
fscore = list()
print("processing file {}".format(k), ">>>>>>>>>>>>>")
for f in range(1, folds + 1):
shuffledata = shuffle(df)
test = shuffledata[:sample]
train = shuffledata[sample:]
train_X =train.iloc[:, 0:20]
# add intercept to train data
train_X["intercept"] = 1
train_Y = train[20].gt(0).astype(int)
# print(train_X.shape)
test_X =test.iloc[:, 0:20]
# add intercept to test data
test_X["intercept"] = 1
test_Y = test[20].gt(0).astype(int)
theta = np.zeros(train_X.shape[1])
for i in range(iterations):
z = np.dot(train_X, theta)
h = sigmoid(z)
gradient = np.dot(train_X.T, (h -train_Y)) / train_Y.size
theta -= lr * gradient
if i % iterations == 0:
z = np.dot(train_X, theta)
h = sigmoid(z)
# loss
loss.append(loss_fn(h,train_Y))
# weights
weights.append([x for x in reversed(theta)])
# predictions on test data
y_hat = predict(test_X, theta)
# accuracy on test data

accuracy.append((y_hat ==test_Y).mean())
# confusion matrix
cf =confusion_matrix(test_Y, y_hat)
# f1 score
fscore.append(f1_score(test_Y,y_hat))
# print("predictions",y_hat)
print("weights", weights)
print("accuracy", accuracy)
print("f1 score", fscore)
# pick best weights based on max f1 measure
# if max f1 is 0, then pick max accuracy
maxpos = 0
if max(fscore) != 0:
maxpos =fscore.index(max(fscore))
else:
maxpos =accuracy.index(max(accuracy))
print("max value::", maxpos, fscore[maxpos], accuracy[maxpos])
x1 = weights[maxpos]
x1.insert(21, fscore[maxpos])
x1.insert(22, accuracy[maxpos])
# print(">>>>>>>>>>>>>>>>>>>>>>", x1)
coeff.append(x1)
return coeff
def infogainexercise():
infogainfinal = []
for i in range(1, 57):
print("file processed {}".format(i))
df = pd.read_csv(filePath.format(i), header=None)
features = df.loc[:, :19]
labels = df[20].gt(0).astype(int)
infogain = []
# print([x for x in df.columns])
for col in features.columns:
# ftinfo = []
threshold = np.mean(df[col])
features["new"] = features[col].gt(threshold).astype(int)
df1 = pd.DataFrame(list(zip(features["new"], labels)), columns=["x",

"y"])
tot = df1.count()[0]
# print("total rows:", tot)
infogain.append(infogain_fn(df1))
# print(df)
# infogain.append(ftinfo)
infogainfinal.append(infogain)
dfout =pd.DataFrame(infogainfinal)
print(dfout)
dfout.to_csv(filePath.format("infogain"), index=False, header=None)
def infogain_fn(df):
# print(df)
tot = df.count()[0]
# calcuate entropy of child
c00 = df[(df["x"] == 0) & (df["y"] == 0)].count()[0]
c01 = df[(df["x"] == 0) & (df["y"] == 1)].count()[0]
t0 = c00 + c01
c10 = df[(df["x"] == 1) & (df["y"] == 0)].count()[0]
c11 = df[(df["x"] == 1) & (df["y"] == 1)].count()[0]
t1 = c10 + c11
# print(c10, c11, c00, c01)
ig00 = 0
if c00 != 0:
ig00 = -1 * (c00 / t0) * math.log2(c00 / t0)
ig01 = 0
if c01 != 0:
ig01 = -1 * (c01 / t0) * math.log2(c01 / t0)
ig10 = 0
if c10 != 0:
ig1 = -1 * (c10 / t1) * math.log2(c10 / t1)
ig11 = 0
if c11 != 0:
ig11 = -1 * (c11 / t1) * math.log2(c11 / t1)
ig0 = ig00 + ig01
ig1 = ig10 + ig11

entchild = (t0 / tot) * ig0 + (t1 / tot) * ig1
# calculate entropy of parent
pc0 = df[(df["y"] == 0)].count()[0]
pc1 = df[(df["y"] == 1)].count()[0]
e0 = 0
if pc0 != 0:
e0 = -1 * (pc0 / tot) * math.log2(pc0 / tot)
e1 = 0
if pc0 != 0:
e1 = -1 * (pc1 / tot) * math.log2(pc1 / tot)
eparent = e0 + e1
return eparent -entchild
if __name__ == "__main__":
main()

Code

Uploaded by

Copyright:

Available Formats

You might also like

Code

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Code

Uploaded by

Copyright:

Available Formats

import pandas as pd

from sklearn.utils import shuffle

from sklearn.metrics import confusion_matrix

from sklearn.metrics import f1_score

for i in range(1, 57):

print("length of final ", len(coeff))

dfout.to_csv(filePath.format("coeff"), index=False, header=None)

return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

def predict(X, theta):

prob = sigmoid(np.dot(X, theta))

return [x for x in map(lambda x: 1 if x else 0, prob >= threshold)]

sample = math.floor(len(df) / folds)

print("processing file {}".format(k), ">>>>>>>>>>>>>")

for f in range(1, folds + 1):

train_X =train.iloc[:, 0:20]

# add intercept to train data

test_X =test.iloc[:, 0:20]

# add intercept to test data

gradient = np.dot(train_X.T, (h -train_Y)) / train_Y.size

weights.append([x for x in reversed(theta)])

# predictions on test data

y_hat = predict(test_X, theta)

# accuracy on test data

print("f1 score", fscore)

# pick best weights based on max f1 measure

# if max f1 is 0, then pick max accuracy

print("max value::", maxpos, fscore[maxpos], accuracy[maxpos])

for i in range(1, 57):

print("file processed {}".format(i))

features = df.loc[:, :19]

# print([x for x in df.columns])

for col in features.columns:

df1 = pd.DataFrame(list(zip(features["new"], labels)), columns=["x",

# print("total rows:", tot)

dfout.to_csv(filePath.format("infogain"), index=False, header=None)

# calcuate entropy of child

c00 = df[(df["x"] == 0) & (df["y"] == 0)].count()[0]

c01 = df[(df["x"] == 0) & (df["y"] == 1)].count()[0]

c10 = df[(df["x"] == 1) & (df["y"] == 0)].count()[0]

c11 = df[(df["x"] == 1) & (df["y"] == 1)].count()[0]

# print(c10, c11, c00, c01)

ig00 = -1 * (c00 / t0) * math.log2(c00 / t0)

ig01 = -1 * (c01 / t0) * math.log2(c01 / t0)

ig1 = -1 * (c10 / t1) * math.log2(c10 / t1)

ig11 = -1 * (c11 / t1) * math.log2(c11 / t1)

ig0 = ig00 + ig01

ig1 = ig10 + ig11

# calculate entropy of parent

pc0 = df[(df["y"] == 0)].count()[0]

pc1 = df[(df["y"] == 1)].count()[0]

e0 = -1 * (pc0 / tot) * math.log2(pc0 / tot)

e1 = -1 * (pc1 / tot) * math.log2(pc1 / tot)

return eparent -entchild

You might also like