import pandas as pd

import numpy as np

import math

from sklearn.utils import shuffle

from sklearn.metrics import confusion_matrix

from sklearn.metrics import f1_score

filePath = "E:assignmentdata{}.csv"

folds = 5

iterations = 1000

lr = 0.0005

coeff = list()

def main():


for i in range(1, 57):



print("length of final ", len(coeff))

dfout =pd.DataFrame(coeff)


dfout.to_csv(filePath.format("coeff"), index=False, header=None)

def sigmoid(z):

return 1 / (1 + np.exp(-z))
def loss_fn(h, y):

return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

def predict(X, theta):

threshold = 0.5

prob = sigmoid(, theta))

return [x for x in map(lambda x: 1 if x else 0, prob >= threshold)]

def logreg(k):

df = pd.read_csv(filePath.format(k), header=None)

sample = math.floor(len(df) / folds)

weights = list()

loss = list()

accuracy = list()

fscore = list()

print("processing file {}".format(k), ">>>>>>>>>>>>>")

for f in range(1, folds + 1):

shuffledata = shuffle(df)

test = shuffledata[:sample]

train = shuffledata[sample:]

train_X =train.iloc[:, 0:20]

# add intercept to train data

train_X["intercept"] = 1

train_Y = train[20].gt(0).astype(int)
# print(train_X.shape)

test_X =test.iloc[:, 0:20]

# add intercept to test data

test_X["intercept"] = 1

test_Y = test[20].gt(0).astype(int)

theta = np.zeros(train_X.shape[1])

for i in range(iterations):

z =, theta)

h = sigmoid(z)

gradient =, (h -train_Y)) / train_Y.size

theta -= lr * gradient

if i % iterations == 0:

z =, theta)

h = sigmoid(z)

# loss


# weights

weights.append([x for x in reversed(theta)])

# predictions on test data

y_hat = predict(test_X, theta)

# accuracy on test data

accuracy.append((y_hat ==test_Y).mean())

# confusion matrix

cf =confusion_matrix(test_Y, y_hat)

# f1 score


# print("predictions",y_hat)

print("weights", weights)

print("accuracy", accuracy)

print("f1 score", fscore)

# pick best weights based on max f1 measure

# if max f1 is 0, then pick max accuracy

maxpos = 0

if max(fscore) != 0:

maxpos =fscore.index(max(fscore))


maxpos =accuracy.index(max(accuracy))

print("max value::", maxpos, fscore[maxpos], accuracy[maxpos])

x1 = weights[maxpos]

x1.insert(21, fscore[maxpos])

x1.insert(22, accuracy[maxpos])

# print(">>>>>>>>>>>>>>>>>>>>>>", x1)


return coeff
def infogainexercise():

infogainfinal = []

for i in range(1, 57):

print("file processed {}".format(i))

df = pd.read_csv(filePath.format(i), header=None)

features = df.loc[:, :19]

labels = df[20].gt(0).astype(int)

infogain = []

# print([x for x in df.columns])

for col in features.columns:

# ftinfo = []

threshold = np.mean(df[col])

features["new"] = features[col].gt(threshold).astype(int)

df1 = pd.DataFrame(list(zip(features["new"], labels)), columns=["x",


tot = df1.count()[0]

# print("total rows:", tot)


# print(df)

# infogain.append(ftinfo)


dfout =pd.DataFrame(infogainfinal)


dfout.to_csv(filePath.format("infogain"), index=False, header=None)

def infogain_fn(df):

# print(df)
tot = df.count()[0]

# calcuate entropy of child

c00 = df[(df["x"] == 0) & (df["y"] == 0)].count()[0]

c01 = df[(df["x"] == 0) & (df["y"] == 1)].count()[0]

t0 = c00 + c01

c10 = df[(df["x"] == 1) & (df["y"] == 0)].count()[0]

c11 = df[(df["x"] == 1) & (df["y"] == 1)].count()[0]

t1 = c10 + c11

# print(c10, c11, c00, c01)

ig00 = 0

if c00 != 0:

ig00 = -1 * (c00 / t0) * math.log2(c00 / t0)

ig01 = 0

if c01 != 0:

ig01 = -1 * (c01 / t0) * math.log2(c01 / t0)

ig10 = 0

if c10 != 0:

ig1 = -1 * (c10 / t1) * math.log2(c10 / t1)

ig11 = 0

if c11 != 0:

ig11 = -1 * (c11 / t1) * math.log2(c11 / t1)

ig0 = ig00 + ig01

ig1 = ig10 + ig11

entchild = (t0 / tot) * ig0 + (t1 / tot) * ig1

# calculate entropy of parent

pc0 = df[(df["y"] == 0)].count()[0]

pc1 = df[(df["y"] == 1)].count()[0]

e0 = 0

if pc0 != 0:

e0 = -1 * (pc0 / tot) * math.log2(pc0 / tot)

e1 = 0

if pc0 != 0:

e1 = -1 * (pc1 / tot) * math.log2(pc1 / tot)

eparent = e0 + e1

return eparent -entchild

if __name__ == "__main__":


