Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 3

#Get the data in R

Default <- read.csv("/home/debanjan/Documents/Teaching related/Analytics/Predictive


Analytics/PA 2020/Sessions/S6-Logistic/Default.csv", header=TRUE)

#Print the first few rows of the data


head(Default)

#Delete the sl.no. column, as the serial nos. in R has been created already
Default <- Default[-1]

#Print the variable names in the dataset Default


names(Default)

#This gives you the size of the dataset


dim(Default)

#This gives a numerical summary of the quantitative and qualitative variables


summary(Default)

#Boxplots to make some idea about the predictor-response relationship


boxplot(balance~default,xlab="Default",ylab="Balance",col=c("light
blue","orange"),data=Default)
boxplot(income~default,xlab="Default",ylab="Income",col=c("light
blue","orange"),data=Default)

#Now we separate the dataset into training and validation set


#In total we 10000 observations. to divide it in the ratio 60% training
# and 40% validation
# we set the random number genarator set.seed() so that we can replicate
# our results
#The sample() command will generate a random sample, 6000 out of 10000 numbers -
this we will use as the training set
set.seed(111)
train <- sample(10000, 6000)
#Now, create the validation set by subtracting the training observations from the
entire dataset
valid <- Default[-train, ]

#Now, we fit the model. The function glm() performs logistic regression on the
training set
#The dependant variable is default
#The dependant variable is on the LHS of the ~ sign. the independent variables are
on the RHS #of the ~ sign
#family = binomial option tells R to fit a logistic model, among the general class
of models covered by glm()
#Individual predictors
glm.fit.balance <- glm(default~balance,data=Default,subset=train,family=binomial)
summary(glm.fit.balance)

glm.fit.student <- glm(default~student,data=Default,subset=train,family=binomial)


summary(glm.fit.student)

#Full model with all predictors


glm.fit <- glm(default~.,data=Default,subset=train,family=binomial)

#Summary of the model fit


coef(glm.fit)
summary(glm.fit)
names(glm.fit)

# Investigating relation between student status and balance


# "Confounding"
boxplot(balance~student,xlab="Student status",ylab="Balance",col=c("light
blue","orange"),data=Default,subset=train)

#How does R code the categorical dependent variable?


contrasts(Default$default)
#the above tells us which level of default corresponds to "success"

#using the predict function we predcit the probabilities


#associated with the training data
#type="response" option will calculate probability of default for individuals
glm.probs <- predict(glm.fit, type="response")

glm.probs[1:10]
# the above prints the estimated probabilities for the first ten observations

#in order to make a prediction if a person will default or not


#we need to set a cutoff with the probability. let the cutoff probability be 0.5
#create an array of 6000 zeros (no-default)
#Then, according to cut-off, change zeroes to ones (default cases) as necessary
glm.pred <- rep(0, 6000)
glm.pred[glm.probs>0.5] <- 1

# table function produces the confusion matrix.


table(glm.pred, Default$default[train])

# finding the validation error rate for this case


glm.probs.valid <- predict(glm.fit, valid, type="response")
glm.pred.valid <- rep(0, 4000)
glm.pred.valid[glm.probs.valid>0.5] <- 1
table(glm.pred.valid, valid$default)

# ROC curve
library(pROC)
#For training data
r <- roc(Default[train,]$default,glm.probs)
plot.roc(r)
auc(r)
#For validation data
r.valid <- roc(valid$default,glm.probs.valid)
plot.roc(r.valid)
auc(r.valid)

# Lift chart
install.packages("gains")
library(gains)
#For training data
#Create a numeric 1-0 response
train.y <- ifelse(Default[train,]$default=="Yes",1,0)
gain.train <- gains(train.y, glm.probs, groups=dim(Default[train,])[1])
plot(c(0,gain.train$cume.pct.of.total*sum(train.y)) ~ c(0,
gain.train$cume.obs),xlab="No. of cases",ylab="Cumulative",main="Gain
Chart",type="l")
lines(c(0,sum(train.y))~c(0,dim(Default[train,])[1]),col="gray",lty=2)
#For validation data
valid.y <- ifelse(valid$default=="Yes",1,0)
gain.valid <- gains(valid.y, glm.probs.valid, groups=dim(valid)[1])
plot(c(0,gain.valid$cume.pct.of.total*sum(valid.y)) ~ c(0,
gain.valid$cume.obs),xlab="No. of cases",ylab="Cumulative",main="Gain
Chart",type="l")
lines(c(0,sum(valid.y))~c(0,dim(valid)[1]),col="gray",lty=2)

You might also like