Logistic Regression - Default

#Get the data in R
Default <- read.csv("/home/debanjan/Documents/Teaching related/Analytics/Predictive

Analytics/PA 2020/Sessions/S6-Logistic/Default.csv", header=TRUE)
#Print the first few rows of the data

head(Default)
#Delete the sl.no. column, as the serial nos. in R has been created already
Default <- Default[-1]
#Print the variable names in the dataset Default

names(Default)
#This gives you the size of the dataset

dim(Default)
#This gives a numerical summary of the quantitative and qualitative variables

summary(Default)
#Boxplots to make some idea about the predictor-response relationship

boxplot(balance~default,xlab="Default",ylab="Balance",col=c("light
blue","orange"),data=Default)
boxplot(income~default,xlab="Default",ylab="Income",col=c("light
blue","orange"),data=Default)
#Now we separate the dataset into training and validation set

#In total we 10000 observations. to divide it in the ratio 60% training
# and 40% validation
# we set the random number genarator set.seed() so that we can replicate
# our results
#The sample() command will generate a random sample, 6000 out of 10000 numbers -
this we will use as the training set
set.seed(111)
train <- sample(10000, 6000)
#Now, create the validation set by subtracting the training observations from the
entire dataset
valid <- Default[-train, ]
#Now, we fit the model. The function glm() performs logistic regression on the
training set
#The dependant variable is default
#The dependant variable is on the LHS of the ~ sign. the independent variables are
on the RHS #of the ~ sign
#family = binomial option tells R to fit a logistic model, among the general class
of models covered by glm()
#Individual predictors
glm.fit.balance <- glm(default~balance,data=Default,subset=train,family=binomial)
summary(glm.fit.balance)
glm.fit.student <- glm(default~student,data=Default,subset=train,family=binomial)

summary(glm.fit.student)
#Full model with all predictors

glm.fit <- glm(default~.,data=Default,subset=train,family=binomial)
#Summary of the model fit

coef(glm.fit)
summary(glm.fit)
names(glm.fit)
# Investigating relation between student status and balance

# "Confounding"
boxplot(balance~student,xlab="Student status",ylab="Balance",col=c("light
blue","orange"),data=Default,subset=train)
#How does R code the categorical dependent variable?

contrasts(Default$default)
#the above tells us which level of default corresponds to "success"
#using the predict function we predcit the probabilities

#associated with the training data
#type="response" option will calculate probability of default for individuals
glm.probs <- predict(glm.fit, type="response")
glm.probs[1:10]
# the above prints the estimated probabilities for the first ten observations
#in order to make a prediction if a person will default or not

#we need to set a cutoff with the probability. let the cutoff probability be 0.5
#create an array of 6000 zeros (no-default)
#Then, according to cut-off, change zeroes to ones (default cases) as necessary
glm.pred <- rep(0, 6000)
glm.pred[glm.probs>0.5] <- 1
# table function produces the confusion matrix.

table(glm.pred, Default$default[train])
# finding the validation error rate for this case

glm.probs.valid <- predict(glm.fit, valid, type="response")
glm.pred.valid <- rep(0, 4000)
glm.pred.valid[glm.probs.valid>0.5] <- 1
table(glm.pred.valid, valid$default)
# ROC curve
library(pROC)
#For training data
r <- roc(Default[train,]$default,glm.probs)
plot.roc(r)
auc(r)
#For validation data
r.valid <- roc(valid$default,glm.probs.valid)
plot.roc(r.valid)
auc(r.valid)
# Lift chart
install.packages("gains")
library(gains)
#For training data
#Create a numeric 1-0 response
train.y <- ifelse(Default[train,]$default=="Yes",1,0)
gain.train <- gains(train.y, glm.probs, groups=dim(Default[train,])[1])
plot(c(0,gain.train$cume.pct.of.total*sum(train.y)) ~ c(0,
gain.train$cume.obs),xlab="No. of cases",ylab="Cumulative",main="Gain
Chart",type="l")
lines(c(0,sum(train.y))~c(0,dim(Default[train,])[1]),col="gray",lty=2)
#For validation data
valid.y <- ifelse(valid$default=="Yes",1,0)
gain.valid <- gains(valid.y, glm.probs.valid, groups=dim(valid)[1])
plot(c(0,gain.valid$cume.pct.of.total*sum(valid.y)) ~ c(0,
gain.valid$cume.obs),xlab="No. of cases",ylab="Cumulative",main="Gain
Chart",type="l")
lines(c(0,sum(valid.y))~c(0,dim(valid)[1]),col="gray",lty=2)

Logistic Regression - Default

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Logistic Regression - Default

Uploaded by

Copyright:

Available Formats

#Get the data in R

Default <- read.csv("/home/debanjan/Documents/Teaching related/Analytics/Predictive

#Print the first few rows of the data

#Print the variable names in the dataset Default

#This gives you the size of the dataset

#This gives a numerical summary of the quantitative and qualitative variables

#Boxplots to make some idea about the predictor-response relationship

#Now we separate the dataset into training and validation set

glm.fit.student <- glm(default~student,data=Default,subset=train,family=binomial)

#Full model with all predictors

#Summary of the model fit

# Investigating relation between student status and balance

#How does R code the categorical dependent variable?

#using the predict function we predcit the probabilities

#in order to make a prediction if a person will default or not

# table function produces the confusion matrix.

# finding the validation error rate for this case

You might also like