Professional Documents
Culture Documents
R Codes
R Codes
R Codes
#From the prospective of the employees and employers it is very essential to know the yearly salary
of a particular candidate or a particular position in the organization.
#In this analysis, an attempt is made to solve the problem of getting a right salary based on the
historical survey of existing employees on different parameters.
#This type of study is part of HR analytics, which can assist HR teams set goals, measure success, and
optimize procedures based on desired skill sets and skill sets created by their applicants.
#The company may concentrate on training its personnel based on this analysis, which will help the
company generate income in the future.
#This predication is very help full for the aspiring IT candidates to predict their salary based on their
skill sets, current salary income and other variables.
#This can also be used by HR department to determine the salary for an existing or new position in
an organization.
#loading the required library and importing the "IT salary survey EU" data set
library(readxl)
library(descr)
library(caTools)
library(caret)
library(ISLR)
read.table()
data.frame()
sum(is.na(data1))
str(data1)
#The unwanted columns which has no relation in the given problem are removed
data1<-data1[-23]
data1<-data1[-22]
data1<-data1[-21]
data1<-data1[-1]
view(data1)
#the sum of null values is found and removed to get a desired output for the dependent variable
yearly salary
sum(is.na(data1))
data1<-na.omit(data1)
summary(data1)
library(caTools)
set.seed(100)
split1<-sample.split(data1$City,SplitRatio =0.7)
summary(split1)
datatrain<-subset(data1, split1==TRUE)
datatest<-subset(data1, split1==FALSE)
summary(datatrain)
str(datatrain)
#logistic regression
library(caret)
lreg1<-train(Contract.duration~ .,
method ="glm",
family = "binomial",
data=datatrain)
lreg1
summary(lreg1)
#taken by removing outliers(12) from the outlier section at the bottom after plotting "reg1"
data1[c(631,659,805,442,854,805,565,575,779,153,839,728,1224,495,574,967,959,674),]<-NA
#taken by removing outliers(12) from the outlier section at the bottom after plotting "reg1"
data1[c(631,659,805,442,854),]<-NA
reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~
Total.years.of.experience+Yearly.bonus...stocks.in.EUR+datatrain$Annual.brutto.salary..without.bon
us.and.stocks..one.year.ago..Only.answer.if.staying.in.the.same.country+
datatrain$Years.of.experience.in.Germany+datatrain$Annual.bonus.stocks.one.year.ago..Only.answ
er.if.staying.in.same.country+datatrain$Number.of.vacation.days+datatrain$Contract.duration+data
train$Age, data = datatrain)
library(tidyverse)
data1%>%
select(Age,Gender,City,Position,Total.years.of.experience,Years.of.experience.in.Germany
,Seniority.level,Your.main.technology...programming.language,Other.technologies.programming.lan
guages.you.use.often,Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,Yearly.bonus...stocks.in
.EUR,Annual.brutto.salary..without.bonus.and.stocks..one.year.ago..Only.answer.if.staying.in.the.sa
me.country,Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.country,Number.of.v
acation.days,Employment.status,Contract.duration,Main.language.at.work,Company.size,Company.t
ype,)%>%
mutate(Position=replace_na(Position,"none"))%>%
view()
-------------------------------------------------------
data1%>%
select(Age,Gender,City,Position,Total.years.of.experience,Total.years.of.experience,Seniority.level,Y
our.main.technology...programming.language,Other.technologies.programming.languages.you.use.o
ften,Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,Yearly.bonus...stocks.in.EUR,Annual.bru
tto.salary..without.bonus.and.stocks..one.year.ago..Only.answer.if.staying.in.the.same.country,Annu
al.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.country,Number.of.vacation.days,Empl
oyment.status,)
library(readxl)
read.table()
data.frame()
data1<-na.omit(data1)
library(caTools)
set.seed(100)
split1<-sample.split(data1$Course,SplitRatio =0.7)
summary(split1)
datatrain<-subset(data1, split1==TRUE)
datatest<-subset(data1, split1==FALSE)
summary(datatrain)
str(datatrain)
library(ggplot2)
cor(datatrain$Student_engagement, datatrain$goal_orientation)
#looks linear
reg1<-lm(datatrain$Student_engagement~
datatrain$goal_orientation+
summary(reg1)
#OR
reg1<-lm(Student_engagement~
goal_orientation+
academic_self_eficacy,
data = datatrain)
summary(reg1)
#Adj r2 for more than one independent variables (adjust for degg of freedom)
#beta2: 73%
#Multicollinearity
library(car)
#Assumptions
plot(reg1$residuals, c(1:length(reg1$residuals)))
descr(datatrain)
summary(datatrain)
library(moments)
datatrain$residual1<-reg1$residuals
moments::skewness(datatrain$residual1)
moments::kurtosis(datatrain$residual1)
#OR
library(e1071)
kurtosis(reg1$residuals)
#outliers
plot(reg1$residuals, reg1$fitted.values)
library(lmtest)
#outliers---
plot(reg1)
#OR
library(car)
influenceIndexPlot(reg1)#261
#converted outlers as missing using NA and removed in main data set in next step
data1[c(261,799,96,426,248),]<-NA
reg1<-lm(emission~mass+
wheelbase+
axlesteering+
axleother+
enginecapacity+
emisionreduction,data = data1)
summary(reg1)
library(car)
vif(reg1)
regm1<-lm(emission~mass+
wheelbase+
axleother+
enginecapacity+
emisionreduction,data = data1)
summary(regm1)
regm2<-lm(emission~mass+
wheelbase+
axlesteering+
enginecapacity+
emisionreduction,data = data1)
summary(regm2)
vif(regm2)
#rmse------,Anova
prem1<-predict(regm1, data1)
prem2<-predict(regm2, data1)
library(caret)
library(Metrics)
RMSE(data1$emission,prem1)
RMSE(data1$emission,prem2)
# the coefficients and r2 values are similar so we can have both the models for prediction of
"emmission"
#loading the required library and importing the "IT salary survey EU" data set
library(readxl)
library(descr)
read.table()
data.frame()
sum(is.na(data1))
str(data1)
#taken by removing outliers from the outlier section at the bottom after plotting "reg1"
data1[c(631,659,805,442,854,855,805,565,575,779,153,839,728,1224,495,574,967,959,674),]<-NA
#the unwanted columns which has no relation in the given problem are removed
data1<-data1[-23]
data1<-data1[-22]
data1<-data1[-21]
data1<-data1[-1]
view(data1)
#the sum of null values is found and removed to get a desired output for the dependent variable
yearly salary
sum(is.na(data1))
data1<-na.omit(data1)
summary(data1)
library(mice)
md.pattern(data1)
unique(data1$Position)
unique(data1$Seniority.level)
fix(data1)
summary(data1)
library(caTools)
set.seed(100)
split1<-sample.split(data1$City,SplitRatio =0.7)
summary(split1)
datatrain<-subset(data1, split1==TRUE)
datatest<-subset(data1, split1==FALSE)
summary(datatrain)
str(datatrain)
#library is loaded
library(ggplot2)
ggplot(datatrain, aes(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Total.years.of.experience))+geom_smooth()
cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Total.years.of.experience)
cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Years.of.experience.in.Germany)
cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Yearly.bonus...stocks.in.EUR)
ggplot(datatrain, aes(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Yearly.bonus...stocks.in.EUR))+geom_smooth()
#linear regression
reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~
Total.years.of.experience+Yearly.bonus...stocks.in.EUR+Annual.brutto.salary..without.bonus.and.sto
cks..one.year.ago..Only.answer.if.staying.in.the.same.country+
Years.of.experience.in.Germany+Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.
country+Number.of.vacation.days+Age, data = datatrain)
summary(reg1)
plot(reg1)
reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~
Total.years.of.experience+Annual.brutto.salary..without.bonus.and.stocks..one.year.ago..Only.answ
er.if.staying.in.the.same.country+
Yearly.bonus...stocks.in.EUR+Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.cou
ntry, data = datatrain)
summary(reg1)
#Outliers have been detected through the regression plotting and the outliers have been deleted
plot(reg1)
plot(reg1$residuals, c(1:length(reg1$residuals)))
shapiro.test(reg1$residuals)
library(readr)
View(data1)
library(caTools)
library(caret)
library(ISLR)
data1<-data1[,-c(1,2)]
set.seed(100)
split1<-sample.split(data1$Status, SplitRatio=0.7)
data1train<-subset(data1, split1==TRUE)
data1test<-subset(data1, split1==FALSE)
str(data1train)
#logistic regression
library(caret)
lreg1<-train(Status~ .,
method ="glm",
family = "binomial",
data=data1train)
lreg1
summary(lreg1)
#conider variables with stars*** and some common sense. the estimate shows the poitive or
negative effects
#model fit--------
library(blorr)
library(Rcpp)
blr_model_fit_stats(lreg1$finalModel)
#the model is good fit as (p-value is less than 0.05 reject (accept) null hypothesis) but the r2 and
other values are not great
blr_test_hosmer_lemeshow(lreg1$finalModel)
#confusion matrix
#chooose the cut off value such that the sensitivity is high
gaintable<-blr_gains_table(lreg1$finalModel)
blr_roc_curve(gaintable)
#sensitivity should be high by giving equal weightage(joining and not joining in this data) )
#OR
#cut off value if from the graph, it can be 0.4,0.5, 0.6, 0.7
#find a cut value in graph such that both conditions are satisfied
#prob stat:HR STATUS-whether the candidate is joinging or not joing after offer given
lreg1$finalModel
#confusion matrix
confusionMatrix(predict1, data1test$Status)
#If the variables are in high number should us this forward or backward method
library(blorr)
#OR
#AIC lower the value better the model, AIC final model
plot(regforward)
#OR
blr_step_
regbackward<-blr
plot(regboth)
regboth$model
summary(regboth$model)
library(caret)
lreg2<-train(Status~Notice.period+
Candidate.Source+
Location+
LOB+Age+
Offered.band+
DOJ.Extended+
Rex.in.Yrs,
method= "glm",
family ="binomial",
data= datatrain)
lreg2
lreg2$finalModel
summary(lreg2$finalModel)
gaintable<-blr_gains_table(lreg2$finalModel)
blr_roc_curve(gaintable)
library(readr)
View(dataset)
data1$Hospitalization<- factor(data1$Hospitalization,
levels = c(0,1),
library(readr)
View(data1)
data1$Hospitalization<- factor(data1$Hospitalization,
levels = c(0,1),
library(caret)
library(ISLR)
str(data1)
lreg1<-train(Hospitalization~ï..Age,
method ="glm",
family = "binomial",
data=data1)
lreg1
lreg1$finalModel
#results say residual deviance and null deviance has no significant change
#beta0=-16.7 beta1=0.25
lreg1$finalModel$coefficients
exp(0.5769003)
lreg1$finalModel$coefficients
coef(lreg1$finalModel)
#probability
1.78/2.78=0.64
#probability
1.78/2.78
lreg1$finalModel$fitted.values
library(blorr)
library(Rcpp)
install.packages("blorr")
library(blorr)
library(Rcpp)
blr_model_fit_stats(lreg1$finalModel)
install.packages("survey")
library(survey)
regTermTest(lreg1$finalModel, "Age")
#confusion matrix
blr_confusion_matrix(lreg1$finalModel)
#“Predicting the contract duration of an IT employee from EU region as "Temporary contract" or
"Unlimited contract”.
#From the prospective of the employees and employers it is very essential to know the contract
duration of a particular candidate for a particular position in the organization.
#In this analysis, an attempt is made to solve the problem of getting a right contract based on the
historical survey of existing employees on different parameters.
#This type of study is part of HR analytics, which can assist HR teams set goals, measure success, and
optimize procedures based on desired/expected skill sets and skill sets created by their applicants.
#The company may concentrate on training its personnel based on this analysis, which will help the
company generate income in the future.
#This predication is very help full for the aspiring IT candidates to predict their contarct duration
based on their skill sets, current salary income and other variables.
#This can also be used by HR department to determine the contracts durations to be signed with an
existing or new position in an organization.
#loading the required library and importing the "IT salary survey EU" data set
library(readxl)
library(descr)
library(caret)
library(ISLR)
read.table()
data.frame()
sum(is.na(data1))
str(data1)
#The unwanted columns which has no relation in the given problem are removed
data1<-data1[-23]
data1<-data1[-22]
data1<-data1[-21]
data1<-data1[-10]
data1<-data1[-9]
data1<-data1[-4]
data1<-data1[-1]
View(data1)
#the sum of null values is found and removed to get a desired output for the dependent variable
yearly salary
sum(is.na(data1))
data1[data1==""] <- NA
data1<-na.omit(data1)
summary(data1)
View(data1)
#The column of contarctv duration is split and the data model is trained and similraly the trained
model is also tested.
library(caTools)
set.seed(100)
split1<-sample.split(data1$Contract.duration,SplitRatio =0.7)
summary(split1)
datatrain<-subset(data1, split1==TRUE)
datatest<-subset(data1, split1==FALSE)
summary(datatrain)
str(datatrain)
#logistic regression
library(caret)
lreg1<-train(Contract.duration~.,
method ="glm",
family = "binomial",
data=datatrain)
lreg1
#The model has a accuracy of 84% and an error terms of 16%
summary(lreg1)
#The variables in the columns such as Position, seniority level,yearly salary, employement status etc
has a postive impact on the dependent variable Contract duration
#The estimate in the table shows the poitive or negative effects of the independent variables on Y
#model fit--------
library(blorr)
library(Rcpp)
blr_model_fit_stats(lreg1$finalModel)
#the model is good fit as (p-value is greater than 0.05 then accept null hypothesis)
blr_test_hosmer_lemeshow(lreg1$finalModel)
#chooose the cut off value such that the sensitivity is high
gaintable<-blr_gains_table(lreg1$finalModel)
blr_roc_curve(gaintable)
#confusion matrix
#sensitivity should be high by giving equal weightage(temporaray contarct and unlimited contract in
this data) )
#The model of contract duration shows a accuracy of 97%. The independent varibale are highly
accurate in determining the dependent variable contarct duration
#cut off value from the graph is taken as a mid point i.e 0.5
#found a cut value in graph such that both conditions are satisfied
#consider sensitivity-99%. It determines how many actual correct independent values we were able
to predict correctly
#specificity-54%.It determines how many actual negative independent values we were able to
predict the contract duration model correctly
lreg1$finalModel
predict1<-predict(lreg1, datatest)
#confusion matrix
confusionMatrix(predict1, datatest$Contract.duration)
#The model of contract duration shows a accuracy of 86%. The independent varibale are highly
accurate in determining the dependent variable contarct duration
#consider sensitivity-30%. It determines how many actual correct independent values we were able
to predict correctly
#specificity-89%.It determines how many actual negative independent values we were able to
predict the contract duration model correctly
#logistic regression 2
library(caret)
lreg2<-train(Contract.duration~ Age+Gender+Position+Total.years.of.experience+
Seniority.level+Yearly.bonus...stocks.in.EUR+Annual.brutto.salary..without.bonus.and.stocks..one.ye
ar.ago..Only.answer.if.staying.in.the.same.country+
Number.of.vacation.days+Main.language.at.work+Employment.status, method
="glm",family = "binomial",data=datatrain)
lreg2
summary(lreg2)
#model fit test for second model--------
library(blorr)
library(Rcpp)
#the model is good fit as (p-value is greater than 0.05 then accept null hypothesis)
blr_test_hosmer_lemeshow(lreg2$finalModel)
#chooose the cut off value such that the sensitivity is high
gaintable<-blr_gains_table(lreg2$finalModel)
blr_roc_curve(gaintable)
#confusion matrix
#sensitivity should be high by giving equal weightage(temporaray contarct and unlimited contract in
this data) )
#The model of contract duration shows a accuracy of 97%. The independent varibale are highly
accurate in determining the dependent variable contarct duration
#cut off value from the graph is taken as a mid point i.e 0.5
#found a cut value in graph such that both conditions are satisfied
#consider sensitivity-99%. It determines how many actual correct independent values we were able
to predict correctly
#specificity-54%.It determines how many actual negative independent values we were able to
predict the contract duration model correctly
#Both the models 1 and model 2 of regression have the similar sensitivity and specificity and any one
of the model can be choosen.Hence, the step LOG RIG is utilized
#If the variables are in high number should use this forward or backward method
library(blorr)
plot(regforward)