R Codes

You might also like

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 23

#“Predicting the salary of an IT employee from EU region”.

#From the prospective of the employees and employers it is very essential to know the yearly salary
of a particular candidate or a particular position in the organization.

#In this analysis, an attempt is made to solve the problem of getting a right salary based on the
historical survey of existing employees on different parameters.

#This type of study is part of HR analytics, which can assist HR teams set goals, measure success, and
optimize procedures based on desired skill sets and skill sets created by their applicants.

#The company may concentrate on training its personnel based on this analysis, which will help the
company generate income in the future.

#This predication is very help full for the aspiring IT candidates to predict their salary based on their
skill sets, current salary income and other variables.

#This can also be used by HR department to determine the salary for an existing or new position in
an organization.

#loading the required library and importing the "IT salary survey EU" data set

library(readxl)

library(descr)

library(caTools)

library(caret)

library(ISLR)

read.table()

data.frame()

data1<-read.csv("C:/Users/hp/Downloads/2027911 MLA Cia1/IT Salary Survey EU 2020...csv",


stringsAsFactors = TRUE)

sum(is.na(data1))

str(data1)

#The unwanted columns which has no relation in the given problem are removed

data1<-data1[-23]

data1<-data1[-22]

data1<-data1[-21]

data1<-data1[-1]

view(data1)
#the sum of null values is found and removed to get a desired output for the dependent variable
yearly salary

sum(is.na(data1))

data1<-na.omit(data1)

summary(data1)

#The data set is splitted and Created a subset of 70% data

library(caTools)

set.seed(100)

split1<-sample.split(data1$City,SplitRatio =0.7)

summary(split1)

#The data set is trained and tested in the ratio of 70:30

#through train we can check the generalisability of the trained model

datatrain<-subset(data1, split1==TRUE)

datatest<-subset(data1, split1==FALSE)

summary(datatrain)

str(datatrain)

#logistic regression

library(caret)

lreg1<-train(Contract.duration~ .,

method ="glm",

family = "binomial",

data=datatrain)

lreg1

summary(lreg1)
#taken by removing outliers(12) from the outlier section at the bottom after plotting "reg1"

data1[c(631,659,805,442,854,805,565,575,779,153,839,728,1224,495,574,967,959,674),]<-NA

#taken by removing outliers(12) from the outlier section at the bottom after plotting "reg1"

data1[c(631,659,805,442,854),]<-NA

reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~

Total.years.of.experience+Yearly.bonus...stocks.in.EUR+datatrain$Annual.brutto.salary..without.bon
us.and.stocks..one.year.ago..Only.answer.if.staying.in.the.same.country+

datatrain$Years.of.experience.in.Germany+datatrain$Annual.bonus.stocks.one.year.ago..Only.answ
er.if.staying.in.same.country+datatrain$Number.of.vacation.days+datatrain$Contract.duration+data
train$Age, data = datatrain)

For replacing blank categorical column with “none”

library(tidyverse)

data1%>%

select(Age,Gender,City,Position,Total.years.of.experience,Years.of.experience.in.Germany
,Seniority.level,Your.main.technology...programming.language,Other.technologies.programming.lan
guages.you.use.often,Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,Yearly.bonus...stocks.in
.EUR,Annual.brutto.salary..without.bonus.and.stocks..one.year.ago..Only.answer.if.staying.in.the.sa
me.country,Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.country,Number.of.v
acation.days,Employment.status,Contract.duration,Main.language.at.work,Company.size,Company.t
ype,)%>%

mutate(Position=replace_na(Position,"none"))%>%
view()

-------------------------------------------------------

data1%>%

select(Age,Gender,City,Position,Total.years.of.experience,Total.years.of.experience,Seniority.level,Y
our.main.technology...programming.language,Other.technologies.programming.languages.you.use.o
ften,Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,Yearly.bonus...stocks.in.EUR,Annual.bru
tto.salary..without.bonus.and.stocks..one.year.ago..Only.answer.if.staying.in.the.same.country,Annu
al.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.country,Number.of.vacation.days,Empl
oyment.status,)

library(readxl)

read.table()

data.frame()

data1<-read.csv("C:/Users/hp/Downloads/Sem4/satya 1.csv", stringsAsFactors = TRUE)

data1<-na.omit(data1)#taken by removing outliers(6) from the outlier section at the bottom

data1<-na.omit(data1)

library(caTools)

set.seed(100)

split1<-sample.split(data1$Course,SplitRatio =0.7)

summary(split1)

datatrain<-subset(data1, split1==TRUE)

datatest<-subset(data1, split1==FALSE)

summary(datatrain)

str(datatrain)

library(ggplot2)

ggplot(datatrain, aes(datatrain$Student_engagement, datatrain$goal_orientation))+geom_point()

cor(datatrain$Student_engagement, datatrain$goal_orientation)

#exists a relationship (p value=0.68)


cor.test(datatrain$Student_engagement, datatrain$goal_orientation)

ggplot(datatrain, aes(datatrain$Student_engagement, datatrain$goal_orientation))+geom_smooth()

#looks linear

reg1<-lm(datatrain$Student_engagement~

datatrain$goal_orientation+

datatrain$academic_self_eficacy, data = datatrain)

summary(reg1)

#OR

reg1<-lm(Student_engagement~

goal_orientation+

academic_self_eficacy,

data = datatrain)

summary(reg1)

#F-statistic p<0.05 reject null, model is good fit

#How good the model is answered with R2 which is 66%

#Adj r2 for more than one independent variables (adjust for degg of freedom)

#intercept: is not significantly effecting dependent variable(if p>0.05)

#beta1: 22% no significant effect of the constant on 'self efficacy'

#beta2: 73%

#Multicollinearity

library(car)

vif(reg1) #variance inflation factor:<4(small model, no multicollinearity)

#Assumptions

#Checking randomness: error term is seen randomly distributed

plot(reg1$residuals, c(1:length(reg1$residuals)))

#normality of error terms(boxplot,qqnorm,shapiro,skewness)

boxplot(reg1$residuals) #the midline is not at the middle, devaitions exists

shapiro.test(reg1$residuals) #P<0.05 reject null, not normally distributed


hist(reg1$residuals)#not normal

descr(datatrain)

summary(datatrain)

library(moments)

# skeness and kurtosis- normality

datatrain$residual1<-reg1$residuals

moments::skewness(datatrain$residual1)

moments::kurtosis(datatrain$residual1)

#OR

library(e1071)

kurtosis(reg1$residuals)

#outliers

#hetro sidasticity, find cone shape, equally distribution on both sides

plot(reg1$residuals, reg1$fitted.values)

library(lmtest)

bptest(reg1) #no equal distribution of , model is not a good fit

#outliers---

plot(reg1)

#OR

library(car)

influenceIndexPlot(reg1)#261

#converted outlers as missing using NA and removed in main data set in next step

data1[c(261,799,96,426,248),]<-NA
reg1<-lm(emission~mass+

wheelbase+

axlesteering+

axleother+

enginecapacity+

emisionreduction,data = data1)

summary(reg1)

library(car)

vif(reg1)

#model---more variable, more data set----two models=axle steering, axle other

#created two models based on the higher ranges of Vif

regm1<-lm(emission~mass+

wheelbase+

axleother+

enginecapacity+

emisionreduction,data = data1)

summary(regm1)

vif(regm1) #no-multicollinaerity..no higher values

regm2<-lm(emission~mass+

wheelbase+

axlesteering+
enginecapacity+

emisionreduction,data = data1)

summary(regm2)

vif(regm2)

#rmse------,Anova

prem1<-predict(regm1, data1)

prem2<-predict(regm2, data1)

library(caret)

library(Metrics)

RMSE(data1$emission,prem1)

RMSE(data1$emission,prem2)

# the coefficients and r2 values are similar so we can have both the models for prediction of
"emmission"
#loading the required library and importing the "IT salary survey EU" data set

library(readxl)

library(descr)

read.table()

data.frame()

data1<-read.csv("C:/Users/hp/Downloads/IT Salary Survey EU 2020...csv", stringsAsFactors = TRUE)

sum(is.na(data1))

str(data1)

#taken by removing outliers from the outlier section at the bottom after plotting "reg1"

#multiple running of the codes gave the following outliers.

data1[c(631,659,805,442,854,855,805,565,575,779,153,839,728,1224,495,574,967,959,674),]<-NA

#the unwanted columns which has no relation in the given problem are removed

data1<-data1[-23]

data1<-data1[-22]

data1<-data1[-21]

data1<-data1[-1]

view(data1)

#the sum of null values is found and removed to get a desired output for the dependent variable
yearly salary

sum(is.na(data1))

data1<-na.omit(data1)

summary(data1)

#The library is used to find unique values in a particular column

library(mice)

md.pattern(data1)

unique(data1$Position)

unique(data1$Seniority.level)
fix(data1)

summary(data1)

#the data set is splitted

library(caTools)

set.seed(100)

split1<-sample.split(data1$City,SplitRatio =0.7)

summary(split1)

#The data set is trained and tested in the ration of 70:30

datatrain<-subset(data1, split1==TRUE)

datatest<-subset(data1, split1==FALSE)

summary(datatrain)

str(datatrain)

#library is loaded

library(ggplot2)

ggplot(datatrain, aes(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Total.years.of.experience))+geom_smooth()

#The graph of yearly salary and years of experience of IT professional

cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Total.years.of.experience)

#There exists a nearly perfect relationship(p=0.43)

cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Years.of.experience.in.Germany)

#There exists no correlation between the salary and experience in germany

cor(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Yearly.bonus...stocks.in.EUR)

ggplot(datatrain, aes(datatrain$Yearly.brutto.salary..without.bonus.and.stocks..in.EUR,
datatrain$Yearly.bonus...stocks.in.EUR))+geom_smooth()

#linear regression

reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~

Total.years.of.experience+Yearly.bonus...stocks.in.EUR+Annual.brutto.salary..without.bonus.and.sto
cks..one.year.ago..Only.answer.if.staying.in.the.same.country+

Years.of.experience.in.Germany+Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.
country+Number.of.vacation.days+Age, data = datatrain)
summary(reg1)

#Outliers are marked to N/A and is omited in the above section

plot(reg1)

#Based on the above results the new model is developed here

reg1<-lm(Yearly.brutto.salary..without.bonus.and.stocks..in.EUR~

Total.years.of.experience+Annual.brutto.salary..without.bonus.and.stocks..one.year.ago..Only.answ
er.if.staying.in.the.same.country+

Yearly.bonus...stocks.in.EUR+Annual.bonus.stocks.one.year.ago..Only.answer.if.staying.in.same.cou
ntry, data = datatrain)

summary(reg1)

#F-statistic: p<0.05 reject null, model is good fit

#R-sqr=0.41 which means that 41% of the model looks good.

#Outliers have been detected through the regression plotting and the outliers have been deleted

plot(reg1)

#variance inflation factor:<4(small model, no multicollinearity)

#Checking randomness: error term is seen randomly distributed

plot(reg1$residuals, c(1:length(reg1$residuals)))

#normality of error terms(shapiro test)

shapiro.test(reg1$residuals)

#P<0.05 reject null, not normally distributed

hist(reg1$residuals)#The model is almost near to normal


LOGISTICS REG HR

library(readr)

data1 <- read_csv("C:/Users/hp/Downloads/IMB533_HR_Data_No_Missing_Value.csv", )

View(data1)

library(caTools)

library(caret)

library(ISLR)

#removing first two col of unwanted ID

data1<-data1[,-c(1,2)]

#Creating a subset of 70% data

set.seed(100)

split1<-sample.split(data1$Status, SplitRatio=0.7)

data1train<-subset(data1, split1==TRUE)

data1test<-subset(data1, split1==FALSE)

#through train we can chcek the generalisability of the trained model

str(data1train)

#logistic regression

library(caret)

lreg1<-train(Status~ .,

method ="glm",

family = "binomial",

data=data1train)

lreg1

summary(lreg1)

#conider variables with stars*** and some common sense. the estimate shows the poitive or
negative effects
#model fit--------

library(blorr)

library(Rcpp)

#AIC lower the value better the model

#analyse p value also

blr_model_fit_stats(lreg1$finalModel)

#MCFadden's R2-13.5% model is

#H0:model is good fit

#the model is good fit as (p-value is less than 0.05 reject (accept) null hypothesis) but the r2 and
other values are not great

blr_test_hosmer_lemeshow(lreg1$finalModel)

#confusion matrix

blr_confusion_matrix(lreg1$finalModel, cutoff = 0.5)

#chooose the cut off value such that the sensitivity is high

gaintable<-blr_gains_table(lreg1$finalModel)

blr_roc_curve(gaintable)

#sensitivity should be high by giving equal weightage(joining and not joining in this data) )

#OR

#cut off value if from the graph, it can be 0.4,0.5, 0.6, 0.7

#find a cut value in graph such that both conditions are satisfied

#consider sensitivity-6% and specificity-98%

#prob stat:HR STATUS-whether the candidate is joinging or not joing after offer given

lreg1$finalModel

#using the final model only in regression


predict1<-predict(lreg1, data1test)

#confusion matrix

confusionMatrix(predict1, data1test$Status)

#stepwise logistic regression

#If the variables are in high number should us this forward or backward method

library(blorr)

blr_step_aic_forward(lreg1$finalModel, details = TRUE)

#OR

regforward<-blr_step_aic_forward(lreg1$finalModel, details = TRUE)

#AIC lower the value better the model, AIC final model

plot(regforward)

#OR

blr_step_

regbackward<-blr

plot(regboth)

regboth$model

summary(regboth$model)

#both creating the model

library(caret)

lreg2<-train(Status~Notice.period+

Candidate.Source+

Location+

LOB+Age+

Offered.band+
DOJ.Extended+

Rex.in.Yrs,

method= "glm",

family ="binomial",

data= datatrain)

lreg2

lreg2$finalModel

summary(lreg2$finalModel)

gaintable<-blr_gains_table(lreg2$finalModel)

blr_roc_curve(gaintable)
library(readr)

dataset <- read_csv("C:/Users/hp/Downloads/Sem4-R/sem 4.3/logistic 1.csv")

View(dataset)

data1$Hospitalization<- factor(data1$Hospitalization,

levels = c(0,1),

labels = c("No Admission", "Admission"))

library(readr)

data1 <- read_csv("C:/Users/hp/Downloads/Sem4-R/sem 4.3/logistic 1.csv")

View(data1)

data1$Hospitalization<- factor(data1$Hospitalization,

levels = c(0,1),

labels = c("No Admission", "Admission"))

library(caret)

library(ISLR)

str(data1)

lreg1<-train(Hospitalization~ï..Age,

method ="glm",

family = "binomial",

data=data1)

lreg1

lreg1$finalModel

#results say residual deviance and null deviance has no significant change

#heigher the difference more is the effect of independent variables on Y

#beta0=-16.7 beta1=0.25

lreg1$finalModel$coefficients

#Odds ratio for intercept


exp(-16.719781)

exp(0.5769003)

lreg1$finalModel$coefficients

coef(lreg1$finalModel)

#probability

1.78/2.78=0.64

#probability

1.78/2.78

#probability for age 28

lreg1$finalModel$fitted.values

library(blorr)

library(Rcpp)

install.packages("blorr")

library(blorr)

library(Rcpp)

blr_model_fit_stats(lreg1$finalModel)

install.packages("survey")

#why we use t test in regression

library(survey)

regTermTest(lreg1$finalModel, "Age")

#confusion matrix

blr_confusion_matrix(lreg1$finalModel)
#“Predicting the contract duration of an IT employee from EU region as "Temporary contract" or
"Unlimited contract”.

#From the prospective of the employees and employers it is very essential to know the contract
duration of a particular candidate for a particular position in the organization.

#In this analysis, an attempt is made to solve the problem of getting a right contract based on the
historical survey of existing employees on different parameters.

#This type of study is part of HR analytics, which can assist HR teams set goals, measure success, and
optimize procedures based on desired/expected skill sets and skill sets created by their applicants.

#The company may concentrate on training its personnel based on this analysis, which will help the
company generate income in the future.

#This predication is very help full for the aspiring IT candidates to predict their contarct duration
based on their skill sets, current salary income and other variables.

#This can also be used by HR department to determine the contracts durations to be signed with an
existing or new position in an organization.

#loading the required library and importing the "IT salary survey EU" data set

library(readxl)

library(descr)

library(caret)

library(ISLR)

read.table()

data.frame()

data1<-read.csv("C:/Users/hp/Downloads/2027911 MLA Cia1/IT Salary Survey EU


2020...csv",stringsAsFactors = TRUE )

sum(is.na(data1))

str(data1)

#The unwanted columns which has no relation in the given problem are removed

data1<-data1[-23]

data1<-data1[-22]

data1<-data1[-21]
data1<-data1[-10]

data1<-data1[-9]

data1<-data1[-4]

data1<-data1[-1]

View(data1)

#the sum of null values is found and removed to get a desired output for the dependent variable
yearly salary

sum(is.na(data1))

data1[data1==""] <- NA

data1<-na.omit(data1)

summary(data1)

View(data1)

#The data set is splitted and Created a subset of 70% data

#The column of contarctv duration is split and the data model is trained and similraly the trained
model is also tested.

library(caTools)

set.seed(100)

split1<-sample.split(data1$Contract.duration,SplitRatio =0.7)

summary(split1)

#The data set is trained and tested in the ratio of 70:30

#through train we can check the generalisability of the model

datatrain<-subset(data1, split1==TRUE)

datatest<-subset(data1, split1==FALSE)

summary(datatrain)

str(datatrain)

#logistic regression

library(caret)

lreg1<-train(Contract.duration~.,

method ="glm",

family = "binomial",

data=datatrain)

lreg1
#The model has a accuracy of 84% and an error terms of 16%

summary(lreg1)

#Conider variables with p value less then 0.05.

#The variables in the columns such as Position, seniority level,yearly salary, employement status etc
has a postive impact on the dependent variable Contract duration

#The estimate in the table shows the poitive or negative effects of the independent variables on Y

#model fit--------

library(blorr)

library(Rcpp)

#AIC: lower the value better the model

#analyse p value also

blr_model_fit_stats(lreg1$finalModel)

#MCFadden's R2-13.5% model is.

#H0:model is good fit

#the model is good fit as (p-value is greater than 0.05 then accept null hypothesis)

blr_test_hosmer_lemeshow(lreg1$finalModel)

#Hence, the model is a good fit

#chooose the cut off value such that the sensitivity is high

gaintable<-blr_gains_table(lreg1$finalModel)

blr_roc_curve(gaintable)

#The cut off is chosen as 0.5 based on the graph plotted

#confusion matrix

blr_confusion_matrix(lreg1$finalModel, cutoff = 0.5)

#sensitivity should be high by giving equal weightage(temporaray contarct and unlimited contract in
this data) )
#The model of contract duration shows a accuracy of 97%. The independent varibale are highly
accurate in determining the dependent variable contarct duration

#cut off value from the graph is taken as a mid point i.e 0.5

#found a cut value in graph such that both conditions are satisfied

#consider sensitivity-99%. It determines how many actual correct independent values we were able
to predict correctly

#specificity-54%.It determines how many actual negative independent values we were able to
predict the contract duration model correctly

#using the final model only in regression

lreg1$finalModel

predict1<-predict(lreg1, datatest)

#confusion matrix

confusionMatrix(predict1, datatest$Contract.duration)

#The model of contract duration shows a accuracy of 86%. The independent varibale are highly
accurate in determining the dependent variable contarct duration

#consider sensitivity-30%. It determines how many actual correct independent values we were able
to predict correctly

#specificity-89%.It determines how many actual negative independent values we were able to
predict the contract duration model correctly

#logistic regression 2

library(caret)

lreg2<-train(Contract.duration~ Age+Gender+Position+Total.years.of.experience+

Seniority.level+Yearly.bonus...stocks.in.EUR+Annual.brutto.salary..without.bonus.and.stocks..one.ye
ar.ago..Only.answer.if.staying.in.the.same.country+

Number.of.vacation.days+Main.language.at.work+Employment.status, method
="glm",family = "binomial",data=datatrain)

lreg2

#The model has a accuracy of 83% and an error terms of 17%

summary(lreg2)
#model fit test for second model--------

library(blorr)

library(Rcpp)

#H0:model is good fit

#the model is good fit as (p-value is greater than 0.05 then accept null hypothesis)

blr_test_hosmer_lemeshow(lreg2$finalModel)

#Hence, the model is a good fit

#chooose the cut off value such that the sensitivity is high

gaintable<-blr_gains_table(lreg2$finalModel)

blr_roc_curve(gaintable)

#The cut off is chosen as 0.5 based on the graph plotted

#confusion matrix

blr_confusion_matrix(lreg2$finalModel, cutoff = 0.5)

#sensitivity should be high by giving equal weightage(temporaray contarct and unlimited contract in
this data) )

#The model of contract duration shows a accuracy of 97%. The independent varibale are highly
accurate in determining the dependent variable contarct duration

#cut off value from the graph is taken as a mid point i.e 0.5

#found a cut value in graph such that both conditions are satisfied

#consider sensitivity-99%. It determines how many actual correct independent values we were able
to predict correctly

#specificity-54%.It determines how many actual negative independent values we were able to
predict the contract duration model correctly

#Both the models 1 and model 2 of regression have the similar sensitivity and specificity and any one
of the model can be choosen.Hence, the step LOG RIG is utilized

#stepwise logistic regression

#If the variables are in high number should use this forward or backward method

library(blorr)

regforward<-blr_step_aic_forward(lreg1$finalModel, details = TRUE)


#AIC values

plot(regforward)

You might also like