Professional Documents
Culture Documents
20BCE1205 Lab3
20BCE1205 Lab3
20BCE1205 Lab3
SHUBHAM OJHA
EDA
LAB3
2023-01-30
1. Compute the relationship that exists between each variable (x1, x2, x3) with
target(y) of the given dataset by finding the Correlation. Out of the given three
variables which one would you choose for modelling. Simulate the above task
using R code.
df<-data.frame(x1=c(1,3,4,5,6,7,8),x2=c(8,7,5,6,4,3,2),x3=c(8,2,4,6,3,7,5),y=
c(2,5,7,11,12,15,17))
df
## x1 x2 x3 y
## 1 1 8 8 2
## 2 3 7 2 5
## 3 4 5 4 7
## 4 5 6 6 11
## 5 6 4 3 12
## 6 7 3 7 15
## 7 8 2 5 17
x1=df$x1
x2=df$x2
x3=df$x3
y=df$y
# for x2 and y
sdx2=sd(x2)
correlation2=sum((x2-mean(x2))*(y-mean(y)))/sqrt(sum((x2-mean(x2))^2) * sum((
y-mean(y))^2))
# for(x2 and y)
sdx3=sd(x3)
correlation3=sum((x3-mean(x3))*(y-mean(y)))/sqrt(sum((x3-mean(x3))^2) * sum((
y-mean(y))^2))
cat("\n")
## [1] "x1 and y has strong correlation among them so we will be chossing x1
for modelling"
library(ggplot2,ggpubr)
ux=mean(df$x)
uy=mean(df$y)
b1=sum((df$x-ux)*(df$y-uy))
b1=b1/sum((df$x-ux)^2)
b0=uy-b1*ux
print(paste("y = ",b0," + ",b1," x"))
#13
tss=sum((df$y-uy)^2)
tss
## [1] 176.8571
pred=b0+b1*df$x
pred
## numeric(0)
rss=sum((pred-df$y)^2)
r2=1-(rss/tss)
print(paste("R Square = ",r2))
#14
e=pred-df$y
e
## numeric(0)
sde=sd(e)
sde
## [1] NA
seb0=sde*(sqrt((1/nrow(df))+(((ux)^2)/(sum((df$x-ux)^2)))))
seb1=sde*(sqrt((1/sum((df$x-ux)^2))))
print(paste("Standard Error for B0 = ",seb0," Standard Error for B1 = ",seb1)
)
##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 2.0 5.1
dt=data.frame(X=x,Y=y)
dt
## X Y
## 1 2 12
## 2 3 17
## 3 4 23
## 4 5 28
## 5 6 32
## [1] 1.1
#OR
rss=deviance(model)
rss
## [1] 1.1
#(d) Draw the residual plot for the best linear fit and comment on the sui
tability of the linear model to this training data.
library(ggplot2,caTools,dplyr)
plot(model, rss)
abline(0,0)
#(e) Evaluate the standard errors associated with a and b.
ux=mean(dt$X)
uy=mean(dt$Y)
b1=sum((dt$X-ux)*(dt$Y-uy))
b1=b1/sum((dt$X-ux)^2)
b0=uy-b1*ux
b0
## [1] 2
pred=b0+b1*dt$X
pred
e=dt$Y-pred
e
sde=sqrt(sum((e-mean(e))^2)/nrow(dt))
sde
## [1] 0.4690416
seb0=sde*sqrt((1/nrow(dt))+(((ux)^2)/(sum((dt$X-ux)^2))))
seb1=sde*(sqrt((1/sum((dt$X-ux)^2))))
confint(model, level=0.95)
## 2.5 % 97.5 %
## (Intercept) -0.5854316 4.585432
## x 4.4906079 5.709392
RSQUARE=1-(rss/sum((y-mean(y))^2))
cat("RSQUARE - ",RSQUARE,"\n")
## RSQUARE - 0.9957887
#or
#summary(model)$r.squared
p=data.frame(X=x)
y2=predict(model, newdata=p)
y2
## 1 2 3 4 5
## 12.2 17.3 22.4 27.5 32.6
input=mtcars
fit<-lm(mpg~wt,data=input)
summary(fit)
##
## Call:
## lm(formula = mpg ~ wt, data = input)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5432 -2.3647 -0.1252 1.4096 6.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.2851 1.8776 19.858 < 2e-16 ***
## wt -5.3445 0.5591 -9.559 1.29e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446
## F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10
## [1] "The relationship between mpg (miles per gallon) and wt (weight of the
car) in the mtcars dataset is typically negative, meaning that as the weight
of the car increases, the miles per gallon decreases. This makes sense as hea
vier cars generally require more fuel to operate and therefore have lower fue
l efficiency"