20BCE1205 Lab3

20BCE1205
SHUBHAM OJHA
EDA
LAB3
2023-01-30
1. Compute the relationship that exists between each variable (x1, x2, x3) with
target(y) of the given dataset by finding the Correlation. Out of the given three
variables which one would you choose for modelling. Simulate the above task
using R code.
df<-data.frame(x1=c(1,3,4,5,6,7,8),x2=c(8,7,5,6,4,3,2),x3=c(8,2,4,6,3,7,5),y=
c(2,5,7,11,12,15,17))
df
## x1 x2 x3 y
## 1 1 8 8 2
## 2 3 7 2 5
## 3 4 5 4 7
## 4 5 6 6 11
## 5 6 4 3 12
## 6 7 3 7 15
## 7 8 2 5 17
x1=df$x1
x2=df$x2
x3=df$x3
y=df$y
#without inbuilt function

#covariance=sum((x1-mean(x1))*(y-mean(y)))/n-1
#for x1 and y
sdx1=sd(x1)
sdy=sd(y)
correlation1=sum((x1-mean(x1))*(y-mean(y)))/sqrt(sum((x1-mean(x1))^2) * sum((
y-mean(y))^2))
# for x2 and y
sdx2=sd(x2)
y-mean(y))^2))
# for(x2 and y)
sdx3=sd(x3)
y-mean(y))^2))
print("Without inbuilt method: ")
## [1] "Without inbuilt method: "
paste("The value of correlation without inbuilt function between x1 and y is

",correlation1)
## [1] "The value of correlation without inbuilt function between x1 and y is

0.991610842427341"

",correlation2)

-0.937893697625288"

",correlation3)

0.0142105105700801"
cat("\n")
#with inbuilt function

print("With inbuilt method: ")
## [1] "With inbuilt method: "
paste("The value of correlation with inbuilt function between x1 and y is ",c

or(df$x1,df$y))
## [1] "The value of correlation with inbuilt function between x1 and y is 0

.991610842427341"

or(df$x2,df$y))
## [1] "The value of correlation with inbuilt function between x1 and y is -

0.937893697625288"

or(df$x3,df$y))
## [1] "The value of correlation with inbuilt function between x1 and y is 0
.0142105105700801"
print("x1 and y has strong correlation among them so we will be chossing x1 f

or modelling")
## [1] "x1 and y has strong correlation among them so we will be chossing x1
for modelling"
library(ggplot2,ggpubr)
## Warning: package 'ggplot2' was built under R version 4.2.2
ux=mean(df$x)
## Warning in mean.default(df$x): argument is not numeric or logical: returni

ng NA
uy=mean(df$y)
b1=sum((df$x-ux)*(df$y-uy))
b1=b1/sum((df$x-ux)^2)
b0=uy-b1*ux
print(paste("y = ",b0," + ",b1," x"))
## [1] "y = NaN + NaN x"
#13
tss=sum((df$y-uy)^2)
tss
## [1] 176.8571
pred=b0+b1*df$x
pred
## numeric(0)
rss=sum((pred-df$y)^2)
r2=1-(rss/tss)
print(paste("R Square = ",r2))
## [1] "R Square = 1"
#14
e=pred-df$y
e
## numeric(0)
sde=sd(e)
sde
## [1] NA
seb0=sde*(sqrt((1/nrow(df))+(((ux)^2)/(sum((df$x-ux)^2)))))
seb1=sde*(sqrt((1/sum((df$x-ux)^2))))
print(paste("Standard Error for B0 = ",seb0," Standard Error for B1 = ",seb1)
)
## [1] "Standard Error for B0 = NA Standard Error for B1 = NA"
Question 2. Consider the following five training examples # X = [2 3 4 5 6] # Y = [12

17 23 28 32] #Write the R script for the following and give your inference. #(a) Find
the best linear fit (Y=aX+b)
x=c(2,3,4,5,6)
y=c(12,17,23,28,32)
model=lm(y~x)
model
##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 2.0 5.1
#(b) Plot the graph of the model
dt=data.frame(X=x,Y=y)
dt
## X Y
## 1 2 12
## 2 3 17
## 3 4 23
## 4 5 28
## 5 6 32
ggplot(dt, aes(X, Y)) +

geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

#(c) Determine the minimum RSS
print(sum((y-predict(model,dt))^2))
## [1] 1.1
#OR
rss=deviance(model)
rss
## [1] 1.1
#(d) Draw the residual plot for the best linear fit and comment on the sui
tability of the linear model to this training data.
library(ggplot2,caTools,dplyr)
plot(model, rss)
abline(0,0)
#(e) Evaluate the standard errors associated with a and b.
ux=mean(dt$X)
uy=mean(dt$Y)
b1=sum((dt$X-ux)*(dt$Y-uy))
b1=b1/sum((dt$X-ux)^2)
b0=uy-b1*ux
b0
## [1] 2
pred=b0+b1*dt$X
pred
## [1] 12.2 17.3 22.4 27.5 32.6
e=dt$Y-pred
e
## [1] -0.2 -0.3 0.6 0.5 -0.6
sde=sqrt(sum((e-mean(e))^2)/nrow(dt))
sde
## [1] 0.4690416
seb0=sde*sqrt((1/nrow(dt))+(((ux)^2)/(sum((dt$X-ux)^2))))
seb1=sde*(sqrt((1/sum((dt$X-ux)^2))))
print(paste("Standard Error for B0 = ",seb0," Standard Error for B1 = ",seb1)

)
## [1] "Standard Error for B0 = 0.629285308902089 Standard Error for B1 =

0.148323969741913"
#(f) Determine the 95% confidence interval for a and b
print("Confidence Interval- \n")
## [1] "Confidence Interval- \n"
confint(model, level=0.95)
## 2.5 % 97.5 %
## (Intercept) -0.5854316 4.585432
## x 4.4906079 5.709392
#(g) Compute R2 statistic
RSQUARE=1-(rss/sum((y-mean(y))^2))
cat("RSQUARE - ",RSQUARE,"\n")
## RSQUARE - 0.9957887
#or
#summary(model)$r.squared
#(h) Predict the value of a test instance from the dataset
p=data.frame(X=x)
y2=predict(model, newdata=p)
y2
## 1 2 3 4 5
## 12.2 17.3 22.4 27.5 32.6
Question 3- Apply linear regression analysis on any prominent dataset

and state the inferences.
#mtcars
data(mtcars)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
input=mtcars
fit<-lm(mpg~wt,data=input)
summary(fit)
##
## Call:
## lm(formula = mpg ~ wt, data = input)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5432 -2.3647 -0.1252 1.4096 6.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.2851 1.8776 19.858 < 2e-16 ***
## wt -5.3445 0.5591 -9.559 1.29e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446
## F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10
ggplot(input, aes(mpg, wt)) +

geom_point() +
geom_smooth(method = "lm",se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

print("The relationship between mpg (miles per gallon) and wt (weight of the
car) in the mtcars dataset is typically negative, meaning that as the weight
of the car increases, the miles per gallon decreases. This makes sense as hea
vier cars generally require more fuel to operate and therefore have lower fue
l efficiency")
## [1] "The relationship between mpg (miles per gallon) and wt (weight of the
car) in the mtcars dataset is typically negative, meaning that as the weight
of the car increases, the miles per gallon decreases. This makes sense as hea
vier cars generally require more fuel to operate and therefore have lower fue
l efficiency"

20BCE1205 Lab3

Uploaded by

Copyright:

Available Formats

You might also like

20BCE1205 Lab3

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

20BCE1205 Lab3

Uploaded by

Copyright:

Available Formats

20BCE1205

#without inbuilt function

print("Without inbuilt method: ")

## [1] "Without inbuilt method: "

paste("The value of correlation without inbuilt function between x1 and y is

## [1] "The value of correlation without inbuilt function between x1 and y is

paste("The value of correlation without inbuilt function between x1 and y is

## [1] "The value of correlation without inbuilt function between x1 and y is

paste("The value of correlation without inbuilt function between x1 and y is

## [1] "The value of correlation without inbuilt function between x1 and y is

#with inbuilt function

## [1] "With inbuilt method: "

paste("The value of correlation with inbuilt function between x1 and y is ",c

## [1] "The value of correlation with inbuilt function between x1 and y is 0

paste("The value of correlation with inbuilt function between x1 and y is ",c

## [1] "The value of correlation with inbuilt function between x1 and y is -

paste("The value of correlation with inbuilt function between x1 and y is ",c

print("x1 and y has strong correlation among them so we will be chossing x1 f

## Warning: package 'ggplot2' was built under R version 4.2.2

## Warning in mean.default(df$x): argument is not numeric or logical: returni

## [1] "y = NaN + NaN x"

## [1] "R Square = 1"

## [1] "Standard Error for B0 = NA Standard Error for B1 = NA"

Question 2. Consider the following five training examples # X = [2 3 4 5 6] # Y = [12

#(b) Plot the graph of the model

ggplot(dt, aes(X, Y)) +

## `geom_smooth()` using formula = 'y ~ x'

## [1] 12.2 17.3 22.4 27.5 32.6

## [1] -0.2 -0.3 0.6 0.5 -0.6

print(paste("Standard Error for B0 = ",seb0," Standard Error for B1 = ",seb1)

## [1] "Standard Error for B0 = 0.629285308902089 Standard Error for B1 =

#(f) Determine the 95% confidence interval for a and b

print("Confidence Interval- \n")

## [1] "Confidence Interval- \n"

#(g) Compute R2 statistic

#(h) Predict the value of a test instance from the dataset

Question 3- Apply linear regression analysis on any prominent dataset

ggplot(input, aes(mpg, wt)) +

## `geom_smooth()` using formula = 'y ~ x'

You might also like