Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 24

# PANGANTIHON, Norman O.

STT061-M14

# Activity on Regression Analysis

# Your task is consider the Anscombe data set again

# These are your new problems

# Problem 1: Encode the Anscombe data set using Excel (Save the file)

# Problem 2: Export the xls file into a csv file (Find Export command

# in the File command menu of Excel)

# csv means comma separated values

# Problem 3: Use R command to load the csv file in R

homedir <- "~/NORMAN/STT061/"

setwd(homedir)

Anscombe <- read.csv("Anscombe.csv")

# Problem 4: Use R command to compute the mean of X1,X2,X3,X4

mean(Anscombe$X1)

mean(Anscombe$X2)

mean(Anscombe$X3)

mean(Anscombe$X4)
# Problem 5: Use R command to compute the mean of Y1,Y2,Y3,Y4

mean(Anscombe$Y1)

mean(Anscombe$Y2)

mean(Anscombe$Y3)

mean(Anscombe$Y4)

# Problem 6: Use R command to compute the variance Y1,Y2,Y3,Y4

var(Anscombe$Y1)

var(Anscombe$Y2)

var(Anscombe$Y3)

var(Anscombe$Y4)

# Problem 7: Use R command to compute the variance X1,X2,X3,X4

var(Anscombe$X1)

var(Anscombe$X2)

var(Anscombe$X3)

var(Anscombe$X4)

# Problem 8: Use R command to compute the sd Y1,Y2,Y3,Y4

sd(Anscombe$Y1)

sd(Anscombe$Y2)

sd(Anscombe$Y3)

sd(Anscombe$Y4)
# Problem 9: Use R command to compute the sd X1,X2,X3,X4

sd(Anscombe$X1)

sd(Anscombe$X2)

sd(Anscombe$X3)

sd(Anscombe$X4)

# Problem 10: Use R command to compute the correlation (X1,Y1),

# and also for (X2,Y2), (X3,Y3), (X4,Y4).

cor(Anscombe$X1,Anscombe$Y1)

cor(Anscombe$X2,Anscombe$Y2)

cor(Anscombe$X3,Anscombe$Y3)

cor(Anscombe$X4,Anscombe$Y4)

# Establish causal relationship by validating the four assumptions

# Problem 11: Build the Simple Linear Regression Model for (X1, Y1).

# Verify the assumptions of this model.

# Step1: Load data set

Anscombe <- read.csv("Anscombe.csv")

# Step2: Build the Simple Linear Regression Model

model <- lm(Anscombe$Y1 ~ Anscombe$X1, data=Anscombe$Y1andAnscombe$X1) # create the linear


regression model
plot(Anscombe$X1,Anscombe$Y1) # scatter plot

abline(model,col = "red",lwd = 3) # plot the regression line

model # display regression coefficients

cor(Anscombe$X1,Anscombe$Y1) # get correlation coefficient

# correlation = 0.8164205 - this value is high

# I can say that there is a strong linear relationship between X1 and Y1

# Findings for Assumption1:

# The linearity of data is satisfied

# because of the high value of correlation

# and the scatter plot shows that

# as X1 increases, Y1 also increases.

# Step 3: Verify Assumption2: Independence of Error terms

plot(model,1)

plot(model$fitted.values, model$residuals)

plot(model,1)

# Conclusion: Independence of error terms is satisfied.

# In the residuals versus fits plot,

# the points seem randomly scattered,

# and it does not appear that there is a pattern.

# Also, there is a red line which is

# approximately horizontal at 0.
# Step4: Verify Assumption3: Normality of Error terms.

# For simple linear regression

# we can check the normality of the response variable

# Can use the hist() to check whether the dependent variable

# follows a normal distribution. Use also Test for normality and the

# QQ plot and the Normality Rule

hist(Anscombe$Y1,probability=T, main="Histogram",xlab="Raw data")

lines(density(Anscombe$Y1),col=2,lwd = 3)

# Findings: it appears that the distribution

# is approximately normal

# We can confirm this by computing the skewness and kurtosis coefficients

library(datawizard)

skewness(Anscombe$Y1)

kurtosis(Anscombe$Y1)

# skew = -0.065 > -0.5 indicating nearly symmetrical data

# Kurtosis = -0.535 < 0 indicating that the curve is flatter than normal

# Findings: The distribution of the response variable is nearly symmetric

# Conclusion: The distribution of the response variable is normal

# Perform Shapiro-Wilk test for Normality


shapiro.test(Anscombe$Y1)

# W = 0.97693, p-value = 0.9467

# The p-value of the test turns out to be 0.9467 .

# Since this value is greater than .05, we have sufficient evidence

# to say that the sample data come from a population

# that is normally distributed.

# Perform the Empirical Normality rule: (This requires that the curve is symmetric)

# Percentage of area covered under normality rule: 68/95/99.7

# Area covered within 1 SD is 68%

# Area covered within 2 SD is 95%

# Area covered within 3 SD is 99.7%

# Compute the actual percentage

data1 <- Anscombe$Y1

within1sd <- round(sum(abs(data1) < mean(data1)+1*sd(data1))*(100/length(data1)),2)

within2sd <- round(sum(abs(data1) < mean(data1)+2*sd(data1))*(100/length(data1)),2)

within3sd <- round(sum(abs(data1) < mean(data1)+3*sd(data1))*(100/length(data1)),2)

percentage <- paste0(within1sd,'/',within2sd,'/',within3sd)

(normsfreq <- paste0("Actual Percentage: ", percentage))

# "Actual Percentage: 81.82/100/100 which satisfies the empirical rule

# and there is a validation that the curve is symmetric

# using the histogram skewness and kurtosis.

# Therefore the result about the actual empirical coverage


# should be considered

# because it has already been established that the curve is nearly symmetric.

# Findings: the Empirical Normality rule is satisfied

# Decision: The result of Empirical rule is valid

# since the conditions are satisfied and

# the shape of the distribution is nearly symmetric.

# Here is another way to establish normality

# create Quantile-Quantile plot for residuals ( the plot

# should fall along a straight line)

qqnorm(model$residuals)

qqline(model$residuals)

# Findings: residuals seem to fall along the straight line

# This means the error terms is normally distributed.

# Final Conclusion about normality of error terms:

# Assumption is satisfied.

# Step5: Verify Assumption4: Constant Variance of Y for each x

# The assumption can be checked by examining the scale-location

# plot which is also known as spread location-plot


plot(model,3)

# Findings: There is constant variance of Y for each X value.

# Another view on how to verify constancy of variance

# Use augment function to automatically insert fitted values

# and residuals

library(broom)

augment_model <- augment(model)

library(ggplot2)

ggplot(augment_model, aes(Anscombe$X1, Anscombe$Y1)) +

geom_point() +

stat_smooth(method = lm, se = TRUE) +

geom_segment(aes(xend = Anscombe$X1, yend = .fitted), color = "red", size = 0.3)

# The variance is represented by the shaded band.

# As you can see the shaded band which represents the standard

# deviation across all values of x. Its values are closer

# from the beginning up to the end values of x.

# This suggests that the variance of Y is constant along the values

# of x.

# Conclusion: Assumption 4 is satisfied

# ---------------------------------------------------------

# Final Conclusion: All 4 assumptions are satisfied.

# Therefore the simple linear regression model is


# appropriate for the data. This means to say that we do not have

# to look for other methods to improve the model.

# Validation of the assumptions showed that the Simple linear regression

# model is valid to represent the causal relationship

# between the X1 variable and the Y1 variable.

# ---------------------------------------------------------

# If instead of evaluating the response variable,

# we will investigate the residuals (actual value - fitted value)

# If we will take a look at the residuals we will have

hist(model$residuals,probability=T, main="Histogram",xlab="Raw data")

lines(density(model$residuals),col=2,lwd = 3)

# ---------------------------------------------------------

# Problem 12: Build the Simple Linear Regression Model for (X2, Y2).

# Verify the assumptions of this model.

# Step1: Load data set

Anscombe <- read.csv("Anscombe.csv")

# Step2: Build the Simple Linear Regression Model

model <- lm(Anscombe$Y2 ~ Anscombe$X2, data=Anscombe$Y2andAnscombe$X2) # create the linear


regression model
plot(Anscombe$X2,Anscombe$Y2) # scatter plot

abline(model,col = "red",lwd = 3) # plot the regression line

model # display regression coefficients

cor(Anscombe$X2,Anscombe$Y2) # get correlation coefficient

# correlation = 0.8162365 -

# Findings for Assumption1:

# Step 3: Verify Assumption2: Independence of Error terms

plot(model,1)

plot(model$fitted.values, model$residuals)

plot(model,1)

# Conclusion:

# Step4: Verify Assumption3: Normality of Error terms.

# For simple linear regression

# we can check the normality of the response variable

# Can use the hist() to check whether the dependent variable

# follows a normal distribution. Use also Test for normality and the
# QQ plot and the Normality Rule

hist(Anscombe$Y2,probability=T, main="Histogram",xlab="Raw data")

lines(density(Anscombe$Y2),col=2,lwd = 3)

# Findings:

# We can confirm this by computing the skewness and kurtosis coefficients

library(datawizard)

skewness(Anscombe$Y2) # skew = -1.316 < -0.5 indicating strong skewness

# on the left side

kurtosis(Anscombe$Y2) # kurtosis = 0.846 > 0 indicating that the curve

# is higher than normal

# Findings:

# Conclusion:

# Perform Shapiro-Wilk test for Normality

shapiro.test(Anscombe$Y2)

# W = 0.82837, p-value = 0.02222

# The p-value of the test turns out to be 0.02222 .

# Perform the Empirical Normality rule: (This requires that the curve is symmetric)

# Percentage of area covered under normality rule: 68/95/99.7


# Area covered within 1 SD is 68%

# Area covered within 2 SD is 95%

# Area covered within 3 SD is 99.7%

# Compute the actual percentage

data1 <- Anscombe$Y2

within1sd <- round(sum(abs(data1) < mean(data1)+1*sd(data1))*(100/length(data1)),2)

within2sd <- round(sum(abs(data1) < mean(data1)+2*sd(data1))*(100/length(data1)),2)

within3sd <- round(sum(abs(data1) < mean(data1)+3*sd(data1))*(100/length(data1)),2)

percentage <- paste0(within1sd,'/',within2sd,'/',within3sd)

(normsfreq <- paste0("Actual Percentage: ", percentage))

# "Actual Percentage: 100/100/100" which satisfies the empirical rule

# Findings:

# Decision:

# Here is another way to establish normality

# create Quantile-Quantile plot for residuals ( the plot

# should fall along a straight line)

qqnorm(model$residuals)

qqline(model$residuals)

# Findings:
# Decision:

# Final Conclusion about normality of error terms:

# Step5: Verify Assumption4: Constant Variance of Y for each x

# The assumption can be checked by examining the scale-location

# plot which is also known as spread location-plot

plot(model,3)

# Findings: No constant variance of Y for each

# x value.

# Another view on how to verify constancy of variance

# Use augment function to automatically insert fitted values

# and residuals

library(broom)

augment_model <- augment(model)

library(ggplot2)

ggplot(augment_model, aes(Anscombe$X2, Anscombe$Y2)) +

geom_point() +

stat_smooth(method = lm, se = TRUE) +

geom_segment(aes(xend = Anscombe$X2, yend = .fitted), color = "red", size = 0.3)

# The variance is represented by the shaded band.


# Conclusion:

# ---------------------------------------------------------

# Final Conclusion:

# ---------------------------------------------------------

# If instead of evaluating the response variable,

# we will investigate the residuals (actual value - fitted value)

# If we will take a look at the residuals we will have

hist(model$residuals,probability=T, main="Histogram",xlab="Raw data")

lines(density(model$residuals),col=2,lwd = 3)

# ---------------------------------------------------------

# Problem 13: Build the Simple Linear Regression Model for (X3, Y3).

# Verify the assumptions of this model.

# Step1: Load data set

Anscombe <- read.csv("Anscombe.csv")

# Step2: Build the Simple Linear Regression Model


model <- lm(Anscombe$Y3 ~ Anscombe$X3, data=Anscombe$Y3andAnscombe$X3)

# create the linear regression model# create the linear regression model

plot(Anscombe$X3,Anscombe$Y3) # scatter plot

abline(model,col = "red",lwd = 3) # plot the regression line

model # display regression coefficients

cor(Anscombe$X3,Anscombe$Y3) # get correlation coefficient

# correlation = 0.8162867 -

# Findings for Assumption1:

# Step 3: Verify Assumption2: Independence of Error terms

plot(model,1)

plot(model$fitted.values, model$residuals)

plot(model,1)

# Conclusion:

# Step4: Verify Assumption3: Normality of Error terms.

# For simple linear regression

# we can check the normality of the response variable


# Can use the hist() to check whether the dependent variable

# follows a normal distribution. Use also Test for normality and the

# QQ plot and the Normality Rule

hist(Anscombe$Y3,probability=T, main="Histogram",xlab="Raw data")

lines(density(Anscombe$Y3),col=2,lwd = 3)

# Findings:

# We can confirm this by computing the skewness and kurtosis coefficients

library(datawizard)

skewness(Anscombe$Y3) # skew = 1.855 > 1 indicating strong skewness

# on the right side

kurtosis(Anscombe$Y3) # kurtosis = 4.384 > 0 indicating that the curve

# is higher than normal

# Findings:

# Conclusion:

# Perform Shapiro-Wilk test for Normality

shapiro.test(Anscombe$Y3)

# W = 0.83361, p-value = 0.02604

# The p-value of the test turns out to be 0.02604 .

#
# Perform the Empirical Normality rule: (This requires that the curve is symmetric)

# Percentage of area covered under normality rule: 68/95/99.7

# Area covered within 1 SD is 68%

# Area covered within 2 SD is 95%

# Area covered within 3 SD is 99.7%

# Compute the actual percentage

data1 <- Anscombe$Y3

within1sd <- round(sum(abs(data1) < mean(data1)+1*sd(data1))*(100/length(data1)),2)

within2sd <- round(sum(abs(data1) < mean(data1)+2*sd(data1))*(100/length(data1)),2)

within3sd <- round(sum(abs(data1) < mean(data1)+3*sd(data1))*(100/length(data1)),2)

percentage <- paste0(within1sd,'/',within2sd,'/',within3sd)

(normsfreq <- paste0("Actual Percentage: ", percentage))

# "Actual Percentage: 90.91/90.91/100" which satisfies the empirical rule

# Findings:

# Decision:

# Here is another way to establish normality

# create Quantile-Quantile plot for residuals ( the plot

# should fall along a straight line)


qqnorm(model$residuals)

qqline(model$residuals)

# Findings:

# Decision:

# Final Conclusion about normality of error terms:

# Step5: Verify Assumption4: Constant Variance of Y for each x

# The assumption can be checked by examining the scale-location

# plot which is also known as spread location-plot

plot(model,3)

# Findings: No constant variance of Y for each

# x value.

# Another view on how to verify constancy of variance

# Use augment function to automatically insert fitted values

# and residuals

library(broom)

augment_model <- augment(model)

library(ggplot2)

ggplot(augment_model, aes(Anscombe$X3, Anscombe$Y3)) +

geom_point() +
stat_smooth(method = lm, se = TRUE) +

geom_segment(aes(xend = Anscombe$X3, yend = .fitted), color = "red", size = 0.3)

# The variance is represented by the shaded band.

# Conclusion:

# ---------------------------------------------------------

# Final Conclusion:

# ---------------------------------------------------------

# If instead of evaluating the response variable,

# we will investigate the residuals (actual value - fitted value)

# If we will take a look at the residuals we will have

hist(model$residuals,probability=T, main="Histogram",xlab="Raw data")

lines(density(model$residuals),col=2,lwd = 3)

# ---------------------------------------------------------

# Problem 14: Build the Simple Linear Regression Model for (X4, Y4).

# Verify the assumptions of this model.

# Step1: Load data set

Anscombe <- read.csv("Anscombe.csv")


# Step2: Build the Simple Linear Regression Model

model <- lm(Anscombe$Y4 ~ Anscombe$X4, data=Anscombe$Y4andAnscombe$X4) # create the linear


regression model

plot(Anscombe$X4,Anscombe$Y4) # scatter plot

abline(model,col = "red",lwd = 3) # plot the regression line

model # display regression coefficients

cor(Anscombe$X4,Anscombe$Y4) # get correlation coefficient

# correlation = 0.8165214 -

# Findings for Assumption1:

# Step 3: Verify Assumption2: Independence of Error terms

plot(model,1)

plot(model$fitted.values, model$residuals)

plot(model,1)

# Conclusion:

# Step4: Verify Assumption3: Normality of Error terms.


# For simple linear regression

# we can check the normality of the response variable

# Can use the hist() to check whether the dependent variable

# follows a normal distribution. Use also Test for normality and the

# QQ plot and the Normality Rule

hist(Anscombe$Y4,probability=T, main="Histogram",xlab="Raw data")

lines(density(Anscombe$Y4),col=2,lwd = 3)

# Findings:

# We can confirm this by computing the skewness and kurtosis coefficients

library(datawizard)

skewness(Anscombe$Y4) # skew = 1.507 > -.5 indicating strong skewness

# on the right side

kurtosis(Anscombe$Y4) # kurtosis = 3.151 > 0 indicating that the curve

# is higher than normal

# Findings:

# Conclusion:

# Perform Shapiro-Wilk test for Normality

shapiro.test(Anscombe$Y4)
# W = 0.87536, p-value = 0.09081

# The p-value of the test turns out to be 0.09081 .

# Perform the Empirical Normality rule: (This requires that the curve is symmetric)

# Percentage of area covered under normality rule: 68/95/99.7

# Area covered within 1 SD is 68%

# Area covered within 2 SD is 95%

# Area covered within 3 SD is 99.7%

# Compute the actual percentage

data1 <- Anscombe$Y4

within1sd <- round(sum(abs(data1) < mean(data1)+1*sd(data1))*(100/length(data1)),2)

within2sd <- round(sum(abs(data1) < mean(data1)+2*sd(data1))*(100/length(data1)),2)

within3sd <- round(sum(abs(data1) < mean(data1)+3*sd(data1))*(100/length(data1)),2)

percentage <- paste0(within1sd,'/',within2sd,'/',within3sd)

(normsfreq <- paste0("Actual Percentage: ", percentage))

# "Actual Percentage: 90.91/90.91/100" which satisfies the empirical rule

# Findings:

# Decision:

# Here is another way to establish normality

# create Quantile-Quantile plot for residuals ( the plot


# should fall along a straight line)

qqnorm(model$residuals)

qqline(model$residuals)

# Findings:

# Decision:

# Final Conclusion about normality of error terms:

# Step5: Verify Assumption4: Constant Variance of Y for each x

# The assumption can be checked by examining the scale-location

# plot which is also known as spread location-plot

plot(model,3)

# Findings:

# Another view on how to verify constancy of variance

# Use augment function to automatically insert fitted values

# and residuals

library(broom)

augment_model <- augment(model)

library(ggplot2)
ggplot(augment_model, aes(Anscombe$X4, Anscombe$Y4)) +

geom_point() +

stat_smooth(method = lm, se = TRUE) +

geom_segment(aes(xend = Anscombe$X4, yend = .fitted), color = "red", size = 0.3)

# The variance is represented by the shaded band.

# Conclusion: Assumption 4 is not satisfied

# ---------------------------------------------------------

# Final Conclusion:

# ---------------------------------------------------------

# If instead of evaluating the response variable,

# we will investigate the residuals (actual value - fitted value)

# If we will take a look at the residuals we will have

hist(model$residuals,probability=T, main="Histogram",xlab="Raw data")

lines(density(model$residuals),col=2,lwd = 3)

# ---------------------------------------------------------

# Finally which pair is suitable for Simple Linear Regression Analysis?

# Short Answer:

# ---------------------------------------------------------

You might also like