Regression Analysis Script

# PANGANTIHON, Norman O.
STT061-M14
# Activity on Regression Analysis
# Your task is consider the Anscombe data set again
# These are your new problems
# Problem 1: Encode the Anscombe data set using Excel (Save the file)
# Problem 2: Export the xls file into a csv file (Find Export command
# in the File command menu of Excel)
# csv means comma separated values
# Problem 3: Use R command to load the csv file in R
homedir <- "~/NORMAN/STT061/"
setwd(homedir)
Anscombe <- read.csv("Anscombe.csv")
# Problem 4: Use R command to compute the mean of X1,X2,X3,X4
mean(Anscombe$X1)
mean(Anscombe$X2)
mean(Anscombe$X3)
mean(Anscombe$X4)
# Problem 5: Use R command to compute the mean of Y1,Y2,Y3,Y4
mean(Anscombe$Y1)
mean(Anscombe$Y2)
mean(Anscombe$Y3)
mean(Anscombe$Y4)
# Problem 6: Use R command to compute the variance Y1,Y2,Y3,Y4
var(Anscombe$Y1)
var(Anscombe$Y2)
var(Anscombe$Y3)
var(Anscombe$Y4)
# Problem 7: Use R command to compute the variance X1,X2,X3,X4
var(Anscombe$X1)
var(Anscombe$X2)
var(Anscombe$X3)
var(Anscombe$X4)
# Problem 8: Use R command to compute the sd Y1,Y2,Y3,Y4
sd(Anscombe$Y1)
sd(Anscombe$Y2)
sd(Anscombe$Y3)
sd(Anscombe$Y4)
# Problem 9: Use R command to compute the sd X1,X2,X3,X4
sd(Anscombe$X1)
sd(Anscombe$X2)
sd(Anscombe$X3)
sd(Anscombe$X4)
# Problem 10: Use R command to compute the correlation (X1,Y1),
# and also for (X2,Y2), (X3,Y3), (X4,Y4).
cor(Anscombe$X1,Anscombe$Y1)
# Establish causal relationship by validating the four assumptions
# Problem 11: Build the Simple Linear Regression Model for (X1, Y1).
# Verify the assumptions of this model.
# Step1: Load data set
# Step2: Build the Simple Linear Regression Model
model <- lm(Anscombe$Y1 ~ Anscombe$X1, data=Anscombe$Y1andAnscombe$X1) # create the linear

regression model
plot(Anscombe$X1,Anscombe$Y1) # scatter plot
abline(model,col = "red",lwd = 3) # plot the regression line
model # display regression coefficients
cor(Anscombe$X1,Anscombe$Y1) # get correlation coefficient
# correlation = 0.8164205 - this value is high
# I can say that there is a strong linear relationship between X1 and Y1
# Findings for Assumption1:
# The linearity of data is satisfied
# because of the high value of correlation
# and the scatter plot shows that
# as X1 increases, Y1 also increases.
# Step 3: Verify Assumption2: Independence of Error terms
plot(model,1)
plot(model$fitted.values, model$residuals)
plot(model,1)
# Conclusion: Independence of error terms is satisfied.
# In the residuals versus fits plot,
# the points seem randomly scattered,
# and it does not appear that there is a pattern.
# Also, there is a red line which is
# approximately horizontal at 0.
# Step4: Verify Assumption3: Normality of Error terms.
# For simple linear regression
# we can check the normality of the response variable
# Can use the hist() to check whether the dependent variable
# follows a normal distribution. Use also Test for normality and the
# QQ plot and the Normality Rule
hist(Anscombe$Y1,probability=T, main="Histogram",xlab="Raw data")
lines(density(Anscombe$Y1),col=2,lwd = 3)
# Findings: it appears that the distribution
# is approximately normal
# We can confirm this by computing the skewness and kurtosis coefficients
library(datawizard)
skewness(Anscombe$Y1)
kurtosis(Anscombe$Y1)
# skew = -0.065 > -0.5 indicating nearly symmetrical data
# Kurtosis = -0.535 < 0 indicating that the curve is flatter than normal
# Findings: The distribution of the response variable is nearly symmetric
# Conclusion: The distribution of the response variable is normal
# Perform Shapiro-Wilk test for Normality

shapiro.test(Anscombe$Y1)
# W = 0.97693, p-value = 0.9467
# The p-value of the test turns out to be 0.9467 .
# Since this value is greater than .05, we have sufficient evidence
# to say that the sample data come from a population
# that is normally distributed.
# Perform the Empirical Normality rule: (This requires that the curve is symmetric)
# Percentage of area covered under normality rule: 68/95/99.7
# Area covered within 1 SD is 68%
# Area covered within 3 SD is 99.7%
# Compute the actual percentage
data1 <- Anscombe$Y1
within1sd <- round(sum(abs(data1) < mean(data1)+1*sd(data1))*(100/length(data1)),2)
percentage <- paste0(within1sd,'/',within2sd,'/',within3sd)
(normsfreq <- paste0("Actual Percentage: ", percentage))
# "Actual Percentage: 81.82/100/100 which satisfies the empirical rule
# and there is a validation that the curve is symmetric
# using the histogram skewness and kurtosis.
# Therefore the result about the actual empirical coverage

# should be considered
# because it has already been established that the curve is nearly symmetric.
# Findings: the Empirical Normality rule is satisfied
# Decision: The result of Empirical rule is valid
# since the conditions are satisfied and
# the shape of the distribution is nearly symmetric.
# Here is another way to establish normality
# create Quantile-Quantile plot for residuals ( the plot
# should fall along a straight line)
qqnorm(model$residuals)
qqline(model$residuals)
# Findings: residuals seem to fall along the straight line
# This means the error terms is normally distributed.
# Final Conclusion about normality of error terms:
# Assumption is satisfied.
# Step5: Verify Assumption4: Constant Variance of Y for each x
# The assumption can be checked by examining the scale-location
# plot which is also known as spread location-plot

plot(model,3)
# Findings: There is constant variance of Y for each X value.
# Another view on how to verify constancy of variance
# Use augment function to automatically insert fitted values
# and residuals
library(broom)
augment_model <- augment(model)
library(ggplot2)
ggplot(augment_model, aes(Anscombe$X1, Anscombe$Y1)) +
geom_point() +
stat_smooth(method = lm, se = TRUE) +
geom_segment(aes(xend = Anscombe$X1, yend = .fitted), color = "red", size = 0.3)
# The variance is represented by the shaded band.
# As you can see the shaded band which represents the standard
# deviation across all values of x. Its values are closer
# from the beginning up to the end values of x.
# This suggests that the variance of Y is constant along the values
# of x.
# Conclusion: Assumption 4 is satisfied
# ---------------------------------------------------------
# Final Conclusion: All 4 assumptions are satisfied.
# Therefore the simple linear regression model is

# appropriate for the data. This means to say that we do not have
# to look for other methods to improve the model.
# Validation of the assumptions showed that the Simple linear regression
# model is valid to represent the causal relationship
# between the X1 variable and the Y1 variable.
# ---------------------------------------------------------
# If instead of evaluating the response variable,
# we will investigate the residuals (actual value - fitted value)
# If we will take a look at the residuals we will have
hist(model$residuals,probability=T, main="Histogram",xlab="Raw data")
lines(density(model$residuals),col=2,lwd = 3)
# ---------------------------------------------------------

regression model
# correlation = 0.8162365 -
plot(model,1)
plot(model,1)
# Conclusion:
# Findings:
library(datawizard)
skewness(Anscombe$Y2) # skew = -1.316 < -0.5 indicating strong skewness
# on the left side
kurtosis(Anscombe$Y2) # kurtosis = 0.846 > 0 indicating that the curve
# is higher than normal
# Findings:
# Conclusion:
# W = 0.82837, p-value = 0.02222

# "Actual Percentage: 100/100/100" which satisfies the empirical rule
# Findings:
# Decision:
# Findings:
# Decision:
plot(model,3)
# Findings: No constant variance of Y for each
# x value.
# and residuals
library(broom)
library(ggplot2)
geom_point() +

# Conclusion:
# ---------------------------------------------------------
# Final Conclusion:
# ---------------------------------------------------------
# ---------------------------------------------------------

model <- lm(Anscombe$Y3 ~ Anscombe$X3, data=Anscombe$Y3andAnscombe$X3)
# create the linear regression model# create the linear regression model
plot(model,1)
plot(model,1)
# Conclusion:

# Findings:
library(datawizard)
skewness(Anscombe$Y3) # skew = 1.855 > 1 indicating strong skewness
# on the right side
# Findings:
# Conclusion:
# W = 0.83361, p-value = 0.02604
#
# "Actual Percentage: 90.91/90.91/100" which satisfies the empirical rule
# Findings:
# Decision:

# Findings:
# Decision:
plot(model,3)
# Findings: No constant variance of Y for each
# x value.
# and residuals
library(broom)
library(ggplot2)
geom_point() +
# Conclusion:
# ---------------------------------------------------------
# Final Conclusion:
# ---------------------------------------------------------
# ---------------------------------------------------------


regression model
plot(model,1)
plot(model,1)
# Conclusion:

# Findings:
library(datawizard)
skewness(Anscombe$Y4) # skew = 1.507 > -.5 indicating strong skewness
# on the right side
# Findings:
# Conclusion:
# W = 0.87536, p-value = 0.09081
# "Actual Percentage: 90.91/90.91/100" which satisfies the empirical rule
# Findings:
# Decision:

# Findings:
# Decision:
plot(model,3)
# Findings:
# and residuals
library(broom)
library(ggplot2)
geom_point() +
# Conclusion: Assumption 4 is not satisfied
# ---------------------------------------------------------
# Final Conclusion:
# ---------------------------------------------------------
# ---------------------------------------------------------
# Finally which pair is suitable for Simple Linear Regression Analysis?
# Short Answer:
# ---------------------------------------------------------

Regression Analysis Script

Uploaded by

Document Information

Original Description:

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Regression Analysis Script

Uploaded by

Copyright:

Available Formats

# PANGANTIHON, Norman O.

# Activity on Regression Analysis

# Your task is consider the Anscombe data set again

# These are your new problems

# in the File command menu of Excel)

# csv means comma separated values

# Problem 3: Use R command to load the csv file in R

homedir <- "~/NORMAN/STT061/"

Anscombe <- read.csv("Anscombe.csv")

# Problem 4: Use R command to compute the mean of X1,X2,X3,X4

# Problem 6: Use R command to compute the variance Y1,Y2,Y3,Y4

# Problem 7: Use R command to compute the variance X1,X2,X3,X4

# Problem 8: Use R command to compute the sd Y1,Y2,Y3,Y4

# Problem 10: Use R command to compute the correlation (X1,Y1),

# and also for (X2,Y2), (X3,Y3), (X4,Y4).

# Establish causal relationship by validating the four assumptions

# Verify the assumptions of this model.

# Step1: Load data set

Anscombe <- read.csv("Anscombe.csv")

# Step2: Build the Simple Linear Regression Model

model <- lm(Anscombe$Y1 ~ Anscombe$X1, data=Anscombe$Y1andAnscombe$X1) # create the linear

abline(model,col = "red",lwd = 3) # plot the regression line

model # display regression coefficients

cor(Anscombe$X1,Anscombe$Y1) # get correlation coefficient

# correlation = 0.8164205 - this value is high

# I can say that there is a strong linear relationship between X1 and Y1

# Findings for Assumption1:

# The linearity of data is satisfied

# because of the high value of correlation

# and the scatter plot shows that

# as X1 increases, Y1 also increases.

# Step 3: Verify Assumption2: Independence of Error terms

# Conclusion: Independence of error terms is satisfied.

# In the residuals versus fits plot,

# the points seem randomly scattered,

# and it does not appear that there is a pattern.

# Also, there is a red line which is

# For simple linear regression

# we can check the normality of the response variable

# Can use the hist() to check whether the dependent variable

# QQ plot and the Normality Rule

hist(Anscombe$Y1,probability=T, main="Histogram",xlab="Raw data")

# Findings: it appears that the distribution

# We can confirm this by computing the skewness and kurtosis coefficients

# skew = -0.065 > -0.5 indicating nearly symmetrical data

# Findings: The distribution of the response variable is nearly symmetric

# Conclusion: The distribution of the response variable is normal

# Perform Shapiro-Wilk test for Normality

# W = 0.97693, p-value = 0.9467

# The p-value of the test turns out to be 0.9467 .

# Since this value is greater than .05, we have sufficient evidence

# to say that the sample data come from a population

# that is normally distributed.

# Percentage of area covered under normality rule: 68/95/99.7

# Area covered within 1 SD is 68%

# Area covered within 2 SD is 95%

# Area covered within 3 SD is 99.7%

# Compute the actual percentage

data1 <- Anscombe$Y1

within1sd <- round(sum(abs(data1) < mean(data1)+1*sd(data1))*(100/length(data1)),2)

within2sd <- round(sum(abs(data1) < mean(data1)+2*sd(data1))*(100/length(data1)),2)

within3sd <- round(sum(abs(data1) < mean(data1)+3*sd(data1))*(100/length(data1)),2)

percentage <- paste0(within1sd,'/',within2sd,'/',within3sd)

within1sd <- round(sum(abs(data1) < mean(data1)+1sd(data1))(100/length(data1)),2)

within2sd <- round(sum(abs(data1) < mean(data1)+2sd(data1))(100/length(data1)),2)

within3sd <- round(sum(abs(data1) < mean(data1)+3sd(data1))(100/length(data1)),2)

within1sd <- round(sum(abs(data1) < mean(data1)+1sd(data1))(100/length(data1)),2)

within2sd <- round(sum(abs(data1) < mean(data1)+2sd(data1))(100/length(data1)),2)