STA2050 Assignment 2

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 10

STA2050 Assignment 2

Mark Bilahi M’rabu

2023-11-28

Import the Necessary Packages

library(imputeTS)

## Warning: package ’imputeTS’ was built under R version 4.3.2

## Registered S3 method overwritten by ’quantmod’:


## method from
## as.zoo.data.frame zoo

library(VIM)

## Warning: package ’VIM’ was built under R version 4.3.2

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

##
## Attaching package: ’VIM’

## The following object is masked from ’package:datasets’:


##
## sleep

library(nnet)
library(readxl)
library(csv)
library(dplyr)

##
## Attaching package: ’dplyr’

1
## The following objects are masked from ’package:stats’:
##
## filter, lag

## The following objects are masked from ’package:base’:


##
## intersect, setdiff, setequal, union

library(tidyverse)

## Warning: package ’tidyverse’ was built under R version 4.3.2

## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --


## v forcats 1.0.0 v readr 2.1.4
## v ggplot2 3.4.3 v stringr 1.5.0
## v lubridate 1.9.3 v tibble 3.2.1
## v purrr 1.0.2 v tidyr 1.3.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --


## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(finalfit)

## Warning: package ’finalfit’ was built under R version 4.3.2

library(lubridate)
library(tidyr)
library(ggplot2)
library(psych)

## Warning: package ’psych’ was built under R version 4.3.2

##
## Attaching package: ’psych’
##
## The following objects are masked from ’package:ggplot2’:
##
## %+%, alpha

library(lme4)

## Loading required package: Matrix


##
## Attaching package: ’Matrix’
##
## The following objects are masked from ’package:tidyr’:
##
## expand, pack, unpack

2
library(mice)

##
## Attaching package: ’mice’
##
## The following object is masked from ’package:stats’:
##
## filter
##
## The following objects are masked from ’package:base’:
##
## cbind, rbind

library(tibble)

Question 1

# Create the DataSet


Tree_no = c(1:20)
Diameter_X = c(12, 11.4, 7.9, 10.5, 7.9, 9, 7.3, 10.2, 11.7, 11.3, 5.7, 8, 10.3, 12, 9.2, 8.5, 7, 10.7,
Age_Y = c(125, 119, 83, 85, 99, 117, 69, 133, 154, 168, 61, 80, 114, 147, 122, 106, 82, 88, 97, 99)
Treedata = data.frame(cbind(Tree_no, Diameter_X, Age_Y))
head(Treedata)

## Tree_no Diameter_X Age_Y


## 1 1 12.0 125
## 2 2 11.4 119
## 3 3 7.9 83
## 4 4 10.5 85
## 5 5 7.9 99
## 6 6 9.0 117

#View(Treedata)
str(Treedata)

## ’data.frame’: 20 obs. of 3 variables:


## $ Tree_no : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Diameter_X: num 12 11.4 7.9 10.5 7.9 9 7.3 10.2 11.7 11.3 ...
## $ Age_Y : num 125 119 83 85 99 117 69 133 154 168 ...

a) Draw a scatterplot of y vs x

Treeplot = ggplot(Treedata, aes(x = Diameter_X, y = Age_Y)) + geom_point() + labs(Treedata, title = 'Sca


Treeplot

3
Scatterplot of Y (Age) against X (Diameter)

150

125
Tree Age

100

75

6 8 10 12
Tree Diameter

"\n"

## [1] "\n"

b) Estimate the Population Mean age of trees in the stand using ratio estimation and give an approximate
standard error for your estimate

Diameter = Auxilliary = x
Age = Dependent = y
n
X
tx = xi
i=1
n
X
ty = yi
i=1

tx µx
=
ty µy
ty
P opulationM eanAgeµy = µx ∗
tx
StanDev
StandardError = √
nof age

4
#Estimating Population Mean Age
meanx = 10.3
totalx = sum(Diameter_X)
cat("Diameter Sample Total =", totalx, "\n")

## Diameter Sample Total = 188.1

totaly = sum(Age_Y)
cat("Age Sample Total =", totaly, "\n")

## Age Sample Total = 2148

meany = meanx * (totaly / totalx)


cat("Population Mean Age =", meany, "\n")

## Population Mean Age = 117.6204

#Approximating Standard Error for above Estimation


StanError = sd(Age_Y) / (sqrt(length(Age_Y)))
cat("Standard Error=", StanError, "\n")

## Standard Error= 6.40904

cat("Therefore, the estimated population mean age of trees is", meany, "while the standard error for sai

## Therefore, the estimated population mean age of trees is 117.6204 while the standard error for said e

c) Repeat b) using Regression Estimation

P P P
n x∗y− x∗ y
bi = P 2 P 2
n ∗ x − ( x)
y = b0 + b1 x
#Creating a Subset of the Treedata
Regdata = Treedata
Regdata[4] = Regdata[2] * Regdata[3]
Regdata[5] = Regdata[2]ˆ2
Regdata[6] = Regdata[3]ˆ2
colnames(Regdata) = c("Tree_no", 'Diameter_X', "Age_Y", "XY", "Xsqr", "Ysqr")
Regdata = Regdata %>%
mutate_at(vars(1), as.character)
Totals = Regdata %>%
summarize(Tree_no = "Totals",
Diameter_X = sum(Regdata$Diameter_X),
Age_Y = sum(Regdata$Age_Y),
XY = sum(Regdata$XY),
Xsqr = sum(Regdata$Xsqr),
Ysqr = sum(Regdata$Ysqr))
Regdata = bind_rows(Regdata, Totals)
#Regdata = Regdata %>%
# mutate(Regdata$Tree_no[21], "Totals")
str(Regdata)

5
## ’data.frame’: 21 obs. of 6 variables:
## $ Tree_no : chr "1" "2" "3" "4" ...
## $ Diameter_X: num 12 11.4 7.9 10.5 7.9 9 7.3 10.2 11.7 11.3 ...
## $ Age_Y : num 125 119 83 85 99 117 69 133 154 168 ...
## $ XY : num 1500 1357 656 892 782 ...
## $ Xsqr : num 144 130 62.4 110.2 62.4 ...
## $ Ysqr : num 15625 14161 6889 7225 9801 ...

head(Regdata)

## Tree_no Diameter_X Age_Y XY Xsqr Ysqr


## 1 1 12.0 125 1500.0 144.00 15625
## 2 2 11.4 119 1356.6 129.96 14161
## 3 3 7.9 83 655.7 62.41 6889
## 4 4 10.5 85 892.5 110.25 7225
## 5 5 7.9 99 782.1 62.41 9801
## 6 6 9.0 117 1053.0 81.00 13689

#Fitting the Linear Regression Model


Treemodel = lm(Age_Y ~ Diameter_X, data = Regdata)
summary(Treemodel)

##
## Call:
## lm(formula = Age_Y ~ Diameter_X, data = Regdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.892 -11.171 -0.288 9.977 38.971
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.03031 4.33635 -0.007 0.994
## Diameter_X 11.42115 0.10301 110.874 <2e-16 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 17.98 on 19 degrees of freedom
## Multiple R-squared: 0.9985, Adjusted R-squared: 0.9984
## F-statistic: 1.229e+04 on 1 and 19 DF, p-value: < 2.2e-16

Intercept = coef(Treemodel)[1]
Slope = coef(Treemodel)[2]
cat("The slope of the model is", Slope, "while the intercept is", Intercept, "\n")

## The slope of the model is 11.42115 while the intercept is -0.03030842

Estimate = Intercept + (Slope *meanx)

The Regression is given by:


age = −7.63 + 12.23diameter

6
#Approximating Standard Error for the above Estimation
Residuals = resid(Treemodel)
ResStanDev = sd(Residuals)
SE = ResStanDev * sqrt(1 + 1/length(Age_Y) + ((meanx - mean(Diameter_X))ˆ2 / sum((Diameter_X - mean(Diam
cat("The Regression Estimate of the Population Mean Age is", Estimate, "while the Standard Error for the

## The Regression Estimate of the Population Mean Age is 117.6075 while the Standard Error for the above

d) Label your Estimates on your graph. How do they compare?

Ratio = c(10.3, meany)


Regression = c(10.3, Estimate)
Est = data.frame(rbind(Ratio, Regression))
Est

## V1 X.Intercept.
## Ratio 10.3 117.6204
## Regression 10.3 117.6075

colnames(Est) = c("X", "Y")


X = Est$X
Y = Est$Y
EstGraph = ggplot(Treedata, aes(x = Diameter_X, y = Age_Y), color = "blue", shape = 15) + geom_point() +
EstGraph

Scatterplot of Y (Age) against X (Diameter)


with Estimates & Regression Line

150

125
Tree Age

100

75

6 8 10 12
Tree Diameter

7
"\n"

## [1] "\n"

From the above graph it is clear to see that both the Ratio Estimation & The Regression Estimation of the
Population Mean Age are very close in their value however the Standard Error for the methods vary greatly

Question 2

The new candy Green Goobules is being test-marketed in an area of upstate NY. The market research firm
decided to sample 6 cities from the 45 in the area & then to sample supermarkets within the cities, wanting
to know the no. of Green Gobules cases sold

a) Obtain summary statistics for each cluster

# Create the Data Sets


cluster_1 = c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)
cluster_2 = c(99, 101, 52, 121)
cluster_3 = c(199, 179, 98, 63, 126, 87, 62)
cluster_4 = c(226, 129, 57, 46, 86, 43, 85, 165)
cluster_5 = c(12, 23)
cluster_6 = c(87, 43, 59)

#Cluster 1
summary(cluster_1)

## Min. 1st Qu. Median Mean 3rd Qu. Max.


## 72.0 147.5 175.5 177.3 184.8 361.0

sd1 = sd(cluster_1)
var1 = var(cluster_1)
cat("The Standard Deviation of Cluster 1 is", sd1, "while the Variance is", var1, "\n", "\n")

## The Standard Deviation of Cluster 1 is 83.59964 while the Variance is 6988.9


##

#Cluster 2
summary(cluster_2)

## Min. 1st Qu. Median Mean 3rd Qu. Max.


## 52.00 87.25 100.00 93.25 106.00 121.00

sd2 = sd(cluster_2)
var2 = var(cluster_2)
cat("The Standard Deviation of Cluster 2 is", sd2, "while the Variance is", var2, "\n", "\n")

## The Standard Deviation of Cluster 2 is 29.23896 while the Variance is 854.9167


##

8
#Cluster 3
summary(cluster_3)

## Min. 1st Qu. Median Mean 3rd Qu. Max.


## 62.0 75.0 98.0 116.3 152.5 199.0

sd3 = sd(cluster_3)
var3 = var(cluster_3)
cat("The Standard Deviation of Cluster 3 is", sd3, "while the Variance is", var3, "\n", "\n")

## The Standard Deviation of Cluster 3 is 54.53963 while the Variance is 2974.571


##

#Cluster 4
summary(cluster_4)

## Min. 1st Qu. Median Mean 3rd Qu. Max.


## 43.00 54.25 85.50 104.62 138.00 226.00

sd4 = sd(cluster_4)
var4 = var(cluster_4)
cat("The Standard Deviation of Cluster 4 is", sd4, "while the Variance is", var4, "\n", "\n")

## The Standard Deviation of Cluster 4 is 64.59309 while the Variance is 4172.268


##

#Cluster 5
summary(cluster_5)

## Min. 1st Qu. Median Mean 3rd Qu. Max.


## 12.00 14.75 17.50 17.50 20.25 23.00

sd5 = sd(cluster_5)
var5 = var(cluster_5)
cat("The Standard Deviation of Cluster 5 is", sd5, "while the Variance is", var5, "\n", "\n")

## The Standard Deviation of Cluster 5 is 7.778175 while the Variance is 60.5


##

#Cluster 6
summary(cluster_6)

## Min. 1st Qu. Median Mean 3rd Qu. Max.


## 43 51 59 63 73 87

sd6 = sd(cluster_6)
var6 = var(cluster_6)
cat("The Standard Deviation of Cluster 1 is", sd6, "while the Variance is", var6, "\n", "\n")

9
## The Standard Deviation of Cluster 1 is 22.27106 while the Variance is 496
##

b) Estimate the total number of cases sold, and the average number sold per supermarket, along with the
standard errors of your Estimates

# Create the Data Set


City = c(1, 2, 3, 4, 5, 6)
Supermarket_No = c(52, 19, 37, 39, 8, 14)
"cluster_1 = I(list(c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)))
cluster_2 = I(list(c(99, 101, 52, 121)))
cluster_3 = I(list(c(199, 179, 98, 63, 126, 87, 62)))
cluster_4 = I(list(c(226, 129, 57, 46, 86, 43, 85, 165)))
cluster_5 = I(list(c(12, 23)))
cluster_6 = I(list(c(87, 43, 59)))"

## [1] "cluster_1 = I(list(c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)))\ncluster_2 = I(list(c(99,

Cases_Sold = data.frame(Clusters = c("Cluster_1", "Cluster_2", "Cluster_3", "Cluster_4", "Cluster_5", "C


#Cases_Sold = bind_rows(
# tibble(Cluster = "cluster_1", Value = cluster_1),
# tibble(Cluster = "cluster_2", Value = cluster_2),
# tibble(Cluster = "cluster_3", Value = cluster_3),
# tibble(Cluster = "cluster_4", Value = cluster_4),
# tibble(Cluster = "cluster_5", Value = cluster_5),
# tibble(Cluster = "cluster_6", Value = cluster_6)
#)
#Cases_Sold
#Cases_Sold = tidyr::gather(data.frame(cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6)
#Cases_Sold = data.frame(var1 = rbind(cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6))
View(Cases_Sold)
Green_Gobules = cbind(City, Supermarket_No, Cases_Sold[2])
View(Green_Gobules)
#Green_Gobules = Green_Gobules %>%
# mutate_at(vars(3), as.numeric)
mean(Green_Gobules[1, 3])

## Warning in mean.default(Green_Gobules[1, 3]): argument is not numeric or


## logical: returning NA

## [1] NA

10

You might also like