STA2050 Assignment 2

STA2050 Assignment 2
Mark Bilahi M’rabu
2023-11-28
Import the Necessary Packages
library(imputeTS)
## Warning: package ’imputeTS’ was built under R version 4.3.2
## Registered S3 method overwritten by ’quantmod’:

## method from
## as.zoo.data.frame zoo
library(VIM)
## Warning: package ’VIM’ was built under R version 4.3.2
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: ’VIM’
## The following object is masked from ’package:datasets’:

##
## sleep
library(nnet)
library(readxl)
library(csv)
library(dplyr)
##
## Attaching package: ’dplyr’
1
## The following objects are masked from ’package:stats’:
##
## filter, lag
## The following objects are masked from ’package:base’:

##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package ’tidyverse’ was built under R version 4.3.2
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --

## v forcats 1.0.0 v readr 2.1.4
## v ggplot2 3.4.3 v stringr 1.5.0
## v lubridate 1.9.3 v tibble 3.2.1
## v purrr 1.0.2 v tidyr 1.3.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --

## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(finalfit)
## Warning: package ’finalfit’ was built under R version 4.3.2
library(lubridate)
library(tidyr)
library(ggplot2)
library(psych)
## Warning: package ’psych’ was built under R version 4.3.2
##
## Attaching package: ’psych’
##
## The following objects are masked from ’package:ggplot2’:
##
## %+%, alpha
library(lme4)
## Loading required package: Matrix

##
## Attaching package: ’Matrix’
##
## The following objects are masked from ’package:tidyr’:
##
## expand, pack, unpack
2
library(mice)
##
## Attaching package: ’mice’
##
## The following object is masked from ’package:stats’:
##
## filter
##
## The following objects are masked from ’package:base’:
##
## cbind, rbind
library(tibble)
Question 1
# Create the DataSet

Tree_no = c(1:20)
Diameter_X = c(12, 11.4, 7.9, 10.5, 7.9, 9, 7.3, 10.2, 11.7, 11.3, 5.7, 8, 10.3, 12, 9.2, 8.5, 7, 10.7,
Age_Y = c(125, 119, 83, 85, 99, 117, 69, 133, 154, 168, 61, 80, 114, 147, 122, 106, 82, 88, 97, 99)
Treedata = data.frame(cbind(Tree_no, Diameter_X, Age_Y))
head(Treedata)
## Tree_no Diameter_X Age_Y

## 1 1 12.0 125
## 2 2 11.4 119
## 3 3 7.9 83
## 4 4 10.5 85
## 5 5 7.9 99
## 6 6 9.0 117
#View(Treedata)
str(Treedata)
## ’data.frame’: 20 obs. of 3 variables:

## $ Tree_no : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Diameter_X: num 12 11.4 7.9 10.5 7.9 9 7.3 10.2 11.7 11.3 ...
## $ Age_Y : num 125 119 83 85 99 117 69 133 154 168 ...
a) Draw a scatterplot of y vs x
Treeplot = ggplot(Treedata, aes(x = Diameter_X, y = Age_Y)) + geom_point() + labs(Treedata, title = 'Sca

Treeplot
3
Scatterplot of Y (Age) against X (Diameter)
150
125
Tree Age
100
75
6 8 10 12
Tree Diameter
"\n"
## [1] "\n"
b) Estimate the Population Mean age of trees in the stand using ratio estimation and give an approximate
standard error for your estimate
Diameter = Auxilliary = x
Age = Dependent = y
n
X
tx = xi
i=1
n
X
ty = yi
i=1
tx µx
=
ty µy
ty
P opulationM eanAgeµy = µx ∗
tx
StanDev
StandardError = √
nof age
4
#Estimating Population Mean Age
meanx = 10.3
totalx = sum(Diameter_X)
cat("Diameter Sample Total =", totalx, "\n")
## Diameter Sample Total = 188.1
totaly = sum(Age_Y)
cat("Age Sample Total =", totaly, "\n")
## Age Sample Total = 2148
meany = meanx * (totaly / totalx)

cat("Population Mean Age =", meany, "\n")
## Population Mean Age = 117.6204
#Approximating Standard Error for above Estimation

StanError = sd(Age_Y) / (sqrt(length(Age_Y)))
cat("Standard Error=", StanError, "\n")
## Standard Error= 6.40904
cat("Therefore, the estimated population mean age of trees is", meany, "while the standard error for sai
## Therefore, the estimated population mean age of trees is 117.6204 while the standard error for said e
c) Repeat b) using Regression Estimation
P P P
n x∗y− x∗ y
bi = P 2 P 2
n ∗ x − ( x)
y = b0 + b1 x
#Creating a Subset of the Treedata
Regdata = Treedata
Regdata[4] = Regdata[2] * Regdata[3]
Regdata[5] = Regdata[2]ˆ2
Regdata[6] = Regdata[3]ˆ2
colnames(Regdata) = c("Tree_no", 'Diameter_X', "Age_Y", "XY", "Xsqr", "Ysqr")
Regdata = Regdata %>%
mutate_at(vars(1), as.character)
Totals = Regdata %>%
summarize(Tree_no = "Totals",
Diameter_X = sum(Regdata$Diameter_X),
Age_Y = sum(Regdata$Age_Y),
XY = sum(Regdata$XY),
Xsqr = sum(Regdata$Xsqr),
Ysqr = sum(Regdata$Ysqr))
Regdata = bind_rows(Regdata, Totals)
#Regdata = Regdata %>%
# mutate(Regdata$Tree_no[21], "Totals")
str(Regdata)
5
## ’data.frame’: 21 obs. of 6 variables:
## $ Tree_no : chr "1" "2" "3" "4" ...
## $ Diameter_X: num 12 11.4 7.9 10.5 7.9 9 7.3 10.2 11.7 11.3 ...
## $ Age_Y : num 125 119 83 85 99 117 69 133 154 168 ...
## $ XY : num 1500 1357 656 892 782 ...
## $ Xsqr : num 144 130 62.4 110.2 62.4 ...
## $ Ysqr : num 15625 14161 6889 7225 9801 ...
head(Regdata)
## Tree_no Diameter_X Age_Y XY Xsqr Ysqr

## 1 1 12.0 125 1500.0 144.00 15625
## 2 2 11.4 119 1356.6 129.96 14161
## 3 3 7.9 83 655.7 62.41 6889
## 4 4 10.5 85 892.5 110.25 7225
## 5 5 7.9 99 782.1 62.41 9801
## 6 6 9.0 117 1053.0 81.00 13689
#Fitting the Linear Regression Model

Treemodel = lm(Age_Y ~ Diameter_X, data = Regdata)
summary(Treemodel)
##
## Call:
## lm(formula = Age_Y ~ Diameter_X, data = Regdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.892 -11.171 -0.288 9.977 38.971
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.03031 4.33635 -0.007 0.994
## Diameter_X 11.42115 0.10301 110.874 <2e-16 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 17.98 on 19 degrees of freedom
## Multiple R-squared: 0.9985, Adjusted R-squared: 0.9984
## F-statistic: 1.229e+04 on 1 and 19 DF, p-value: < 2.2e-16
Intercept = coef(Treemodel)[1]
Slope = coef(Treemodel)[2]
cat("The slope of the model is", Slope, "while the intercept is", Intercept, "\n")
## The slope of the model is 11.42115 while the intercept is -0.03030842
Estimate = Intercept + (Slope *meanx)
The Regression is given by:

age = −7.63 + 12.23diameter
6
#Approximating Standard Error for the above Estimation
Residuals = resid(Treemodel)
ResStanDev = sd(Residuals)
SE = ResStanDev * sqrt(1 + 1/length(Age_Y) + ((meanx - mean(Diameter_X))ˆ2 / sum((Diameter_X - mean(Diam
cat("The Regression Estimate of the Population Mean Age is", Estimate, "while the Standard Error for the
## The Regression Estimate of the Population Mean Age is 117.6075 while the Standard Error for the above
d) Label your Estimates on your graph. How do they compare?
Ratio = c(10.3, meany)

Regression = c(10.3, Estimate)
Est = data.frame(rbind(Ratio, Regression))
Est
## V1 X.Intercept.
## Ratio 10.3 117.6204
## Regression 10.3 117.6075
colnames(Est) = c("X", "Y")

X = Est$X
Y = Est$Y
EstGraph = ggplot(Treedata, aes(x = Diameter_X, y = Age_Y), color = "blue", shape = 15) + geom_point() +
EstGraph
Scatterplot of Y (Age) against X (Diameter)

with Estimates & Regression Line
150
125
Tree Age
100
75
6 8 10 12
Tree Diameter
7
"\n"
## [1] "\n"
From the above graph it is clear to see that both the Ratio Estimation & The Regression Estimation of the
Population Mean Age are very close in their value however the Standard Error for the methods vary greatly
Question 2
The new candy Green Goobules is being test-marketed in an area of upstate NY. The market research firm
decided to sample 6 cities from the 45 in the area & then to sample supermarkets within the cities, wanting
to know the no. of Green Gobules cases sold
a) Obtain summary statistics for each cluster
# Create the Data Sets

cluster_1 = c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)
cluster_2 = c(99, 101, 52, 121)
cluster_3 = c(199, 179, 98, 63, 126, 87, 62)
cluster_4 = c(226, 129, 57, 46, 86, 43, 85, 165)
cluster_5 = c(12, 23)
cluster_6 = c(87, 43, 59)
#Cluster 1
summary(cluster_1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.

## 72.0 147.5 175.5 177.3 184.8 361.0
sd1 = sd(cluster_1)
var1 = var(cluster_1)
cat("The Standard Deviation of Cluster 1 is", sd1, "while the Variance is", var1, "\n", "\n")
## The Standard Deviation of Cluster 1 is 83.59964 while the Variance is 6988.9

##
#Cluster 2
summary(cluster_2)

## 52.00 87.25 100.00 93.25 106.00 121.00
sd2 = sd(cluster_2)

##
8
#Cluster 3
summary(cluster_3)

## 62.0 75.0 98.0 116.3 152.5 199.0
sd3 = sd(cluster_3)

##
#Cluster 4
summary(cluster_4)

## 43.00 54.25 85.50 104.62 138.00 226.00
sd4 = sd(cluster_4)

##
#Cluster 5
summary(cluster_5)

## 12.00 14.75 17.50 17.50 20.25 23.00
sd5 = sd(cluster_5)

##
#Cluster 6
summary(cluster_6)

## 43 51 59 63 73 87
sd6 = sd(cluster_6)
9
## The Standard Deviation of Cluster 1 is 22.27106 while the Variance is 496
##
b) Estimate the total number of cases sold, and the average number sold per supermarket, along with the
standard errors of your Estimates
# Create the Data Set

City = c(1, 2, 3, 4, 5, 6)
Supermarket_No = c(52, 19, 37, 39, 8, 14)
"cluster_1 = I(list(c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)))
cluster_2 = I(list(c(99, 101, 52, 121)))
cluster_3 = I(list(c(199, 179, 98, 63, 126, 87, 62)))
cluster_4 = I(list(c(226, 129, 57, 46, 86, 43, 85, 165)))
cluster_5 = I(list(c(12, 23)))
cluster_6 = I(list(c(87, 43, 59)))"
## [1] "cluster_1 = I(list(c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)))\ncluster_2 = I(list(c(99,
Cases_Sold = data.frame(Clusters = c("Cluster_1", "Cluster_2", "Cluster_3", "Cluster_4", "Cluster_5", "C

#Cases_Sold = bind_rows(
# tibble(Cluster = "cluster_1", Value = cluster_1),
# tibble(Cluster = "cluster_6", Value = cluster_6)
#)
#Cases_Sold
#Cases_Sold = tidyr::gather(data.frame(cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6)
#Cases_Sold = data.frame(var1 = rbind(cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6))
View(Cases_Sold)
Green_Gobules = cbind(City, Supermarket_No, Cases_Sold[2])
View(Green_Gobules)
#Green_Gobules = Green_Gobules %>%
# mutate_at(vars(3), as.numeric)
mean(Green_Gobules[1, 3])
## Warning in mean.default(Green_Gobules[1, 3]): argument is not numeric or

## logical: returning NA
## [1] NA
10

STA2050 Assignment 2

Uploaded by

Copyright:

Available Formats

You might also like

STA2050 Assignment 2

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

STA2050 Assignment 2

Uploaded by

Copyright:

Available Formats

STA2050 Assignment 2

Mark Bilahi M’rabu

Import the Necessary Packages

## Warning: package ’imputeTS’ was built under R version 4.3.2

## Registered S3 method overwritten by ’quantmod’:

## Warning: package ’VIM’ was built under R version 4.3.2

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## The following object is masked from ’package:datasets’:

## The following objects are masked from ’package:base’:

## Warning: package ’tidyverse’ was built under R version 4.3.2

## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --

## Warning: package ’finalfit’ was built under R version 4.3.2

## Warning: package ’psych’ was built under R version 4.3.2

## Loading required package: Matrix

# Create the DataSet

## Tree_no Diameter_X Age_Y

## ’data.frame’: 20 obs. of 3 variables:

Treeplot = ggplot(Treedata, aes(x = Diameter_X, y = Age_Y)) + geom_point() + labs(Treedata, title = 'Sca

## Diameter Sample Total = 188.1

## Age Sample Total = 2148

meany = meanx * (totaly / totalx)

## Population Mean Age = 117.6204

#Approximating Standard Error for above Estimation

## Standard Error= 6.40904

c) Repeat b) using Regression Estimation

## Tree_no Diameter_X Age_Y XY Xsqr Ysqr

#Fitting the Linear Regression Model

## The slope of the model is 11.42115 while the intercept is -0.03030842

Estimate = Intercept + (Slope *meanx)

The Regression is given by:

d) Label your Estimates on your graph. How do they compare?

Ratio = c(10.3, meany)

colnames(Est) = c("X", "Y")

Scatterplot of Y (Age) against X (Diameter)

a) Obtain summary statistics for each cluster

# Create the Data Sets

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## The Standard Deviation of Cluster 1 is 83.59964 while the Variance is 6988.9

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## The Standard Deviation of Cluster 2 is 29.23896 while the Variance is 854.9167

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## The Standard Deviation of Cluster 3 is 54.53963 while the Variance is 2974.571

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## The Standard Deviation of Cluster 4 is 64.59309 while the Variance is 4172.268

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## The Standard Deviation of Cluster 5 is 7.778175 while the Variance is 60.5

## Min. 1st Qu. Median Mean 3rd Qu. Max.

# Create the Data Set

Cases_Sold = data.frame(Clusters = c("Cluster_1", "Cluster_2", "Cluster_3", "Cluster_4", "Cluster_5", "C

## Warning in mean.default(Green_Gobules[1, 3]): argument is not numeric or

You might also like