STA2050 Assignment 2

Mark Bilahi M’rabu


Import the Necessary Packages


Question 1

# Create the DataSet

Tree_no = c(1:20)
Diameter_X = c(12, 11.4, 7.9, 10.5, 7.9, 9, 7.3, 10.2, 11.7, 11.3, 5.7, 8, 10.3, 12, 9.2, 8.5, 7, 10.7,
Age_Y = c(125, 119, 83, 85, 99, 117, 69, 133, 154, 168, 61, 80, 114, 147, 122, 106, 82, 88, 97, 99)
Treedata = data.frame(cbind(Tree_no, Diameter_X, Age_Y))

## Tree_no Diameter_X Age_Y

## 1 1 12.0 125
## 2 2 11.4 119
## 3 3 7.9 83
## 4 4 10.5 85
## 5 5 7.9 99
## 6 6 9.0 117


## ’data.frame’: 20 obs. of 3 variables:

## $ Tree_no : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Diameter_X: num 12 11.4 7.9 10.5 7.9 9 7.3 10.2 11.7 11.3 ...
## $ Age_Y : num 125 119 83 85 99 117 69 133 154 168 ...

a) Draw a scatterplot of y vs x

Treeplot = ggplot(Treedata, aes(x = Diameter_X, y = Age_Y)) + geom_point() + labs(Treedata, title = 'Sca


Scatterplot of Y (Age) against X (Diameter)


Tree Age



6 8 10 12
Tree Diameter


## [1] "\n"

b) Estimate the Population Mean age of trees in the stand using ratio estimation and give an approximate
standard error for your estimate

Diameter = Auxilliary = x
Age = Dependent = y
tx = xi
ty = yi

tx µx
ty µy
P opulationM eanAgeµy = µx ∗
StandardError = √
nof age

#Estimating Population Mean Age
meanx = 10.3
totalx = sum(Diameter_X)
cat("Diameter Sample Total =", totalx, "\n")

## Diameter Sample Total = 188.1

totaly = sum(Age_Y)
cat("Age Sample Total =", totaly, "\n")

## Age Sample Total = 2148

meany = meanx * (totaly / totalx)

cat("Population Mean Age =", meany, "\n")

## Population Mean Age = 117.6204

#Approximating Standard Error for above Estimation

StanError = sd(Age_Y) / (sqrt(length(Age_Y)))
cat("Standard Error=", StanError, "\n")

## Standard Error= 6.40904

cat("Therefore, the estimated population mean age of trees is", meany, "while the standard error for sai

## Therefore, the estimated population mean age of trees is 117.6204 while the standard error for said e

c) Repeat b) using Regression Estimation

n x∗y− x∗ y
bi = P 2 P 2
n ∗ x − ( x)
y = b0 + b1 x
#Creating a Subset of the Treedata
Regdata = Treedata
Regdata[4] = Regdata[2] * Regdata[3]
Regdata[5] = Regdata[2]ˆ2
Regdata[6] = Regdata[3]ˆ2
colnames(Regdata) = c("Tree_no", 'Diameter_X', "Age_Y", "XY", "Xsqr", "Ysqr")
Regdata = Regdata %>%
mutate_at(vars(1), as.character)
Totals = Regdata %>%
summarize(Tree_no = "Totals",
Diameter_X = sum(Regdata$Diameter_X),
Age_Y = sum(Regdata$Age_Y),
XY = sum(Regdata$XY),
Xsqr = sum(Regdata$Xsqr),
Ysqr = sum(Regdata$Ysqr))
Regdata = bind_rows(Regdata, Totals)
#Regdata = Regdata %>%
# mutate(Regdata$Tree_no[21], "Totals")

## ’data.frame’: 21 obs. of 6 variables:
## $ Tree_no : chr "1" "2" "3" "4" ...
## $ Diameter_X: num 12 11.4 7.9 10.5 7.9 9 7.3 10.2 11.7 11.3 ...
## $ Age_Y : num 125 119 83 85 99 117 69 133 154 168 ...
## $ XY : num 1500 1357 656 892 782 ...
## $ Xsqr : num 144 130 62.4 110.2 62.4 ...
## $ Ysqr : num 15625 14161 6889 7225 9801 ...


## Tree_no Diameter_X Age_Y XY Xsqr Ysqr

## 1 1 12.0 125 1500.0 144.00 15625
## 2 2 11.4 119 1356.6 129.96 14161
## 3 3 7.9 83 655.7 62.41 6889
## 4 4 10.5 85 892.5 110.25 7225
## 5 5 7.9 99 782.1 62.41 9801
## 6 6 9.0 117 1053.0 81.00 13689

#Fitting the Linear Regression Model

Treemodel = lm(Age_Y ~ Diameter_X, data = Regdata)

## Call:
## lm(formula = Age_Y ~ Diameter_X, data = Regdata)
## Residuals:
## Min 1Q Median 3Q Max
## -34.892 -11.171 -0.288 9.977 38.971
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.03031 4.33635 -0.007 0.994
## Diameter_X 11.42115 0.10301 110.874 <2e-16 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
## Residual standard error: 17.98 on 19 degrees of freedom
## Multiple R-squared: 0.9985, Adjusted R-squared: 0.9984
## F-statistic: 1.229e+04 on 1 and 19 DF, p-value: < 2.2e-16

Intercept = coef(Treemodel)[1]
Slope = coef(Treemodel)[2]
cat("The slope of the model is", Slope, "while the intercept is", Intercept, "\n")

## The slope of the model is 11.42115 while the intercept is -0.03030842

Estimate = Intercept + (Slope *meanx)

The Regression is given by:

age = −7.63 + 12.23diameter

#Approximating Standard Error for the above Estimation
Residuals = resid(Treemodel)
ResStanDev = sd(Residuals)
SE = ResStanDev * sqrt(1 + 1/length(Age_Y) + ((meanx - mean(Diameter_X))ˆ2 / sum((Diameter_X - mean(Diam
cat("The Regression Estimate of the Population Mean Age is", Estimate, "while the Standard Error for the

## The Regression Estimate of the Population Mean Age is 117.6075 while the Standard Error for the above

d) Label your Estimates on your graph. How do they compare?

Ratio = c(10.3, meany)

Regression = c(10.3, Estimate)
Est = data.frame(rbind(Ratio, Regression))

## V1 X.Intercept.
## Ratio 10.3 117.6204
## Regression 10.3 117.6075

colnames(Est) = c("X", "Y")

X = Est$X
Y = Est$Y
EstGraph = ggplot(Treedata, aes(x = Diameter_X, y = Age_Y), color = "blue", shape = 15) + geom_point() +

Scatterplot of Y (Age) against X (Diameter)

with Estimates & Regression Line


Tree Age



6 8 10 12
Tree Diameter


## [1] "\n"

From the above graph it is clear to see that both the Ratio Estimation & The Regression Estimation of the
Population Mean Age are very close in their value however the Standard Error for the methods vary greatly

Question 2

The new candy Green Goobules is being test-marketed in an area of upstate NY. The market research firm
decided to sample 6 cities from the 45 in the area & then to sample supermarkets within the cities, wanting
to know the no. of Green Gobules cases sold

a) Obtain summary statistics for each cluster

# Create the Data Sets

cluster_1 = c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)
cluster_2 = c(99, 101, 52, 121)
cluster_3 = c(199, 179, 98, 63, 126, 87, 62)
cluster_4 = c(226, 129, 57, 46, 86, 43, 85, 165)
cluster_5 = c(12, 23)
cluster_6 = c(87, 43, 59)

#Cluster 1

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## 72.0 147.5 175.5 177.3 184.8 361.0

sd1 = sd(cluster_1)
var1 = var(cluster_1)
cat("The Standard Deviation of Cluster 1 is", sd1, "while the Variance is", var1, "\n", "\n")

## The Standard Deviation of Cluster 1 is 83.59964 while the Variance is 6988.9


#Cluster 2

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## 52.00 87.25 100.00 93.25 106.00 121.00

sd2 = sd(cluster_2)
var2 = var(cluster_2)
cat("The Standard Deviation of Cluster 2 is", sd2, "while the Variance is", var2, "\n", "\n")

## The Standard Deviation of Cluster 2 is 29.23896 while the Variance is 854.9167


#Cluster 3

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## 62.0 75.0 98.0 116.3 152.5 199.0

sd3 = sd(cluster_3)
var3 = var(cluster_3)
cat("The Standard Deviation of Cluster 3 is", sd3, "while the Variance is", var3, "\n", "\n")

## The Standard Deviation of Cluster 3 is 54.53963 while the Variance is 2974.571


#Cluster 4

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## 43.00 54.25 85.50 104.62 138.00 226.00

sd4 = sd(cluster_4)
var4 = var(cluster_4)
cat("The Standard Deviation of Cluster 4 is", sd4, "while the Variance is", var4, "\n", "\n")

## The Standard Deviation of Cluster 4 is 64.59309 while the Variance is 4172.268


#Cluster 5

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## 12.00 14.75 17.50 17.50 20.25 23.00

sd5 = sd(cluster_5)
var5 = var(cluster_5)
cat("The Standard Deviation of Cluster 5 is", sd5, "while the Variance is", var5, "\n", "\n")

## The Standard Deviation of Cluster 5 is 7.778175 while the Variance is 60.5


#Cluster 6

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## 43 51 59 63 73 87

sd6 = sd(cluster_6)
var6 = var(cluster_6)
cat("The Standard Deviation of Cluster 1 is", sd6, "while the Variance is", var6, "\n", "\n")

## The Standard Deviation of Cluster 1 is 22.27106 while the Variance is 496

b) Estimate the total number of cases sold, and the average number sold per supermarket, along with the
standard errors of your Estimates

# Create the Data Set

City = c(1, 2, 3, 4, 5, 6)
Supermarket_No = c(52, 19, 37, 39, 8, 14)
"cluster_1 = I(list(c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)))
cluster_2 = I(list(c(99, 101, 52, 121)))
cluster_3 = I(list(c(199, 179, 98, 63, 126, 87, 62)))
cluster_4 = I(list(c(226, 129, 57, 46, 86, 43, 85, 165)))
cluster_5 = I(list(c(12, 23)))
cluster_6 = I(list(c(87, 43, 59)))"

## [1] "cluster_1 = I(list(c(146, 180, 251, 152, 72, 181, 171, 361, 73, 186)))\ncluster_2 = I(list(c(99,

Cases_Sold = data.frame(Clusters = c("Cluster_1", "Cluster_2", "Cluster_3", "Cluster_4", "Cluster_5", "C

#Cases_Sold = bind_rows(
# tibble(Cluster = "cluster_1", Value = cluster_1),
# tibble(Cluster = "cluster_2", Value = cluster_2),
# tibble(Cluster = "cluster_3", Value = cluster_3),
# tibble(Cluster = "cluster_4", Value = cluster_4),
# tibble(Cluster = "cluster_5", Value = cluster_5),
# tibble(Cluster = "cluster_6", Value = cluster_6)
#Cases_Sold = tidyr::gather(data.frame(cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6)
#Cases_Sold = data.frame(var1 = rbind(cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6))
Green_Gobules = cbind(City, Supermarket_No, Cases_Sold[2])
#Green_Gobules = Green_Gobules %>%
# mutate_at(vars(3), as.numeric)
mean(Green_Gobules[1, 3])

## Warning in mean.default(Green_Gobules[1, 3]): argument is not numeric or

## logical: returning NA

## [1] NA


