Download as pdf or txt
Download as pdf or txt
You are on page 1of 17

TUGAS MINGGU KE-8

MODEL STATISTIKA LINIER

Oleh:
Akhmad Safrin Sadad Khan
191810101014

JURUSAN MATEMMATIKA
FAKULTAS MATEMATIKA DAN ILMU PENGETAHUAN ALAM
UNIVERSITAS JEMBER
2021
Tugas_Praktikum_8
Akhmad Safrin Sadad Khan

11/1/2021

Read Data
retails<-read.csv("retail.csv")
head(retails)

## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID


## 1 1 CA-2016-152156 11/8/16 11/11/16 Second Class CG-12520
## 2 2 CA-2016-152156 11/8/16 11/11/16 Second Class CG-12520
## 3 3 CA-2016-138688 6/12/16 6/16/16 Second Class DV-13045
## 4 4 US-2015-108966 10/11/15 10/18/15 Standard Class SO-20335
## 5 5 US-2015-108966 10/11/15 10/18/15 Standard Class SO-20335
## 6 6 CA-2014-115812 6/9/14 6/14/14 Standard Class BH-11710
## Segment Product.ID Category Sub.Category
## 1 Consumer FUR-BO-10001798 Furniture Bookcases
## 2 Consumer FUR-CH-10000454 Furniture Chairs
## 3 Corporate OFF-LA-10000240 Office Supplies Labels
## 4 Consumer FUR-TA-10000577 Furniture Tables
## 5 Consumer OFF-ST-10000760 Office Supplies Storage
## 6 Consumer FUR-FU-10001487 Furniture Furnishings
## Product.Name
Sales
## 1 Bush Somerset Collection Bookcase
261.9600
## 2 Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back
731.9400
## 3 Self-Adhesive Address Labels for Typewriters by Universal
14.6200
## 4 Bretford CR4500 Series Slim Rectangular Table
957.5775
## 5 Eldon Fold 'N Roll Cart System
22.3680
## 6 Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood
48.8600
## Quantity Discount Profit
## 1 2 0.00 41.9136
## 2 3 0.00 219.5820
## 3 2 0.00 6.8714
## 4 5 0.45 -383.0310
## 5 2 0.20 2.5164
## 6 7 0.00 14.1694

str(retails)
## 'data.frame': 9994 obs. of 15 variables:
## $ Row.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Order.ID : chr "CA-2016-152156" "CA-2016-152156" "CA-2016-138688"
"US-2015-108966" ...
## $ Order.Date : chr "11/8/16" "11/8/16" "6/12/16" "10/11/15" ...
## $ Ship.Date : chr "11/11/16" "11/11/16" "6/16/16" "10/18/15" ...
## $ Ship.Mode : chr "Second Class" "Second Class" "Second Class"
"Standard Class" ...
## $ Customer.ID : chr "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
## $ Segment : chr "Consumer" "Consumer" "Corporate" "Consumer" ...
## $ Product.ID : chr "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-
10000240" "FUR-TA-10000577" ...
## $ Category : chr "Furniture" "Furniture" "Office Supplies"
"Furniture" ...
## $ Sub.Category: chr "Bookcases" "Chairs" "Labels" "Tables" ...
## $ Product.Name: chr "Bush Somerset Collection Bookcase" "Hon Deluxe
Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address
Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular
Table" ...
## $ Sales : num 262 731.9 14.6 957.6 22.4 ...
## $ Quantity : int 2 3 2 5 2 7 4 6 3 5 ...
## $ Discount : num 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
## $ Profit : num 41.91 219.58 6.87 -383.03 2.52 ...

library(lubridate)

##
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':


##
## date, intersect, setdiff, union

Konversi Data
retails$Ship.Date<-as.Date(retails$Ship.Date,format = "%m/%d/%y")
retails$Order.Date<-as.Date(retails$Order.Date,format = "%m/%d/%y")
retails[,c("Ship.Mode","Segment","Category","Sub.Category")]<-
lapply(retails[,c("Ship.Mode","Segment","Category","Sub.Category")],as.factor
)

str(retails)

## 'data.frame': 9994 obs. of 15 variables:


## $ Row.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Order.ID : chr "CA-2016-152156" "CA-2016-152156" "CA-2016-138688"
"US-2015-108966" ...
## $ Order.Date : Date, format: "2016-11-08" "2016-11-08" ...
## $ Ship.Date : Date, format: "2016-11-11" "2016-11-11" ...
## $ Ship.Mode : Factor w/ 4 levels "First Class",..: 3 3 3 4 4 4 4 4 4 4
...
## $ Customer.ID : chr "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
## $ Segment : Factor w/ 3 levels "Consumer","Corporate",..: 1 1 2 1 1 1
1 1 1 1 ...
## $ Product.ID : chr "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-
10000240" "FUR-TA-10000577" ...
## $ Category : Factor w/ 3 levels "Furniture","Office Supplies",..: 1 1
2 1 2 1 2 3 2 2 ...
## $ Sub.Category: Factor w/ 17 levels "Accessories",..: 5 6 11 17 15 10 3
14 4 2 ...
## $ Product.Name: chr "Bush Somerset Collection Bookcase" "Hon Deluxe
Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address
Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular
Table" ...
## $ Sales : num 262 731.9 14.6 957.6 22.4 ...
## $ Quantity : int 2 3 2 5 2 7 4 6 3 5 ...
## $ Discount : num 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
## $ Profit : num 41.91 219.58 6.87 -383.03 2.52 ...

Visualisasi
hist(retails$Sales, breaks=90)

boxplot(retails$Sales)
out_sales<-quantile(retails$Sales,probs = 0.75)+1.5*IQR(retails$Sales)

Cleaning Outlier
sales_without_outlier<-retails[retails$Sales<out_sales,]
boxplot(sales_without_outlier$Sales)
hist(sales_without_outlier$Sales)

hist(retails$Discount, breaks=20)
boxplot(retails$Discount)

out<-quantile(retails$Discount,probs = 0.75)+1.5*IQR(retails$Discount)
discount_without_outlier<-retails[retails$Discount<out,]
boxplot(discount_without_outlier$Discount)

hist(retails$Profit, breaks=20)
boxplot(retails$Profit,outline = TRUE)

Kolerasi
cor(retails$Sales,retails$Profit)
## [1] 0.4790643

qqplot(retails$Sales,retails$Profit)

plot(retails$Sales,retails$Profit, main="Scatterplot Example",


xlab="Car Weight ", ylab="Miles Per Gallon ", pch=19)
library(car)

## Loading required package: carData

scatterplot(retails$Sales~retails$Profit ,data = retails)


plot(retails$Sales,retails$Profit, main = "Main title",
xlab = "X axis title", ylab = "Y axis title",
pch = 19, frame = FALSE)
abline(lm(retails$Sales~retails$Profit, data = retails), col = "blue")
lines(lowess(retails$Sales,retails$Profit), col = "blue")
cor(retails$Profit,retails$Discount)

## [1] -0.2194875

cor(retails$Profit,retails$Sales)

## [1] 0.4790643

cor(discount_without_outlier$Profit,discount_without_outlier$Discount)

## [1] -0.1577858

cor(sales_without_outlier$Profit,sales_without_outlier$Sales)

## [1] 0.1964482

Build Model Linier Regresi 1. Pembentkan Model linier regresi “Profit ~ Sales” tanpa
Outlier
ols_without_outliers<-lm(formula=Profit ~ Sales,data=sales_without_outlier)
class(ols_without_outliers)

## [1] "lm"

summary(ols_without_outliers)

##
## Call:
## lm(formula = Profit ~ Sales, data = sales_without_outlier)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1223.79 -3.62 0.97 10.36 196.07
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.349824 0.660415 5.072 4.01e-07 ***
## Sales 0.084519 0.004491 18.821 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48.11 on 8825 degrees of freedom
## Multiple R-squared: 0.03859, Adjusted R-squared: 0.03848
## F-statistic: 354.2 on 1 and 8825 DF, p-value: < 2.2e-16

ols_without_outliers$coefficients

## (Intercept) Sales
## 3.34982400 0.08451873

2.Pembentkan Model linier regresi “Profit ~ Sales” dengan Outlier


ols_with_outliers<-lm(formula=Profit ~ Sales,data=retails)
class(ols_with_outliers)

## [1] "lm"

summary(ols_with_outliers)

##
## Call:
## lm(formula = Profit ~ Sales, data = retails)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7397.5 2.6 14.6 21.7 5261.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12.732867 2.192459 -5.808 6.53e-09 ***
## Sales 0.180067 0.003301 54.555 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 205.6 on 9992 degrees of freedom
## Multiple R-squared: 0.2295, Adjusted R-squared: 0.2294
## F-statistic: 2976 on 1 and 9992 DF, p-value: < 2.2e-16

ols_with_outliers$coefficients
## (Intercept) Sales
## -12.7328671 0.1800667

3.Pembentkan Model linier regresi “Profit ~ Discount” tanpa Outlier


ols_without_outliers_2<-lm(formula=Profit ~
Discount,data=discount_without_outlier)
class(ols_without_outliers_2)

## [1] "lm"

summary(ols_without_outliers_2)

##
## Call:
## lm(formula = Profit ~ Discount, data = discount_without_outlier)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1817.0 -59.5 -13.3 2.2 8328.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.689 2.901 24.71 <2e-16 ***
## Discount -292.435 19.217 -15.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 206 on 9070 degrees of freedom
## Multiple R-squared: 0.0249, Adjusted R-squared: 0.02479
## F-statistic: 231.6 on 1 and 9070 DF, p-value: < 2.2e-16

ols_without_outliers_2$coefficients

## (Intercept) Discount
## 71.68931 -292.43549

4.Pembentkan Model linier regresi “Profit ~ Discount” dengan Outliir


ols_with_outliers_2<-lm(formula=Profit ~ Discount,data=retails)
class(ols_with_outliers_2)

## [1] "lm"

summary(ols_with_outliers_2)

##
## Call:
## lm(formula = Profit ~ Discount, data = retails)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6493.2 -54.9 -15.9 9.4 8332.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67.559 2.867 23.57 <2e-16 ***
## Discount -249.051 11.075 -22.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 228.6 on 9992 degrees of freedom
## Multiple R-squared: 0.04817, Adjusted R-squared: 0.04808
## F-statistic: 505.7 on 1 and 9992 DF, p-value: < 2.2e-16

ols_with_outliers_2$coefficients

## (Intercept) Discount
## 67.55941 -249.05142

1. Model Profit~Sales tanpa Outlier


tes<-sales_without_outlier$Sales[c(0,4,8,20)]
profit_sesungguhnya<-sales_without_outlier$Profit[c(0,4,8,20)]
data_tes<-data.frame(Sales=tes) #ngetes model prediksi benar atau salah
prediksi_profit<-predict(ols_without_outliers,data_tes)

data.frame(data_tes,prediksi_profit,profit_sesungguhnya)

## Sales prediksi_profit profit_sesungguhnya


## 1 48.860 7.479409 14.1694
## 2 15.552 4.664259 5.4432
## 3 90.570 11.004686 11.7741

2.Model Profit~Sales dengan Outlier


tes<-sales_without_outlier$Sales[c(0,4,8,20)]
profit_sesungguhnya<-sales_without_outlier$Profit[c(0,4,8,20)]
data_tes<-data.frame(Sales=tes) #ngetes model prediksi benar atau salah
prediksi_profit<-predict(ols_with_outliers,data_tes)

data.frame(data_tes,prediksi_profit,profit_sesungguhnya)

## Sales prediksi_profit profit_sesungguhnya


## 1 48.860 -3.934810 14.1694
## 2 15.552 -9.932470 5.4432
## 3 90.570 3.575771 11.7741

3.Model Profit~Discount tanpa Outlier


tes_discount<-discount_without_outlier$Discount[c(0,4,8,20)]
profit_Sesungguhnya<-discount_without_outlier$Profit[c(0,4,8,20)]
transaction_today<-data.frame(Discount=tes_discountProfit_Prediksi<-
predict(ols_without_outliers_2,transaction_today)
data.frame(tes_discount,Profit_Prediksi,profit_Sesungguhnya)

## tes_discount Profit_Prediksi profit_Sesungguhnya


## 1 0.45 -59.90666 -383.0310
## 2 0.20 13.20221 90.7152
## 3 0.00 71.68931 5.0596

4. Model Profit~Discount dengan Outlier


tes_discount<-discount_without_outlier$Discount[c(0,4,8,20)]
profit_Sesungguhnya<-discount_without_outlier$Profit[c(0,4,8,20)]
transaction_today<-data.frame(Discount=tes_discount) #ngetes model prediksi
benar atau salah
Profit_Prediksi<-predict(ols_with_outliers_2,transaction_today)
data.frame(tes_discount,Profit_Prediksi,profit_Sesungguhnya)

## tes_discount Profit_Prediksi profit_Sesungguhnya


## 1 0.45 -44.51373 -383.0310
## 2 0.20 17.74912 90.7152
## 3 0.00 67.55941 5.0596

##KESIMPULAN dari hasil prediksi model di atas yaitu model yang tidak memuat outlier
adalah model yang paling baik, yang dimana dapat dilihat dari hasil prediksi dari model
tanpa outlier yang prediksinya lebih mendekati dengan nilai aslinya dibandingkan pada
model yang memuat outlier

You might also like