Professional Documents
Culture Documents
Lista 4 - Mineração de Dados
Lista 4 - Mineração de Dados
Resolução:
Questão 1
item a
library(ISLR)
library(tidyverse)
library(caret)
library(glmnet)
Credit <- as_tibble(Credit)
head(Credit)
## # A tibble: 6 x 12
## ID Income Limit Rating Cards Age Education Gender Student Married
## <int> <dbl> <int> <int> <int> <int> <int> <fct> <fct> <fct>
## 1 1 14.9 3606 283 2 34 11 " Mal~ No Yes
## 2 2 106. 6645 483 3 82 15 "Fema~ Yes Yes
## 3 3 105. 7075 514 4 71 11 " Mal~ No No
## 4 4 149. 9504 681 3 36 11 "Fema~ No No
## 5 5 55.9 4897 357 2 68 16 " Mal~ No Yes
## 6 6 80.2 8047 569 4 77 10 " Mal~ No No
## # ... with 2 more variables: Ethnicity <fct>, Balance <int>
str(Credit)
1
Credit <- Credit[,-1]
head(Credit)
## # A tibble: 6 x 11
## Income Limit Rating Cards Age Education Gender Student Married Ethnicity
## <dbl> <int> <int> <int> <int> <int> <fct> <fct> <fct> <fct>
## 1 14.9 3606 283 2 34 11 " Mal~ No Yes Caucasian
## 2 106. 6645 483 3 82 15 "Fema~ Yes Yes Asian
## 3 105. 7075 514 4 71 11 " Mal~ No No Asian
## 4 149. 9504 681 3 36 11 "Fema~ No No Asian
## 5 55.9 4897 357 2 68 16 " Mal~ No Yes Caucasian
## 6 80.2 8047 569 4 77 10 " Mal~ No No Caucasian
## # ... with 1 more variable: Balance <int>
str(Credit)
# Exercício 1-a)
set.seed(1)
treinamento <- sample.int(nrow(Credit), nrow(Credit)*0.6)
treino <- Credit[treinamento,]
teste <- Credit[-treinamento,]
item b
x <- treino[,-11]
x <- data.matrix(cbind(1,x))
x_teste <- teste[,-11]
x_teste <- data.matrix(cbind(1,x_teste))
lasso <- cv.glmnet(x,treino$Balance,alpha=1)
lasso
##
## Call: cv.glmnet(x = x, y = treino$Balance, alpha = 1)
##
## Measure: Mean-Squared Error
##
2
## Lambda Measure SE Nonzero
## min 1.406 9370 549.4 9
## 1se 6.231 9843 542.0 7
# Lasso
ajuste_lasso <- glmnet(x, treino$Balance, alpha=1)
ajuste_lasso
##
## Call: glmnet(x = x, y = treino$Balance, alpha = 1)
##
## Df %Dev Lambda
## 1 0 0.00 410.00
## 2 1 12.95 373.60
## 3 1 23.70 340.40
## 4 1 32.63 310.10
## 5 1 40.04 282.60
## 6 1 46.19 257.50
## 7 1 51.30 234.60
## 8 1 55.54 213.80
## 9 2 59.06 194.80
## 10 2 61.99 177.50
## 11 2 64.42 161.70
## 12 2 66.43 147.30
## 13 2 68.11 134.30
## 14 2 69.50 122.30
## 15 2 70.65 111.50
## 16 3 72.52 101.60
## 17 3 74.20 92.53
## 18 3 75.60 84.31
## 19 3 76.76 76.82
## 20 3 77.72 70.00
## 21 3 78.52 63.78
## 22 3 79.19 58.11
## 23 4 79.93 52.95
## 24 4 82.61 48.25
## 25 4 84.83 43.96
## 26 5 86.68 40.06
## 27 5 88.22 36.50
## 28 5 89.50 33.25
## 29 5 90.56 30.30
## 30 5 91.44 27.61
## 31 6 92.19 25.16
## 32 6 92.82 22.92
## 33 6 93.35 20.89
## 34 6 93.79 19.03
## 35 6 94.16 17.34
## 36 6 94.46 15.80
## 37 6 94.72 14.40
## 38 6 94.94 13.12
## 39 6 95.12 11.95
## 40 6 95.27 10.89
3
## 41 6 95.39 9.92
## 42 6 95.50 9.04
## 43 6 95.58 8.24
## 44 6 95.65 7.51
## 45 6 95.71 6.84
## 46 7 95.77 6.23
## 47 7 95.81 5.68
## 48 7 95.85 5.17
## 49 7 95.88 4.71
## 50 7 95.90 4.30
## 51 7 95.93 3.91
## 52 7 95.94 3.57
## 53 7 95.96 3.25
## 54 8 95.97 2.96
## 55 8 95.98 2.70
## 56 8 95.99 2.46
## 57 8 96.00 2.24
## 58 8 96.01 2.04
## 59 8 96.01 1.86
## 60 8 96.02 1.69
## 61 8 96.02 1.54
## 62 9 96.02 1.41
## 63 9 96.03 1.28
## 64 9 96.03 1.17
## 65 9 96.03 1.06
## 66 9 96.03 0.97
## 67 9 96.03 0.88
## 68 9 96.04 0.80
## 69 10 96.04 0.73
## 70 10 96.04 0.67
# Melhor lambda
lasso
##
## Call: cv.glmnet(x = x, y = treino$Balance, alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Measure SE Nonzero
## min 1.406 9370 549.4 9
## 1se 6.231 9843 542.0 7
plot(lasso)
4
250000 10 9 9 8 8 7 7 6 6 6 5 5 3 3 2 2 1 1
Mean−Squared Error
150000
50000
0
0 1 2 3 4 5 6
Log(λ)
lasso$lambda.min
## [1] 1.406431
item c
5
predito_coef_lasso <- predict(lasso,s=lasso$lambda.min, type="coef")
predito_coef_lasso
item d
riscos<-list()
riscos$minimos.quadrados<-
(predito_mmq - teste$Balance)^2 %>% mean()
riscos$minimos.quadrados
## [1] 11130.05
riscos$lasso<-
(predito_lasso - teste$Balance)^2 %>% mean()
riscos$lasso
## [1] 11080.59
riscos
## $minimos.quadrados
## [1] 11130.05
##
## $lasso
## [1] 11080.59
lasso$lambda.min
## [1] 1.406431
Exercício 2
6
library(MLmetrics)
# Exercicio 2
set.seed(1)
X <- runif(30,min = 8, max = 18)
Y <- rnorm(30, mean = 45*tanh(X/(1.9)-7)+57, sd = 16)
X_<-cbind(1,X)
X0<-cbind(1,rep(0,30))
X0
## [,1] [,2]
## [1,] 1 0
## [2,] 1 0
## [3,] 1 0
## [4,] 1 0
## [5,] 1 0
## [6,] 1 0
## [7,] 1 0
## [8,] 1 0
## [9,] 1 0
## [10,] 1 0
## [11,] 1 0
## [12,] 1 0
## [13,] 1 0
## [14,] 1 0
## [15,] 1 0
## [16,] 1 0
## [17,] 1 0
## [18,] 1 0
## [19,] 1 0
## [20,] 1 0
## [21,] 1 0
## [22,] 1 0
## [23,] 1 0
## [24,] 1 0
## [25,] 1 0
## [26,] 1 0
## [27,] 1 0
## [28,] 1 0
## [29,] 1 0
## [30,] 1 0
X20<-cbind(1,X,X^2,X^3,X^4,X^5,
X^6,X^7,X^8,X^9,X^10,
X^11,X^12,X^13,X^14,X^15,
X^16,X^17,X^18,X^19,X^20)
X10<-cbind(1,X,X^2,X^3,X^4,X^5,
X^6,X^7,X^8,X^9,X^10)
X5<-cbind(1,X,X^2,X^3,X^4,X^5)
X2<-cbind(1,X,X^2)
7
a<-as.vector(lasso$lambda)
a
riscos<-list()
riscos$minimos.quadrados<-
MSE(predito_mmq,Y)
riscos$lasso<-
MSE(predito_lasso,Y)
riscos
## $minimos.quadrados
## [1] 184.12
##
## $lasso
## [1] 184.12
8
#Grau 2
lasso2
##
## Call: cv.glmnet(x = X2, y = Y, alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Measure SE Nonzero
## min 0.233 188.8 36.83 1
## 1se 5.501 220.8 21.48 1
riscos<-list()
riscos$minimos.quadrados<-
MSE(predito_mmq,Y)
riscos$lasso<-
MSE(predito_lasso,Y)
riscos
## $minimos.quadrados
## [1] 165.9805
##
## $lasso
## [1] 165.9805
9
## 4 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -26.4896455
## .
## X .
## 0.4553384
#Grau 5
riscos<-list()
riscos$minimos.quadrados<-
MSE(predito_mmq,Y)
riscos$lasso<-
MSE(predito_lasso,Y)
riscos
## $minimos.quadrados
## [1] 165.9167
##
## $lasso
## [1] 166.2808
10
predito_coef_mmq <- predict(lasso5, s=0, type="coef")
predito_coef_mmq
#Grau 10
riscos<-list()
11
riscos$minimos.quadrados<-
MSE(predito_mmq,Y)
riscos$lasso<-
MSE(predito_lasso,Y)
riscos
## $minimos.quadrados
## [1] 134.2554
##
## $lasso
## [1] 139.201
12
#Grau 20
riscos<-list()
riscos$minimos.quadrados<-
MSE(predito_mmq,Y)
riscos$lasso<-
MSE(predito_lasso,Y)
riscos
## $minimos.quadrados
## [1] 140.9794
##
## $lasso
## [1] 146.7015
13
## .
## .
## .
## .
## -1.124252e-18
## -4.570775e-20
## -9.429254e-19
## -1.069807e-19
## -8.177216e-21
## -5.532420e-22
## -3.757787e-24
## .
dtX<-data.frame(x=a,x2=b,x5=c,x10=d,x20=e)
dtY<-data.frame(y=as.vector(lasso$cvm),
y2=as.vector(lasso2$cvm),
y5=as.vector(lasso5$cvm),
y10=(as.vector(lasso10$cvm) %>% sample(55) %>% sort(decreasing = TRUE)),
y20=(as.vector(lasso20$cvm) %>% sample(55) %>% sort(decreasing = TRUE)))
library(ggplot2)
14
dt<-data.frame(dtX,dtY)
ggplot(dt,aes(x=log(value),y=value.1,color=key))+
geom_point()+
labs(color = "Graus")+
scale_color_discrete(labels=c("Grau 1", "Grau 10","Grau 2","Grau 20", "Grau 5"))+
xlab(label="log(lambda)")+
ylab(label = "EQM")
1600
1200
Graus
Grau 1
Grau 10
EQM
800 Grau 2
Grau 20
Grau 5
400
−4 −2 0 2
log(lambda)
Como podemos observar pelos resultados obtidos e pelo gráfico, o valor de lambda aumenta conforme
maior é o grau polinomial. Além disso, o lasso apresentou piora de desempenho conforme cresceu o grau
polinomial, e ainda nessa situação, o modelo se mostrou inferior aos de mínimos quadrados.
15