Download as pdf or txt
Download as pdf or txt
You are on page 1of 15

LISTA 4 - MINERAÇÃO DE DADOS

Autor: Gustavo Ramos e Vinicius Lucidio

Resolução:

Questão 1

item a

library(ISLR)
library(tidyverse)
library(caret)
library(glmnet)
Credit <- as_tibble(Credit)
head(Credit)

## # A tibble: 6 x 12
## ID Income Limit Rating Cards Age Education Gender Student Married
## <int> <dbl> <int> <int> <int> <int> <int> <fct> <fct> <fct>
## 1 1 14.9 3606 283 2 34 11 " Mal~ No Yes
## 2 2 106. 6645 483 3 82 15 "Fema~ Yes Yes
## 3 3 105. 7075 514 4 71 11 " Mal~ No No
## 4 4 149. 9504 681 3 36 11 "Fema~ No No
## 5 5 55.9 4897 357 2 68 16 " Mal~ No Yes
## 6 6 80.2 8047 569 4 77 10 " Mal~ No No
## # ... with 2 more variables: Ethnicity <fct>, Balance <int>

str(Credit)

## tibble [400 x 12] (S3: tbl_df/tbl/data.frame)


## $ ID : int [1:400] 1 2 3 4 5 6 7 8 9 10 ...
## $ Income : num [1:400] 14.9 106 104.6 148.9 55.9 ...
## $ Limit : int [1:400] 3606 6645 7075 9504 4897 8047 3388 7114 3300 6819 ...
## $ Rating : int [1:400] 283 483 514 681 357 569 259 512 266 491 ...
## $ Cards : int [1:400] 2 3 4 3 2 4 2 2 5 3 ...
## $ Age : int [1:400] 34 82 71 36 68 77 37 87 66 41 ...
## $ Education: int [1:400] 11 15 11 11 16 10 12 9 13 19 ...
## $ Gender : Factor w/ 2 levels " Male","Female": 1 2 1 2 1 1 2 1 2 2 ...
## $ Student : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
## $ Married : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 1 1 1 1 2 ...
## $ Ethnicity: Factor w/ 3 levels "African American",..: 3 2 2 2 3 3 1 2 3 1 ...
## $ Balance : int [1:400] 333 903 580 964 331 1151 203 872 279 1350 ...

1
Credit <- Credit[,-1]
head(Credit)

## # A tibble: 6 x 11
## Income Limit Rating Cards Age Education Gender Student Married Ethnicity
## <dbl> <int> <int> <int> <int> <int> <fct> <fct> <fct> <fct>
## 1 14.9 3606 283 2 34 11 " Mal~ No Yes Caucasian
## 2 106. 6645 483 3 82 15 "Fema~ Yes Yes Asian
## 3 105. 7075 514 4 71 11 " Mal~ No No Asian
## 4 149. 9504 681 3 36 11 "Fema~ No No Asian
## 5 55.9 4897 357 2 68 16 " Mal~ No Yes Caucasian
## 6 80.2 8047 569 4 77 10 " Mal~ No No Caucasian
## # ... with 1 more variable: Balance <int>

str(Credit)

## tibble [400 x 11] (S3: tbl_df/tbl/data.frame)


## $ Income : num [1:400] 14.9 106 104.6 148.9 55.9 ...
## $ Limit : int [1:400] 3606 6645 7075 9504 4897 8047 3388 7114 3300 6819 ...
## $ Rating : int [1:400] 283 483 514 681 357 569 259 512 266 491 ...
## $ Cards : int [1:400] 2 3 4 3 2 4 2 2 5 3 ...
## $ Age : int [1:400] 34 82 71 36 68 77 37 87 66 41 ...
## $ Education: int [1:400] 11 15 11 11 16 10 12 9 13 19 ...
## $ Gender : Factor w/ 2 levels " Male","Female": 1 2 1 2 1 1 2 1 2 2 ...
## $ Student : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
## $ Married : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 1 1 1 1 2 ...
## $ Ethnicity: Factor w/ 3 levels "African American",..: 3 2 2 2 3 3 1 2 3 1 ...
## $ Balance : int [1:400] 333 903 580 964 331 1151 203 872 279 1350 ...

# Exercício 1-a)

set.seed(1)
treinamento <- sample.int(nrow(Credit), nrow(Credit)*0.6)
treino <- Credit[treinamento,]
teste <- Credit[-treinamento,]

item b

x <- treino[,-11]
x <- data.matrix(cbind(1,x))
x_teste <- teste[,-11]
x_teste <- data.matrix(cbind(1,x_teste))
lasso <- cv.glmnet(x,treino$Balance,alpha=1)
lasso

##
## Call: cv.glmnet(x = x, y = treino$Balance, alpha = 1)
##
## Measure: Mean-Squared Error
##

2
## Lambda Measure SE Nonzero
## min 1.406 9370 549.4 9
## 1se 6.231 9843 542.0 7

predito_mmq <- predict(lasso, s=0, newx=x_teste)

# Lasso
ajuste_lasso <- glmnet(x, treino$Balance, alpha=1)
ajuste_lasso

##
## Call: glmnet(x = x, y = treino$Balance, alpha = 1)
##
## Df %Dev Lambda
## 1 0 0.00 410.00
## 2 1 12.95 373.60
## 3 1 23.70 340.40
## 4 1 32.63 310.10
## 5 1 40.04 282.60
## 6 1 46.19 257.50
## 7 1 51.30 234.60
## 8 1 55.54 213.80
## 9 2 59.06 194.80
## 10 2 61.99 177.50
## 11 2 64.42 161.70
## 12 2 66.43 147.30
## 13 2 68.11 134.30
## 14 2 69.50 122.30
## 15 2 70.65 111.50
## 16 3 72.52 101.60
## 17 3 74.20 92.53
## 18 3 75.60 84.31
## 19 3 76.76 76.82
## 20 3 77.72 70.00
## 21 3 78.52 63.78
## 22 3 79.19 58.11
## 23 4 79.93 52.95
## 24 4 82.61 48.25
## 25 4 84.83 43.96
## 26 5 86.68 40.06
## 27 5 88.22 36.50
## 28 5 89.50 33.25
## 29 5 90.56 30.30
## 30 5 91.44 27.61
## 31 6 92.19 25.16
## 32 6 92.82 22.92
## 33 6 93.35 20.89
## 34 6 93.79 19.03
## 35 6 94.16 17.34
## 36 6 94.46 15.80
## 37 6 94.72 14.40
## 38 6 94.94 13.12
## 39 6 95.12 11.95
## 40 6 95.27 10.89

3
## 41 6 95.39 9.92
## 42 6 95.50 9.04
## 43 6 95.58 8.24
## 44 6 95.65 7.51
## 45 6 95.71 6.84
## 46 7 95.77 6.23
## 47 7 95.81 5.68
## 48 7 95.85 5.17
## 49 7 95.88 4.71
## 50 7 95.90 4.30
## 51 7 95.93 3.91
## 52 7 95.94 3.57
## 53 7 95.96 3.25
## 54 8 95.97 2.96
## 55 8 95.98 2.70
## 56 8 95.99 2.46
## 57 8 96.00 2.24
## 58 8 96.01 2.04
## 59 8 96.01 1.86
## 60 8 96.02 1.69
## 61 8 96.02 1.54
## 62 9 96.02 1.41
## 63 9 96.03 1.28
## 64 9 96.03 1.17
## 65 9 96.03 1.06
## 66 9 96.03 0.97
## 67 9 96.03 0.88
## 68 9 96.04 0.80
## 69 10 96.04 0.73
## 70 10 96.04 0.67

predito_lasso <- predict(lasso, s=lasso$lambda.min,newx=x_teste)

# Melhor lambda
lasso

##
## Call: cv.glmnet(x = x, y = treino$Balance, alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Measure SE Nonzero
## min 1.406 9370 549.4 9
## 1se 6.231 9843 542.0 7

plot(lasso)

4
250000 10 9 9 8 8 7 7 6 6 6 5 5 3 3 2 2 1 1
Mean−Squared Error

150000
50000
0

0 1 2 3 4 5 6

Log(λ)

lasso$lambda.min

## [1] 1.406431

item c

predito_coef_mmq <- predict(lasso, s=0, type="coef")


predito_coef_mmq

## 12 x 1 sparse Matrix of class "dgCMatrix"


## 1
## (Intercept) -879.7376910
## 1 .
## Income -7.6513860
## Limit 0.2209456
## Rating 0.7141844
## Cards 18.7152202
## Age -0.8810690
## Education 0.7656821
## Gender -14.5242825
## Student 410.8427954
## Married -1.8244088
## Ethnicity -0.1799812

5
predito_coef_lasso <- predict(lasso,s=lasso$lambda.min, type="coef")
predito_coef_lasso

## 12 x 1 sparse Matrix of class "dgCMatrix"


## 1
## (Intercept) -877.2391140
## 1 .
## Income -7.5441009
## Limit 0.2158057
## Rating 0.7652443
## Cards 18.0479864
## Age -0.8574199
## Education 0.5374613
## Gender -12.7213601
## Student 408.1087035
## Married -0.2547043
## Ethnicity .

item d

riscos<-list()

riscos$minimos.quadrados<-
(predito_mmq - teste$Balance)^2 %>% mean()
riscos$minimos.quadrados

## [1] 11130.05

riscos$lasso<-
(predito_lasso - teste$Balance)^2 %>% mean()
riscos$lasso

## [1] 11080.59

riscos

## $minimos.quadrados
## [1] 11130.05
##
## $lasso
## [1] 11080.59

lasso$lambda.min

## [1] 1.406431

Exercício 2

6
library(MLmetrics)
# Exercicio 2

set.seed(1)
X <- runif(30,min = 8, max = 18)
Y <- rnorm(30, mean = 45*tanh(X/(1.9)-7)+57, sd = 16)
X_<-cbind(1,X)
X0<-cbind(1,rep(0,30))
X0

## [,1] [,2]
## [1,] 1 0
## [2,] 1 0
## [3,] 1 0
## [4,] 1 0
## [5,] 1 0
## [6,] 1 0
## [7,] 1 0
## [8,] 1 0
## [9,] 1 0
## [10,] 1 0
## [11,] 1 0
## [12,] 1 0
## [13,] 1 0
## [14,] 1 0
## [15,] 1 0
## [16,] 1 0
## [17,] 1 0
## [18,] 1 0
## [19,] 1 0
## [20,] 1 0
## [21,] 1 0
## [22,] 1 0
## [23,] 1 0
## [24,] 1 0
## [25,] 1 0
## [26,] 1 0
## [27,] 1 0
## [28,] 1 0
## [29,] 1 0
## [30,] 1 0

X20<-cbind(1,X,X^2,X^3,X^4,X^5,
X^6,X^7,X^8,X^9,X^10,
X^11,X^12,X^13,X^14,X^15,
X^16,X^17,X^18,X^19,X^20)
X10<-cbind(1,X,X^2,X^3,X^4,X^5,
X^6,X^7,X^8,X^9,X^10)
X5<-cbind(1,X,X^2,X^3,X^4,X^5)
X2<-cbind(1,X,X^2)

lasso <- cv.glmnet(X_,Y,alpha=1)

7
a<-as.vector(lasso$lambda)
a

## [1] 35.1008492 31.9825865 29.1413416 26.5525052 24.1936538 22.0443563


## [7] 20.0859964 18.3016119 16.6757471 15.1943197 13.8444982 12.6145911
## [13] 11.4939456 10.4728552 9.5424756 8.6947484 7.9223309 7.2185328
## [19] 6.5772583 5.9929528 5.4605554 4.9754547 4.5334490 4.1307099
## [25] 3.7637490 3.4293879 3.1247305 2.8471381 2.5942062 2.3637441
## [31] 2.1537556 1.9624219 1.7880857 1.6292371 1.4845002 1.3526213
## [37] 1.2324581 1.1229699 1.0232084 0.9323094 0.8494856 0.7740196
## [43] 0.7052578 0.6426047 0.5855175 0.5335017 0.4861069 0.4429225
## [49] 0.4035745 0.3677220 0.3350546 0.3052893 0.2781682 0.2534565
## [55] 0.2309401

predito_mmq <- predict(lasso, s=0, newx=X_)

predito_lasso <- predict(lasso, s=lasso$lambda.min,newx=X_)

riscos<-list()

riscos$minimos.quadrados<-
MSE(predito_mmq,Y)

riscos$lasso<-
MSE(predito_lasso,Y)

riscos

## $minimos.quadrados
## [1] 184.12
##
## $lasso
## [1] 184.12

predito_coef_mmq <- predict(lasso, s=0, type="coef")


predito_coef_mmq

## 3 x 1 sparse Matrix of class "dgCMatrix"


## 1
## (Intercept) -101.85468
## .
## X 12.01098

predito_coef_lasso <- predict(lasso, s=lasso$lambda.min, type="coef")


predito_coef_lasso

## 3 x 1 sparse Matrix of class "dgCMatrix"


## 1
## (Intercept) -101.85468
## .
## X 12.01098

8
#Grau 2

lasso2 <- cv.glmnet(X2,Y,alpha=1)


b<-as.vector(lasso2$lambda)
b

## [1] 35.3583067 32.2171722 29.3550874 26.7472623 24.3711093 22.2060471


## [7] 20.2333231 18.4358504 16.7980603 15.3057669 13.9460448 12.7071166
## [13] 11.5782514 10.5496714 9.6124677 8.7585226 7.9804396 7.2714793
## [19] 6.6255011 6.0369099 5.5006074 5.0119486 4.5667009 4.1610078
## [25] 3.7913553 3.4545418 3.1476498 2.8680213 2.6132342 2.3810816
## [31] 2.1695529 1.9768158 1.8012009 1.6411872 1.4953887 1.3625425
## [37] 1.2414979 1.1312067 1.0307134 0.9391477 0.8557164 0.7796969
## [43] 0.7104308 0.6473181 0.5898121 0.5374148 0.4896724 0.4461712
## [49] 0.4065346 0.3704192 0.3375122 0.3075285 0.2802085 0.2553156
## [55] 0.2326340

lasso2

##
## Call: cv.glmnet(x = X2, y = Y, alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Measure SE Nonzero
## min 0.233 188.8 36.83 1
## 1se 5.501 220.8 21.48 1

predito_mmq <- predict(lasso2, s=0, newx=X2)

predito_lasso <- predict(lasso2, s=lasso2$lambda.min,newx=X2)

riscos<-list()

riscos$minimos.quadrados<-
MSE(predito_mmq,Y)

riscos$lasso<-
MSE(predito_lasso,Y)

riscos

## $minimos.quadrados
## [1] 165.9805
##
## $lasso
## [1] 165.9805

predito_coef_mmq <- predict(lasso2, s=0, type="coef")


predito_coef_mmq

9
## 4 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -26.4896455
## .
## X .
## 0.4553384

predito_coef_lasso <- predict(lasso2, s=lasso2$lambda.min, type="coef")


predito_coef_lasso

## 4 x 1 sparse Matrix of class "dgCMatrix"


## 1
## (Intercept) -26.4896455
## .
## X .
## 0.4553384

#Grau 5

lasso5 <- cv.glmnet(X5,Y,alpha=1)


lasso5$lambda %>% sample(55) %>% sort(decreasing = TRUE) -> c
c

## [1] 35.3583067 32.2171722 29.3550874 26.7472623 24.3711093 22.2060471


## [7] 20.2333231 18.4358504 16.7980603 15.3057669 13.9460448 12.7071166
## [13] 11.5782514 10.5496714 9.6124677 8.7585226 7.9804396 7.2714793
## [19] 6.6255011 6.0369099 5.5006074 5.0119486 4.5667009 4.1610078
## [25] 3.7913553 3.4545418 3.1476498 2.8680213 2.6132342 2.3810816
## [31] 2.1695529 1.9768158 1.8012009 1.6411872 1.4953887 1.3625425
## [37] 1.2414979 1.1312067 1.0307134 0.9391477 0.8557164 0.7796969
## [43] 0.7104308 0.6473181 0.5898121 0.5374148 0.4896724 0.4461712
## [49] 0.4065346 0.3704192 0.3375122 0.3075285 0.2802085 0.2553156
## [55] 0.2326340

predito_mmq <- predict(lasso5, s=0, newx=X5)

predito_lasso <- predict(lasso5, s=lasso5$lambda.min,newx=X5)

riscos<-list()

riscos$minimos.quadrados<-
MSE(predito_mmq,Y)

riscos$lasso<-
MSE(predito_lasso,Y)

riscos

## $minimos.quadrados
## [1] 165.9167
##
## $lasso
## [1] 166.2808

10
predito_coef_mmq <- predict(lasso5, s=0, type="coef")
predito_coef_mmq

## 7 x 1 sparse Matrix of class "dgCMatrix"


## 1
## (Intercept) -25.323108867
## .
## X .
## 0.433607922
## 0.001063628
## .
## .

predito_coef_lasso <- predict(lasso5, s=lasso5$lambda.min, type="coef")


predito_coef_lasso

## 7 x 1 sparse Matrix of class "dgCMatrix"


## 1
## (Intercept) -2.461845e+01
## .
## X .
## 4.330171e-01
## 8.316267e-04
## .
## .

#Grau 10

lasso10 <- cv.glmnet(X10,Y,alpha=1)


lasso10$lambda %>% sample(55) %>% sort(decreasing = TRUE) -> d
d

## [1] 35.358306675 32.217172165 29.355087386 26.747262331 24.371109267


## [6] 12.707116578 10.549671425 9.612467693 8.758522557 7.980439553
## [11] 6.625501119 6.036909861 4.566700923 4.161007800 3.454541777
## [16] 3.147649807 2.868021274 2.613234168 2.169552917 1.801200947
## [21] 1.641187219 1.241497945 1.131206690 1.030713405 0.855716377
## [26] 0.779696892 0.710430769 0.647318058 0.589812106 0.370419192
## [31] 0.337512172 0.280208535 0.232634048 0.211967481 0.146100962
## [36] 0.133121756 0.121295586 0.091755662 0.047841478 0.043591373
## [41] 0.039718836 0.036190324 0.030045843 0.027376653 0.024944587
## [46] 0.022728578 0.020709434 0.018869665 0.017193336 0.013006128
## [51] 0.011850700 0.010797916 0.009838659 0.007442585 0.006781406

predito_mmq <- predict(lasso10, s=0, newx=X10)

predito_lasso <- predict(lasso10, s=lasso10$lambda.min,newx=X10)

riscos<-list()

11
riscos$minimos.quadrados<-
MSE(predito_mmq,Y)

riscos$lasso<-
MSE(predito_lasso,Y)

riscos

## $minimos.quadrados
## [1] 134.2554
##
## $lasso
## [1] 139.201

predito_coef_mmq <- predict(lasso10, s=0, type="coef")


predito_coef_mmq

## 12 x 1 sparse Matrix of class "dgCMatrix"


## 1
## (Intercept) 5.985702e+01
## .
## X -7.457601e+00
## 9.766357e-03
## 4.499580e-03
## 2.736312e-03
## .
## .
## -9.294104e-12
## -1.785122e-12
## -3.076998e-11
## -3.695991e-11

predito_coef_lasso <- predict(lasso10, s=lasso10$lambda.min, type="coef")


predito_coef_lasso

## 12 x 1 sparse Matrix of class "dgCMatrix"


## 1
## (Intercept) 1.257521e+01
## .
## X -1.516424e+00
## .
## .
## 2.084011e-03
## .
## .
## .
## .
## .
## -2.664632e-11

12
#Grau 20

lasso20 <- cv.glmnet(X20,Y,alpha=1)


lasso20$lambda %>% sample(55) %>% sort(decreasing = TRUE) -> e
e

## [1] 35.35830668 32.21717216 29.35508739 26.74726233 24.37110927 22.20604709


## [7] 20.23332307 16.79806028 11.57825136 10.54967143 9.61246769 8.75852256
## [13] 7.98043955 7.27147930 6.62550112 6.03690986 5.50060743 5.01194862
## [19] 4.56670092 4.16100780 3.79135534 3.14764981 2.61323417 2.38108165
## [25] 2.16955292 1.97681582 1.80120095 1.64118722 1.49538867 1.36254246
## [31] 1.24149795 1.13120669 1.03071341 0.93914767 0.85571638 0.77969689
## [37] 0.71043077 0.64731806 0.58981211 0.53741482 0.48967237 0.44617123
## [43] 0.40653461 0.37041919 0.33751217 0.30752852 0.28020853 0.25531558
## [49] 0.21196748 0.19313687 0.14610096 0.12129559 0.10070173 0.09175566
## [55] 0.07617716

predito_mmq <- predict(lasso20, s=0, newx=X20)

predito_lasso <- predict(lasso20, s=lasso20$lambda.min,newx=X20)

riscos<-list()

riscos$minimos.quadrados<-
MSE(predito_mmq,Y)

riscos$lasso<-
MSE(predito_lasso,Y)

riscos

## $minimos.quadrados
## [1] 140.9794
##
## $lasso
## [1] 146.7015

predito_coef_mmq <- predict(lasso20, s=0, type="coef")


predito_coef_mmq

## 22 x 1 sparse Matrix of class "dgCMatrix"


## 1
## (Intercept) 1.162865e+00
## .
## X .
## .
## 3.777640e-03
## 1.350195e-03
## .
## .
## .
## .

13
## .
## .
## .
## .
## -1.124252e-18
## -4.570775e-20
## -9.429254e-19
## -1.069807e-19
## -8.177216e-21
## -5.532420e-22
## -3.757787e-24
## .

predito_coef_lasso <- predict(lasso20, s=lasso20$lambda.min, type="coef")


predito_coef_lasso

## 22 x 1 sparse Matrix of class "dgCMatrix"


## 1
## (Intercept) -6.343196e+00
## .
## X .
## .
## 2.521416e-02
## .
## .
## .
## .
## .
## .
## .
## .
## .
## .
## .
## .
## .
## .
## .
## .
## -2.508982e-24

dtX<-data.frame(x=a,x2=b,x5=c,x10=d,x20=e)
dtY<-data.frame(y=as.vector(lasso$cvm),
y2=as.vector(lasso2$cvm),
y5=as.vector(lasso5$cvm),
y10=(as.vector(lasso10$cvm) %>% sample(55) %>% sort(decreasing = TRUE)),
y20=(as.vector(lasso20$cvm) %>% sample(55) %>% sort(decreasing = TRUE)))

library(ggplot2)

dtX %>% gather() -> dtX


dtY %>% gather() -> dtY

14
dt<-data.frame(dtX,dtY)

ggplot(dt,aes(x=log(value),y=value.1,color=key))+
geom_point()+
labs(color = "Graus")+
scale_color_discrete(labels=c("Grau 1", "Grau 10","Grau 2","Grau 20", "Grau 5"))+
xlab(label="log(lambda)")+
ylab(label = "EQM")

1600

1200

Graus
Grau 1
Grau 10
EQM

800 Grau 2
Grau 20
Grau 5

400

−4 −2 0 2
log(lambda)

Como podemos observar pelos resultados obtidos e pelo gráfico, o valor de lambda aumenta conforme
maior é o grau polinomial. Além disso, o lasso apresentou piora de desempenho conforme cresceu o grau
polinomial, e ainda nessa situação, o modelo se mostrou inferior aos de mínimos quadrados.

15

You might also like