Professional Documents
Culture Documents
Data Science Comp 2
Data Science Comp 2
r
Tobias Dons
2020-11-05
#Compulsory activity 2
## ── Attaching packages
──────────────────────────────────────────────────────────────────────────────────
──────────── tidyverse 1.3.0 ──
## ── Conflicts
──────────────────────────────────────────────────────────────────────────────────
─────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(glmnet)
##
## Attaching package: 'Matrix'
library(quanteda)
library(yardstick)
## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.
##
## Attaching package: 'yardstick'
library(knitr)
## [1] 441 7
dim(speeches_dfm)
## [1] 441 46574
speeches_dfm[1:6, 1:6]
#Ridge
ridge <- cv.glmnet(train_dfm, docvars(train_dfm, "is_us"),
family = "binomial", alpha = 0, nfolds = 5,
parralel = TRUE, intercept = TRUE,
type.measure = "class")
summary(ridge)
ridge$lambda.min
## [1] 7.713741
ridge$lambda.1se
## [1] 27.08442
plot(ridge)
#Lasso
lasso <- cv.glmnet(train_dfm, docvars(train_dfm, "is_us"),
family = "binomial", alpha = 1, nfolds = 5, #Alpha = 1 changes
the minimum significanse criteria
parralel = TRUE, intercept = TRUE,
type.measure = "class")
summary(lasso)
lasso$lambda.min
## [1] 0.005316782
lasso$lambda.1se
## [1] 0.01781974
plot(lasso)
##
## preds_lasso 0 1
## 0 80 3
## 1 1 5
results
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.944
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.955
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 precision binary 1
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 precision binary 0.833
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.375
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.375
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.625
## [1] 0.5454545
## [1] 0.9700599
## [1] 0.7142857
## [1] 0.9756098
#_________________________________________________________
#Using the lasso-model to produce top predictive terms for US
best.lambda <- which(lasso$lambda == lasso$lambda.1se)
best.lambda
## [1] 56
## [1] 46574
sum(beta == 0)
## [1] 46544
df <- data.frame(coef = as.numeric(beta),
word = names(beta),
stringsAsFactors = FALSE)
## coef word
## 1 1.1618740 peac_endur
## 2 1.0319813 ask_congress
## 3 0.8189179 control_agenda
## 4 0.6292725 liberti
## 5 0.5605610 today_unit
## 6 0.5211970 world_democraci
## 7 0.4708195 full_scope
## 8 0.4265196 terribl_weapon
## 9 0.4186460 everi_nation
## 10 0.3610864 grievanc
## coef word
## 1 0 privileg
## 2 0 extend
## 3 0 warm
## 4 0 congratul
## 5 0 unit
## 6 0 state
## 7 0 deleg
## 8 0 elect
## 9 0 presid
## 10 0 twenti
## [1] 0
#_________________________________________________________
#Using the ridge-model to produce top predictive terms US
best.lambda1 <- which(ridge$lambda == ridge$lambda.1se)
best.lambda1
## [1] 47
sum(beta1 == 0)
## [1] 152
## coef word
## 1 0.02154828 intern_landscap
## 2 0.02154798 emerg_within
## 3 0.02154761 hard_earn
## 4 0.02154760 involv_question
## 5 0.02053956 intern_oversight
## 6 0.01941480 slowli_sure
## 7 0.01941431 institut_secur
## 8 0.01871609 food_unit
## 9 0.01871432 caus_second
## 10 0.01871428 nation_victim
## coef word
## 1 -0.002699953 term_need
## 2 -0.002699920 improv_infrastructur
## 3 -0.002699918 bring_uniqu
## 4 -0.002699918 govern_right
## 5 -0.002699905 govern_move
## 6 -0.002698543 way_give
## 7 -0.002698541 secur_architectur
## 8 -0.002698525 press_ahead
## 9 -0.002671338 mean_creat
## 10 -0.002671309 process_east
#Converting dfm to df
speeches_df <- convert(speeches_dfm, to = "data.frame")
## [1] 3
sum(speeches_df$ask_congress)
## [1] 4
sum(speeches_df$control_agenda)
## [1] 4
sum(speeches_df$liberti)
## [1] 92
sum(speeches_df$today_unit)
## [1] 20
sum(speeches_df$world_democraci)
## [1] 3
sum(speeches_df$full_scope)
## [1] 3
sum(speeches_df$terribl_weapon)
## [1] 4
sum(speeches_df$everi_nation)
## [1] 74
sum(speeches_df$grievanc)
## [1] 11
## [1] 3
sum(speeches_df$emerg_within)
## [1] 3
sum(speeches_df$hard_earn)
## [1] 4
sum(speeches_df$involv_question)
## [1] 4
sum(speeches_df$intern_oversight)
## [1] 4
sum(speeches_df$slowli_sure)
## [1] 5
sum(speeches_df$institut_secur)
## [1] 3
sum(speeches_df$food_unit)
## [1] 3
sum(speeches_df$caus_second)
## [1] 3
sum(speeches_df$nation_victim)
## [1] 3