Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 13

Comp2.

r
Tobias Dons

2020-11-05
#Compulsory activity 2

#Loading packages used throughout the assignment


library(tidyverse)

## ── Attaching packages
──────────────────────────────────────────────────────────────────────────────────
──────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4


## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0

## ── Conflicts
──────────────────────────────────────────────────────────────────────────────────
─────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()

library(glmnet)

## Loading required package: Matrix

##
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':


##
## expand, pack, unpack

## Loaded glmnet 4.0-2

library(quanteda)

## Package version: 2.1.1

## Parallel computing: 2 of 8 threads used.

## See https://quanteda.io for tutorials and examples.


##
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':


##
## View

library(yardstick)

## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.

##
## Attaching package: 'yardstick'

## The following object is masked from 'package:readr':


##
## spec

library(knitr)

#Loading the dataframe


load("./Data/ungd.RData")
dim(ungd_sample)

## [1] 441 7

#Transforming and cleaning the text data through tokenization


speeches_token <- tokens(ungd_sample$text, what = 'word',
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, split_hyphens = TRUE)

speeches_token <- tokens_tolower(speeches_token)


speeches_token <- tokens_select(speeches_token, stopwords(), selection = 'remove')
speeches_token <- tokens_wordstem(speeches_token, language = 'english')

speeches_token_ngrams <-tokens_ngrams(speeches_token, n = 1:2)

#Creating a document feature matrix


speeches_dfm <- dfm(speeches_token_ngrams)

docvars(speeches_dfm) <- data.frame(id = ungd_sample$row_id,


is_us = ungd_sample$is_us)

speeches_dfm <- dfm_trim(speeches_dfm, min_docfreq = 3)

dim(speeches_dfm)
## [1] 441 46574

speeches_dfm[1:6, 1:6]

## Document-feature matrix of: 6 documents, 6 features (44.4% sparse) and 2


docvars.
## features
## docs privileg extend warm congratul unit state
## text1 1 1 1 1 91 48
## text2 0 1 0 1 43 30
## text3 0 0 0 0 43 33
## text4 0 0 0 1 24 20
## text5 0 0 0 0 28 25
## text6 0 2 0 0 59 46

#Defining test and training set:


set.seed(202020)
uid <- unique(ungd_sample$row_id)
training <- sample(1:length(uid),
floor(.80*length(uid)))
test <- (1:length(uid))[1:length(uid) %in% training == FALSE]
train_id <- uid[training]
test_id <- uid[test]
train_dfm <- dfm_subset(speeches_dfm, id %in% train_id)
test_dfm <- dfm_subset(speeches_dfm, id %in% test_id)

#Ridge
ridge <- cv.glmnet(train_dfm, docvars(train_dfm, "is_us"),
family = "binomial", alpha = 0, nfolds = 5,
parralel = TRUE, intercept = TRUE,
type.measure = "class")

summary(ridge)

## Length Class Mode


## lambda 100 -none- numeric
## cvm 100 -none- numeric
## cvsd 100 -none- numeric
## cvup 100 -none- numeric
## cvlo 100 -none- numeric
## nzero 100 -none- numeric
## call 9 -none- call
## name 1 -none- character
## glmnet.fit 13 lognet list
## lambda.min 1 -none- numeric
## lambda.1se 1 -none- numeric

ridge$lambda.min
## [1] 7.713741

ridge$lambda.1se

## [1] 27.08442

plot(ridge)

#Lasso
lasso <- cv.glmnet(train_dfm, docvars(train_dfm, "is_us"),
family = "binomial", alpha = 1, nfolds = 5, #Alpha = 1 changes
the minimum significanse criteria
parralel = TRUE, intercept = TRUE,
type.measure = "class")

summary(lasso)

## Length Class Mode


## lambda 100 -none- numeric
## cvm 100 -none- numeric
## cvsd 100 -none- numeric
## cvup 100 -none- numeric
## cvlo 100 -none- numeric
## nzero 100 -none- numeric
## call 9 -none- call
## name 1 -none- character
## glmnet.fit 13 lognet list
## lambda.min 1 -none- numeric
## lambda.1se 1 -none- numeric

lasso$lambda.min

## [1] 0.005316782

lasso$lambda.1se

## [1] 0.01781974

plot(lasso)

#predictions for is_us


preds_ridge <- predict(ridge, test_dfm, type = "class")
table(preds_ridge, docvars(test_dfm, "is_us"))
##
## preds_ridge 0 1
## 0 81 5
## 1 0 3

preds_lasso <- predict(lasso, test_dfm, type = "class")


table(preds_lasso, docvars(test_dfm, "is_us"))

##
## preds_lasso 0 1
## 0 80 3
## 1 1 5

#Comparing predictions and accuracy


results <- data.frame(obs = factor(docvars(test_dfm, "is_us")),
p_ridge = factor(preds_ridge),
p_lasso = factor(preds_lasso))

results

## obs p_ridge p_lasso


## 1 1 0 0
## 2 1 1 1
## 3 1 0 0
## 4 1 0 1
## 5 1 0 1
## 6 1 0 0
## 7 1 1 1
## 8 1 1 1
## 9 0 0 0
## 10 0 0 0
## 11 0 0 0
## 12 0 0 0
## 13 0 0 0
## 14 0 0 0
## 15 0 0 0
## 16 0 0 0
## 17 0 0 0
## 18 0 0 0
## 19 0 0 0
## 20 0 0 0
## 21 0 0 0
## 22 0 0 0
## 23 0 0 0
## 24 0 0 0
## 25 0 0 0
## 26 0 0 0
## 27 0 0 0
## 28 0 0 0
## 29 0 0 1
## 30 0 0 0
## 31 0 0 0
## 32 0 0 0
## 33 0 0 0
## 34 0 0 0
## 35 0 0 0
## 36 0 0 0
## 37 0 0 0
## 38 0 0 0
## 39 0 0 0
## 40 0 0 0
## 41 0 0 0
## 42 0 0 0
## 43 0 0 0
## 44 0 0 0
## 45 0 0 0
## 46 0 0 0
## 47 0 0 0
## 48 0 0 0
## 49 0 0 0
## 50 0 0 0
## 51 0 0 0
## 52 0 0 0
## 53 0 0 0
## 54 0 0 0
## 55 0 0 0
## 56 0 0 0
## 57 0 0 0
## 58 0 0 0
## 59 0 0 0
## 60 0 0 0
## 61 0 0 0
## 62 0 0 0
## 63 0 0 0
## 64 0 0 0
## 65 0 0 0
## 66 0 0 0
## 67 0 0 0
## 68 0 0 0
## 69 0 0 0
## 70 0 0 0
## 71 0 0 0
## 72 0 0 0
## 73 0 0 0
## 74 0 0 0
## 75 0 0 0
## 76 0 0 0
## 77 0 0 0
## 78 0 0 0
## 79 0 0 0
## 80 0 0 0
## 81 0 0 0
## 82 0 0 0
## 83 0 0 0
## 84 0 0 0
## 85 0 0 0
## 86 0 0 0
## 87 0 0 0
## 88 0 0 0
## 89 0 0 0

accuracy(results, obs, p_ridge)

## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.944

accuracy(results, obs, p_lasso)

## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.955

precision(results, obs, p_ridge, event_level = "second")

## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 precision binary 1

precision(results, obs, p_lasso, event_level = "second")

## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 precision binary 0.833

recall(results, obs, p_ridge, event_level = "second")

## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.375

recall(results, obs, p_lasso, event_level = "second")


## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.625

recall(results, obs, p_ridge, event_level = "second")

## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.375

recall(results, obs, p_lasso, event_level = "second")

## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.625

f_meas_vec(results$obs, results$p_ridge, event_level = "second")

## [1] 0.5454545

f_meas_vec(results$obs, results$p_ridge, event_level = "first")

## [1] 0.9700599

f_meas_vec(results$obs, results$p_lasso, event_level = "second")

## [1] 0.7142857

f_meas_vec(results$obs, results$p_lasso, event_level = "first")

## [1] 0.9756098

#_________________________________________________________
#Using the lasso-model to produce top predictive terms for US
best.lambda <- which(lasso$lambda == lasso$lambda.1se)
best.lambda

## [1] 56

beta <- lasso$glmnet.fit$beta[, best.lambda]


length(beta)

## [1] 46574

sum(beta == 0)

## [1] 46544
df <- data.frame(coef = as.numeric(beta),
word = names(beta),
stringsAsFactors = FALSE)

#Top predictive terms to predict whether a speech is american (lasso)


arrange(df, desc(coef)) %>%
head(n = 10)

## coef word
## 1 1.1618740 peac_endur
## 2 1.0319813 ask_congress
## 3 0.8189179 control_agenda
## 4 0.6292725 liberti
## 5 0.5605610 today_unit
## 6 0.5211970 world_democraci
## 7 0.4708195 full_scope
## 8 0.4265196 terribl_weapon
## 9 0.4186460 everi_nation
## 10 0.3610864 grievanc

arrange(df, coef) %>%


head(n = 10)

## coef word
## 1 0 privileg
## 2 0 extend
## 3 0 warm
## 4 0 congratul
## 5 0 unit
## 6 0 state
## 7 0 deleg
## 8 0 elect
## 9 0 presid
## 10 0 twenti

#Add number of entries


sum(speeches_dfm$liberti)

## [1] 0
#_________________________________________________________
#Using the ridge-model to produce top predictive terms US
best.lambda1 <- which(ridge$lambda == ridge$lambda.1se)
best.lambda1

## [1] 47

beta1 <- ridge$glmnet.fit$beta[, best.lambda1]


length(beta1)
## [1] 46574

sum(beta1 == 0)

## [1] 152

df1 <- data.frame(coef = as.numeric(beta1),


word = names(beta1),
stringsAsFactors = FALSE)

#Top predictive terms to predict whether a speech is american (ridge)


arrange(df1, desc(coef)) %>%
head(n = 10)

## coef word
## 1 0.02154828 intern_landscap
## 2 0.02154798 emerg_within
## 3 0.02154761 hard_earn
## 4 0.02154760 involv_question
## 5 0.02053956 intern_oversight
## 6 0.01941480 slowli_sure
## 7 0.01941431 institut_secur
## 8 0.01871609 food_unit
## 9 0.01871432 caus_second
## 10 0.01871428 nation_victim

arrange(df1, coef) %>%


head(n = 10)

## coef word
## 1 -0.002699953 term_need
## 2 -0.002699920 improv_infrastructur
## 3 -0.002699918 bring_uniqu
## 4 -0.002699918 govern_right
## 5 -0.002699905 govern_move
## 6 -0.002698543 way_give
## 7 -0.002698541 secur_architectur
## 8 -0.002698525 press_ahead
## 9 -0.002671338 mean_creat
## 10 -0.002671309 process_east

#Converting dfm to df
speeches_df <- convert(speeches_dfm, to = "data.frame")

#Frequency of best predictors (Lasso)


sum(speeches_df$peac_endur)

## [1] 3

sum(speeches_df$ask_congress)
## [1] 4

sum(speeches_df$control_agenda)

## [1] 4

sum(speeches_df$liberti)

## [1] 92

sum(speeches_df$today_unit)

## [1] 20

sum(speeches_df$world_democraci)

## [1] 3

sum(speeches_df$full_scope)

## [1] 3

sum(speeches_df$terribl_weapon)

## [1] 4

sum(speeches_df$everi_nation)

## [1] 74

sum(speeches_df$grievanc)

## [1] 11

#Frequency of best predictors (Ridge)


sum(speeches_df$intern_landscap)

## [1] 3

sum(speeches_df$emerg_within)

## [1] 3

sum(speeches_df$hard_earn)

## [1] 4

sum(speeches_df$involv_question)

## [1] 4

sum(speeches_df$intern_oversight)

## [1] 4
sum(speeches_df$slowli_sure)

## [1] 5

sum(speeches_df$institut_secur)

## [1] 3

sum(speeches_df$food_unit)

## [1] 3

sum(speeches_df$caus_second)

## [1] 3

sum(speeches_df$nation_victim)

## [1] 3

You might also like