Data Science Comp 2

Comp2.
r
Tobias Dons
2020-11-05
#Compulsory activity 2
#Loading packages used throughout the assignment

library(tidyverse)
## ── Attaching packages
──────────────────────────────────────────────────────────────────────────────────
──────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4

## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts
──────────────────────────────────────────────────────────────────────────────────
─────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':

##
## expand, pack, unpack
## Loaded glmnet 4.0-2
library(quanteda)
## Package version: 2.1.1
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.

##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':

##
## View
library(yardstick)
## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.
##
## Attaching package: 'yardstick'
## The following object is masked from 'package:readr':

##
## spec
library(knitr)
#Loading the dataframe

load("./Data/ungd.RData")
dim(ungd_sample)
## [1] 441 7
#Transforming and cleaning the text data through tokenization

speeches_token <- tokens(ungd_sample$text, what = 'word',
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, split_hyphens = TRUE)
speeches_token <- tokens_tolower(speeches_token)

speeches_token <- tokens_select(speeches_token, stopwords(), selection = 'remove')
speeches_token <- tokens_wordstem(speeches_token, language = 'english')
speeches_token_ngrams <-tokens_ngrams(speeches_token, n = 1:2)
#Creating a document feature matrix

speeches_dfm <- dfm(speeches_token_ngrams)
docvars(speeches_dfm) <- data.frame(id = ungd_sample$row_id,

is_us = ungd_sample$is_us)
speeches_dfm <- dfm_trim(speeches_dfm, min_docfreq = 3)
dim(speeches_dfm)
## [1] 441 46574
speeches_dfm[1:6, 1:6]
## Document-feature matrix of: 6 documents, 6 features (44.4% sparse) and 2

docvars.
## features
## docs privileg extend warm congratul unit state
## text1 1 1 1 1 91 48
## text2 0 1 0 1 43 30
## text3 0 0 0 0 43 33
## text4 0 0 0 1 24 20
## text5 0 0 0 0 28 25
## text6 0 2 0 0 59 46
#Defining test and training set:

set.seed(202020)
uid <- unique(ungd_sample$row_id)
training <- sample(1:length(uid),
floor(.80*length(uid)))
test <- (1:length(uid))[1:length(uid) %in% training == FALSE]
train_id <- uid[training]
test_id <- uid[test]
train_dfm <- dfm_subset(speeches_dfm, id %in% train_id)
test_dfm <- dfm_subset(speeches_dfm, id %in% test_id)
#Ridge
ridge <- cv.glmnet(train_dfm, docvars(train_dfm, "is_us"),
family = "binomial", alpha = 0, nfolds = 5,
parralel = TRUE, intercept = TRUE,
type.measure = "class")
summary(ridge)
## Length Class Mode

## lambda 100 -none- numeric
## cvm 100 -none- numeric
## cvsd 100 -none- numeric
## cvup 100 -none- numeric
## cvlo 100 -none- numeric
## nzero 100 -none- numeric
## call 9 -none- call
## name 1 -none- character
## glmnet.fit 13 lognet list
## lambda.min 1 -none- numeric
## lambda.1se 1 -none- numeric
ridge$lambda.min
## [1] 7.713741
ridge$lambda.1se
## [1] 27.08442
plot(ridge)
#Lasso
lasso <- cv.glmnet(train_dfm, docvars(train_dfm, "is_us"),
family = "binomial", alpha = 1, nfolds = 5, #Alpha = 1 changes
the minimum significanse criteria
parralel = TRUE, intercept = TRUE,
type.measure = "class")
summary(lasso)
## Length Class Mode

## lambda 100 -none- numeric
## cvm 100 -none- numeric
## cvsd 100 -none- numeric
## cvup 100 -none- numeric
## cvlo 100 -none- numeric
## nzero 100 -none- numeric
## call 9 -none- call
## name 1 -none- character
## glmnet.fit 13 lognet list
## lambda.min 1 -none- numeric
## lambda.1se 1 -none- numeric
lasso$lambda.min
## [1] 0.005316782
lasso$lambda.1se
## [1] 0.01781974
plot(lasso)
#predictions for is_us

preds_ridge <- predict(ridge, test_dfm, type = "class")
table(preds_ridge, docvars(test_dfm, "is_us"))
##
## preds_ridge 0 1
## 0 81 5
## 1 0 3
preds_lasso <- predict(lasso, test_dfm, type = "class")

table(preds_lasso, docvars(test_dfm, "is_us"))
##
## preds_lasso 0 1
## 0 80 3
## 1 1 5
#Comparing predictions and accuracy

results <- data.frame(obs = factor(docvars(test_dfm, "is_us")),
p_ridge = factor(preds_ridge),
p_lasso = factor(preds_lasso))
results
## obs p_ridge p_lasso

## 1 1 0 0
## 2 1 1 1
## 3 1 0 0
## 4 1 0 1
## 5 1 0 1
## 6 1 0 0
## 7 1 1 1
## 8 1 1 1
## 9 0 0 0
## 10 0 0 0
## 11 0 0 0
## 12 0 0 0
## 13 0 0 0
## 14 0 0 0
## 15 0 0 0
## 16 0 0 0
## 17 0 0 0
## 18 0 0 0
## 19 0 0 0
## 20 0 0 0
## 21 0 0 0
## 22 0 0 0
## 23 0 0 0
## 24 0 0 0
## 25 0 0 0
## 26 0 0 0
## 27 0 0 0
## 28 0 0 0
## 29 0 0 1
## 30 0 0 0
## 31 0 0 0
## 32 0 0 0
## 33 0 0 0
## 34 0 0 0
## 35 0 0 0
## 36 0 0 0
## 37 0 0 0
## 38 0 0 0
## 39 0 0 0
## 40 0 0 0
## 41 0 0 0
## 42 0 0 0
## 43 0 0 0
## 44 0 0 0
## 45 0 0 0
## 46 0 0 0
## 47 0 0 0
## 48 0 0 0
## 49 0 0 0
## 50 0 0 0
## 51 0 0 0
## 52 0 0 0
## 53 0 0 0
## 54 0 0 0
## 55 0 0 0
## 56 0 0 0
## 57 0 0 0
## 58 0 0 0
## 59 0 0 0
## 60 0 0 0
## 61 0 0 0
## 62 0 0 0
## 63 0 0 0
## 64 0 0 0
## 65 0 0 0
## 66 0 0 0
## 67 0 0 0
## 68 0 0 0
## 69 0 0 0
## 70 0 0 0
## 71 0 0 0
## 72 0 0 0
## 73 0 0 0
## 74 0 0 0
## 75 0 0 0
## 76 0 0 0
## 77 0 0 0
## 78 0 0 0
## 79 0 0 0
## 80 0 0 0
## 81 0 0 0
## 82 0 0 0
## 83 0 0 0
## 84 0 0 0
## 85 0 0 0
## 86 0 0 0
## 87 0 0 0
## 88 0 0 0
## 89 0 0 0
accuracy(results, obs, p_ridge)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.944
accuracy(results, obs, p_lasso)
## 1 accuracy binary 0.955
precision(results, obs, p_ridge, event_level = "second")
## 1 precision binary 1
precision(results, obs, p_lasso, event_level = "second")
## 1 precision binary 0.833
recall(results, obs, p_ridge, event_level = "second")
## 1 recall binary 0.375
recall(results, obs, p_lasso, event_level = "second")

recall(results, obs, p_ridge, event_level = "second")
recall(results, obs, p_lasso, event_level = "second")
f_meas_vec(results$obs, results$p_ridge, event_level = "second")
## [1] 0.5454545
f_meas_vec(results$obs, results$p_ridge, event_level = "first")
## [1] 0.9700599
f_meas_vec(results$obs, results$p_lasso, event_level = "second")
## [1] 0.7142857
f_meas_vec(results$obs, results$p_lasso, event_level = "first")
## [1] 0.9756098
#_________________________________________________________
#Using the lasso-model to produce top predictive terms for US
best.lambda <- which(lasso$lambda == lasso$lambda.1se)
best.lambda
## [1] 56
beta <- lasso$glmnet.fit$beta[, best.lambda]

length(beta)
## [1] 46574
sum(beta == 0)
## [1] 46544
df <- data.frame(coef = as.numeric(beta),
word = names(beta),
stringsAsFactors = FALSE)
#Top predictive terms to predict whether a speech is american (lasso)

arrange(df, desc(coef)) %>%
head(n = 10)
## coef word
## 1 1.1618740 peac_endur
## 2 1.0319813 ask_congress
## 3 0.8189179 control_agenda
## 4 0.6292725 liberti
## 5 0.5605610 today_unit
## 6 0.5211970 world_democraci
## 7 0.4708195 full_scope
## 8 0.4265196 terribl_weapon
## 9 0.4186460 everi_nation
## 10 0.3610864 grievanc
arrange(df, coef) %>%

head(n = 10)
## coef word
## 1 0 privileg
## 2 0 extend
## 3 0 warm
## 4 0 congratul
## 5 0 unit
## 6 0 state
## 7 0 deleg
## 8 0 elect
## 9 0 presid
## 10 0 twenti
#Add number of entries

sum(speeches_dfm$liberti)
## [1] 0
#_________________________________________________________
#Using the ridge-model to produce top predictive terms US
best.lambda1 <- which(ridge$lambda == ridge$lambda.1se)
best.lambda1
## [1] 47
beta1 <- ridge$glmnet.fit$beta[, best.lambda1]

length(beta1)
## [1] 46574
sum(beta1 == 0)
## [1] 152
df1 <- data.frame(coef = as.numeric(beta1),

word = names(beta1),
stringsAsFactors = FALSE)
#Top predictive terms to predict whether a speech is american (ridge)

arrange(df1, desc(coef)) %>%
head(n = 10)
## coef word
## 1 0.02154828 intern_landscap
## 2 0.02154798 emerg_within
## 3 0.02154761 hard_earn
## 4 0.02154760 involv_question
## 5 0.02053956 intern_oversight
## 6 0.01941480 slowli_sure
## 7 0.01941431 institut_secur
## 8 0.01871609 food_unit
## 9 0.01871432 caus_second
## 10 0.01871428 nation_victim
arrange(df1, coef) %>%

head(n = 10)
## coef word
## 1 -0.002699953 term_need
## 2 -0.002699920 improv_infrastructur
## 3 -0.002699918 bring_uniqu
## 4 -0.002699918 govern_right
## 5 -0.002699905 govern_move
## 6 -0.002698543 way_give
## 7 -0.002698541 secur_architectur
## 8 -0.002698525 press_ahead
## 9 -0.002671338 mean_creat
## 10 -0.002671309 process_east
#Converting dfm to df
speeches_df <- convert(speeches_dfm, to = "data.frame")
#Frequency of best predictors (Lasso)

sum(speeches_df$peac_endur)
## [1] 3
sum(speeches_df$ask_congress)
## [1] 4
sum(speeches_df$control_agenda)
## [1] 4
sum(speeches_df$liberti)
## [1] 92
sum(speeches_df$today_unit)
## [1] 20
sum(speeches_df$world_democraci)
## [1] 3
sum(speeches_df$full_scope)
## [1] 3
sum(speeches_df$terribl_weapon)
## [1] 4
sum(speeches_df$everi_nation)
## [1] 74
sum(speeches_df$grievanc)
## [1] 11
#Frequency of best predictors (Ridge)

sum(speeches_df$intern_landscap)
## [1] 3
sum(speeches_df$emerg_within)
## [1] 3
sum(speeches_df$hard_earn)
## [1] 4
sum(speeches_df$involv_question)
## [1] 4
sum(speeches_df$intern_oversight)
## [1] 4
sum(speeches_df$slowli_sure)
## [1] 5
sum(speeches_df$institut_secur)
## [1] 3
sum(speeches_df$food_unit)
## [1] 3
sum(speeches_df$caus_second)
## [1] 3
sum(speeches_df$nation_victim)
## [1] 3

Data Science Comp 2

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Data Science Comp 2

Uploaded by

Copyright:

Available Formats

Comp2.

#Loading packages used throughout the assignment

## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4

## Loading required package: Matrix

## The following objects are masked from 'package:tidyr':

## Loaded glmnet 4.0-2

## Package version: 2.1.1

## Parallel computing: 2 of 8 threads used.

## See https://quanteda.io for tutorials and examples.

## The following object is masked from 'package:utils':

## The following object is masked from 'package:readr':

#Loading the dataframe

#Transforming and cleaning the text data through tokenization

speeches_token <- tokens_tolower(speeches_token)

speeches_token_ngrams <-tokens_ngrams(speeches_token, n = 1:2)

#Creating a document feature matrix

docvars(speeches_dfm) <- data.frame(id = ungd_sample$row_id,

speeches_dfm <- dfm_trim(speeches_dfm, min_docfreq = 3)

## Document-feature matrix of: 6 documents, 6 features (44.4% sparse) and 2

#Defining test and training set:

## Length Class Mode

## Length Class Mode

#predictions for is_us

preds_lasso <- predict(lasso, test_dfm, type = "class")

#Comparing predictions and accuracy

## obs p_ridge p_lasso

accuracy(results, obs, p_ridge)

accuracy(results, obs, p_lasso)

precision(results, obs, p_ridge, event_level = "second")

precision(results, obs, p_lasso, event_level = "second")

recall(results, obs, p_ridge, event_level = "second")

recall(results, obs, p_lasso, event_level = "second")

recall(results, obs, p_ridge, event_level = "second")

recall(results, obs, p_lasso, event_level = "second")

f_meas_vec(results$obs, results$p_ridge, event_level = "second")

f_meas_vec(results$obs, results$p_ridge, event_level = "first")

f_meas_vec(results$obs, results$p_lasso, event_level = "second")

f_meas_vec(results$obs, results$p_lasso, event_level = "first")

beta <- lasso$glmnet.fit$beta[, best.lambda]

#Top predictive terms to predict whether a speech is american (lasso)

arrange(df, coef) %>%

#Add number of entries

beta1 <- ridge$glmnet.fit$beta[, best.lambda1]

df1 <- data.frame(coef = as.numeric(beta1),

#Top predictive terms to predict whether a speech is american (ridge)

arrange(df1, coef) %>%

#Frequency of best predictors (Lasso)

#Frequency of best predictors (Ridge)

You might also like