Professional Documents
Culture Documents
Codes - Basic
Codes - Basic
packages("quanteda")
install.packages("quanteda.textstats")
install.packages("quanteda.textplots")
install.packages("quanteda.textmodels")
install.packages("tidytext")
install.packages("topicmodels")
install.packages("tm")
install.packages("ggplot2")
install.packages("gdap")
require(readtext)
require(quanteda)
require(quanteda.textstats)
require(quanteda.textplots)
require(quanteda.textmodels)
require(dplyr)
require(tidytext)
require(topicmodels)
require(tm)
require(qdap)
require(ggplot2)
library(readtext)
library(quanteda.textstats)
x=readtext("*.txt")
#========Convert to Corpus========
corp=corpus(x)
corp
summary(corp)
textstat_summary(corp)
# The Flesch Reading Ease gives a text a score between 1 and 100,
?textstat_readability
textstat_readability(corp)
t=tokens(corp)
t=tokens_tolower(t)
t=tokens(t,
remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T)
t
#Clean corpus created by OCR/Scanner
t=tokens_select(t,
selection = "remove",
valuetype = "regex",
verbose = TRUE
#Custom stopwords
mywords=c("can","go","even","think",
"soon","will","now","soon")
t=tokens_remove(t,
pattern = mywords)