Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 3

install.

packages("quanteda")

install.packages("quanteda.textstats")

install.packages("quanteda.textplots")

install.packages("quanteda.textmodels")

install.packages("tidytext")

install.packages("topicmodels")

install.packages("tm")

install.packages("ggplot2")

install.packages("gdap")

require(readtext)

require(quanteda)

require(quanteda.textstats)

require(quanteda.textplots)

require(quanteda.textmodels)

require(dplyr)

require(tidytext)

require(topicmodels)

require(tm)

require(qdap)

require(ggplot2)

library(readtext)

library(quanteda.textstats)

#========Import text file======

x=readtext("*.txt")

#========Convert to Corpus========

corp=corpus(x)
corp

summary(corp)

textstat_summary(corp)

#Readability measures the complexity of the printed materials

# by measuring the number of words in the sentence and

# the number of letters or syllables per word

# The Flesch Reading Ease gives a text a score between 1 and 100,

# with 100 being the highest readability score.

# Scoring between 70 to 80 is equivalent to school grade level 8.

# Low the score-> Tougher the text

?textstat_readability

textstat_readability(corp)

#========Corpus to Token and text Cleaning========

t=tokens(corp)

t=tokens_tolower(t)

t=tokens(t,

remove_punct = T,

remove_symbols = T,

remove_numbers = T,

remove_url = T,

remove_separators = T)

t=tokens_remove(t, pattern = stopwords("english"))

t
#Clean corpus created by OCR/Scanner

t=tokens_select(t,

c("[\\d-]", "[[:punct:]]", "^.{1,2}$"),

selection = "remove",

valuetype = "regex",

verbose = TRUE

#Custom stopwords

#Create your own stopwords to remove

mywords=c("can","go","even","think",

"soon","will","now","soon")

t=tokens_remove(t,

pattern = mywords)

You might also like