Professional Documents
Culture Documents
Rutina Datos Nos Estructurados
Rutina Datos Nos Estructurados
Rutina Datos Nos Estructurados
library(tm)
library(pdftools)
library(wordcloud)
ruta = "~/Downloads/Apuntes/Texto"
# a minusculas
docs =tm_map(docs,content_transformer(tolower))
#Signos de Puntuación
docs =tm_map(docs,removePunctuation(tolower))
#Español
docs =tm_map(docs,removeWords,stopwords("spanish"))
#Ingles
docs =tm_map(docs,removeWords,stopwords("english"))
#Espacios en Blanco
docs =tm_map(docs,stripWhitespace)
ruta=setwd("~/Documents/agronegocios expertos")
archivo=DirSource(ruta,pattern="*.pdf")
docs=VCorpus(archivo,readerControl = list(reader = readPDF,language="spa"))
inspect(docs)
# limpieza corpus
docs=tm_map(docs,content_transformer(tolower)) # a minusculas
docs=tm_map(docs,removePunctuation) # signos puntuacion
docs=tm_map(docs,removeWords,stopwords("spanish")) # español
docs=tm_map(docs,removeWords,stopwords("english")) # ingles
docs=tm_map(docs,stripWhitespace) # espacios en blanco
# nube de palabras
wordcloud(docs,scale=c(2,0.2),max.words = 30,rot.per=0.25,
colors=brewer.pal(8,"Dark2"))
docs=tm_map(docs,removeNumbers)
wordcloud(docs,scale=c(3,0.2),max.words = 40,rot.per=0.25,
colors=brewer.pal(8,"Dark2"))
docs=tm_map(docs,removeWords,c("gran","agronegocios","nivel","manejo",
"requiere","manera","frente","conocimientos","competencia","competencias","perm
ite"))
wordcloud(docs,scale=c(2,0.1),max.words = 40,rot.per=0.25,
colors=brewer.pal(8,"Dark2"))
# Frecuencias
dtm.docs = DocumentTermMatrix(docs)
matmdt=as.matrix(dtm.docs)
dim(matmdt)
frecuencia=colSums(matmdt)
frecuencia=sort(frecuencia,decreasing=TRUE)
frecuencia[1:15]
library(lattice)
barchart(frecuencia[1:15])
# Correlacion de terminos
findAssocs(dtm.docs,"diseño",0.90)
findAssocs(dtm.docs,"gestión",0.95)
# Grafica de correlaciones
BiocManager::install("Rgraphviz")
library(Rgraphviz)
dtm.docs$dimnames$Terms[1845]="gestión"
dtm.docs$dimnames$Terms[3913]="diseño"
frec=findFreqTerms(dtm.docs,50)
# las palabras que se repiten al menos 65 veces
plot(dtm.docs,term=frec,corThreshold = 0.60,weighting = F,
attrs=list(node=list(width=10,fontsize=28,fontcolor="blue",
shape="ellipse",fixedsize=FALSE)))
m=dtm.docs
m=as.matrix(m[,frec])
c=cor(m)
c[c<umbralcorre]=0
c[is.na(c)]=0
diag(c)=0
ig=graph.adjacency(c,mode="undirected",weighted=TRUE)
g1=as_graphnel(ig)
#labels
ew=as.character(unlist(edgeWeights(g1)))
ew=ew[setdiff(seq(along=ew),Rgraphviz::removedEdges(g1))]
names(ew)=edgeNames(g1)
eAttrs=list()
elabs=paste(" ",round(as.numeric(ew),2))
names(elabs)=names(ew)
eAttrs$label=elabs
fontsizes=rep(6,length(elabs)) # tamano de letra de las correlaciones
names(fontsizes)=names(ew)
eAttrs$fontsize=fontsizes
plot(dtm.docs,term=frec,corThreshold = umbralcorre,weighting = F,
attrs=list(node=list(height=3,width=10,fontsize=10,fontcolor="blue",
shape="ellipse",fixedsize=FALSE)),
edgeAttrs=eAttrs)
install.packages("tidyverse")
library(tidyverse)
install.packages("tidytext")
library(tidytext)
library(gridExtra)
library(scales)
install.packages("gridExtra")
install.packages("tidyr")