Professional Documents
Culture Documents
Text Mining and Classifica1on: Karianne Bergen
Text Mining and Classifica1on: Karianne Bergen
Karianne
Bergen
kbergen@stanford.edu
Ins1tute
for
Computa1onal
and
Mathema1cal
Engineering,
Stanford
University
hTp://www.theshedonline.org.au/ac1vi1es/ac1vity/scam-‐email-‐examples
≈ +
+
+
+
+ + + +
# install.packages("NMF") # nmf
library(NMF)
k = 20
res <- nmf(V,k)
# shakespeare
s.dir = "shakespeare"
s.Docs <- Corpus(DirSource(directory=s.dir,
encoding="UTF-8"))
# frost
f.dir = "frost"
f.Docs <- Corpus(DirSource(directory=f.dir,
encoding="UTF-8"))
# apply stemming
corpus <-tm_map(corpus, stemDocument, lazy=TRUE)
# remove punctuation
corpus.tmp <- tm_map(corpus,removePunctuation)
return(corpus.tmp)
}
class.names = c('shakespeare','frost')
d.class = c(rep(class.names[1], nrow(s.tdm)),
rep(class.names[2], nrow(f.tdm)))
d.class = as.factor(d.class)
> levels(d.class)
[1] "frost" "shakespeare“
> d.tdm.test
<<DocumentTermMatrix (documents: 105, terms: 518)>>
Non-/sparse entries : 4578/49812
Sparsity : 92%
Maximal term length : 9
Weighting : term frequency (tf)
> summary(treefit)
Variables actually used in tree construction:
[1] doth eyes green grow let thee which
plot(treefit, uniform=TRUE)
text(treefit, use.n=T)
colNames = colnames(predclass)
d.class.pred <-
as.factor(colNames[max.col(predclass)])
tree.table <- table(d.class.pred, d.class.test)
> tree.table
actual
predicted frost shakespeare
frost 55 12
shakespeare 1 37
errorRate<-function(table){
TP = table[1,1]; # true positives
TN = table[2,2]; # true negatives
FP = table[1,2]; # false positives
FN = table[2,1]; # false negatives
error_rate = (FP + FN)/(TP + TN + FP + FN)
return(error_rate)
}
> errorRate(tree.table)
[1] 0.1238095
> errorRate(knn.table)
[1] 0.3142857
> errorRate(res.table)
[1] 0.1619048
> names(Auto)
[1] "mpg" "cylinders " "displacement" "horsepower "
[5] "weight" "acceleration" "year" "origin"
[9] "name"