BAR-Module 2-Unsupervised Learning

###Association Rule
install.packages("arules")
install.packages("arulesViz")
install.packages("tidyverse")
install.packages("plyr")
install.packages("readxl")
install.packages("lubridate")
install.packages("ggplot2")
install.packages("knitr")
library(arules)
library(arulesViz)
library(tidyverse)
library(plyr)
library(readxl)
library(lubridate)
library(ggplot2)
library(knitr)
grocery <- read.csv("E:/D Drive/Analytics Consulting using

ML/BA_Slides_shubho/groceries-apriori.csv",
na = "0")
##convert in to transactional data

tr<-read.transactions("E:/D Drive/Analytics Consulting using
ML/BA_Slides_shubho/groceries-apriori.csv",
format = "basket", sep = ",")
# or to read from as transactions from imported data
trObj<-as(grocery,"transactions")#avoid
summary(tr)
itemFrequencyPlot(trObj,topN=20,type="absolute")
itemFrequencyPlot(tr,topN=20,type="relative")
grocery_rules1 <- apriori(tr, parameter = list(support = 0.05, confidence = 0.2))

grocery_rules1
inspect(head(sort(grocery_rules1, by = "confidence"), 3))
inspect(grocery_rules1)
#For a particular item

wholemilk_rules <- apriori(tr, parameter=list (supp=0.01,conf = 0.1), appearance =
list (rhs="whole milk"))
inspect(wholemilk_rules)
inspect(head(sort(wholemilk_rules, by = "lift"), 3))
library(arules)
Grocery <- read.csv("groceries.csv")
View(Grocery)
str(Grocery)
##convert in to transactional data
transactional_data <- as(Grocery, "transactions")
str(transactional_data)
inspect(head(transactional_data, 2))
grocery_rules <- apriori(transactional_data, parameter = list(support = 0.01,
confidence = 0.5))
grocery_rules1 <- apriori(transactional_data, parameter = list(support = 0.1,
confidence = 0.5))
grocery_rules
inspect(head(sort(grocery_rules, by = "confidence"), 3))
inspect(grocery_rules1)
grocery_rules2 <- apriori(transactional_data, parameter = list(support = 0.02,
confidence = 0.5))
inspect(head(sort(grocery_rules2, by = "confidence"), 3))
wholemilk_rules <- apriori(data=Groceries, parameter=list (supp=0.001,conf = 0.08),
appearance = list (rhs="whole milk"))
inspect(head(sort(wholemilk_rules, by = "confidence"), 3))
##FA and cluster Analysis
data<-as.data.frame(AP_IMPOVERISHMENT)
data<-DP_final[,2:41]
data_2 <- data.frame(lapply(data, function(x) as.numeric(as.character(x))))

#data_2<-data_2[,-c(16,30)]
fit <- princomp(data, cor=TRUE)

summary(fit)
plot(fit,type="lines") # scree plot
install.packages("GPArotation")
library(GPArotation)
fit2<-principal(data, nfactors=6,rotate= "promax")
rotated_matrix2<-fit2$loadings[,1:6]
library(psych)
fit <- principal(data, nfactors=6, rotate="varimax")#rotate ="none", "varimax",
#"quatimax", "promax"
rotated_matrix<-fit$loadings[,1:6] # print results
communality<-fit$communality
var_acc<-fit$Vaccounted
write.csv(rotated_matrix,"D:/Ph.D/rot-matrix-class.csv")
write.csv(communality,"D:/Ph.D/Varimax-communality.csv")
write.csv(var_acc,"D:/Ph.D/Varimax-var-accounted.csv")
#Reliability
corr_matrix<-corr.test(data,use = "pairwise", method = "pearson")
item_item_corr<-corr_matrix$r
install.packages("psy")
library(psy)
attach(data)
L<-list(c("VD-1","VD-2.1","VD-2.2","VD-2.3","VD-2.4","VD-2.5","VD-3","VD-4"),
c("VL-1.1","VL-1.2","VL-1.3","VL-1.4","VL-1.5","VL-1.6","VL-1.7"),
c("VH-2","VH-3","VH-4","VH-5"),
c("VO-1","VO-2","VO-3"))
mtmm(data,L)
#chronbach alpha
cr_alpha1<-psych::alpha(data[,12:14],check.keys = TRUE)
cr_alpha1$total$raw_alpha
cr_alpha1$total$std.alpha#0.2429168
cr_alpha1$alpha.drop$std.alpha
##CLUSTER ANALYSIS
data<-AP_final[,42:51]
data<-IMP_AP[,42:49]
data[is.na(data)]<-0
my_data<-scale(data)# standardize variables
#partitioning method
install.packages("factoextra")
install.packages("cluster")
install.packages("magrittr")
install.packages("rlang")
install.packages("ggplot2")
library(factoextra)
library("cluster")
library(magrittr)
library(rlang)
library(ggplot2)
fviz_nbclust(my_data, kmeans, method = "silhouette")
km.res <- kmeans(my_data,3,algorithm = "Hartigan-Wong", nstart = 25)

#2 clusters; F=1108/4904 (betSS/WithinSS)
#3 clusters; F=1809/4203
##Visualize
fviz_cluster(km.res, data = my_data,
ellipse.type = "convex",
palette = "jco",
ggtheme = theme_minimal())
cluster_mem<-km.res$cluster
centroid<-km.res$centers
sil<-silhouette(km.res$cluster,dist(my_data))
plot(sil)
write.csv(cluster_mem,"E:/Research/Latha maam/full data/IMP_AP_mem-3.csv")
write.csv(centroid,"E:/Research/Latha maam/full data/IMP_AP_centroid-3.csv")
##Hiearchical
my_data_2<-my_data[2:100,]
clusters <- hclust(dist(my_data_2), method = 'average')#complete, single,ward.D2
plot(clusters)
clusterCut <- cutree(clusters, 10)
table(clusterCut)
Assume the data have been clustered via any technique, such as k-means, into
clusters.
For each datum , let be the average dissimilarity of with all other data
within the same cluster.
Any measure of dissimilarity can be used but distance measures are the most common.
We can interpret as how well is assigned to its cluster (the smaller the value,
the better the assignment).
We then define the average dissimilarity of point to a cluster as the average
of the distance from to points in .
Let be the lowest average dissimilarity of to any other cluster which is not
a member.
The cluster with this lowest average dissimilarity is said to be the "neighbouring
cluster" of because it is the next best fit cluster for point .
We now define:
For to be close to 1 we require . As is a measure of how dissimilar is to

its own cluster, a small value means it is well matched. Furthermore, a large
implies that is badly matched to its neighbouring cluster. Thus an close to one
means that the datum is appropriately clustered. If is close to negative one,
then by the same logic we see that would be more appropriate if it was clustered
in its neighbouring cluster. An near zero means that the datum is on the border
of two natural clusters.
The average over all data of a cluster is a measure of how tightly grouped all
the data in the cluster are. Thus the average over all data of the entire dataset
is a measure of how appropriately the data has been clustered. If there are too
many or too few clusters, as may occur when a poor choice of is used in the k-
means algorithm, some of the clusters will typically display much narrower
silhouettes than the rest. Thus silhouette plots and averages may be used to
determine the natural number of clusters within a dataset.

BAR-Module 2-Unsupervised Learning

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

BAR-Module 2-Unsupervised Learning

Uploaded by

Copyright:

Available Formats

###Association Rule

grocery <- read.csv("E:/D Drive/Analytics Consulting using

##convert in to transactional data

grocery_rules1 <- apriori(tr, parameter = list(support = 0.05, confidence = 0.2))

#For a particular item

##FA and cluster Analysis

data_2 <- data.frame(lapply(data, function(x) as.numeric(as.character(x))))

fit <- princomp(data, cor=TRUE)

my_data<-scale(data)# standardize variables

fviz_nbclust(my_data, kmeans, method = "silhouette")

km.res <- kmeans(my_data,3,algorithm = "Hartigan-Wong", nstart = 25)

For to be close to 1 we require . As is a measure of how dissimilar is to

You might also like