Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 4

###Association Rule

install.packages("arules")
install.packages("arulesViz")
install.packages("tidyverse")
install.packages("plyr")
install.packages("readxl")
install.packages("lubridate")
install.packages("ggplot2")
install.packages("knitr")

library(arules)
library(arulesViz)
library(tidyverse)
library(plyr)
library(readxl)
library(lubridate)
library(ggplot2)
library(knitr)

grocery <- read.csv("E:/D Drive/Analytics Consulting using


ML/BA_Slides_shubho/groceries-apriori.csv",
na = "0")

##convert in to transactional data


tr<-read.transactions("E:/D Drive/Analytics Consulting using
ML/BA_Slides_shubho/groceries-apriori.csv",
format = "basket", sep = ",")
# or to read from as transactions from imported data

trObj<-as(grocery,"transactions")#avoid

summary(tr)
itemFrequencyPlot(trObj,topN=20,type="absolute")
itemFrequencyPlot(tr,topN=20,type="relative")

grocery_rules1 <- apriori(tr, parameter = list(support = 0.05, confidence = 0.2))


grocery_rules1
inspect(head(sort(grocery_rules1, by = "confidence"), 3))
inspect(grocery_rules1)

#For a particular item


wholemilk_rules <- apriori(tr, parameter=list (supp=0.01,conf = 0.1), appearance =
list (rhs="whole milk"))
inspect(wholemilk_rules)
inspect(head(sort(wholemilk_rules, by = "lift"), 3))

library(arules)
Grocery <- read.csv("groceries.csv")
View(Grocery)
str(Grocery)
##convert in to transactional data
transactional_data <- as(Grocery, "transactions")
str(transactional_data)
inspect(head(transactional_data, 2))
grocery_rules <- apriori(transactional_data, parameter = list(support = 0.01,
confidence = 0.5))
grocery_rules1 <- apriori(transactional_data, parameter = list(support = 0.1,
confidence = 0.5))
grocery_rules
inspect(head(sort(grocery_rules, by = "confidence"), 3))
inspect(grocery_rules1)
grocery_rules2 <- apriori(transactional_data, parameter = list(support = 0.02,
confidence = 0.5))
inspect(head(sort(grocery_rules2, by = "confidence"), 3))
wholemilk_rules <- apriori(data=Groceries, parameter=list (supp=0.001,conf = 0.08),
appearance = list (rhs="whole milk"))
inspect(head(sort(wholemilk_rules, by = "confidence"), 3))

##FA and cluster Analysis

data<-as.data.frame(AP_IMPOVERISHMENT)
data<-DP_final[,2:41]

data_2 <- data.frame(lapply(data, function(x) as.numeric(as.character(x))))


#data_2<-data_2[,-c(16,30)]

fit <- princomp(data, cor=TRUE)


summary(fit)
plot(fit,type="lines") # scree plot
install.packages("GPArotation")
library(GPArotation)
fit2<-principal(data, nfactors=6,rotate= "promax")
rotated_matrix2<-fit2$loadings[,1:6]

library(psych)
fit <- principal(data, nfactors=6, rotate="varimax")#rotate ="none", "varimax",
#"quatimax", "promax"
rotated_matrix<-fit$loadings[,1:6] # print results
communality<-fit$communality
var_acc<-fit$Vaccounted

write.csv(rotated_matrix,"D:/Ph.D/rot-matrix-class.csv")
write.csv(communality,"D:/Ph.D/Varimax-communality.csv")
write.csv(var_acc,"D:/Ph.D/Varimax-var-accounted.csv")

#Reliability
corr_matrix<-corr.test(data,use = "pairwise", method = "pearson")
item_item_corr<-corr_matrix$r

install.packages("psy")
library(psy)
attach(data)
L<-list(c("VD-1","VD-2.1","VD-2.2","VD-2.3","VD-2.4","VD-2.5","VD-3","VD-4"),
c("VL-1.1","VL-1.2","VL-1.3","VL-1.4","VL-1.5","VL-1.6","VL-1.7"),
c("VH-2","VH-3","VH-4","VH-5"),
c("VO-1","VO-2","VO-3"))

mtmm(data,L)

#chronbach alpha
cr_alpha1<-psych::alpha(data[,12:14],check.keys = TRUE)
cr_alpha1$total$raw_alpha
cr_alpha1$total$std.alpha#0.2429168
cr_alpha1$alpha.drop$std.alpha

##CLUSTER ANALYSIS
data<-AP_final[,42:51]
data<-IMP_AP[,42:49]
data[is.na(data)]<-0

my_data<-scale(data)# standardize variables

#partitioning method
install.packages("factoextra")
install.packages("cluster")
install.packages("magrittr")
install.packages("rlang")
install.packages("ggplot2")
library(factoextra)
library("cluster")
library(magrittr)
library(rlang)
library(ggplot2)

fviz_nbclust(my_data, kmeans, method = "silhouette")

km.res <- kmeans(my_data,3,algorithm = "Hartigan-Wong", nstart = 25)


#2 clusters; F=1108/4904 (betSS/WithinSS)
#3 clusters; F=1809/4203
##Visualize
fviz_cluster(km.res, data = my_data,
ellipse.type = "convex",
palette = "jco",
ggtheme = theme_minimal())
cluster_mem<-km.res$cluster
centroid<-km.res$centers
sil<-silhouette(km.res$cluster,dist(my_data))
plot(sil)
write.csv(cluster_mem,"E:/Research/Latha maam/full data/IMP_AP_mem-3.csv")
write.csv(centroid,"E:/Research/Latha maam/full data/IMP_AP_centroid-3.csv")

##Hiearchical
my_data_2<-my_data[2:100,]
clusters <- hclust(dist(my_data_2), method = 'average')#complete, single,ward.D2
plot(clusters)
clusterCut <- cutree(clusters, 10)
table(clusterCut)

Assume the data have been clustered via any technique, such as k-means, into
clusters.
For each datum , let be the average dissimilarity of with all other data
within the same cluster.
Any measure of dissimilarity can be used but distance measures are the most common.

We can interpret as how well is assigned to its cluster (the smaller the value,
the better the assignment).
We then define the average dissimilarity of point to a cluster as the average
of the distance from to points in .
Let be the lowest average dissimilarity of to any other cluster which is not
a member.
The cluster with this lowest average dissimilarity is said to be the "neighbouring
cluster" of because it is the next best fit cluster for point .
We now define:

For to be close to 1 we require . As is a measure of how dissimilar is to


its own cluster, a small value means it is well matched. Furthermore, a large
implies that is badly matched to its neighbouring cluster. Thus an close to one
means that the datum is appropriately clustered. If is close to negative one,
then by the same logic we see that would be more appropriate if it was clustered
in its neighbouring cluster. An near zero means that the datum is on the border
of two natural clusters.
The average over all data of a cluster is a measure of how tightly grouped all
the data in the cluster are. Thus the average over all data of the entire dataset
is a measure of how appropriately the data has been clustered. If there are too
many or too few clusters, as may occur when a poor choice of is used in the k-
means algorithm, some of the clusters will typically display much narrower
silhouettes than the rest. Thus silhouette plots and averages may be used to
determine the natural number of clusters within a dataset.

You might also like