Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 5

Appendix

Association R

#import needed packages


library(arules)
library(arulesViz)
library(RColorBrewer)
# Loading data
dataset =
readxl::read_excel("C:/Users/khouloud.bennour/Desktop/GroceryStoreDataSet.xlsx")
View(dataset)
# Structure
str(dataset)
#set column InvoiceNo of dataframe transactionData
dataset$ID <- NULL
#Rename column to items
colnames(dataset) <- c("items")
#export and import dataset
tr <- read.transactions('C:/Users/khouloud.bennour/Desktop/R
code/GroceryStoreDataSet.csv', format = 'basket', sep=',')
View(tr)
summary(tr)
#frequency of items
itemFrequencyPlot(tr,topN=20,type="absolute",col=brewer.pal(8,'Pastel2'), main="Absolute
Item Frequency Plot")
#association rule
association.rules <- apriori(tr, parameter = list(supp=0.001, conf=0.8,maxlen=10))
#summary of association rule
summary(association.rules)
#inspect the rules
inspect(association.rules[1:10])

Clustering R
#*****************Clustering******************************
#import needed packages
library(dplyr)
library(FactoMineR)
library(ggplot2)
library(funModeling)
library(psych)
#import Kaggle data
cluster=read.csv("C:/Users/khouloud.bennour/Desktop/R code/Mall_Customers.csv",
sep=",")
View(cluster)
#Data cleaning
str(cluster)
#convert gender to factor
cluster$Gender=as.factor(cluster$Gender)
#count na
colSums(is.na(cluster))
summary(cluster)

#frequency of gender using barplot


freq(cluster)
#Customer Distribution by Age
hist(cluster$Age,
col="orange",
main="Histogram to Show Count of Age Class",
xlab="Age Class",
ylab="Frequency",
labels=TRUE)
#mHisogram for income
plot(density(cluster$Annual_Income))
hist(cluster$Annual_Income,
col="orange",
main="Histogram to Show the distribution of income",
xlab="Income",
ylab="Value",
labels=TRUE)

#Histogram for score_score


plot(density(cluster$Annual_Income))
hist(cust$Spending_Score,
col="orange",
main="Histogram to Show the distribution of Spending_Score",
xlab="Spending_Score",
ylab="Frequency",
labels=TRUE)
#looking for outliers
boxplot(cluster$Annual_Income, main="Boxplot for income variable")
boxplot(cluster$Age, main="Boxplot for age variable")
boxplot(cluster$Spending_Score,main="Boxplot for spending variable")

#standarisation
scaled <-scale(cluster[,c(4,5)])

#choice of number of clusters


#using Elbow Method
wss <- function(data, maxCluster = 10) {
# Initialize within sum of squares
SSw <- (nrow(data) - 1) * sum(apply(data, 2, var))
SSw <- vector()
for (i in 2:maxCluster) {
SSw[i] <- sum(kmeans(data, centers = i)$withinss)
}
plot(1:maxCluster, SSw, type = "o", xlab = "Number of Clusters", ylab = "Within groups
sum of squares", pch=19)
}
set.seed(100)
wss(scaled)
#apply k-means
cust.KM<-kmeans(scaled,5)
cust.KM
# Adding 'Cluster' column
cluster$Cluster <- cust.KM$cluster
cluster
#visualization
c_Clust=cluster[,c(4,5)]
ggplot(c_Clust, aes(x = Annual_Income, y = Spending_Score)) +
geom_point(stat = "identity", aes(color = as.factor(cust.KM$cluster))) +
scale_color_discrete(name=" ",
breaks=c("1", "2", "3", "4", "5"),
labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5")) +
ggtitle("Customer Cluster")+
xlab("Annual Income")+ylab("Spending Score")

Classification R
#Import needed libraries
library(ggplot2)
library(gridExtra)
library(psych)
library(dplyr)
library(FactoMineR)
library(ggplot2)
library(funModeling)
library(caret)
library(rpart)
library(rpart.plot)
library("caret")

# Load in the credit data


data=readxl::read_excel("C:/Users/khouloud.bennour/Desktop/Dataset/Score.xlsx")
data$Sex[data$Sex=="female"]<-"Female"
# convert to factors
data[,c(1,3,6)] <- lapply(data[,c(1,3,6)], factor)
#check na
colSums(is.na(data))
freq(data)
summary(data)
#histogram for Age
hist(data$Age,
col="orange",
xlab="Age",
ylab="Frequency",
labels=TRUE)
#histogram for shoesize
hist(data$shoeSize,
col="orange",
xlab="Shoesize",
ylab="Frequency",
labels=TRUE)
#histogram for Learning hours
hist(data$Total_Learning_Hours,
col="orange",
xlab="Shoesize",
ylab="Frequency",
labels=TRUE)
#visualization of variables based on dependent variable
y1 <- qplot(x=data$Grade, y=data$age, fill=data$Grade, geom='boxplot')
+guides(fill=FALSE)
y2 <- qplot(x=data$Grade, y=data$shoeSize, fill=data$Grade, geom='boxplot')
+guides(fill=FALSE)
y3 <- qplot(x=data$Grade, y=data$Total_Learning_Hours, fill=data$Grade, geom='boxplot')
+guides(fill=FALSE)
#plot
grid.arrange(y1, y2,y3, nrow=1)
#correlation
library(corrplot)
mcor <- cor(data[-c(1,3,6)])
mcor
corrplot(mcor, type="upper", order="hclust", tl.col="black", tl.srt=45)

#split data to test and train


smp_size <- floor(0.8 * nrow(data))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(data)), size = smp_size)
train <- data[train_ind, ]
test <- data[-train_ind, ]

#logistic regression
logit <- glm(Grade ~., data=train, family='binomial')
summary(logit)

#test and estimation


test$Grade <- ifelse(test$Grade=="F","1","0")
predicted <- predict(logit, test, type="response")

#Confusion Matrix
confusionMatrix(as.factor(as.numeric(predicted>0.5)),as.factor(test$Grade))

#KNN
model_knn <- train(Grade~.,data=train, method='knn')
#Prediction
test <- data[-train_ind, ]
predicted <- predict(model_knn, test)

#Confusion Matrix
confusionMatrix(predicted,test$Grade)

#Decision TREE

fit <- rpart(Grade~., data = train, method = 'class')


#Tree graph
rpart.plot(fit, extra = 106)
#Prediction
predicted<-predict(fit, test, type = 'class')

#Confusion Matrix
confusionMatrix(test$Grade, predicted)

You might also like