Data Science Programming Lab Assessment-6: Importing The Packages and Loading The Dataset

You might also like

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 16

19MID0056

E.UTHESH GANAPATHY

DATA SCIENCE PROGRAMMING

LAB ASSESSMENT-6

QUESTION

1. WRITE AN R CODE TO PERFORM BEHAVIOUR ANALYSIS OF


CUSTOMERS FOR ANY ONLINE PURCHASE MODEL. CONSIDER
ANY DATA SET

STEP 1 - IMPORTING THE PACKAGES AND LOADING THE DATASET

A <- read.csv("C:/Users/Hariharan/Documents/CSV/CUSTOMER.csv")
head(A)
sapply(A, function(x) sum(is.na(x)))
A <- na.omit(A)
str(A)
unique(A$MONTH)
19MID0056
E.UTHESH GANAPATHY

STEP 2 – FIXING THE STRUCTURE OF DATA AND USING


DECRIPTIVE ANALYSIS

A$REVENUE <- gsub(FALSE, 0, A$REVENUE)


A$REVENUE <- gsub(TRUE, 1, A$REVENUE)
A$WEEKEND <- gsub(TRUE, 1, A$WEEKEND)
A$WEEKEND <- gsub(FALSE, 0, A$WEEKEND)
A$MONTH <- factor(A$MONTH, levels = c("Feb", "Mar", "May", "June", "Jul", "Aug",
"Sep", "Oct", "Nov", "Dec"), ordered = TRUE)
A$OS <- factor(A$OS)
A$BROWSER <- factor(A$BROWSER)
A$REGION <- factor(A$REGION)
A$TRAFFIC_TYPES <- factor(A$TRAFFIC_TYPES)
A$VISITOR_TYPE <- factor(A$VISITOR_TYPE)
A$REVENUE <- factor(A$REVENUE)
A$WEEKEND <- factor(A$WEEKEND)
str(A)
19MID0056
E.UTHESH GANAPATHY

summary(A[,c(1:2)])
table(A$REVENUE)
table(A$WEEKEND)
table(A$VISITOR_TYPE)
table(A$TRAFFIC_TYPE)
table(A$REGION)
table(A$BROWSER)
table(A$OS)
table(A$MONTH)
19MID0056
E.UTHESH GANAPATHY

STEP 3 – VISUALIZING

library(corrplot)
correlation <- cor(A[,c(1:5)])
corrplot(correlation, method = "square", type = "lower", diag = TRUE)

library(ggplot2)
options(repr.plot.width = 8, repr.plot.height = 5)
ggplot(A, mapping = aes(x = BOUNCE_RATES, y = EXIT_RATES,inherit.aes =
FALSE)) + geom_point(mapping = aes(color = REVENUE)) + geom_smooth(se =
TRUE, alpha = 0.5) + theme_light() + ggtitle("RELATION BW BOUNCE AND EXIT
RATES") + xlab("BOUNCE RATES") + ylab("EXIT RATES") + geom_text(mapping =
aes(x = 0.15, y = 0.05, label = "Correlation = 0.91"))
19MID0056
E.UTHESH GANAPATHY

library(ggplot2)
table(A$REVENUE, A$VISITOR_TYPE)
options(repr.plot.width = 10, repr.plot.height = 6)
p1 <- ggplot(A, mapping = aes(x = REVENUE)) + geom_bar(mapping = aes(fill =
VISITOR_TYPE)) + theme_light() + ggtitle("REVENUE BASED VISITOR TYPE") +
xlab("REVENUE STATUS(0/1)") + ylab("VISITORS") + theme(legend.position =
"bottom")
options(repr.plot.width = 10, repr.plot.height = 6)
p2 <- ggplot(A, mapping = aes(x = REVENUE)) + geom_bar(mapping = aes(fill =
WEEKEND)) + theme_light() + ggtitle("REVENUE BASED WEEKEND STATUS") +
xlab("REVENUE STATUS (0/1)") + ylab("VISITORS") + theme(legend.position =
"bottom")
grid.arrange(p1,p2, nrow = 1)
19MID0056
E.UTHESH GANAPATHY

options(repr.plot.width = 8, repr.plot.height = 5)
p1 <- ggplot(A, mapping = aes(x = REVENUE)) + geom_bar(mapping = aes(fill =
OS)) + theme_light() + ggtitle("RELATIONSHIP BW OS AND REVENUE") +
xlab("REVENUE") + ylab("OS") + theme(legend.position = "bottom")

options(repr.plot.width = 8, repr.plot.height = 5)
p1 <- ggplot(A, mapping = aes(x = REGION)) + geom_bar(mapping = aes(fill =
REVENUE)) + theme_light() + ggtitle("RELATIONSHIP BW REGION AND
REVENUE") + xlab("REGION") + ylab("REVENUE") + theme(legend.position =
"BOTTOM")
19MID0056
E.UTHESH GANAPATHY

2. WRITE AN R CODE TO PERFORM AGRICULTURAL DATA


ANALYSIS FOR YIELD PREDICTION AND CROP SELECTION ON
INDIAN TERRAIN DATA SET.

STEP 1 – IMPORTING THE DATASET

library(dplyr)
library(ggplot2)
library(xlsx)
library(reshape2)
library(corrplot)

A <- read.csv("C:/Users/Hariharan/Documents/CSV/AGRICULTURE.csv")
B <- read.csv("C:/Users/Hariharan/Documents/CSV/CROPS.csv")
str(A)
head(B)
19MID0056
E.UTHESH GANAPATHY

STEP 2 – RENAMING THE DATA

A <- A %>% rename(rice = 2, jowar = 3, bajra = 4, maize = 5, ragi = 6, millets = 7,


wheat = 8)
A <- A %>% rename(barley = 9, tcereals = 10, gram = 11, tur = 12, otherpulses = 13,
totalpulses = 14)
A <- A %>% rename(totalgrains = 15, gnuts = 16, sesame = 17, mustard = 18,
linseed = 19)
A <- A %>% rename(castor = 20, totaloilseeds = 21, cotton = 22, jute = 23, mesta =
24, tea = 25)
A <- A %>% rename(coffee = 26, rubber = 27, banana = 28, sugarcane = 29,
tobacco = 30, potatoes = 31)
A <- A %>% rename(pepper = 32, chilles = 33, ginger = 34, coconut = 35, turmeric =
36)

cormat <- round(cor(as.matrix(A[,2:15])),2)


get_lower_tri <- function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
lowertri <- get_lower_tri(cormat)
melted.cormat <- melt(lowertri, na.rm = TRUE)
sorted.cormat <- melted.cormat[order(melted.cormat$value),]

STEP 3 – VISUALIZING THE DATA

neg.cormat <- head(sorted.cormat, 6)


ggplot(data = neg.cormat, aes(x = Var1, y = Var2, fill = value)) + geom_tile(color =
"red")
19MID0056
E.UTHESH GANAPATHY

pos.cormat <- sorted.cormat[c(50:80),]


ggplot(data = pos.cormat, aes(x = Var1, y = Var2, fill = value)) + geom_tile(color =
"red")

STEP 4 – COMBINING 2 DATASETS AND VISUALIZING THE DATA

AB<-merge(data.frame(A, row.names=NULL), data.frame(B, row.names=NULL),


by = 0, all = TRUE)[-1]

C <- melt(AB, id = "ANN", measure = c("rice", "maize"))


ggplot(data = C, aes(x = ANN, y = value, color = variable)) + geom_point() +
xlab("ANNUAL RAINFALL") + ylab("GRAIN PRODUCTION")
19MID0056
E.UTHESH GANAPATHY

ggplot(AB, aes(x = ANN, y = mesta)) + geom_point()+ xlab("ANNUAL RAINFALL") +


ylab("CROP PRODUCTION")

ggplot(AB, aes(x = ANN, y = maize)) + geom_point()+ xlab("ANNUAL RAINFALL") +


ylab("GRAIN PRODUCTION")
19MID0056
E.UTHESH GANAPATHY

3. WRITE AN R CODE TO DEVELOP A RECOMMENDER SYSTEM FOR


ANY REAL-WORLD PROBLEM (WHEN A USER QUERIES TO FIND
THE UNIVERSITY THAT OFFERS PYTHON, THE SYSTEM SHOULD
DISPLAY RANK WISE LIST OF THE UNIVERSITY BASED ON THE
REVIEW GIVEN BY THE CUSTOMERS)

STEP 1 - IMPORTING THE PACKAGES AND LOADING THE DATASET

library(DT)
A <- read.csv("C:/Users/Hariharan/Documents/CSV/COURSE.csv")
head(A)
str(A)
19MID0056
E.UTHESH GANAPATHY

STEP 2 – VISUALIZING THE DATA

height<- sort(table(A$COUNTRY), decreasing = TRUE)


barplot(height[1:5], las = 3, main = "TOP COUNTRIES RANKING IN 2014")

plot (A$FACULTY_QUALITY, A$RANK, xlab ="FACULTY QUALITY", ylab =


"RANK", main = "FACULTY QUALITY VS RANK")
c <- lm(RANK ~ FACULTY_QUALITY, data = A)
abline(c)
19MID0056
E.UTHESH GANAPATHY

plot (A$INFLUENCE, A$RANK, xlab ="INFLUENCE", ylab="RANK", main =


"INFLUENCE VS RANK")
c <- lm(RANK ~ INFLUENCE, data = A)
abline(c)

plot (A$CITATIONS, A$RANK, xlab = "CITATIONS", ylab = "RANK", main =


"CITATIONS VS RANK")
c <- lm(RANK ~ CITATIONS, data = A)
abline(c)
19MID0056
E.UTHESH GANAPATHY

STEP 3 – USING APRIORI ALGORITHM

A <- read.csv("C:/Users/Hariharan/Documents/CSV/COURSE.csv")
A <- A[1:1000, ]
A <- select(A, COUNTRY, NATIONAL_RANK)
head(A, 5)
dim(A)

matrix1 <- as(split(A[ , "COUNTRY"], A[ , "NATIONAL_RANK"]), "transactions")


matrix1
ruleParameters <- list(supp = 0.01, conf = 0.75, maxlen = 2)
associationRules <- apriori(matrix1, parameter = ruleParameters)
summary(associationRules)
19MID0056
E.UTHESH GANAPATHY

set.seed(240) plot(associationRules, method = "graph", measure = "support",


shading = "lift", main = "Association Rules Graph")

rules <- sapply(associationRules$rules, function(x){


x = gsub("[\\{\\}]", "", regmatches(x, gregexpr("\\{.*\\}", x))[[1]])
x = gsub("=>",",",x)
x = str_replace_all(x," ","")
return( x )
})

rules <- as.character(rules)


rules <- str_split(rules, ",")
associationRules$movieLeftSide <- sapply( rules, "[[", 1)
associationRules$movieRightSide <- sapply( rules, "[[", 2)
associationRules$movieLeftSide <- as.numeric(associationRules$movieLeftSide)
associationRules$movieRightSide <- as.numeric(associationRules$movieRightSide)
associationRules$rules <- NULL
associationRules <- associationRules %>% left_join(A, by = c("movieLeftSide" =
"COUNTRY"))

associationRules$movieLeftSide <- NULL


columnNames <- colnames(associationRules)
columnNames[5] <- str_c("Left_", columnNames[5])
columnNames[7:25] <- str_c("Left_", columnNames[7:25])
colnames(associationRules) <- columnNames
associationRules <- associationRules %>% left_join(A, by = c("movieRightSide" =
"NATIONAL_RANK"))
19MID0056
E.UTHESH GANAPATHY

associationRules$movieRightSide <- NULL


columnNames <- colnames(associationRules)
columnNames[26:45] <- str_c("Right_", columnNames[26:45])
colnames(associationRules) <- columnNames
colnames(associationRules)

associationRules <- arrange(associationRules, desc(lift))


associationRules <- select(associationRules, Left_title, Left_year, Right_title,
Right_year, support, confidence, lift)
head(associationRules)

You might also like