Digital Assignment-6: Read The Data

DIGITAL ASSIGNMENT-6
NAME: K.V.PAVAN KARTHIKEYA

REG NO: 19MID0063
1. Write an R code to perform behaviour analysis of customers for

any online purchase model. Consider any data set.
Read the Data
>data=read.csv("D:\\Onlineshoppersintention.csv")
>str(data)
Missing Value Analysis
>data=na.omit(data)
>str(data)
>unique(data$Month)
Fix Structure of Data
>data$Revenue<- gsub(FALSE, 0, data$Revenue)

>data$Revenue<- gsub(TRUE, 1, data$Revenue)
>data$Weekend<- gsub(TRUE, 1, data$Weekend)
>data$Weekend<- gsub(FALSE, 0, data$Weekend)
>
>
>data$Month<- factor(data$Month, levels = c("Feb", "Mar", "May", "June", "Jul",
"Aug", "Sep", "Oct", "Nov", "Dec"), ordered = TRUE)
>
>data$OperatingSystems<- factor(data$OperatingSystems)
>data$Browser<- factor(data$Browser)
>data$Region<- factor(data$Region)
>data$TrafficType<- factor(data$TrafficType)
>data$VisitorType<- factor(data$VisitorType)
>data$Revenue<- factor(data$Revenue)
>data$Weekend<- factor(data$Weekend)
Descriptive Analysis
>summary(data[,c(1:10)])
>table(data$Revenue)
>table(data$Weekend)
>table(data$VisitorType)
>table(data$TrafficType)
>table(data$Region)
>table(data$Browser)
>table(data$OperatingSystems)
>table(data$Month)
Correlation
>library(corrplot)
corrplot 0.90 loaded
>correlation<- cor(data[,c(1:10)])
>corrplot(correlation, method = "square", type = "lower", diag = TRUE)
Relationship between Exit Rates and Bound Rates
>library(ggplot2)
Warning message:
package ‘ggplot2’ was built under R version 4.0.5
>
>options(repr.plot.width = 8, repr.plot.height = 5)
>ggplot(data = data, mapping = aes(x = BounceRates, y = ExitRates)) +
geom_point(mapping = aes(color = Revenue)) + geom_smooth(se = TRUE, alpha = 0.5)
+ theme_light() + ggtitle("Relationship between Exit Rates and Bounce Rates") +
xlab("Bounce Rates") + ylab("Exit Rates") + geom_text(mapping = aes(x = 0.15, y =
0.05, label = "Correlation = 0.913"))
#Trend line for revenue status based on months and trend line for visitor type based on
months

>
> p1 <- ggplot(data = data, mapping = aes(x = Revenue)) + geom_bar(mapping = aes(fill
= VisitorType)) + theme_light() + ggtitle("Revenue based on visitor type") +
xlab("Revenue status (0/1)") + ylab("Visitors") + theme(legend.position = "bottom")
> p2 <- ggplot(data = data, mapping = aes(x = Revenue)) + geom_bar(mapping = aes(fill
= Weekend)) + theme_light() + ggtitle("Revenue based on weekend status") +
xlab("Revenue status (0/1)") + ylab("Visitors") + theme(legend.position = "bottom")
>grid.arrange(p1,p2, nrow = 1)
>
>
>
>
>trend<- data.frame(table(data$Month, data$Revenue))
>
>names(trend) <- c("Months", "Revenue", "Frequency")
>
>ggplot(data = trend, mapping = aes(x = Months, y = Frequency)) +
geom_line(mapping = aes(color = Revenue, group = Revenue), lwd = 1) +
geom_point(mapping = aes(color = Revenue, group = Revenue, size = 0.1),
show.legend = FALSE) + theme_light() + scale_y_continuous(breaks = seq(from = 0, to
= 4000, by = 500)) + ggtitle("Trend line for revenue status based on months") +
xlab("Months") + ylab("Visitors")
>
Split the training and testing set
>library(caret)
Loading required package: lattice
Warning message:
package ‘caret’ was built under R version 4.0.5
>
>set.seed(777)
>split <- createDataPartition(data$Revenue, p = 0.8, list = FALSE)
>train<- data[split,]
>test<- data[-split,]
Model Decision Tree
#Scale
>library(rpart)
>library(rpart.plot)
Warning message:
package ‘rpart.plot’ was built under R version 4.0.5
>
>
>set.seed(1)
> model4_decision <- rpart(Revenue ~ ., data = sample_train, method = "class")
>rpart.plot(model4_decision, box.palette = "RdYlGn", shadow.col = "darkgray")
>data.frame(model4_decision$variable.importance)
#metrics
>metrics<- function(x){
+ Accuracy <- (x[4] + x[1]) / (nrow(train))
+ ErrorRate<- (x[3] + x[2]) / (nrow(train))
+ TPR_Recall<- x[4] / (x[2] + x[4])
+ FPR <- x[3] / (x[3] + x[1])
+ TNR_Specificity<- x[1] / (x[1] + x[3])
+ Precision <- x[4] / (x[3] + x[4])
+ F1score <- (2 * Precision * TPR_Recall) / (Precision + TPR_Recall)
+ cat("Accuracy = ", Accuracy, "\n", "Error Rate = ", ErrorRate, "\n", "True Positive
Rate (Recall) = ", TPR_Recall, "\n", "False Positive Rate = ", FPR, "\n", "True Negative
Rate (Specificity) = ", TNR_Specificity, "\n", "Precision = ", Precision, "\n", "F1Score = ",
F1score)
+}
#prediction
>cat("Model4: Decision Tree Classifier\n")

>cat("Fitness level\n")
>prediction<- predict(model4_decision, test, type = "class")
>mean(prediction == test$Revenue)
>cat("\nEvaluation on test set\n")
> evaluate <- table(prediction, test$Revenue)
> evaluate
>metrics(evaluate)
2. Write an R code to perform Agricultural data analysis for yield

prediction and crop selection on Indian terrain data set.
Importing Packages
>library(dplyr)
>library(ggplot2)
>library(xlsx)
>library(reshape2)
>library(corrplot)
Loading Dataset
>grains=read.csv("D:\\Table_8.3-All_India_1.csv")
>
rain=read.xlsx("D:\\All_India_Area_Weighted_Monthly_Seasonal_And_Annual_Rainfal
l.xls",sheetIndex = 1)
>str(grains)
>grains<- grains %>% rename(rice = 2, jowar = 3, bajra = 4, maize = 5, ragi = 6, millets =
7, wheat = 8)
>grains<- grains %>% rename(barley = 9, tcereals = 10, gram = 11, tur = 12,
otherpulses = 13, totalpulses = 14)
>grains<- grains %>% rename(totalgrains = 15, gnuts = 16, sesame = 17, mustard = 18,
linseed = 19)
>grains<- grains %>% rename(castor = 20, totaloilseeds = 21, cotton = 22, jute = 23,
mesta = 24, tea = 25)
>grains<- grains %>% rename(coffee = 26, rubber = 27, banana = 28, sugarcane = 29,
tobacco = 30, potatoes = 31)
>grains<- grains %>% rename(pepper = 32, chilles = 33, ginger = 34, coconut = 35,
turmeric = 36)
>cormat<- round(cor(as.matrix(grains[,2:36])),2)
>get_lower_tri<- function(cormat){
+ cormat[upper.tri(cormat)] <- NA
+ return(cormat)
+}
>lowertri<- get_lower_tri(cormat)
>melted.cormat<- melt(lowertri, na.rm = TRUE)
>sorted.cormat<- melted.cormat[order(melted.cormat$value),]
>neg.cormat<- head(sorted.cormat, 6)
>ggplot(data = neg.cormat, aes(x = Var1, y = Var2, fill = value)) + geom_tile(color =

"white")
>pos.cormat<- sorted.cormat[c(140:170),]
>ggplot(data = pos.cormat, aes(x = Var1, y = Var2, fill = value)) + geom_tile(color =

"white")
EDA Rainfall
>head(rain)
Combined EDA
>rain.new<- rain %>% filter(YEAR >=2001)
>
>grains.new<- grains[c(1:14),]
>
>grainsrain<- cbind(grains.new, rain.new)
>
> grains1 <- melt(grainsrain, id = "ANN", measure = c("rice", "maize"))
A plot showing the grain production as a function of annual rainfall
>ggplot(data = grains1, aes(x = ANN, y = value, color = variable)) + geom_point() +

xlab("Annual Rainfall in mm") + ylab("Grain Production (1000s tonnes)")
Mesta crop production vs annual rainfall
>ggplot(grainsrain, aes(x = ANN, y = mesta)) + geom_point()+ xlab("Annual Rainfall in

mm") + ylab("Crop Production (1000s bales)")
>
Maize crop production vs annual rainfall
>ggplot(grainsrain, aes(x = ANN, y = maize)) + geom_point()+ xlab("Annual Rainfall in

mm") + ylab("Grain Production (1000s tonnes)")
3.Write an R code to Develop a recommender system for any real-world

problem (when a user queries to find the university that offers Python,
the system should display rank wise list of the university based on the
review given by the customers)
library(DT)
data<- read.csv("E:\\cwurData.csv")
data <- subset(data, data$year == 2015)
str(data)
height<- sort(table(data$country), decreasing = TRUE)
barplot(height[1:10], las = 3, main = "Top Countries in University Rankings 2015")

usa <- subset(data, data$country == "USA")
library(DT)
datatable(usa)
plot (usa$quality_of_faculty, usa$national_rank, xlab ="Quality of Faculty", ylab =

"National Rank", main = "Quality of Faculty vs National Rank")
c <- lm(national_rank ~ quality_of_faculty, data = usa)
abline(c)
summary (c)
plot (usa$influence, usa$national_rank, xlab ="Influence", ylab="National Rank", main

= "Influnce vs National Rank")
c <- lm(national_rank ~ influence, data = usa)
abline(c)
summary(c)
plot (usa$citations, usa$national_rank, xlab = "Citations", ylab = "National Rank", main

= "Citations vs National Rank")
c <- lm(national_rank ~ citations, data = usa)

abline(c)
regline <- lm(national_rank ~ quality_of_faculty + influence + citations, data = usa)
summary(regline)
usaEmployment <- usa[order(usa$alumni_employment),]
usaEmploymentRank<-c(1:229) #229 because that is the # of US schools on the list
usaEmployment<-cbind(usaEmployment,usaEmploymentRank)
library(DT)
uTable <- data.frame(usaEmployment$institution,

usaEmployment$usaEmploymentRank,usaEmployment$national_rank)
datatable(uTable, colnames = c("Institution", "National Employment Rank", "National

Rank"))
string= "Employment Rank"
plot(usaEmployment$national_rank,usaEmployment$usaEmploymentRank, xlab =
"National Rank", ylab = string ,main = "National Rank vs Employment Rank")
c<-lm(usaEmploymentRank ~ national_rank, data = usaEmployment)
abline(c)
plot(usaEmployment$quality_of_education,usaEmployment$usaEmploymentRank,
xlab = "Quality of Education ", ylab = string, main = "Quality of Education vs
Employment Rank")
c<-lm(usaEmploymentRank ~ quality_of_education, data = usaEmployment)
abline(c)
plot(usaEmployment$quality_of_faculty,usaEmployment$usaEmploymentRank, xlab =
"Quality of Faculty", ylab = string, main = "Quality of Faculty vs Employment Rank")
c<-lm(usaEmploymentRank ~ quality_of_faculty, data = usaEmployment)
abline(c)
summary(c)
linReg<-
lm(usaEmploymentRank~national_rank+quality_of_education+quality_of_faculty,
data = usaEmployment)
summary(linReg)

Digital Assignment-6: Read The Data

Uploaded by

Copyright:

Available Formats

You might also like

Digital Assignment-6: Read The Data

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Digital Assignment-6: Read The Data

Uploaded by

Copyright:

Available Formats

DIGITAL ASSIGNMENT-6

NAME: K.V.PAVAN KARTHIKEYA

1. Write an R code to perform behaviour analysis of customers for

Read the Data

Missing Value Analysis

>data$Revenue<- gsub(FALSE, 0, data$Revenue)

Relationship between Exit Rates and Bound Rates

>options(repr.plot.width = 10, repr.plot.height = 6)

Model Decision Tree

>cat("Model4: Decision Tree Classifier\n")

2. Write an R code to perform Agricultural data analysis for yield

>melted.cormat<- melt(lowertri, na.rm = TRUE)

>ggplot(data = neg.cormat, aes(x = Var1, y = Var2, fill = value)) + geom_tile(color =

>ggplot(data = pos.cormat, aes(x = Var1, y = Var2, fill = value)) + geom_tile(color =

>rain.new<- rain %>% filter(YEAR >=2001)

>grainsrain<- cbind(grains.new, rain.new)

> grains1 <- melt(grainsrain, id = "ANN", measure = c("rice", "maize"))

A plot showing the grain production as a function of annual rainfall

>ggplot(data = grains1, aes(x = ANN, y = value, color = variable)) + geom_point() +

>ggplot(grainsrain, aes(x = ANN, y = mesta)) + geom_point()+ xlab("Annual Rainfall in

>ggplot(grainsrain, aes(x = ANN, y = maize)) + geom_point()+ xlab("Annual Rainfall in

3.Write an R code to Develop a recommender system for any real-world

barplot(height[1:10], las = 3, main = "Top Countries in University Rankings 2015")

plot (usa$quality_of_faculty, usa$national_rank, xlab ="Quality of Faculty", ylab =

plot (usa$influence, usa$national_rank, xlab ="Influence", ylab="National Rank", main

c <- lm(national_rank ~ influence, data = usa)

plot (usa$citations, usa$national_rank, xlab = "Citations", ylab = "National Rank", main

c <- lm(national_rank ~ citations, data = usa)

regline <- lm(national_rank ~ quality_of_faculty + influence + citations, data = usa)

usaEmploymentRank<-c(1:229) #229 because that is the # of US schools on the list

uTable <- data.frame(usaEmployment$institution,

datatable(uTable, colnames = c("Institution", "National Employment Rank", "National

c<-lm(usaEmploymentRank ~ national_rank, data = usaEmployment)

c<-lm(usaEmploymentRank ~ quality_of_education, data = usaEmployment)

c<-lm(usaEmploymentRank ~ quality_of_faculty, data = usaEmployment)

You might also like