Digital Assignment-6: Read The Data

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 30

DIGITAL ASSIGNMENT-6

NAME: K.V.PAVAN KARTHIKEYA


REG NO: 19MID0063

1. Write an R code to perform behaviour analysis of customers for


any online purchase model. Consider any data set.

Read the Data

>data=read.csv("D:\\Onlineshoppersintention.csv")
>str(data)

Missing Value Analysis

>data=na.omit(data)
>str(data)
>unique(data$Month)
Fix Structure of Data

>data$Revenue<- gsub(FALSE, 0, data$Revenue)


>data$Revenue<- gsub(TRUE, 1, data$Revenue)
>data$Weekend<- gsub(TRUE, 1, data$Weekend)
>data$Weekend<- gsub(FALSE, 0, data$Weekend)
>
>
>data$Month<- factor(data$Month, levels = c("Feb", "Mar", "May", "June", "Jul",
"Aug", "Sep", "Oct", "Nov", "Dec"), ordered = TRUE)
>
>data$OperatingSystems<- factor(data$OperatingSystems)
>data$Browser<- factor(data$Browser)
>data$Region<- factor(data$Region)
>data$TrafficType<- factor(data$TrafficType)
>data$VisitorType<- factor(data$VisitorType)
>data$Revenue<- factor(data$Revenue)
>data$Weekend<- factor(data$Weekend)
Descriptive Analysis

>summary(data[,c(1:10)])
>table(data$Revenue)

>table(data$Weekend)
>table(data$VisitorType)
>table(data$TrafficType)
>table(data$Region)
>table(data$Browser)
>table(data$OperatingSystems)
>table(data$Month)
Correlation

>library(corrplot)
corrplot 0.90 loaded
>correlation<- cor(data[,c(1:10)])
>corrplot(correlation, method = "square", type = "lower", diag = TRUE)

Relationship between Exit Rates and Bound Rates

>library(ggplot2)
Warning message:
package ‘ggplot2’ was built under R version 4.0.5
>
>options(repr.plot.width = 8, repr.plot.height = 5)
>ggplot(data = data, mapping = aes(x = BounceRates, y = ExitRates)) +
geom_point(mapping = aes(color = Revenue)) + geom_smooth(se = TRUE, alpha = 0.5)
+ theme_light() + ggtitle("Relationship between Exit Rates and Bounce Rates") +
xlab("Bounce Rates") + ylab("Exit Rates") + geom_text(mapping = aes(x = 0.15, y =
0.05, label = "Correlation = 0.913"))
#Trend line for revenue status based on months and trend line for visitor type based on
months

>options(repr.plot.width = 10, repr.plot.height = 6)


>
> p1 <- ggplot(data = data, mapping = aes(x = Revenue)) + geom_bar(mapping = aes(fill
= VisitorType)) + theme_light() + ggtitle("Revenue based on visitor type") +
xlab("Revenue status (0/1)") + ylab("Visitors") + theme(legend.position = "bottom")
>options(repr.plot.width = 10, repr.plot.height = 6)
> p2 <- ggplot(data = data, mapping = aes(x = Revenue)) + geom_bar(mapping = aes(fill
= Weekend)) + theme_light() + ggtitle("Revenue based on weekend status") +
xlab("Revenue status (0/1)") + ylab("Visitors") + theme(legend.position = "bottom")
>grid.arrange(p1,p2, nrow = 1)
>
>
>
>options(repr.plot.width = 8, repr.plot.height = 5)
>
>trend<- data.frame(table(data$Month, data$Revenue))
>
>names(trend) <- c("Months", "Revenue", "Frequency")
>
>ggplot(data = trend, mapping = aes(x = Months, y = Frequency)) +
geom_line(mapping = aes(color = Revenue, group = Revenue), lwd = 1) +
geom_point(mapping = aes(color = Revenue, group = Revenue, size = 0.1),
show.legend = FALSE) + theme_light() + scale_y_continuous(breaks = seq(from = 0, to
= 4000, by = 500)) + ggtitle("Trend line for revenue status based on months") +
xlab("Months") + ylab("Visitors")
>
Split the training and testing set

>library(caret)
Loading required package: lattice
Warning message:
package ‘caret’ was built under R version 4.0.5
>
>set.seed(777)
>split <- createDataPartition(data$Revenue, p = 0.8, list = FALSE)
>train<- data[split,]
>test<- data[-split,]

Model Decision Tree

#Scale

>library(rpart)
>library(rpart.plot)
Warning message:
package ‘rpart.plot’ was built under R version 4.0.5
>
>
>set.seed(1)
> model4_decision <- rpart(Revenue ~ ., data = sample_train, method = "class")
>options(repr.plot.width = 10, repr.plot.height = 10)
>rpart.plot(model4_decision, box.palette = "RdYlGn", shadow.col = "darkgray")
>data.frame(model4_decision$variable.importance)
#metrics

>metrics<- function(x){
+ Accuracy <- (x[4] + x[1]) / (nrow(train))
+ ErrorRate<- (x[3] + x[2]) / (nrow(train))
+ TPR_Recall<- x[4] / (x[2] + x[4])
+ FPR <- x[3] / (x[3] + x[1])
+ TNR_Specificity<- x[1] / (x[1] + x[3])
+ Precision <- x[4] / (x[3] + x[4])
+ F1score <- (2 * Precision * TPR_Recall) / (Precision + TPR_Recall)
+ cat("Accuracy = ", Accuracy, "\n", "Error Rate = ", ErrorRate, "\n", "True Positive
Rate (Recall) = ", TPR_Recall, "\n", "False Positive Rate = ", FPR, "\n", "True Negative
Rate (Specificity) = ", TNR_Specificity, "\n", "Precision = ", Precision, "\n", "F1Score = ",
F1score)
+}
#prediction

>cat("Model4: Decision Tree Classifier\n")


>cat("Fitness level\n")
>prediction<- predict(model4_decision, test, type = "class")
>mean(prediction == test$Revenue)
>cat("\nEvaluation on test set\n")
> evaluate <- table(prediction, test$Revenue)
> evaluate
>metrics(evaluate)

2. Write an R code to perform Agricultural data analysis for yield


prediction and crop selection on Indian terrain data set.

Importing Packages

>library(dplyr)

>library(ggplot2)

>library(xlsx)

>library(reshape2)

>library(corrplot)
Loading Dataset

>grains=read.csv("D:\\Table_8.3-All_India_1.csv")

>
rain=read.xlsx("D:\\All_India_Area_Weighted_Monthly_Seasonal_And_Annual_Rainfal
l.xls",sheetIndex = 1)

>str(grains)
>grains<- grains %>% rename(rice = 2, jowar = 3, bajra = 4, maize = 5, ragi = 6, millets =
7, wheat = 8)

>grains<- grains %>% rename(barley = 9, tcereals = 10, gram = 11, tur = 12,
otherpulses = 13, totalpulses = 14)

>grains<- grains %>% rename(totalgrains = 15, gnuts = 16, sesame = 17, mustard = 18,
linseed = 19)

>grains<- grains %>% rename(castor = 20, totaloilseeds = 21, cotton = 22, jute = 23,
mesta = 24, tea = 25)
>grains<- grains %>% rename(coffee = 26, rubber = 27, banana = 28, sugarcane = 29,
tobacco = 30, potatoes = 31)

>grains<- grains %>% rename(pepper = 32, chilles = 33, ginger = 34, coconut = 35,
turmeric = 36)

>cormat<- round(cor(as.matrix(grains[,2:36])),2)

>get_lower_tri<- function(cormat){

+ cormat[upper.tri(cormat)] <- NA

+ return(cormat)

+}

>lowertri<- get_lower_tri(cormat)

>melted.cormat<- melt(lowertri, na.rm = TRUE)

>sorted.cormat<- melted.cormat[order(melted.cormat$value),]
>neg.cormat<- head(sorted.cormat, 6)

>ggplot(data = neg.cormat, aes(x = Var1, y = Var2, fill = value)) + geom_tile(color =


"white")

>pos.cormat<- sorted.cormat[c(140:170),]

>ggplot(data = pos.cormat, aes(x = Var1, y = Var2, fill = value)) + geom_tile(color =


"white")
EDA Rainfall

>head(rain)

Combined EDA

>rain.new<- rain %>% filter(YEAR >=2001)

>

>grains.new<- grains[c(1:14),]

>

>grainsrain<- cbind(grains.new, rain.new)

>

> grains1 <- melt(grainsrain, id = "ANN", measure = c("rice", "maize"))

A plot showing the grain production as a function of annual rainfall

>ggplot(data = grains1, aes(x = ANN, y = value, color = variable)) + geom_point() +


xlab("Annual Rainfall in mm") + ylab("Grain Production (1000s tonnes)")
Mesta crop production vs annual rainfall

>ggplot(grainsrain, aes(x = ANN, y = mesta)) + geom_point()+ xlab("Annual Rainfall in


mm") + ylab("Crop Production (1000s bales)")

>
Maize crop production vs annual rainfall

>ggplot(grainsrain, aes(x = ANN, y = maize)) + geom_point()+ xlab("Annual Rainfall in


mm") + ylab("Grain Production (1000s tonnes)")

3.Write an R code to Develop a recommender system for any real-world


problem (when a user queries to find the university that offers Python,
the system should display rank wise list of the university based on the
review given by the customers)

library(DT)
data<- read.csv("E:\\cwurData.csv")
data <- subset(data, data$year == 2015)
str(data)
height<- sort(table(data$country), decreasing = TRUE)

barplot(height[1:10], las = 3, main = "Top Countries in University Rankings 2015")


usa <- subset(data, data$country == "USA")
library(DT)

datatable(usa)

plot (usa$quality_of_faculty, usa$national_rank, xlab ="Quality of Faculty", ylab =


"National Rank", main = "Quality of Faculty vs National Rank")
c <- lm(national_rank ~ quality_of_faculty, data = usa)
abline(c)
summary (c)

plot (usa$influence, usa$national_rank, xlab ="Influence", ylab="National Rank", main


= "Influnce vs National Rank")

c <- lm(national_rank ~ influence, data = usa)

abline(c)
summary(c)

plot (usa$citations, usa$national_rank, xlab = "Citations", ylab = "National Rank", main


= "Citations vs National Rank")

c <- lm(national_rank ~ citations, data = usa)


abline(c)

regline <- lm(national_rank ~ quality_of_faculty + influence + citations, data = usa)

summary(regline)
usaEmployment <- usa[order(usa$alumni_employment),]

usaEmploymentRank<-c(1:229) #229 because that is the # of US schools on the list

usaEmployment<-cbind(usaEmployment,usaEmploymentRank)

library(DT)

uTable <- data.frame(usaEmployment$institution,


usaEmployment$usaEmploymentRank,usaEmployment$national_rank)

datatable(uTable, colnames = c("Institution", "National Employment Rank", "National


Rank"))
string= "Employment Rank"

plot(usaEmployment$national_rank,usaEmployment$usaEmploymentRank, xlab =
"National Rank", ylab = string ,main = "National Rank vs Employment Rank")

c<-lm(usaEmploymentRank ~ national_rank, data = usaEmployment)

abline(c)
plot(usaEmployment$quality_of_education,usaEmployment$usaEmploymentRank,
xlab = "Quality of Education ", ylab = string, main = "Quality of Education vs
Employment Rank")

c<-lm(usaEmploymentRank ~ quality_of_education, data = usaEmployment)

abline(c)
plot(usaEmployment$quality_of_faculty,usaEmployment$usaEmploymentRank, xlab =
"Quality of Faculty", ylab = string, main = "Quality of Faculty vs Employment Rank")

c<-lm(usaEmploymentRank ~ quality_of_faculty, data = usaEmployment)

abline(c)
summary(c)

linReg<-
lm(usaEmploymentRank~national_rank+quality_of_education+quality_of_faculty,
data = usaEmployment)

summary(linReg)

You might also like