R Studio

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 13

rm(list=ls()) ##create vectors x <- 1:20

#always creates an integer vector


#Assign a variable with an x
integer value a <- 10L a #class and length of a
is.integer(a) #to check whether the value is vector class(x)
integer or not length(x)

#using c() x <- c(0.1,0.2)


#character type
##numeric vector x <-
str <- 'R
c(TRUE,FALSE) ##logical
programming'
vector x <- c(T,F) ##logical
str s <-
vector x <- c("A","B","C")
"cse3505 -"
##character vector x <-
s class(s)
c(1L,2L,15L,27L) ##integer
#some useful functions vector x x <- c(1+2i,3)
paste(s,str) sprintf("%s has ##complex vector
scored %d marks","Sita",90)
#using
substr(str,start=5,stop=10)
vector() x
sub("e","C",str) <- vector()
str x length(x)
print(str) class(x)

#complex type x <-


cmp <- 21+10i vector("character",length
sqrt(-1) sqrt(-1+0i) = 10) x
sqrt(as.complex(-1)) #explicit type #Implicit type coercion -
conversion mixed objects y <- c(1.5,"a")
#character y y <- c(1.5,TRUE)
#numeric y

#logical type lg
<- TRUE y <- c(TRUE,"a")
p=TRUE;q=FAL #character y
SE
p&q;p|q;!p #Explicit type
coercion x <- 2.5
#Obtain the class and type of the variable class(x)
as.integer(x) x
class(a)
typeof(a) x <- -1:5 x
class(str) class(x)
typeof(str) as.numeric
class(cmp) (x)
typeof(cmp) as.logical(x)
class(lg) as.characte
typeof(lg) r(x)
as.complex
#special number Inf representing infinity
(x)
1/0
1/Inf log(0) #find
natural log.
#Non-sensical coercion results in NAs
#you can represent base value as 2nd x <- c('a','b','c')
argument log(10,2) #base 2 log(10,10) x as.
#base 10 numeric(x)
as.logical(x)
#NaN represents a undefined value (also indicates a missing value)
0/0 #vector
arithmetics x
<- c(1,3,5) y <-
c(2,4,6)
x+y #missing values x
x-y x*y x/y <-
help(options c(1,2,NA,5,NaN,6)
) ?options is.na(x) is.nan(x)
options(digit
# Data frame ----------------------------------------------
s=2)
-------------------rm(list=ls())
#recycling
# table with the same type within a column and different types between columns #
rule y <-
defined with a data.frame() function id=c(1,2,3) name=c("a","b","c") marks = c(50, 0, 25)
c(2,4,6,8,10)
sample_df=data.frame(id,name,marks) sample_df
x+y
my_df <- data.frame(id = c(1, 2, 3),
#create
name = c("Ramu","Raju","Ravi"),
matrices m <- marks = c(50, 40, 25))
matrix() my_df
m
#dimension of the data frame
m <- matrix(nrow=3,ncol=2) dim(my_df)
m
attributes(m) dim(m) m <- matrix() m <- #columns of the data frame
matrix(1:6,nrow=3,ncol=2) #constructed column-wise m names(my_df)
<- matrix(1:6,nrow=3,ncol=2,byrow = TRUE)
#constructed column-wise #structure of the data frame
m str(my_df)

#summary statistics of the data frame


#constructing from summary(my_df)
vector m <- 1:6
head(my_df) #top 6 rows in the data
dim(m) <- c(3,2) frame tail(my_df) #bottom 6 rows in the
m data frame

#constructing using ################ ADDING/Removing columns


column-binding x <- 1:3 x y # Ways to add a column
<- 10:12 y cbind(x,y)
my_df
#constructing using row- #initialize with 0
binding rbind(x,y) my_df$name
#matrix multiplication x <- my_df$perf <- 0
matrix(c(1,2,3,4),nrow=2,ncol=2) my_df
y <-
matrix(c(10,10,10,10),nrow=2,n my_df$perf <- c("very good","good","needs
col=2) x y x*y #does element- to improve") my_df
wise multiplication x%*%y
#can use [[]],[],[,] my_df[["perf"]] <-c("very
#does matrix multiplication
good","good","needs to improve")
#similarly, use x%/%y for matrix division. Otherwise, it does element-wise division my_df["perf"] <- c("very
good","good","needs to improve")
x t(x) #transpose of a my_df[,"perf"] <- c("very
matrix solve(x) #inverse good","good","needs to improve") my_df[5]
of a matrix det(x) # <- 0 my_df
determinant of a matrix

#creating a List x <- # Ways to remove the column


list(1,'a',TRUE,1+3i,6.7,c(10,20,
my_df[5] <- NULL
30)) x
my_df$V5 <- NULL
my_df my_df$perf
#factors x <- <- NULL
factor(c("male","female")) x x <- my_df[["perf"]] <-
factor(c("low","medium","high", NULL
"low")) table(x) unclass(x) my_df["perf"] <-
NULL my_df[5] <- boolv <-
NULL my_df$V5 <- stu_temp["mark2"]>10
NULL #subsetting boolv
row.names(stu_temp)[bo
df1 <- subset(my_df, olv]
select=c(id,marks)) df1 df1 <- #---------------------------------------------------------------
-----
subset(my_df, select=-marks)
library(help=datas
df1 View(df1) ets)

my_df$mark2 <- data(mtcars) # Loading mtcars


c(30,20,10) my_df data set cars <-mtcars # Save the
data into workspace
#sum of all marks
sum(my_df$mark2) # Viewing data set mtcars
# Total data set in console
#rowsum View(mtcars) # Viewing dataset in
my_df$total <- spreadsheet
rowSums(my_df[c(3,5)]) my_df
head(mtcars) # Viewing top-6 observations (default:
#max
top-6) tail(mtcars) # Viewing bottom 6
max(my_df$total)
observations str(mtcars) # Viewing data
#index at which max value is present
which.max(my_df$total) dictionary names(mtcars) # Viewing column
names v1 <- mtcars$mpg # Assigning single variable
#name of the student who got the from mtcars data to v1 v2 <- mtcars$cyl v3 <- mtcars$disp
max mark v4 <- mtcars$hp newvar <- mtcars$disp + mtcars$hp
my_df[["name"]][which.max(my_d
f$total)] mtcars1<-rbind(v1,v2,v3,v4) # Combined as rows
my_df$name[which.max(my_df$to #Horizontal joins mtcars1 mtcars2<-
tal)] cbind(v1,v2,v3,v4) # Combined as columns # Vertical
my_df[which.max(my_df$total),2] joins mtcars2

my_df my_df <- #create a variable obs_subset and have rows 4


rbind(my_df,data.frame(id=4,name="avgscore",marks=mean(my_df$marks),perf="meanperf",mark2=mean(my_df$mark to 10 in mtcars obs_subset <- mtcars[4:10,]
2),total=mean(my_df$total))) getwd() obs_subset

write.csv(my_df,"marks1.csv") #create a variable var_subset and have only the


write.csv(my_df,"marks.csv",row.names = FALSE) columns 1,5,9 var_subset <- mtcars[,c(1,5,9)]
var_subset
#R datasets
#subsetting
stu_marks <- read.csv("marks.csv") #create a variable subset1 and have only mpg and cyl variables of mtcars
str(stu_marks) #using indexing subset1 <-
mtcars[,c(1,2)] head(subset1,3)
stu_marks
#using subset() subset2 <-
stu_marks$mark2
subset(mtcars,select=c(mpg,cyl))
stu_marks[4]
subset2
stu_marks[3,3]
stu_marks[3,5] #create a variable subset3 and have only the rows
stu_marks where mpg>18 subset3 <- subset(mtcars,mpg>18)
stu_marks[c(1,3),c( subset3
2,5)]
#create a variable subset3 and have only the rows where
stu_temp <- stu_marks[c(-2,-4)] mpg>18 and cyl>5 subset4 <- subset(mtcars, mpg>18
stu_temp &cyl>5) subset4

row.names(stu_temp) #exclude mpg and cyl columns subset4 <-


row.names(stu_temp) <- subset(mtcars, mpg>18 &cyl>5, select=c(-mpg,-
stu_marks$name stu_temp cyl)) subset4
#install.packages("M #tbl_df(cars)
ASS") library(MASS) as_tibble(cars)
data("survey")
glimpse(cars)
#clear workspace
rm(list=ls()) ############ Subsetting Rows (Observations) #################

#filtering based on single


loan <- read.csv("loans data.csv") loan <- condition filter(cars, mpg>25)
read.table("loans data.csv",header = TRUE,sep
= ",") #filtering based on multiple
condition filter(cars, mpg>25&
dim(loan) hp >90)
str(loan)
head(loan,3) #Remove duplicate rows
tail(loan,2) distinct(cars)
summary(loan)
any(is.na(loan)) #Randomly select fraction of rows
sum(is.na(loan)) sample_frac(cars,0.2)

loan_cln <- #Randomly select no. of rows


na.omit(loan) sample_n(cars,5)
nrow(loan)
nrow(loan_cln) #selecting rows by position
slice(cars,11:15)
loan_cln2 <-
loan[complete.cases(loan),] slice_sample(cars,n=5)
nrow(loan_cln2)
temp <-
#Loading the dplyr package
library(dplyr) filter(cars,mpg>25)

#loading slice_sample(temp,n
data
data("mtca =2)
rs") cars <-
mtcars cars %>%
filter(mpg>25) %>%
#dimension of the data
slice_sample(n=2)
dim(cars)
#unique values in a column
#structure of the data unique(cars$cyl)
str(cars)
#no. of values under each unique category
#is.na(cars) #NA or NaN table(cars$cyl)
#checking for missing
#grouping
values any(is.na(cars))
cars %>%
sum(is.na(cars))
group_by(cyl)%
#################### Viewing data ########################
>%
#fetching top 6 rows slice_sample(n
head(cars) =2)

#fetching last 6 rows


############ Subsetting Columns (variables) #################
tail(cars)
#selecting single column
#viewing data
dplyr::select(cars,mpg)
View(cars)
cars %>%
#summary
dplyr::select(mpg)%>%
summary(cars) head(3)
#slice_sample(n=3)
cars
#selecting multiple columns dplyr::select(mpg,disp,hp,newv
dplyr::select(cars,mpg,cyl,gear) ar2)%>% sample_n(2)
dplyr::select(cars,c("mpg","cyl","ge
ar")) ############ summarizing data
names(cars) #################
#select all columns between a range of columns #Always group_by is used along with summarise. It is applied on categorical value
(inclusive) dplyr::select(cars,hp:am) cars %>% group_by(cyl) %>%
summarize(cnt=n()) #count of unique
cyl values
#combining filter and select- using pipe operator table(cars$cyl)
cars %>%
filter(mpg>18)%>% #computing max, min and standard dev cars %>% group_by(cyl) %>%
dplyr::select(mpg,cyl)%>% summarize(mx_mpg=max(mpg),mi_mpg=min(mpg),std_mpg=sd(mpg),mn=mean(
head(3) mpg),md=median(mpg))

names(cars) #clear workspace


#selecting columns starting with 'd' rm(list=ls())
dplyr::select(cars,starts_with('d'))
loan <- read.csv("loans data.csv") loan <-
#selecting columns ending with 't'
read.table("loans data.csv",header = TRUE,sep
dplyr::select(cars,ends_with('t'))
= ",")
#selecting columns
dim(loan)
containing 'g'
str(loan)
dplyr::select(cars,contains('g
head(loan,3)
')) #selecting columns
matching regular expression tail(loan,2)
dplyr::select(cars,matches('.. summary(loan)
a.')) any(is.na(loan))
sum(is.na(loan))
#Excluding certain columns
select(cars,c(-mpg,-cyl)) loan_cln <-
na.omit(loan)
############ Arranging data nrow(loan)
################# nrow(loan_cln)

#arrange the data in ascending order of mpg loan_cln2 <-


arrange(cars,mpg) loan[complete.cases(loan),]
nrow(loan_cln2)
#arrange the data in descending order of mpg
arrange(cars,desc(mpg)) #Loading the dplyr package
library(dplyr)
#arrange the data in order based on more than
one column arrange(cars, mpg,disp) #loading
data
arrange(cars,mpg,desc(disp)) data("mtca
rs") cars <-
############ Making new variables ################# mtcars

#creating a new column #dimension of the data


mutate(cars,newvar=disp-hp ) dim(cars)

#combining functions
#create a new variable that sum up disp and hp #structure of the data
and filter only str(cars)
#the rows where mpg>25 & #is.na(cars) #NA or NaN
disp>90 #and select only mpg, #checking for missing values
disp, hp, newvar cars %>%
mutate(newvar2=disp+hp)%>% any(is.na(cars))
filter(mpg>25,disp>90)%>% sum(is.na(cars))
#################### Viewing data ########################
#fetching top 6 rows slice_sample(n
head(cars) =2)

#fetching last 6 rows


tail(cars) ############ Subsetting Columns (variables) #################

#viewing data #selecting single column


View(cars) dplyr::select(cars,mpg)

#summary cars %>%


summary(cars) dplyr::select(mpg)%>%
head(3)
cars #slice_sample(n=3)

#tbl_df(cars) #selecting multiple columns


as_tibble(cars) dplyr::select(cars,mpg,cyl,gear)
dplyr::select(cars,c("mpg","cyl","ge
glimpse(cars) ar"))
############ Subsetting Rows (Observations) ################# names(cars)
#filtering based on single #select all columns between a range of columns
condition filter(cars, mpg>25) (inclusive) dplyr::select(cars,hp:am)

#filtering based on multiple


condition filter(cars, mpg>25& #combining filter and select- using pipe operator
hp >90)
cars %>%
#Remove duplicate rows filter(mpg>18)%>%
distinct(cars) dplyr::select(mpg,cyl)%>%
head(3)
#Randomly select fraction of rows names(cars)
sample_frac(cars,0.2)
#selecting columns starting with 'd'
#Randomly select no. of rows dplyr::select(cars,starts_with('d'))
sample_n(cars,5)
#selecting columns ending with 't'
#selecting rows by position dplyr::select(cars,ends_with('t'))
slice(cars,11:15)
#selecting columns containing 'g'
slice_sample(cars,n=5) dplyr::select(cars,contains('g'))

#selecting columns matching regular expression


temp <-
dplyr::select(cars,matches('..a.'))
filter(cars,mpg>25) #Excluding certain columns
select(cars,c(-mpg,-cyl))
slice_sample(temp,n
############ Arranging data
=2) #################

cars %>% #arrange the data in ascending order of mpg


filter(mpg>25) %>% arrange(cars,mpg)
slice_sample(n=2) #arrange the data in descending order of mpg
#unique values in a column arrange(cars,desc(mpg))
unique(cars$cyl) #arrange the data in order based on more than
#no. of values under each unique category one column arrange(cars, mpg,disp)
table(cars$cyl) arrange(cars,mpg,desc(disp))
#grouping
cars %>% ############ Making new variables #################
group_by(cyl)%
>%
#creating a new column M1 <- A1[,,1]
mutate(cars,newvar=disp-hp ) M2 <- A1[,,2]
M3 <- M1+M2
#combining functions
M3
#create a new variable that sum up disp and hp
and filter only M1
#the rows where mpg>25 & #Aggregation on array
disp>90 #and select only mpg, elements apply(M1,1,sum)
disp, hp, newvar cars %>% #1- along row
mutate(newvar2=disp+hp)%>% apply(M2,2,sum) #2 -along
filter(mpg>25,disp>90)%>% column A1
dplyr::select(mpg,disp,hp,newv apply(A1,1,sum)
ar2)%>% sample_n(2) apply(A1,2,mean)

rm(list=ls())
############ summarizing data
################# #To create date / To
#Always group_by is used along with summarise. It is applied on categorical value represent date d <- date()
cars %>% group_by(cyl) %>% d class(d)
summarize(cnt=n()) #count of unique
#as.Date(d)
cyl values
#to convert date string to date class d <-
table(cars$cyl) as.Date("2022-8-25") #default format -year-
month-day class(d)
#computing max, min and standard dev cars %>% group_by(cyl) %>% d as.Date("2022-8-25
summarize(mx_mpg=max(mpg),mi_mpg=min(mpg),std_mpg=sd(mpg),mn=mean( 10:44:22")
mpg),md=median(mpg)) as.Date("2022-8-25
21:15")
rm(list=ls())

#creating array from #to see the internal representation


vectors v1 <- c(1,2,3) unclass(d)
v2 <- c(4,5,6,7,8,9)
#to represent both date and
A1 <- array(c(v1,v2),dim = c(3,3,2)) time as.POSIXct("2022-8-25")
A1 pd <- as.POSIXct("2022-8-25
21:15") pd
#naming columns and rows class(pd)
rname <- c("r1","r2","r3") cname unclass(pd)
<- c("c1","c2","c3") mname <-
pd <- as.POSIXlt("2022-8-
c("mat1","mat2") dimnames(A1)
25") pd
<- list(rname,cname,mname)
class(pd)
A1 <- array(c(v1,v2),dim = c(3,3,2),dimnames = list(rname,cname,mname))
A1 #getting meta using
unclass() unclass(pd)
#printing the second row of second matrix names(unclass(pd))
A1[2,,2]
pd <- as.POSIXlt("2022-8-17
A1["r2",,"mat2"]
21:15:30") pd$sec pd$hour
#printing the second column of first matrix
pd$min pd$mday pd$year
A1[,2,1] unlist(pd)
A1[,"c2","mat1"] #if format is different
#printing the element in the 2nd row and 3rd column of second matrix as.Date("25/8/2022",format="%d/%
A1[2,3,2] m/%Y") date() as.Date("August
25,2022",format="%B %d,%Y")
as.Date("25Aug22",format="%d%b%
#printing the second matrix y")
A1[,,2]
A1[,,"mat2"] #Checking the class
class(as.Date("2022-8-25 21:15"))
#Manipulating array elements class(as.POSIXct("2022-8-25
21:15")) class(as.POSIXlt("2022-8- # dir.create("data")
25 21:15"))
#fileurl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
#download.file(fileurl,destfile = "E:/sweetlin-personal/coursera/data/camera.csv")
#Getting date, time and #list.files("E:/sweetlin-personal/coursera/data")
zone p <- Sys.Date() #only
current date class(p) #dateofdownload <- date()
Sys.time() #current date, time and timezone #dateofdownload
Sys.timezone() rm(list=ls())
#Reading flat file using read.table() loan <-
#difference in dates Sys.Date()-
read.table("loans data.csv",header = TRUE,sep
as.Date("1979-03-21")
= ",") str(loan) head(loan,2)
difftime(Sys.Date(),as.Date("1979-03-
21"),units = "weeks") #getting weekdays and #Reading flat file using read.csv() loan1 <-
basic arithmetic d <- as.Date("2022-8-17") d
#to find weekday of the date read.csv("loans data.csv") str(loan1) df <-
weekdays(d) read.table("tabsepfile.txt",header =

#add or subtract to create new date(s) FALSE,sep = "\t") str(df)


d+1 d+1:5
weekdays(d+ df <- read.table("slashsepfile.txt",header = FALSE,sep="/",strip.white = TRUE,na.strings = "EMPTY")
1:5) str(df)

#check for seq #Reading Excel file


and rep #using #you need to import xlsx package
sequence d #install.packages("xlsx")
dt <- seq(d,by="2 #library(xlsx)
months",length.out = 6) dt #loan <- read.xlsx("loan.xls",sheetIndex=1,
header=TRUE)
#getting month and
quarter months(d) #install.packages("XLConnect")
months(dt) #library(XLConnect)
quarters(dt)
library(readxl)
#lubridate::today() #lubridate package #excel_sheets('E:/sweetlin-official/FALL 2020 -2021/CSE3505/R
#ISOdate(2021,8,25) programs/loans data.xlsx') excel_sheets("loans data.xlsx") df <-
read_excel("loans data.xlsx",sheet="sample")
?strptime
help("strptime") str(df)

datestring<-"August 17, 2022 04:20"


convertedForm<- #XLConnect, XLSX, readxl
strptime(datestring,"%B %d, %Y %H:%M")
class(convertedForm) convertedForm #Reading XML file
#You need to install XML
x <- as.Date("2020-01-01") y <- strptime("25 Aug package
2020 09:00:00", "%d %b %Y %H:%M:%S") x-y install.packages("XML")
class(x) library(XML)
#library(methods)
class(y)
#install.packages("RCurl") #library(RCurl)
x <- as.POSIXlt(x) library(httr) fileurl <-
x-y "https://www.w3schools.com/xml/simpl
e.xml" xmldata <- GET(fileurl) doc <-
#different time zones x <- xmlTreeParse(xmldata,useInternal=TRUE)
as.POSIXct("2021-08-25
08:00:00") x root <-
xmlRoot(doc)
xgmt<-as.POSIXct("2021-08-25 08:00:00", root
tz="GMT") xgmt xmlName(root)
names(root)
xgmt-x
#Accessing parts of xml file in the same
#if(!file.exists("data")) way as list root[[1]] #accessing 1st
food root[[1]][[1]] #accessing name of #view of the data in a table
the 1st food View(loan)

#Extracting parts of XML file- value of all nodes


xmlSApply(root,xmlValue) #fetching top 6 rows
head(loan)
root <- xmlSApply(root,function(x) xmlSApply(x,xmlValue))
#fetching last 6 rows
root tail(loan)
#Extracting individual nodes of #summary of the data
XML file summary(loan)
xpathSApply(root,"//name",xmlVal
ue) ############ Cleaning data
xpathSApply(root,"//price",xmlVal #################
ue) #checking for missing values in the data
any(is.na(loan)) #NA NaN
xml_df <- data.frame(t(root),row.names =
NULL) str(xml_df)
#checking for the total no. of missing values in
the data sum(is.na(loan))
#Reading JSON file #cleaning NA values
#Loading jsonlite loan_clean <-
package na.omit(loan)
library(jsonlite) jdata <-
fromJSON("https://api.github.com/users/jtleek sum(is.na(loan_clean)) str(loan_clean)
/repos") names(jdata) loan_clean1 <- loan[complete.cases(loan),]
class(jdata) str(jdata) #boolean indexing sum(is.na(loan_clean1))
head(jdata,2)
#Extracting nested #imputation - filling the missing values
objects #cleaning Amount.Requested Column
names(jdata$owner) #checking for the total no. of missing values in a particular column
jdata$owner$login
sum(is.na(loan$Amount.Requested)
)

unique(loan$Amount.Requested)
#writing to json file
data(iris) str(iris) #changing to numeric types
head(iris,2) jfile <- loan$Amount.Requested <-
toJSON(iris,pretty = TRUE) as.integer(loan$Amount.Requested)
cat(jfile) str(loan)

#reading json file #unique values in a column


irisdata <- unique(loan$Amount.Requested)
fromJSON(jfile)
mean(loan$Amount.Requested,na.rm = TRUE)
head(irisdata)
median(loan$Amount.Requested,na.rm = TRUE)
#clear workspace
#library(dplyr)
rm(list=ls())
library(tidyverse)
############ Reading data #Decide whether to impute with mean or median loan %>%
################# summarize(avg=mean(Amount.Requested,na.rm =
#using read.table() loan_data <- read.table("loans TRUE),med=median(Amount.Requested,na.rm = TRUE))
data.csv",header = TRUE,sep = ",") loan <- loan <- loan %>%
read.csv("loans data.csv")
mutate(Amount.Requested=replace(Amount.Requested,is.na(Amount.Requested),median(Amount.Req
#dimension of the data uested,na.rm = TRUE)))
dim(loan)
sum(is.na(loan$Amount.Requested)
#structure of the data )
str(loan)
#Rename a column loan <- loan%>%
loan %>% filter(is.na(Loan.Length))
rename(Amt_Req=Amount.Re
quested) names(loan) #drop the rows with
str(loan) NA values loan <-
#cleaning Amount.Funded.By.Investors column loan%>%
sum(is.na(loan$Amount.Funded.By.Investors)) drop_na(Loan.Length)

unique(loan$Amount.Funded.By.In #checking
vestors) str(loan) sum(is.na(loan$Loan.Length))
loan <- loan%>%
unique(loan$Loan.Length)
rename(Amt_fund=Amount.Funded.By.Investors)
#convert the type to numeric
loan$Amt_fund <- #cleaning Employment.Length column
as.numeric(loan$Amt_fund) sum(is.na(loan$Employment.Length))

#checking for NA values unique(loan$Employment.Length)


sum(is.na(loan$Amt_fund))
loan <- loan %>% mutate(Employment.Length=gsub(" year|
#check impute with mean or years|< |\\+","",Employment.Length))
median loan%>%
loan$Employment.Length <- as.integer(loan$Employment.Length)
summarize(avg=mean(loan$Amt_f
und,na.rm = #checking
TRUE),md=median(loan$Amt_fund, unique(loan$Employment.Length)
na.rm = TRUE)) sum(is.na(loan$Employment.Len
gth))
loan <- loan%>%
mutate(Amt_fund=replace(Amt_fund,is.na(Amt_fund),median(Amt_fund, table(loan$Employment.Length)
na.rm = TRUE))) mean(table(loan$Employment.Len
gth))
sum(is.na(loan$Amt_fund))
loan <- loan%>%
str(loan)
mutate(Employment.Length=replace(Employment.Length,is.na(Employment.Len
#cleaning Interest.Rate column
gth),2))
sum(is.na(loan$Interest.Rate))
#checking
#cleaning unwanted substring in a chr
sum(is.na(loan$Employment.Lengt
column loan <- loan %>%
h))
mutate(Interest.Rate=gsub("%","",Int
unique(loan$Employment.Length)
erest.Rate))
head(loan$Interest.Rate,2) #cleaning FICO.Range column
head(loan$FICO.Range,2) loan <-
loan$Interest.Rate <- as.numeric(loan$Interest.Rate)
loan %>%
head(loan$Interest.Rate,2)
separate(FICO.Range,c("fico-
loan$Interest.Rate <- low","fico-high")) str(loan)
as.numeric(loan$Interest.Rate) str(loan) names(loan)

loan$`fico-high` <-
#cleaning Loan.Length column as.integer(loan$`fico-high`)
sum(is.na(loan$Loan.Length)) loan$`fico-low` <-
as.integer(loan$`fico-low`) str(loan)
unique(loan$Loan.Length)
sum(is.na(loan$`fico-high`))
loan <- loan %>% sum(is.na(loan$`fico-low`))
mutate(Loan.Length=gsub(" unique(loan$`fico-high`)
months","",Loan.Length)) unique(loan$`fico-low`)
#statistical analysis - Numerical measure
loan$Loan.Length <- as.integer(loan$Loan.Length) str(faithful) #faithful - built-in data
head(faithful)
sum(is.na(loan$Loan.Length))
#Central tendency measure
unique(loan$Loan.Length)
mean(faithful$eruptions)
#filtering the rows with NA values #median
median(faithful$eruptions) #relative frequency relfreq
<-
#Measure of dispersion Interval_freq/nrow(faithful)
range(faithful$eruptions) old=options(digits = 2)
max(faithful$eruptions)- cbind(Interval_freq,relfreq)
min(faithful$eruptions)
#quartile #cumulative frequency
quantile(faithful$eruption cumfreq <-
s) cumsum(table(interval))
cumfreq cbind(cumfreq)
#Inter-quartile range
IQR(faithful$eruptions) rm(list=ls())
library(help=graphi
#percentile cs)
quantile(faithful$eruptions,c(.27,.3 data("airquality")
5,.65)) str(airquality)
#variance #to set the margin
var(faithful$eruptions) par(mar=c(2,2,2,2))
#standard deviation #1D scatter plot
sd(faithful$eruptions) plot(airquality$Ozone)
#covariance #2D scatter plot
cov(faithful$eruptions,faithful$wai
ting) plot(airquality$Ozone,airquality$W
ind)
#correlation
cor(faithful$eruptions,faithful$wai ?plot
ting)
#type argument in plot
#moment -third central moment plot(airquality$Ozone,type="l")
# the second central moment of a population
#title and axis labels arguments
is its variance library(e1071)
plot(airquality$Ozone,main = "ozone levels",xlab =
moment(faithful$eruptions,3, center = TRUE)
"index",ylab = "ozone")
#skewness
skewness(faithful$eruptions)
#histogram
#kurtosis hist(airquality$Solar.R)
kurtosis(faithful$eruptions)
#boxplot
#frequency summary(airquality$Ozone)
distributio #step1 - boxplot(airquality$Ozone)
find range
range(faithful$erupti #multiple boxplot
ons) boxplot(airquality[,1:4],main="multiple
box plots")
#step2 - Break the range into non-overlapping sub-intervals by defining a sequence of
equal distance break points. breaks <- seq(1.5,5.5,by=0.5) breaks
#pie chart
#step3- Classify the eruption durations according to the half-unit-length sub- unique(airquality$Wind)
intervals with cut. interval <- cut(faithful$eruptions,breaks,right=FALSE) table(airquality$Wind)

#step 4 - Compute the frequency of eruptions in each sub-interval with the table function. wind_freq <- table(airquality$Wind)
Interval_freq = wind_above8 <- wind_freq>8
table(interval) wind_freq wind_above8
Interval_freq wind_above8data <-
cbind(Interval_freq) wind_freq[wind_above8]
wind_above8data
table(wind_above8)
pie(wind_above8data,radius=1) #scatter plot - multiple variables through both color and shape
par(mar=c(1,1,1,1)) ggplot(mtcars,aes(x=wt,y=mpg,shape=gear_factor))+geom_point(aes(color=cyl_factor,size=4))+geom_point(color="grey",
size=1.5)
#grid of charts
par(mfrow=c(2,3),mar=c(2,2,2,1),las=0, bty="n") ggplot(mtcars,aes(x=wt,y=mpg,shape=gear_factor))+geom_point(aes(color=cyl_factor,size=4))+geom_point(color="grey",
plot(airquality$Ozone) size=1.5)
plot(airquality$Ozone,airquality$Wind)
plot(airquality$Ozone,type ='l') #scatter plot- adding best fit line ggplot(mtcars,
barplot(airquality$Ozone, main = 'Ozone levels', ylab aes(x=wt,y=mpg))+geom_point()+geom_smooth(method="
= 'ozone value') hist(airquality$Solar.R) lm")
boxplot(airquality$Ozone)
###########bar plot ########### ggplot(mtcars,
aes(x=gear_factor))+geom_bar() ggplot(mtcars,
#lattice graph aes(x=gear_factor,fill=gear_factor,color="red"))+geom_bar() +ggtitle("frquency
library(lattice) plot of gear")
#density plot #flipping the bar direction ggplot(mtcars,
densityplot(airquality$Ozone) aes(x=gear_factor))+geom_bar()+coord_flip()
#scatter plot matrix #bar plot for 2 variables ggplot(mtcars,
splom(airquality[c(1,3,4)]) aes(x=cyl_factor,fill=gear_factor))+geom_bar(position='sta
ck')
#scatter plot depicting the combination
#################### pie chart ############ ggplot(mtcars,
of 2 variables data("mtcars") df <-
aes(x="",y=mpg,fill=cyl_factor))+geom_bar(width =
mtcars
1,stat='identity')+coord_polar("y",start = 0)
str(df)
par(mar=c(4,4.5,1, #################### histogram ###########
1)) ggplot(mtcars,aes(x=hp))+geom_histogram()+labs(title = "Distribution of
plot(df$wt,df$mpg) hp",y='frequency')

unique(df$cyl) cyl_factor <- factor(df$cyl,levels = #setting bin size ggplot(mtcars,aes(x=hp))+geom_histogram(bins =


c(4,6,8),labels = c("4cyl","6cyl","8cyl")) 3)+labs(title = "Distribution of hp",y='frequency')

unique(df$gear) gear_factor <- factor(df$gear,levels = #setting bin width ggplot(mtcars,aes(x=hp))+geom_histogram(binwidth =


c(3,4,5),labels = c("3 gears","4 gears", "5 gears")) 30)+labs(title = "Distribution of hp",y='frequency')

xyplot(df$mpg~df$wt|cyl_factor*gear_factor,main="scatter plots: Cylinders and Gears",xlab = "weight of #with border and fill color ggplot(mtcars,aes(x=hp))+geom_histogram(binwidth =
car",ylab = "miles per gallon") gear_factor 30,color='green',fill='yellow')+labs(title = "Distribution of hp",y='frequency')

freq_gear <- table(gear_factor) freq_gear #facets ggplot(mtcars,aes(x=hp))+geom_histogram(color="white",fill="blue")+labs(title = "Distribution of


barplot(freq_gear,col=c("red","green","blue")) hp",y='frequency')+facet_wrap(cyl_factor,ncol=1)
pie(freq_gear,labels=c("3 gears","4 gears", "5
gears"),col=c("red","green","blue"),radius=1)
################ Kernel density curve ############ ggplot(mtcars,
rm(list=ls()) aes(x=hp))+geom_density()+labs(title="Distribution of hp",x="horse
data("mtcars") power",y='density')

#install.packages("ggplot2") library(ggplot2) #with fill color ggplot(mtcars,


head(mtcars,2) #scatter plot ggplot(data=mtcars, aes(x=hp))+geom_density(fill='blue',color='red')+labs(title="Distribution of
mapping=aes(x=wt,y=mpg))+geom_point() hp",x="horse power",y='density')
unique(mtcars$cyl) cyl_factor <-
############## Line plot ###############
factor(mtcars$cyl,levels = c(4,6,8),labels =
library(dplyr) d <-
c("4cyl","6cyl","8cyl"))
sample_n(mtcars,10)
unique(mtcars$gear) gear_factor <- factor(mtcars$gear,levels = ggplot(d,aes(x=wt,y=drat))+ge
c(3,4,5),labels = c("3 gears","4 gears", "5 gears")) om_line() d

#scatter plot - multiple variables through color #with varied thickness and color points
ggplot(mtcars,aes(x=wt,y=mpg,color=gear_factor))+geom_point() ggplot(d,aes(x=wt,y=drat))+geom_line(aes(size=2,color='red'))+geom_point(aes(size=2,color='blue'))

#scatter plot - multiple variables through size ################# box plot


ggplot(mtcars,aes(x=wt,y=mpg,size=qsec))+geo ################ ggplot(mtcars,
m_point() aes(x=mpg))+geom_boxplot()
#multiple box plots ggplot(mtcars,
aes(x=cyl_factor,y=mpg))+geom_boxplot()

You might also like