Professional Documents
Culture Documents
R Studio
R Studio
R Studio
#logical type lg
<- TRUE y <- c(TRUE,"a")
p=TRUE;q=FAL #character y
SE
p&q;p|q;!p #Explicit type
coercion x <- 2.5
#Obtain the class and type of the variable class(x)
as.integer(x) x
class(a)
typeof(a) x <- -1:5 x
class(str) class(x)
typeof(str) as.numeric
class(cmp) (x)
typeof(cmp) as.logical(x)
class(lg) as.characte
typeof(lg) r(x)
as.complex
#special number Inf representing infinity
(x)
1/0
1/Inf log(0) #find
natural log.
#Non-sensical coercion results in NAs
#you can represent base value as 2nd x <- c('a','b','c')
argument log(10,2) #base 2 log(10,10) x as.
#base 10 numeric(x)
as.logical(x)
#NaN represents a undefined value (also indicates a missing value)
0/0 #vector
arithmetics x
<- c(1,3,5) y <-
c(2,4,6)
x+y #missing values x
x-y x*y x/y <-
help(options c(1,2,NA,5,NaN,6)
) ?options is.na(x) is.nan(x)
options(digit
# Data frame ----------------------------------------------
s=2)
-------------------rm(list=ls())
#recycling
# table with the same type within a column and different types between columns #
rule y <-
defined with a data.frame() function id=c(1,2,3) name=c("a","b","c") marks = c(50, 0, 25)
c(2,4,6,8,10)
sample_df=data.frame(id,name,marks) sample_df
x+y
my_df <- data.frame(id = c(1, 2, 3),
#create
name = c("Ramu","Raju","Ravi"),
matrices m <- marks = c(50, 40, 25))
matrix() my_df
m
#dimension of the data frame
m <- matrix(nrow=3,ncol=2) dim(my_df)
m
attributes(m) dim(m) m <- matrix() m <- #columns of the data frame
matrix(1:6,nrow=3,ncol=2) #constructed column-wise m names(my_df)
<- matrix(1:6,nrow=3,ncol=2,byrow = TRUE)
#constructed column-wise #structure of the data frame
m str(my_df)
#loading slice_sample(temp,n
data
data("mtca =2)
rs") cars <-
mtcars cars %>%
filter(mpg>25) %>%
#dimension of the data
slice_sample(n=2)
dim(cars)
#unique values in a column
#structure of the data unique(cars$cyl)
str(cars)
#no. of values under each unique category
#is.na(cars) #NA or NaN table(cars$cyl)
#checking for missing
#grouping
values any(is.na(cars))
cars %>%
sum(is.na(cars))
group_by(cyl)%
#################### Viewing data ########################
>%
#fetching top 6 rows slice_sample(n
head(cars) =2)
#combining functions
#create a new variable that sum up disp and hp #structure of the data
and filter only str(cars)
#the rows where mpg>25 & #is.na(cars) #NA or NaN
disp>90 #and select only mpg, #checking for missing values
disp, hp, newvar cars %>%
mutate(newvar2=disp+hp)%>% any(is.na(cars))
filter(mpg>25,disp>90)%>% sum(is.na(cars))
#################### Viewing data ########################
#fetching top 6 rows slice_sample(n
head(cars) =2)
rm(list=ls())
############ summarizing data
################# #To create date / To
#Always group_by is used along with summarise. It is applied on categorical value represent date d <- date()
cars %>% group_by(cyl) %>% d class(d)
summarize(cnt=n()) #count of unique
#as.Date(d)
cyl values
#to convert date string to date class d <-
table(cars$cyl) as.Date("2022-8-25") #default format -year-
month-day class(d)
#computing max, min and standard dev cars %>% group_by(cyl) %>% d as.Date("2022-8-25
summarize(mx_mpg=max(mpg),mi_mpg=min(mpg),std_mpg=sd(mpg),mn=mean( 10:44:22")
mpg),md=median(mpg)) as.Date("2022-8-25
21:15")
rm(list=ls())
unique(loan$Amount.Requested)
#writing to json file
data(iris) str(iris) #changing to numeric types
head(iris,2) jfile <- loan$Amount.Requested <-
toJSON(iris,pretty = TRUE) as.integer(loan$Amount.Requested)
cat(jfile) str(loan)
unique(loan$Amount.Funded.By.In #checking
vestors) str(loan) sum(is.na(loan$Loan.Length))
loan <- loan%>%
unique(loan$Loan.Length)
rename(Amt_fund=Amount.Funded.By.Investors)
#convert the type to numeric
loan$Amt_fund <- #cleaning Employment.Length column
as.numeric(loan$Amt_fund) sum(is.na(loan$Employment.Length))
loan$`fico-high` <-
#cleaning Loan.Length column as.integer(loan$`fico-high`)
sum(is.na(loan$Loan.Length)) loan$`fico-low` <-
as.integer(loan$`fico-low`) str(loan)
unique(loan$Loan.Length)
sum(is.na(loan$`fico-high`))
loan <- loan %>% sum(is.na(loan$`fico-low`))
mutate(Loan.Length=gsub(" unique(loan$`fico-high`)
months","",Loan.Length)) unique(loan$`fico-low`)
#statistical analysis - Numerical measure
loan$Loan.Length <- as.integer(loan$Loan.Length) str(faithful) #faithful - built-in data
head(faithful)
sum(is.na(loan$Loan.Length))
#Central tendency measure
unique(loan$Loan.Length)
mean(faithful$eruptions)
#filtering the rows with NA values #median
median(faithful$eruptions) #relative frequency relfreq
<-
#Measure of dispersion Interval_freq/nrow(faithful)
range(faithful$eruptions) old=options(digits = 2)
max(faithful$eruptions)- cbind(Interval_freq,relfreq)
min(faithful$eruptions)
#quartile #cumulative frequency
quantile(faithful$eruption cumfreq <-
s) cumsum(table(interval))
cumfreq cbind(cumfreq)
#Inter-quartile range
IQR(faithful$eruptions) rm(list=ls())
library(help=graphi
#percentile cs)
quantile(faithful$eruptions,c(.27,.3 data("airquality")
5,.65)) str(airquality)
#variance #to set the margin
var(faithful$eruptions) par(mar=c(2,2,2,2))
#standard deviation #1D scatter plot
sd(faithful$eruptions) plot(airquality$Ozone)
#covariance #2D scatter plot
cov(faithful$eruptions,faithful$wai
ting) plot(airquality$Ozone,airquality$W
ind)
#correlation
cor(faithful$eruptions,faithful$wai ?plot
ting)
#type argument in plot
#moment -third central moment plot(airquality$Ozone,type="l")
# the second central moment of a population
#title and axis labels arguments
is its variance library(e1071)
plot(airquality$Ozone,main = "ozone levels",xlab =
moment(faithful$eruptions,3, center = TRUE)
"index",ylab = "ozone")
#skewness
skewness(faithful$eruptions)
#histogram
#kurtosis hist(airquality$Solar.R)
kurtosis(faithful$eruptions)
#boxplot
#frequency summary(airquality$Ozone)
distributio #step1 - boxplot(airquality$Ozone)
find range
range(faithful$erupti #multiple boxplot
ons) boxplot(airquality[,1:4],main="multiple
box plots")
#step2 - Break the range into non-overlapping sub-intervals by defining a sequence of
equal distance break points. breaks <- seq(1.5,5.5,by=0.5) breaks
#pie chart
#step3- Classify the eruption durations according to the half-unit-length sub- unique(airquality$Wind)
intervals with cut. interval <- cut(faithful$eruptions,breaks,right=FALSE) table(airquality$Wind)
#step 4 - Compute the frequency of eruptions in each sub-interval with the table function. wind_freq <- table(airquality$Wind)
Interval_freq = wind_above8 <- wind_freq>8
table(interval) wind_freq wind_above8
Interval_freq wind_above8data <-
cbind(Interval_freq) wind_freq[wind_above8]
wind_above8data
table(wind_above8)
pie(wind_above8data,radius=1) #scatter plot - multiple variables through both color and shape
par(mar=c(1,1,1,1)) ggplot(mtcars,aes(x=wt,y=mpg,shape=gear_factor))+geom_point(aes(color=cyl_factor,size=4))+geom_point(color="grey",
size=1.5)
#grid of charts
par(mfrow=c(2,3),mar=c(2,2,2,1),las=0, bty="n") ggplot(mtcars,aes(x=wt,y=mpg,shape=gear_factor))+geom_point(aes(color=cyl_factor,size=4))+geom_point(color="grey",
plot(airquality$Ozone) size=1.5)
plot(airquality$Ozone,airquality$Wind)
plot(airquality$Ozone,type ='l') #scatter plot- adding best fit line ggplot(mtcars,
barplot(airquality$Ozone, main = 'Ozone levels', ylab aes(x=wt,y=mpg))+geom_point()+geom_smooth(method="
= 'ozone value') hist(airquality$Solar.R) lm")
boxplot(airquality$Ozone)
###########bar plot ########### ggplot(mtcars,
aes(x=gear_factor))+geom_bar() ggplot(mtcars,
#lattice graph aes(x=gear_factor,fill=gear_factor,color="red"))+geom_bar() +ggtitle("frquency
library(lattice) plot of gear")
#density plot #flipping the bar direction ggplot(mtcars,
densityplot(airquality$Ozone) aes(x=gear_factor))+geom_bar()+coord_flip()
#scatter plot matrix #bar plot for 2 variables ggplot(mtcars,
splom(airquality[c(1,3,4)]) aes(x=cyl_factor,fill=gear_factor))+geom_bar(position='sta
ck')
#scatter plot depicting the combination
#################### pie chart ############ ggplot(mtcars,
of 2 variables data("mtcars") df <-
aes(x="",y=mpg,fill=cyl_factor))+geom_bar(width =
mtcars
1,stat='identity')+coord_polar("y",start = 0)
str(df)
par(mar=c(4,4.5,1, #################### histogram ###########
1)) ggplot(mtcars,aes(x=hp))+geom_histogram()+labs(title = "Distribution of
plot(df$wt,df$mpg) hp",y='frequency')
xyplot(df$mpg~df$wt|cyl_factor*gear_factor,main="scatter plots: Cylinders and Gears",xlab = "weight of #with border and fill color ggplot(mtcars,aes(x=hp))+geom_histogram(binwidth =
car",ylab = "miles per gallon") gear_factor 30,color='green',fill='yellow')+labs(title = "Distribution of hp",y='frequency')
#scatter plot - multiple variables through color #with varied thickness and color points
ggplot(mtcars,aes(x=wt,y=mpg,color=gear_factor))+geom_point() ggplot(d,aes(x=wt,y=drat))+geom_line(aes(size=2,color='red'))+geom_point(aes(size=2,color='blue'))