Professional Documents
Culture Documents
Basic of R 2
Basic of R 2
About R
• Open source software exclusively meant for statistics, contains > 18K packages
• Domain specific packages - Finance [ts, timeseries, quantmod, TTR etc]
• – Questionaire based [psych] | SEM [lavaan, semTools, semPlot] etc.,
• High level of reliability in terms of statistics output
• Great community support
• Every thing is Object in R
• LHS <- RHS is basic of R
• Own Functions, interface with other software
airquality [1:5,] # R dataset for practice store in data frame looks like spreadsheet
## [1] "data.frame"
• Before start the DA, it is better to understand data and its types using functions such as head, tail, str, dim and
names etc.,
mt <- mtcars
head(mt) # display first 6 rows by default
## m c di h dr wt qse v a ge ca
p y sp p at c s m ar rb
g l
## Mazda RX4 21 6 16 1 3. 2.6 16. 0 1 4 4
.0 0 1 90 20 46
0
## Mazda RX4 Wag 21 6 16 1 3. 2.8 17. 0 1 4 4
.0 0 1 90 75 02
0
## Datsun 710 22 4 10 9 3. 2.3 18. 1 1 4 1
.8 8 3 85 20 61
## Hornet 4 Drive 21 6 25 1 3. 3.2 19. 1 0 3 1
.4 8 1 08 15 44
0
## Hornet 18 8 36 1 3. 3.4 17. 0 0 3 2
Sportabout .7 0 7 15 40 02
5
## Valiant 18 6 22 1 2. 3.4 20. 1 0 3 1
1
.1 5 0 76 60 22
5
tail(mt,7) # display last 6 rows
## mp c di h dr wt qs v a ge ca
g y sp p at ec s m ar rb
l
## Fiat X1-9 27 4 79 6 4. 1.9 18 1 1 4 1
.3 .0 6 08 35 .9
## Porsche 914-2 26 4 120 9 4. 2.1 16 0 1 5 2
.0 .3 1 43 40 .7
## Lotus Europa 30 4 95 1 3. 1.5 16 1 1 5 2
.4 .1 1 77 13 .9
3
## Ford Pantera 15 8 351 2 4. 3.1 14 0 1 5 4
L .8 .0 6 22 70 .5
4
## Ferrari Dino 19 6 145 1 3. 2.7 15 0 1 5 6
.7 .0 7 62 70 .5
5
## Maserati Bora 15 8 301 3 3. 3.5 14 0 1 5 8
.0 .0 3 54 70 .6
5
## Volvo 142E 21 4 121 1 4. 2.7 18 1 1 4 2
.4 .0 0 11 80 .6
9
str(mt) # structure of the data set
## [1] 32 11
summary(mt) # basic statistics except sd
2
# mpg cyl disp h
# p
# Min. :10.40 Min. :4.000 Min. : 71.1 Min. :
# 52.0
# 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
#
# Median :19.20 Median :6.000 Median :196.3 Median :123.0
#
# Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.
# 7
# 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
#
# Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.
# 0
# drat wt qsec v
# s
# Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.00
# 00
# 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st
# Qu.:0.0000
# Median :3.695 Median :3.325 Median :17.71 Median :0.000
# 0
# Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.43
# 75
# 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd
# Qu.:1.0000
# Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.00
# 00
# am gear carb
#
# Min. :0.0000 Min. :3.000 Min. :1.000
#
# 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
#
# Median :0.0000 Median :4.000 Median :2.000
#
# Mean :0.4062 Mean :3.688 Mean :2.812
#
# 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
#
# Max. :1.0000 Max. :5.000 Max. :8.000
#
names(mt) #display only the var. names of data set
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
• Data understanding
• Data Exploration - using graphs and summaries
• Data Preparation - recode, merge, sort and aggregate etc.,
• Data Analysis - Univariate, Bivariate and Multivariate techq.
• Data Reporting - Tables, Diagrams, Charts and Graphs
Subsetting
• Selecting required rows or col
• splitting the file
• Keep or drop variables and cases
3
• filtering, subet etc.,
• Brackets are used [Row ,Col], “[[” selected output
mt [1:5 ,1:5] #first 5 rows and 5 col
## mpg cyl hp
disp drat
## Mazda RX4 21.0 6 160 110
3.90
## Mazda RX4 Wag 21.0 6 160 110
3.90
## Datsun 710 22.8 4 108 93
3.85
## Hornet 4 Drive 21.4 6 258 110
3.08
## Hornet Sportabout 18.7 8 360 175
3.15
mt[c(1,5,7,9),c(1,3,5)] # choosing selected rows and col using c function
## m dis dr
p p at
g
## Mazda RX4 21 160 3.
.0 .0 90
## Hornet 18 360 3.
Sportabout .7 .0 15
## Duster 360 14 360 3.
.3 .0 21
## Merc 230 22 140 3.
.8 .8 92
mt[c(1,5,7,9),-c(1,3,5)] # drop selected rows and col using c function and " - "
## c h wt qse v a ge ca
y p c s m ar rb
l
## Mazda RX4 6
1 2. 16. 0 1 4 4
1 62 46
0
## Hornet 8 1 3. 17. 0 0 3 2
Sportabout 7 44 02
5
## Duster 360 8 2 3. 15. 0 0 3 4
4 57 84
5
## Merc 230 4 9 3. 22. 1 0 4 2
5 15 90
subset(mt,cyl==4) # filtering based on 1 variable
4
# 5 .1 7 70 65 1
# Fiat X1-9 27. 4 79 6 4. 1.9 18.9 1 1 4 1
# 3 .0 6 08 35 0
# Porsche 914-2 26. 4 120 9 4. 2.1 16.7 0 1 5 2
# 0 .3 1 43 40 0
# Lotus Europa 30. 4 95 1 3. 1.5 16.9 1 1 5 2
# 4 .1 1 77 13 0
3
# Volvo 142E 21. 4 121 1 4. 2.7 18.6 1 1 4 2
# 4 .0 0 11 80 0
9
subset(mt,cyl==6 & am == 0) # filtering based on 2 variable using AND condition
5
# Duster 360 14 8 360 24 3.2 3.57 15.8 0 0 3 4
# .3 .0 5 1 0 4
# Merc 240D 24 4 146 62 3.6 3.19 20.0 1 0 4 2
# .4 .7 9 0 0
# Merc 230 22 4 140 95 3.9 3.15 22.9 1 0 4 2
# .8 .8 2 0 0
# Merc 280 19 6 167 12 3.9 3.44 18.3 1 0 4 4
# .2 .6 3 2 0 0
# Merc 280C 17 6 167 12 3.9 3.44 18.9 1 0 4 4
# .8 .6 3 2 0 0
# Merc 450SE 16 8 275 18 3.0 4.07 17.4 0 0 3 3
# .4 .8 0 7 0 0
# Merc 450SL 17 8 275 18 3.0 3.73 17.6 0 0 3 3
# .3 .8 0 7 0 0
# Merc 450SLC 15 8 275 18 3.0 3.78 18.0 0 0 3 3
# .2 .8 0 7 0 0
# Cadillac Fleetwood 10 8 472 20 2.9 5.25 17.9 0 0 3 4
# .4 .0 5 3 0 8
# Lincoln 10 8 460 21 3.0 5.42 17.8 0 0 3 4
# Continental .4 .0 5 0 4 2
# Chrysler Imperial 14 8 440 23 3.2 5.34 17.4 0 0 3 4
# .7 .0 0 3 5 2
# Toyota Corona 21 4 120 97 3.7 2.46 20.0 1 0 3 1
# .5 .1 0 5 1
# Dodge Challenger 15 8 318 15 2.7 3.52 16.8 0 0 3 2
# .5 .0 0 6 0 7
# AMC Javelin 15 8 304 15 3.1 3.43 17.3 0 0 3 2
# .2 .0 0 5 5 0
# Camaro Z28 13 8 350 24 3.7 3.84 15.4 0 0 3 4
# .3 .0 5 3 0 1
# Pontiac Firebird 19 8 400 17 3.0 3.84 17.0 0 0 3 2
# .2 .0 5 8 5 5
# Ferrari Dino 19 6 145 17 3.6 2.77 15.5 0 1 5 6
# .7 .0 5 2 0 0
splitfile <- split(mt,mt$cyl) # split into file based on cylinder
splitfile # displaying on required filter
# $`4`
#
# mp cy di h dr wt qsec v a ge carb
# g l sp p at s m ar
# Dats 710 22 4 108 9 3. 2.3 18.6 1 1 4 1
# un .8 .0 3 85 20 1
# Merc 240D 24 4 146 6 3. 3.1 20.0 1 0 4 2
# .4 .7 2 69 90 0
# Merc 230 22 4 140 9 3. 3.1 22.9 1 0 4 2
# .8 .8 5 92 50 0
# Fiat 128 32 4 78 6 4. 2.2 19.4 1 1 4 1
# .4 .7 6 08 00 7
# Honda Civic 30 4 75 5 4. 1.6 18.5 1 1 4 2
# .4 .7 2 93 15 2
# Toyota 33 4 71 6 4. 1.8 19.9 1 1 4 1
# Corolla .9 .1 5 22 35 0
# Toyota Corona 21 4 120 9 3. 2.4 20.0 1 0 3 1
# .5 .1 7 70 65 1
# Fiat X1-9 27 4 79 6 4. 1.9 18.9 1 1 4 1
# .3 .0 6 08 35 0
# Porsche 914-2 26 4 120 9 4. 2.1 16.7 0 1 5 2
# .0 .3 1 43 40 0
# Lotus Europa 30 4 95 11 3. 1.5 16.9 1 1 5 2
# .4 .1 3 77 13 0
# Volvo 142E 21 4 121 10 4. 2.7 18.6 1 1 4 2
6
# .4 .0 9 11 80 0
#
#
# $`6`
#
# mp cy di h dr wt qsec v a ge carb
# g l sp p at s m ar
# Mazda RX4 21 6 160 11 3. 2.6 16.4 0 1 4 4
# .0 .0 0 90 20 6
# Mazda RX4 Wag 21 6 160 11 3. 2.8 17.0 0 1 4 4
# .0 .0 0 90 75 2
# Hornet 4 21 6 258 11 3. 3.2 19.4 1 0 3 1
# Drive .4 .0 0 08 15 4
# Valiant 18 6 225 10 2. 3.4 20.2 1 0 3 1
# .1 .0 5 76 60 2
# Merc 280 19 6 167 12 3. 3.4 18.3 1 0 4 4
# .2 .6 3 92 40 0
# Merc 280C 17 6 167 12 3. 3.4 18.9 1 0 4 4
# .8 .6 3 92 40 0
# Ferrari Dino 19 6 145 17 3. 2.7 15.5 0 1 5 6
# .7 .0 5 62 70 0
#
#
# $`8`
#
# mp cy di hp dra w qs v a ge ca
# g l sp t t ec s m ar rb
# Hornet Sportabout 18 8 360 17 3.1 3.44 17. 0 0 3 2
# .7 .0 5 5 0 02
# Duster 360 14 8 360 24 3.2 3.57 15. 0 0 3 4
# .3 .0 5 1 0 84
# Merc 450SE 16 8 275 18 3.0 4.07 17. 0 0 3 3
# .4 .8 0 7 0 40
# Merc 450SL 17 8 275 18 3.0 3.73 17. 0 0 3 3
# .3 .8 0 7 0 60
# Merc 450SLC 15 8 275 18 3.0 3.78 18. 0 0 3 3
# .2 .8 0 7 0 00
# Cadillac Fleetwood 10 8 472 20 2.9 5.25 17. 0 0 3 4
# .4 .0 5 3 0 98
# Lincoln 10 8 460 21 3.0 5.42 17. 0 0 3 4
# Continental .4 .0 5 0 4 82
# Chrysler Imperial 14 8 440 23 3.2 5.34 17. 0 0 3 4
# .7 .0 0 3 5 42
7
## Dodge Challenger 15.5 8 318 1 2. 3.5 16.8 0 0 3 2
.0 5 76 20 7
0
## AMC Javelin 15.2 8 304 1 3. 3.4 17.3 0 0 3 2
.0 5 15 35 0
0
## Camaro Z28 13.3 8 350 2 3. 3.8 15.4 0 0 3 4
.0 4 73 40 1
5
## Pontiac Firebird 19.2 8 400 1 3. 3.8 17.0 0 0 3 2
.0 7 08 45 5
5
## Ford Pantera L 15.8 8 351 2 4. 3.1 14.5 0 1 5 4
.0 6 22 70 0
4
## Maserati Bora 15.0 8 301 3 3. 3.5 14.6 0 1 5 8
.0 3 54 70 0
5
8
# 3
]
# [ "DailyRate" "Department"
# 5
]
# [ "DistanceFromHome" "Education"
# 7
]
# [ "EducationField" "EmployeeCount"
# 9
]
# [1 "EmployeeNumber" "EnvironmentSatisfaction
# 1] "
# [1 "Gender" "HourlyRate"
# 3]
# [1 "JobInvolvement" "JobLevel"
# 5]
# [1 "JobRole" "JobSatisfaction"
# 7]
# [1 "MaritalStatus" "MonthlyIncome"
# 9]
# [2 "MonthlyRate" "NumCompaniesWorked"
# 1]
# [2 "Over18" "OverTime"
# 3]
# [2 "PercentSalaryHike" "PerformanceRating"
# 5]
# [2 "RelationshipSatisfaction "StandardHours"
# 7] "
9
## [29] "StockOptionLevel" "TotalWorkingYears"
## [31] "TrainingTimesLastYear" "WorkLifeBalance"
## [33] "YearsAtCompany" "YearsInCurrentRole" ##
[35] "YearsSinceLastPromotion" "YearsWithCurrManager"
table(hrat$Attrition) # to compute count for 1 var
##
## No Yes
## 1233 237
table(hrat$Attrition,hrat$Department) # to compute count for 2 var
##
## Human Resources Research & Development Sales
# No 51 828 354
#
# Yes 12 133 92
#
table(hrat$Attrition,hrat$Department,hrat$Gender) # crosstabs by dim.
## , , = Female
##
##
## Human Resources Research & Development Sales
## No 14 336 151
## Yes 6 43 38
##
## , , = Male
##
##
## Human Resources Research & Development Sales
## No 37 492 203
## Yes 6 90 54
tb1 <- table(hrat$Attrition)
prop.table(tb1) # overall percent
##
## No Yes
## 0.8387755 0.1612245
10