Professional Documents
Culture Documents
R Workshop STIS
R Workshop STIS
R Workshop STIS
Setia Pramana
May 23, 2014
What is R?
R Installation
3
>
>
>
>
>
>
[1] 7
>
>
>
>
>
>
>
## vector ##
vc1 <- c(2,5,5,3,3,6,2,3,5,6)
# we can also create an object using assign function :#
assign ("vc2",seq(from=1, to=100, by=10))
vc1
1
[1] 2 5 5 3 3 6 2 3 5 6
> vc2
[1]
1 11 21 31 41 51 61 71 81 91
> length(vc2)
[1] 10
> ## vector multiplication #
>
> vc1*vc2
[1]
55 105
[1] High
Medium Low
High
Medium Low
High
Medium Low
High
[11] Medium Low
High
Medium Low
High
Medium Low
High
Medium
[21] Low
High
Medium Low
High
Medium Low
High
Medium Low
Levels: High Low Medium
> grade <- rep(c("Grade1","Grade2","Grade3", "Grade4"),each=5)
> grade <-factor(grade)
> grade
[1] Grade1 Grade1 Grade1 Grade1 Grade1 Grade2 Grade2 Grade2 Grade2 Grade2
[11] Grade3 Grade3 Grade3 Grade3 Grade3 Grade4 Grade4 Grade4 Grade4 Grade4
Levels: Grade1 Grade2 Grade3 Grade4
> ## MAtrix ##
>
> mat <- matrix(c(2,3,1,5,4,5,6,7,2,3,1,5,4,5,6,7),nrow=4,ncol=4)
> mat
[1,]
[2,]
[3,]
[4,]
[1,]
[2,]
[3,]
[4,]
> dim(mat2)
[1] 4 5
> ## coloumn binding##
> cbind(mat2,mat)
[1,]
[2,]
[3,]
[4,]
[1,]
[2,]
[3,]
[4,]
[5,]
> ## Transpose #
>
> t(mat2)
[1,]
[2,]
[3,]
[4,]
[5,]
[1,]
[2,]
[3,]
[4,]
> diag(mat)
[1] 2 5 1 7
>
>
>
>
## Matrix multiplication #
m1 <- matrix(c(6,2,4,5), 2,2)
m2 <- matrix(c(2,4,1,2), 2,2)
m1
[1,]
[2,]
[,1] [,2]
6
4
2
5
> m2
[1,]
[2,]
[,1] [,2]
2
1
4
2
> m1*m2
[1,]
[2,]
[,1] [,2]
12
4
8
10
> m1%*%m2
[1,]
[2,]
[,1] [,2]
28
14
24
12
> ## List ##
>
> myList <- list(vc1, vc2, 5,6,"seven", mat,mat2)
> myList
[[1]]
[1] 2 5 5 3 3 6 2 3 5 6
[[2]]
[1] 1 11 21 31 41 51 61 71 81 91
[[3]]
[1] 5
[[4]]
[1] 6
[[5]]
[1] "seven"
[[6]]
[,1] [,2] [,3] [,4]
[1,]
2
4
2
4
[2,]
3
5
3
5
[3,]
1
6
1
6
[4,]
5
7
5
7
[[7]]
[,1] [,2] [,3] [,4] [,5]
[1,]
1
41
81
5
2
[2,]
11
51
91
3
3
[3,]
21
61
2
3
5
[4,]
31
71
5
6
6
> class (myList)
[1] "list"
> ## Data Frame ##
>
> Data1 <- data.frame( X=c( vc1, vc2), grade, sex=rep(c("male","female"),each=10))
> head(Data1)
1
2
3
4
5
6
X
2
5
5
3
3
6
grade
Grade1
Grade1
Grade1
Grade1
Grade1
Grade2
sex
male
male
male
male
male
male
> tail(Data1)
15
16
17
18
19
20
X
41
51
61
71
81
91
grade
Grade3
Grade4
Grade4
Grade4
Grade4
Grade4
sex
female
female
female
female
female
female
5
1
2
3
4
5
6
7
8
9
10
X
2
5
5
3
3
6
2
3
5
6
grade
Grade1
Grade1
Grade1
Grade1
Grade1
Grade2
Grade2
Grade2
Grade2
Grade2
sex
male
male
male
male
male
male
male
male
male
male
## data Extraction ##
vc2[3:10]
vc2>10
vc3 <- vc2[vc2>10]
vc3
mat2[2:3,3:5]
mat2[2:3,3:5]*3
mat2 [,1]
mat2 [,-1]
Data1 $sex
Data1 [1:4,-1]
myList[[1]]
myList[1:2]
Summary Statistics
Median
46.0
grade
Grade1:5
Grade2:5
Grade3:5
Grade4:5
sex
female:10
male :10
> mean(vc2)
[1] 46
> var(vc1)
[1] 2.444444
> length(vc2)
[1] 10
> tapply(Data1$X,Data1$grade,mean)
Grade1 Grade2 Grade3 Grade4
3.6
4.4
21.0
71.0
> table(Data1 $sex)
female
10
male
10
female
male
> ##
> apply(mat2, 1, mean)
[1] 26.0 31.8 18.4 23.8
7
Max.
91.0
1.500000
1.825742
21
56
86 NaN NaN
> rowMeans(mat2)
[1] 26.0 31.8 18.4 23.8
> colMeans(mat2)
[1] 16.00 56.00 44.75
4.25
4.00
16
25
36
49
64
81 100
>
> ## Delete all objects ##
>
> #rm(list=ls())
>
>
4.1
Read Data
5.1
5.2
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
feed
horsebean
horsebean
horsebean
horsebean
horsebean
horsebean
> summary(chickwts)
weight
Min.
:108.0
casein
feed
:12
1st Qu.:204.5
Median :258.0
Mean
:261.3
3rd Qu.:323.5
Max.
:423.0
>
>
>
>
>
>
>
horsebean:10
linseed :12
meatmeal :11
soybean :14
sunflower:12
5.3
1
2
3
4
5
6
X weight
feed
1
179 horsebean
2
160 horsebean
3
136 horsebean
4
227 horsebean
5
217 horsebean
6
168 horsebean
1
2
3
4
5
6
weight
179
160
136
227
217
168
feed
horsebean
horsebean
horsebean
horsebean
horsebean
horsebean
>
>
>
>
require(foreign)
# SPSS files
dat.spss <- read.spss("hsb2.sav",to.data.frame=TRUE)
head(dat.spss)
10
1
2
3
4
5
6
ID FEMALE RACE
SES SCHTYP
PROG READ WRITE MATH SCIENCE SOCST
70
male white
low public general
57
52
41
47
57
121 female white middle public vocation
68
59
53
63
61
86
male white
high public general
44
33
54
58
31
141
male white
high public vocation
63
44
47
53
56
172
male white middle public academic
47
52
57
53
61
113
male white middle public academic
44
52
51
63
61
1
2
3
4
5
6
id female race
ses schtyp
prog read write math science socst
70
male white
low public general
57
52
41
47
57
121 female white middle public vocation
68
59
53
63
61
86
male white
high public general
44
33
54
58
31
141
male white
high public vocation
63
44
47
53
56
172
male white middle public academic
47
52
57
53
61
113
male white middle public academic
44
52
51
63
61
1
2
3
4
5
6
admit
0
1
1
1
0
1
gre
380
660
800
640
520
760
gpa rank
3.61
3
3.67
3
4.00
1
3.19
4
2.93
4
3.00
2
>
>
Graphics
11
15
10
0
Frequency
20
Histogram of rnorm(100, 0, 1)
rnorm(100, 0, 1)
>
>
>
>
>
>
## Scatter Plot ##
n <- 1000
x1 <- matrix(rnorm(n, mean=0, sd=2),ncol = 2)
x2 <- matrix(rnorm(n, mean = 4, sd = 1.5), ncol = 2)
dtx
<- rbind(x1, x2)
plot(dtx)
12
5
5
dtx[,2]
dtx[,1]
13
5
5
dtx[,2]
dtx[,1]
14
5
5
dtx[,2]
dtx[,1]
15
5
5
dtx[,2]
dtx[,1]
16
2
0
2
0
2
4
5
0
5
15
Series 3
25
1960
1970
1980
1990
2000
Time
> plot(density(chik2$weight))
17
2010
Series 2
6 4 2
Series 1
0.000
0.001
0.002
Density
0.003
0.004
density.default(x = chik2$weight)
100
200
300
400
500
N = 71 Bandwidth = 29.96
> stem(chik2$weight)
The decimal point is 2 digit(s) to the right of the |
1
1
2
2
3
3
4
|
|
|
|
|
|
|
124444
5566777889
00112223333444
5556666667778
0001222222333334444
5678899
02
18
400
350
300
250
200
150
100
casein
linseed
soybean
19
14
12
10
8
Frequency
6
4
2
0
casein
linseed
soybean
feed
20
7
7.1
7.2
t-test
>
> pairwise.t.test(chik2$weight, chik2$feed, p.adj="bonferroni", paired=F)
Pairwise comparisons using t tests with pooled SD
data:
horsebean
linseed
meatmeal
soybean
sunflower
casein
3.1e-08
0.00022
0.68350
0.00998
1.00000
horsebean
0.22833
0.00011
0.00487
1.2e-08
linseed
0.20218
1.00000
9.3e-05
meatmeal
1.00000
0.39653
250
300
weight
350
400
>
>
casein
linseed
soybean
feed
7.3
>
>
22
soybean
0.00447
1
2
3
4
5
6
>
weight
179
160
136
227
217
168
feed
horsebean
horsebean
horsebean
horsebean
horsebean
horsebean
summary(chickwts)
weight
Min.
:108.0
1st Qu.:204.5
Median :258.0
Mean
:261.3
3rd Qu.:323.5
Max.
:423.0
feed
casein
:12
horsebean:10
linseed :12
meatmeal :11
soybean :14
sunflower:12
diff
lwr
upr
p adj
horsebean-casein
-163.383333 -232.346876 -94.41979 0.0000000
linseed-casein
-104.833333 -170.587491 -39.07918 0.0002100
meatmeal-casein
-46.674242 -113.906207 20.55772 0.3324584
soybean-casein
-77.154762 -140.517054 -13.79247 0.0083653
sunflower-casein
5.333333 -60.420825 71.08749 0.9998902
linseed-horsebean
58.550000 -10.413543 127.51354 0.1413329
meatmeal-horsebean
116.709091
46.335105 187.08308 0.0001062
soybean-horsebean
86.228571
19.541684 152.91546 0.0042167
sunflower-horsebean 168.716667
99.753124 237.68021 0.0000000
meatmeal-linseed
58.159091
-9.072873 125.39106 0.1276965
soybean-linseed
27.678571 -35.683721 91.04086 0.7932853
sunflower-linseed
110.166667
44.412509 175.92082 0.0000884
soybean-meatmeal
-30.480519 -95.375109 34.41407 0.7391356
sunflower-meatmeal
52.007576 -15.224388 119.23954 0.2206962
sunflower-soybean
82.488095
19.125803 145.85039 0.0038845
> pairwise.t.test(chickwts$weight, chickwts$feed, p.adj="bonferroni", paired=F)
Pairwise comparisons using t tests with pooled SD
data:
horsebean
linseed
meatmeal
soybean
sunflower
casein
3.1e-08
0.00022
0.68350
0.00998
1.00000
horsebean
0.22833
0.00011
0.00487
1.2e-08
linseed
0.20218
1.00000
9.3e-05
meatmeal
1.00000
0.39653
24
soybean
0.00447
200
400
Berat
600
800
1000
casein
linseed
soybean
Pakan
>
>
>
>
>
## Two-way ANOVA
hsb2<-read.table("http://www.ats.ucla.edu/stat/data/hsb2.csv", sep=",", header=T)
attach(hsb2)
tapply(write, ses, mean)
1
2
3
50.61702 51.92632 55.91379
> tapply(write, ses, sd)
1
2
3
9.490391 9.106044 9.442874
> anova2 <- aov(write ~ ses + female)
> summary(anova2)
Df Sum Sq Mean Sq F value
Pr(>F)
ses
1
770
769.8
9.683 0.00214 **
female
1
1449 1448.8 18.225 3.05e-05 ***
Residuals
197 15660
79.5
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
25
>
>
7.4
>
>
data(state)
head(state.x77)
Alabama
Alaska
Arizona
Arkansas
California
Colorado
>
>
>
# Correlation
cor(state.x77)
Population
Income Illiteracy
Life Exp
Population 1.00000000 0.2082276 0.10762237 -0.06805195
Income
0.20822756 1.0000000 -0.43707519 0.34025534
Illiteracy 0.10762237 -0.4370752 1.00000000 -0.58847793
Life Exp
-0.06805195 0.3402553 -0.58847793 1.00000000
Murder
0.34364275 -0.2300776 0.70297520 -0.78084575
HS Grad
-0.09848975 0.6199323 -0.65718861 0.58221620
Frost
-0.33215245 0.2262822 -0.67194697 0.26206801
Area
0.02254384 0.3633154 0.07726113 -0.10733194
HS Grad
Frost
Area
Population -0.09848975 -0.3321525 0.02254384
Income
0.61993232 0.2262822 0.36331544
Illiteracy -0.65718861 -0.6719470 0.07726113
Life Exp
0.58221620 0.2620680 -0.10733194
Murder
-0.48797102 -0.5388834 0.22839021
HS Grad
1.00000000 0.3667797 0.33354187
Frost
0.36677970 1.0000000 0.05922910
Area
0.33354187 0.0592291 1.00000000
>
>
pairs(state.x77[,2:6])
26
Murder
0.3436428
-0.2300776
0.7029752
-0.7808458
1.0000000
-0.4879710
-0.5388834
0.2283902
1.5
2.5
10 14
4500
6000
0.5
2.5
3000
Income
72
0.5
1.5
Illiteracy
10 14
68
70
Life Exp
40
HS Grad
50
60
Murder
3000
>
>
>
>
>
>
>
>
>
>
+
>
4500
6000
68
70
72
40
50
60
Call:
lm(formula = Life.Exp ~ Population + Income + Illiteracy + Murder +
HS.Grad + Frost + Area, data = st)
Residuals:
Min
1Q
Median
-1.48895 -0.51232 -0.02747
3Q
0.57002
Coefficients:
27
Max
1.49447
Call:
lm(formula = Life.Exp ~ Murder + HS.Grad + Frost, data = st)
Residuals:
Min
1Q
-1.5015 -0.5391
Median
0.1014
3Q
0.5921
Max
1.2268
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 71.036379
0.983262 72.246 < 2e-16 ***
Murder
-0.283065
0.036731 -7.706 8.04e-10 ***
HS.Grad
0.049949
0.015201
3.286 0.00195 **
Frost
-0.006912
0.002447 -2.824 0.00699 **
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7427 on 46 degrees of freedom
Multiple R-squared: 0.7127,
Adjusted R-squared: 0.6939
F-statistic: 38.03 on 3 and 46 DF, p-value: 1.634e-12
>
>
## Prediction
predict(model2, list(Murder=10, HS.Grad=50, Frost=90))
1
70.08111
>
>
28
> require(car)
> outlierTest(model2) # Bonferonni p-value for most extreme obs
No Studentized residuals with Bonferonni p < 0.05
Largest |rstudent|:
rstudent unadjusted p-value Bonferonni p
Maine -2.17235
0.035136
NA
> qqPlot(model2, main="QQ Plot") #qq plot for studentized resid
>
1
0
1
2
Studentized Residuals(model2)
QQ Plot
t Quantiles
29
1.5
0.5
1.5
0.5
Life.Exp | others
1
0
1
2
3
Life.Exp | others
Leverage Plots
0.6
0.2
0.6
HS.Grad | others
1
0
1
Life.Exp | others
Murder | others
0.2
0.5
0.0
0.5
Frost | others
>
>
>
>
>
>
# Influential Observations
# Cook's D plot
# identify D values > 4/(n-k-1)
cutoff <- 4/((nrow(mtcars)-length(model2$coefficients)-2))
plot(model2, which=4, cook.levels=cutoff)
30
Cook's distance
0.15
Washington
0.10
Nevada
0.00
0.05
Cook's distance
0.20
Hawaii
10
20
30
40
50
Obs. number
lm(Life.Exp ~ Murder + HS.Grad + Frost)
31
0
1
2
Studentized Residuals
Influence Plot
0.05
0.10
0.15
0.20
0.25
HatValues
Circle size is proportial to Cook's Distance
p = 0.8629874
-1.68051
32
Murder HS.Grad
Frost
1.633405 1.339236 1.437903
> sqrt(vif(model2)) > 2 # problem?
Murder HS.Grad
FALSE
FALSE
Frost
FALSE
>
>
>
>
>
>
# Evaluate Nonlinearity
# component + residual plot
crPlots(model2)
# Ceres plots
#ceresPlots(model2)
7.5
>
>
>
>
>
>
>
1
2
3
4
5
6
>
>
Logistic Regression
## Logistic Regression ##
## load data ##
bindata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
## view the first few rows of the data
head(bindata)
admit
0
1
1
1
0
1
gre
380
660
800
640
520
760
gpa rank
3.61
3
3.67
3
4.00
1
3.19
4
2.93
4
3.00
2
admit
Min.
:0.0000
1st Qu.:0.0000
Median :0.0000
Mean
:0.3175
3rd Qu.:1.0000
Max.
:1.0000
>
gre
Min.
:220.0
1st Qu.:520.0
Median :580.0
Mean
:587.7
3rd Qu.:660.0
Max.
:800.0
gpa
Min.
:2.260
1st Qu.:3.130
Median :3.395
Mean
:3.390
3rd Qu.:3.670
Max.
:4.000
sapply(bindata, sd)
admit
gre
0.4660867 115.5165364
gpa
0.3805668
33
rank
0.9444602
rank
Min.
:1.000
1st Qu.:2.000
Median :2.000
Mean
:2.485
3rd Qu.:3.000
Max.
:4.000
>
>
>
>
>
>
Call:
glm(formula = admit ~ gre + gpa + rank, family = "binomial",
data = bindata)
Deviance Residuals:
Min
1Q
Median
-1.6268 -0.8662 -0.6388
3Q
1.1490
Max
2.0790
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.989979
1.139951 -3.500 0.000465 ***
gre
0.002264
0.001094
2.070 0.038465 *
gpa
0.804038
0.331819
2.423 0.015388 *
rank2
-0.675443
0.316490 -2.134 0.032829 *
rank3
-1.340204
0.345306 -3.881 0.000104 ***
rank4
-1.551464
0.417832 -3.713 0.000205 ***
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 499.98
Residual deviance: 458.52
AIC: 470.52
on 399
on 394
degrees of freedom
degrees of freedom
7.6
>
>
+
+
+
+
>
If-statement
w = 3
if( w < 5 )
d=2
} else {
d=10
}
d
[1] 2
>
34
7.7
>
>
>
+
+
+
>
For-loop
[1]
10
20
30
40
50
60
70
35
80
90 100