Download as pdf or txt
Download as pdf or txt
You are on page 1of 5

Hierarchical clustering and experiment

with cutting the dendrogram

#Load and display data


library(klaR)
data("countries")
countries

## Country Popul PopDens GDPpp


## AFG Afghanistan 28717213 44.350908 661.6241
## DZ Algeria 32818500 13.779206 5295.7935
## RA Argentina 38740807 16.265758 10423.1179
## BD Bangladesh 138448210 961.445903 1720.4990
## BR Brazil 182032604 21.385497 7559.0854
## CDN Canada 32207113 3.225656 29002.9100
## VRC China 1286975468 134.102410 4653.5464
## CO Colombia 41662073 36.580654 6039.0658
## RDC Democratic Republic of the Congo 56625039 24.142917 600.4411
## ET Egypt 74718797 74.610612 3878.5421
## ETH Ethiopia 66557553 59.050624 729.1434
## F France 60180529 110.013215 25888.7721
## D Germany 82398326 230.794060 26214.1248
## IND India 1049700118 319.291675 2537.8677
## RI Indonesia 234893453 122.376033 3040.5275
## IR Iran 68278826 41.431326 6712.1834
## I Italy 57998353 192.538436 25086.9193
## J Japan 127214499 336.693263 28699.5588
## EAK Kenya 31639091 54.302053 1039.5368
## ROK Korea 48289037 490.343593 19497.1790
## MAL Malaysia 23092940 70.031660 8591.3703
## MEX Mexico 104907991 53.183945 8811.5309
## MA Morocco 31689265 70.964651 3843.5729
## MYA Myanmar 42510537 62.653702 1733.4526
## WAN Nigeria 133881703 144.930007 840.2941
## PK Pakistan 150694740 187.445257 1959.5906
## PE Peru 28409897 22.105085 4885.6214
## RP Philippines 84619974 282.066580 4487.1203
## PL Poland 38622660 123.519389 9662.7213
## RUS Russian Federation 144526278 8.464105 9749.0921
## SA South Africa 42768678 35.058822 10000.3091
## E Spain 40217413 79.672835 21152.5291
## SUD Sudan 38114160 15.210315 1387.9356
## THA Thailand 64265276 125.029720 6936.8721
## TR Turkey 68109469 87.254950 7189.8960
## UA Ukraine 48055439 79.601522 4536.4272
## GB United Kingdom 0094648 245.464619 25426.5571
## EAT United Rep. of Tanzania 35922454 38.009680 568.4467
## USA USA 290342554 30.152644 35991.9683
## UZB Uzbekistan 25981647 58.072523 2542.5640
## YV Venezuela 24654694 27.032174 5341.7820
## VN Viet Nam 81624716 247.677861 2251.7689
## LifeEx InfMor Illit
## AFG 46.97 14.248 64.0
## DZ 70.54 3.774 30.0
## RA 75.48 1.616 2.9
## BD 61.33 6.608 56.9
## BR 71.13 3.174 13.6
## CDN 79.83 0.488 3.0
## VRC 72.22 2.526 14.0
## CO 71.14 2.247 7.5
## RDC 48.93 9.656 34.5
## ET 70.41 3.526 42.3
## ETH 41.24 10.322 57.3
## F 79.28 0.437 1.0
## D 78.42 0.423 1.0
## IND 63.62 5.959 40.5
## RI 68.94 3.809 11.5
## IR 69.35 4.417 20.6
## I 79.40 0.619 1.4
## J 80.93 0.330 1.0
## EAK 45.22 6.336 14.9
## ROK 75.36 0.731 1.9
## MAL 71.67 1.900 11.1
## MEX 72.30 2.368 7.8
## MA 70.04 4.487 48.3
## MYA 55.79 7.035 16.9
## WAN 51.01 7.135 32.0
## PK 62.20 7.653 54.3
## PE 70.88 3.697 9.1
## RP 69.29 2.498 4.1
## PL 73.91 0.895 0.2
## RUS 67.66 1.951 0.4
## SA 46.56 6.084 13.6
## E 79.23 0.454 2.1
## SUD 57.73 6.559 38.9
## THA 71.24 2.183 4.0
## TR 71.80 4.420 13.5
## UA 66.50 2.087 0.3
## GB 78.16 0.528 1.0
## EAT 44.56 10.368 21.8
## USA 77.14 0.675 3.0
## UZB 64.00 7.151 0.7
## YV 73.81 2.379 6.6
## VN 70.05 3.083 6.0

#Select only the numeric metrics and scale data for clustering
country_metrics <- countries[,2:7]
country_metrics <- scale(country_metrics)

#Perform Hierarchical clustering


hc <- hclust(dist(country_metrics),method = "complete")
#Confirm the linkage method and distance measure
hc$method

## [1] "complete"

hc$dist.method

## [1] "euclidean"

#Visualize dendrogram and see the heights where clusters were joined
plot(as.dendrogram(hc))

0
BD
AFG
ETH
VRC
IND
USA
CON
F
E
ROK
J
I
D
GB
UZB
RP
VN
RI
BR
IR
TR
RA
PL
RUS
UA
PE
CO
YV
THA
MAL
MEX
SUD
MYA
WAN
RDC
EAT
EAK
SA
PK
DZ
ET
hc$height MA

## [1] 0.1549853 0.2757010 0.3356801 0.3756963 0.4208425 0.4689604 0.5029623


## [8] 0.5170324 0.5299215 0.5603698 0.7039239 0.7393391 0.7964019 0.8208645
## [15] 0.8496731 0.9302615 0.9458651 0.9847847 1.0327015 1.0520642 1.0631944
## [22] 1.1033044 1.2238031 1.2766480 1.3634624 1.6847228 1.8058200 1.8465007
## [29] 1.8765357 1.8857972 1.9134200 2.2444036 2.2655848 2.4231736 3.3522237
## [36] 3.3579700 4.2298115 5.7583206 6.0609169 6.2431407 7.5222373

#Identify two most similar countries


cutree(hc,h=0.2)

##AFG DZ RA BD BR CDN VRC CO RDC ET ETH F D IND RI IR I J


## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
##EAK ROK MAL MEX MA MYA WAN PK PE RP PL RUS SA E SUD THA TR UA
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## GB EAT USA UZB YV VN
## 13 37 38 39 40 41

plot(as.dendrogram(hc),ylim=c(0,1))
0
2
4
6
0.0
0.2
0.4
0.6
0.8
1.0

BD BD
AFG AFG
ETH ETH
VRC VRC
IND IND
USA USA
CON CON

plot(as.dendrogram(hc))
F F
E E

#Idenitfy a potential cutpoint


ROK ROK
J J
I I
D D

abline(h=3,lty=2,col="dodgerblue")
GB GB
UZB UZB
RP RP
VN VN
RI RI
BR BR
IR IR
TR TR
RA RA
PL PL
RUS RUS
UA UA
PE PE
CO CO
YV YV
THA THA
MAL MAL
MEX MEX
SUD SUD
MYA MYA
WAN WAN
RDC RDC
EAT EAT
EAK EAK
SA SA
PK PK
DZ DZ
ET ET
MA MA
#Perform cut and save groups
(my.cut <- cutree(hc,h=3))

##AFG DZ RA BD BR CDN VRC CO RDC ET ETH F D IND RI IR I J


## 1 2 3 4 3 5 6 3 7 2 1 5 8 6 3 3 8 8
##EAK ROK MAL MEX MA MYA WAN PK PE RP PL RUS SA E SUD THA TR UA
## 7 8 3 3 2 7 7 2 3 3 3 3 7 5 7 3 3 3
## GB EAT USA UZB YV VN
## 8 7 5 3 3 3

country.groups <- vector("list",max(my.cut))


for(i in 1:length(country.groups)){
country.groups[[i]] <- names(my.cut)[which(my.cut==i)]
}
country.groups

## [[1]]
## [1] "AFG" "ETH"
##
## [[2]]
## [1] "DZ" "ET" "MA" "PK"
##
## [[3]]
## [1] "RA" "BR" "CO" "RI" "IR" "MAL" "MEX" "PE" "RP" "PL" "RUS"
## [12] "THA" "TR" "UA" "UZB" "YV" "VN"
##
## [[4]]
## [1] "BD"
##
## [[5]]
## [1] "CDN" "F" "E" "USA"
##
## [[6]]
## [1] "VRC" "IND"
##
## [[7]]
## [1] "RDC" "EAK" "MYA" "WAN" "SA" "SUD" "EAT"
##
## [[8]]
## [1] "D" "I" "J" "ROK" "GB"

You might also like