Basics of R Session -19- Cluster Analysis
Dr Manohar Kapse
rm(list=ls())
# import the data
cluster1.1<-read.csv("file:///D:/1 Teaching Material/1 inurture Lectures/1 multivariate data analysis/1 Multivariate Data Analysis PPts Self/Cluster Analysis/Cluster Analysis MDP 2018/Data1.csv")
str(cluster1.1)
## 'data.frame': 9 obs. of 3 variables:
## $ Name : Factor w/ 9 levels "A","B","C","D",..: 1 2 3 4 5 6 7 8 9
## $ Physics : int 1 3 2 2 4 5 6 8 6
## $ Mathematics: int 5 5 6 4 7 9 6 8 8
# remove the missing observations
cluster1.1<-na.omit(cluster1.1)
str(cluster1.1)
## 'data.frame': 9 obs. of 3 variables:
## $ Name : Factor w/ 9 levels "A","B","C","D",..: 1 2 3 4 5 6 7 8 9
## $ Physics : int 1 3 2 2 4 5 6 8 6
## $ Mathematics: int 5 5 6 4 7 9 6 8 8
#fix(cluster1.1)
# remove the labels of the data set or extra variables if present
cluster1.2<-cluster1.1[,-1]
calculate distance using dist() or daisy()
dist1<-dist(cluster1.2,method = "euclidean")
dist1
## 1 2 3 4 5 6 7 8
## 2 2.000000
## 3 1.414214 1.414214
## 4 1.414214 1.414214 2.000000
## 5 3.605551 2.236068 2.236068 3.605551
## 6 5.656854 4.472136 4.242641 5.830952 2.236068
## 7 5.099020 3.162278 4.000000 4.472136 2.236068 3.162278
## 8 7.615773 5.830952 6.324555 7.211103 4.123106 3.162278 2.828427
## 9 5.830952 4.242641 4.472136 5.656854 2.236068 1.414214 2.000000 2.000000
dist2<-dist(cluster1.2,method = "maximum")
dist2
## 1 2 3 4 5 6 7 8
## 2 2
## 3 1 1
## 4 1 1 2
## 5 3 2 2 3
## 6 4 4 3 5 2
## 7 5 3 4 4 2 3
## 8 7 5 6 6 4 3 2
## 9 5 3 4 4 2 1 2 2
dist3<-dist(cluster1.2,method = "manhattan")
dist3
## 1 2 3 4 5 6 7 8
## 2 2
## 3 2 2
## 4 2 2 2
## 5 5 3 3 5
## 6 8 6 6 8 3
## 7 6 4 4 6 3 4
## 8 10 8 8 10 5 4 4
## 9 8 6 6 8 3 2 2 2
dist4<-dist(cluster1.2,method = "minkowski")
dist4
## 1 2 3 4 5 6 7 8
## 2 2.000000
## 3 1.414214 1.414214
## 4 1.414214 1.414214 2.000000
## 5 3.605551 2.236068 2.236068 3.605551
## 6 5.656854 4.472136 4.242641 5.830952 2.236068
## 7 5.099020 3.162278 4.000000 4.472136 2.236068 3.162278
## 8 7.615773 5.830952 6.324555 7.211103 4.123106 3.162278 2.828427
## 9 5.830952 4.242641 4.472136 5.656854 2.236068 1.414214 2.000000 2.000000
hierarchical cluster analysis use any one of the distance and any one method
clusterout<-hclust(dist1,method = "single")
plot(clusterout)
clusterout<-hclust(dist1,method = "complete")
plot(clusterout)
clusterout<-hclust(dist1,method = "average")
plot(clusterout)
clusterout<-hclust(dist1,method = "median")
plot(clusterout)
clusterout<-hclust(dist1,method = "centroid")
plot(clusterout)
clusterout<-hclust(dist1,method = "ward.D")
plot(clusterout)
clusterout<-hclust(dist1,method = "ward.D2")
plot(clusterout)
adding labels to the r object clusterout
clusterout$labels<-cluster1.1[,1]
plot(clusterout)
# reducing the tree or dentograph either on the basis of
number of clusters -k or height- h # generally h not considered
groups<- cutree(clusterout, k=3) # cut the existing tree into 3 clusters
groups
## A B C D E G H I J
## 1 1 1 1 2 2 2 3 2
draw dendogram with red borders around the 3 clusters
plot(clusterout)
rect.hclust(clusterout, k=3,border="red")
print(clusterout)
##
## Call:
## hclust(d = dist1, method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 9
summary(clusterout)
## Length Class Mode
## merge 16 -none- numeric
## height 8 -none- numeric
## order 9 -none- numeric
## labels 9 factor numeric
## method 1 -none- character
## call 3 -none- call
## dist.method 1 -none- character
clusterout$height
## [1] 1.414214 1.414214 1.414214 2.000000 2.236068 2.915476 3.535534 9.706813
centroid of the clusters
aggregate(cluster1.2,by=list(groups),FUN=mean)
## Group.1 Physics Mathematics
## 1 1 2.00 5.0
## 2 2 5.25 7.5
## 3 3 8.00 8.0
# using centroid we can check the profiles of the clusters formed
—————-
Other Method #—————-
# Libraries for Cluster Analysis
library(NbClust)
library(mclust)
## Warning: package 'mclust' was built under R version 3.5.3
## Package 'mclust' version 5.4.3
## Type 'citation("mclust")' for citing this R package in publications.
library(fpc)
## Warning: package 'fpc' was built under R version 3.5.3
# max.nc can be more also say 10
nb<-NbClust(cluster1.2, distance = "euclidean",min.nc = 2, max.nc = 4,method = "single")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 11 proposed 2 as the best number of clusters
## * 6 proposed 3 as the best number of clusters
## * 6 proposed 4 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
nb
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW Friedman
## 2 36.1587 15.9034 3.3185 4.8583 41.2499 372 56.25 19.9000 61.1016
## 3 0.2211 11.4691 2.6786 2.5279 47.7833 405 56.25 13.5000 86.6250
## 4 0.2568 9.9603 4.3333 1.4017 57.0900 256 40.00 9.3333 145.0278
## Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky Ball
## 2 29.6985 0.3788 0.6614 0.4819 0.8173 0.6706 0.1676 0.5726 9.9500
## 3 43.7778 0.4711 0.6005 0.3245 0.5614 1.5625 0.5208 0.5059 4.5000
## 4 63.3214 0.3717 0.6415 0.4064 0.1875 4.3333 2.1667 0.4513 2.3333
## Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 0.7006 1.3500 0.5909 0.6202 0.0158 0.8562 1.3638 0.7935
## 3 0.6437 0.9198 0.9087 0.7071 0.0168 1.1765 1.0730 0.1482
## 4 0.5955 0.5167 1.2746 0.6325 0.0192 1.5557 0.8513 0.1061
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 -0.4219 -10.1102 0.8495
## 3 -0.5522 -5.6219 0.6295
## 4 -0.7431 -2.3458 0.3158
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 2.0000 2.0000 4.0000 2.0000 4.0000 3 4.00
## Value_Index 36.1587 15.9034 1.6548 4.8583 9.3067 -182 16.25
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.0000 4.0000 3.0000 4.0000 3.0000 2.0000 2.0000
## Value_Index 2.2333 58.4028 5.4644 0.3717 0.6005 0.4819 0.8173
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters NA 2.0000 2.0000 3.00 2.0000 2.00 2.0000
## Value_Index NA 0.1676 0.5726 5.45 0.7006 1.35 0.5909
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 3.0000 0 2.0000 0 4.0000
## Value_Index 0.7071 0 0.8562 0 0.1061
##
## $Best.partition
## 1 2 3 4 5 6 7 8 9
## 1 1 1 1 1 2 2 2 2
nb$All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW Friedman
## 2 36.1587 15.9034 3.3185 4.8583 41.2499 372 56.25 19.9000 61.1016
## 3 0.2211 11.4691 2.6786 2.5279 47.7833 405 56.25 13.5000 86.6250
## 4 0.2568 9.9603 4.3333 1.4017 57.0900 256 40.00 9.3333 145.0278
## Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky Ball
## 2 29.6985 0.3788 0.6614 0.4819 0.8173 0.6706 0.1676 0.5726 9.9500
## 3 43.7778 0.4711 0.6005 0.3245 0.5614 1.5625 0.5208 0.5059 4.5000
## 4 63.3214 0.3717 0.6415 0.4064 0.1875 4.3333 2.1667 0.4513 2.3333
## Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 0.7006 1.3500 0.5909 0.6202 0.0158 0.8562 1.3638 0.7935
## 3 0.6437 0.9198 0.9087 0.7071 0.0168 1.1765 1.0730 0.1482
## 4 0.5955 0.5167 1.2746 0.6325 0.0192 1.5557 0.8513 0.1061
nb$Best.partition
## 1 2 3 4 5 6 7 8 9
## 1 1 1 1 1 2 2 2 2
# or we can check scree plot as well
# if we require some specific number of cluster (say 2) then
nbfit<-kmeans(cluster1.2,2)
nbfit$cluster
## 1 2 3 4 5 6 7 8 9
## 1 1 1 1 2 2 2 2 2
nbfit$centers # for profiling or prediction
## Physics Mathematics
## 1 2.0 5.0
## 2 5.8 7.6
nbfit$totss
## [1] 65.11111
nbfit$withinss
## [1] 4 14
nbfit$tot.withinss
## [1] 18
nbfit$betweenss
## [1] 47.11111
nbfit$size
## [1] 4 5
nbfit$iter
## [1] 1