Monday, December 31, 2018

Basics of R Session -19- Cluster Analysis

rm(list=ls())
# import the data
cluster1.1<-read.csv("file:///D:/1 Teaching Material/1 inurture Lectures/1 multivariate data analysis/1 Multivariate Data Analysis PPts Self/Cluster Analysis/Cluster Analysis MDP 2018/Data1.csv")

str(cluster1.1)
## 'data.frame':    9 obs. of  3 variables:
##  $ Name       : Factor w/ 9 levels "A","B","C","D",..: 1 2 3 4 5 6 7 8 9
##  $ Physics    : int  1 3 2 2 4 5 6 8 6
##  $ Mathematics: int  5 5 6 4 7 9 6 8 8
# remove the missing observations
cluster1.1<-na.omit(cluster1.1)
str(cluster1.1)
## 'data.frame':    9 obs. of  3 variables:
##  $ Name       : Factor w/ 9 levels "A","B","C","D",..: 1 2 3 4 5 6 7 8 9
##  $ Physics    : int  1 3 2 2 4 5 6 8 6
##  $ Mathematics: int  5 5 6 4 7 9 6 8 8
#fix(cluster1.1)

# remove the labels of the data set or extra variables if present
cluster1.2<-cluster1.1[,-1]

calculate distance using dist() or daisy()

dist1<-dist(cluster1.2,method = "euclidean")
dist1
##          1        2        3        4        5        6        7        8
## 2 2.000000                                                               
## 3 1.414214 1.414214                                                      
## 4 1.414214 1.414214 2.000000                                             
## 5 3.605551 2.236068 2.236068 3.605551                                    
## 6 5.656854 4.472136 4.242641 5.830952 2.236068                           
## 7 5.099020 3.162278 4.000000 4.472136 2.236068 3.162278                  
## 8 7.615773 5.830952 6.324555 7.211103 4.123106 3.162278 2.828427         
## 9 5.830952 4.242641 4.472136 5.656854 2.236068 1.414214 2.000000 2.000000
dist2<-dist(cluster1.2,method = "maximum")
dist2
##   1 2 3 4 5 6 7 8
## 2 2              
## 3 1 1            
## 4 1 1 2          
## 5 3 2 2 3        
## 6 4 4 3 5 2      
## 7 5 3 4 4 2 3    
## 8 7 5 6 6 4 3 2  
## 9 5 3 4 4 2 1 2 2
dist3<-dist(cluster1.2,method = "manhattan")
dist3
##    1  2  3  4  5  6  7  8
## 2  2                     
## 3  2  2                  
## 4  2  2  2               
## 5  5  3  3  5            
## 6  8  6  6  8  3         
## 7  6  4  4  6  3  4      
## 8 10  8  8 10  5  4  4   
## 9  8  6  6  8  3  2  2  2
dist4<-dist(cluster1.2,method = "minkowski")
dist4
##          1        2        3        4        5        6        7        8
## 2 2.000000                                                               
## 3 1.414214 1.414214                                                      
## 4 1.414214 1.414214 2.000000                                             
## 5 3.605551 2.236068 2.236068 3.605551                                    
## 6 5.656854 4.472136 4.242641 5.830952 2.236068                           
## 7 5.099020 3.162278 4.000000 4.472136 2.236068 3.162278                  
## 8 7.615773 5.830952 6.324555 7.211103 4.123106 3.162278 2.828427         
## 9 5.830952 4.242641 4.472136 5.656854 2.236068 1.414214 2.000000 2.000000

hierarchical cluster analysis use any one of the distance and any one method

clusterout<-hclust(dist1,method = "single")
plot(clusterout)
clusterout<-hclust(dist1,method = "complete")
plot(clusterout)
clusterout<-hclust(dist1,method = "average")
plot(clusterout)
clusterout<-hclust(dist1,method = "median")
plot(clusterout)
clusterout<-hclust(dist1,method = "centroid")
plot(clusterout)
clusterout<-hclust(dist1,method = "ward.D")
plot(clusterout)
clusterout<-hclust(dist1,method = "ward.D2")
plot(clusterout)

adding labels to the r object clusterout

clusterout$labels<-cluster1.1[,1]
plot(clusterout)
 # reducing the tree or dentograph either on the basis of

number of clusters -k or height- h # generally h not considered

groups<- cutree(clusterout, k=3)   # cut the existing tree into 3 clusters
groups
## A B C D E G H I J 
## 1 1 1 1 2 2 2 3 2

draw dendogram with red borders around the 3 clusters

plot(clusterout)
rect.hclust(clusterout, k=3,border="red")
print(clusterout)
## 
## Call:
## hclust(d = dist1, method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 9
summary(clusterout)
##             Length Class  Mode     
## merge       16     -none- numeric  
## height       8     -none- numeric  
## order        9     -none- numeric  
## labels       9     factor numeric  
## method       1     -none- character
## call         3     -none- call     
## dist.method  1     -none- character
clusterout$height
## [1] 1.414214 1.414214 1.414214 2.000000 2.236068 2.915476 3.535534 9.706813

centroid of the clusters

aggregate(cluster1.2,by=list(groups),FUN=mean)
##   Group.1 Physics Mathematics
## 1       1    2.00         5.0
## 2       2    5.25         7.5
## 3       3    8.00         8.0
# using centroid we can check the profiles of the clusters formed

—————-

Other Method #—————-
# Libraries for Cluster Analysis
library(NbClust)
library(mclust)
## Warning: package 'mclust' was built under R version 3.5.3
## Package 'mclust' version 5.4.3
## Type 'citation("mclust")' for citing this R package in publications.
library(fpc)
## Warning: package 'fpc' was built under R version 3.5.3
# max.nc can be more also say 10
nb<-NbClust(cluster1.2, distance = "euclidean",min.nc = 2, max.nc = 4,method = "single")
## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 
## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 11 proposed 2 as the best number of clusters 
## * 6 proposed 3 as the best number of clusters 
## * 6 proposed 4 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
nb
## $All.index
##        KL      CH Hartigan    CCC   Scott Marriot TrCovW  TraceW Friedman
## 2 36.1587 15.9034   3.3185 4.8583 41.2499     372  56.25 19.9000  61.1016
## 3  0.2211 11.4691   2.6786 2.5279 47.7833     405  56.25 13.5000  86.6250
## 4  0.2568  9.9603   4.3333 1.4017 57.0900     256  40.00  9.3333 145.0278
##     Rubin Cindex     DB Silhouette   Duda Pseudot2  Beale Ratkowsky   Ball
## 2 29.6985 0.3788 0.6614     0.4819 0.8173   0.6706 0.1676    0.5726 9.9500
## 3 43.7778 0.4711 0.6005     0.3245 0.5614   1.5625 0.5208    0.5059 4.5000
## 4 63.3214 0.3717 0.6415     0.4064 0.1875   4.3333 2.1667    0.4513 2.3333
##   Ptbiserial   Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2     0.7006 1.3500  0.5909 0.6202 0.0158  0.8562 1.3638 0.7935
## 3     0.6437 0.9198  0.9087 0.7071 0.0168  1.1765 1.0730 0.1482
## 4     0.5955 0.5167  1.2746 0.6325 0.0192  1.5557 0.8513 0.1061
## 
## $All.CriticalValues
##   CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2        -0.4219           -10.1102       0.8495
## 3        -0.5522            -5.6219       0.6295
## 4        -0.7431            -2.3458       0.3158
## 
## $Best.nc
##                      KL      CH Hartigan    CCC  Scott Marriot TrCovW
## Number_clusters  2.0000  2.0000   4.0000 2.0000 4.0000       3   4.00
## Value_Index     36.1587 15.9034   1.6548 4.8583 9.3067    -182  16.25
##                 TraceW Friedman  Rubin Cindex     DB Silhouette   Duda
## Number_clusters 3.0000   4.0000 3.0000 4.0000 3.0000     2.0000 2.0000
## Value_Index     2.2333  58.4028 5.4644 0.3717 0.6005     0.4819 0.8173
##                 PseudoT2  Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters       NA 2.0000    2.0000 3.00     2.0000 2.00  2.0000
## Value_Index           NA 0.1676    0.5726 5.45     0.7006 1.35  0.5909
##                   Dunn Hubert SDindex Dindex   SDbw
## Number_clusters 3.0000      0  2.0000      0 4.0000
## Value_Index     0.7071      0  0.8562      0 0.1061
## 
## $Best.partition
## 1 2 3 4 5 6 7 8 9 
## 1 1 1 1 1 2 2 2 2
nb$All.index
##        KL      CH Hartigan    CCC   Scott Marriot TrCovW  TraceW Friedman
## 2 36.1587 15.9034   3.3185 4.8583 41.2499     372  56.25 19.9000  61.1016
## 3  0.2211 11.4691   2.6786 2.5279 47.7833     405  56.25 13.5000  86.6250
## 4  0.2568  9.9603   4.3333 1.4017 57.0900     256  40.00  9.3333 145.0278
##     Rubin Cindex     DB Silhouette   Duda Pseudot2  Beale Ratkowsky   Ball
## 2 29.6985 0.3788 0.6614     0.4819 0.8173   0.6706 0.1676    0.5726 9.9500
## 3 43.7778 0.4711 0.6005     0.3245 0.5614   1.5625 0.5208    0.5059 4.5000
## 4 63.3214 0.3717 0.6415     0.4064 0.1875   4.3333 2.1667    0.4513 2.3333
##   Ptbiserial   Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2     0.7006 1.3500  0.5909 0.6202 0.0158  0.8562 1.3638 0.7935
## 3     0.6437 0.9198  0.9087 0.7071 0.0168  1.1765 1.0730 0.1482
## 4     0.5955 0.5167  1.2746 0.6325 0.0192  1.5557 0.8513 0.1061
nb$Best.partition
## 1 2 3 4 5 6 7 8 9 
## 1 1 1 1 1 2 2 2 2
# or we can check scree plot as well
# if we require some specific number of cluster (say 2) then

nbfit<-kmeans(cluster1.2,2)
nbfit$cluster
## 1 2 3 4 5 6 7 8 9 
## 1 1 1 1 2 2 2 2 2
nbfit$centers  # for profiling or prediction
##   Physics Mathematics
## 1     2.0         5.0
## 2     5.8         7.6
nbfit$totss
## [1] 65.11111
nbfit$withinss
## [1]  4 14
nbfit$tot.withinss
## [1] 18
nbfit$betweenss
## [1] 47.11111
nbfit$size
## [1] 4 5
nbfit$iter
## [1] 1