Wednesday, February 6, 2019

Basic of R Session 22- K Nearest Neighbor

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
data("iris")

data1<-iris
# fix(data1)
str(data1)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#------------apply the Model

fitControl = trainControl(method="repeatedcv", repeats=3)

# apply the knn on the whole data
knnMod2 = train(Species~ ., data=data1,
                method="knn",
                trControl=fitControl,
                preProcess=c("center","scale"),
                tuneLength=10)

print(knnMod2)
## k-Nearest Neighbors 
## 
## 150 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.9600000  0.9400000
##    7  0.9600000  0.9400000
##    9  0.9488889  0.9233333
##   11  0.9511111  0.9266667
##   13  0.9688889  0.9533333
##   15  0.9666667  0.9500000
##   17  0.9600000  0.9400000
##   19  0.9511111  0.9266667
##   21  0.9488889  0.9233333
##   23  0.9511111  0.9266667
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 13.
plot(knnMod2)
pred = predict(knnMod2, newdata=data1)
confusionMatrix(pred, data1$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         3
##   virginica       0          2        47
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.9239, 0.9891)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.95            
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9600           0.9400
## Specificity                 1.0000            0.9700           0.9800
## Pos Pred Value              1.0000            0.9412           0.9592
## Neg Pred Value              1.0000            0.9798           0.9703
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3200           0.3133
## Detection Prevalence        0.3333            0.3400           0.3267
## Balanced Accuracy           1.0000            0.9650           0.9600

Example two- fruit_data_with_colors.txt

library(caret)

# import the txt file

fruit = read.table("file:///D:/1 Teaching Material/1 inurture Lectures/1 multivariate data analysis/1 Multivariate Data Analysis PPts Self/KNN/case study- KNN/KNN-to-classify-fruits-master/fruit_data_with_colors.txt", sep="\t", header=TRUE)

# fix(fruit)

# remove the first and third variable

fruit<-fruit[, c(2,4,5,6,7)]
#fix(fruit)
str(fruit)
## 'data.frame':    59 obs. of  5 variables:
##  $ fruit_name : Factor w/ 4 levels "apple","lemon",..: 1 1 1 3 3 3 3 3 1 1 ...
##  $ mass       : num  192 180 176 86 84 80 80 76 178 172 ...
##  $ width      : num  8.4 8 7.4 6.2 6 5.8 5.9 5.8 7.1 7.4 ...
##  $ height     : num  7.3 6.8 7.2 4.7 4.6 4.3 4.3 4 7.8 7 ...
##  $ color_score: num  0.55 0.59 0.6 0.8 0.79 0.77 0.81 0.81 0.92 0.89 ...
fitControl = trainControl(method="repeatedcv", repeats=3)
# apply the knn on the whole data

knnMod2 = train(fruit_name ~ ., data=fruit,
                method="knn",
                trControl=fitControl,
                preProcess=c("center","scale"),
                tuneLength=10)

summary(knnMod2)
##             Length Class      Mode     
## learn       2      -none-     list     
## k           1      -none-     numeric  
## theDots     0      -none-     list     
## xNames      4      -none-     character
## problemType 1      -none-     character
## tuneValue   1      data.frame list     
## obsLevels   4      -none-     character
## param       0      -none-     list
print(knnMod2)
## k-Nearest Neighbors 
## 
## 59 samples
##  4 predictor
##  4 classes: 'apple', 'lemon', 'mandarin', 'orange' 
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 53, 53, 53, 52, 54, 53, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa     
##    5  0.9489683  0.92723064
##    7  0.9356349  0.90804204
##    9  0.8982540  0.85549704
##   11  0.8389683  0.77036974
##   13  0.7921429  0.69977445
##   15  0.6915079  0.55691944
##   17  0.6546032  0.49725470
##   19  0.5073810  0.27615439
##   21  0.3740476  0.07196162
##   23  0.3419841  0.02170672
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
plot(knnMod2)
pred = predict(knnMod2, newdata=fruit)
confusionMatrix(pred, fruit[,1])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction apple lemon mandarin orange
##   apple       18     0        0      0
##   lemon        0    16        0      0
##   mandarin     0     0        5      0
##   orange       1     0        0     19
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9831          
##                  95% CI : (0.9091, 0.9996)
##     No Information Rate : 0.322           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9762          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: apple Class: lemon Class: mandarin
## Sensitivity                0.9474       1.0000         1.00000
## Specificity                1.0000       1.0000         1.00000
## Pos Pred Value             1.0000       1.0000         1.00000
## Neg Pred Value             0.9756       1.0000         1.00000
## Prevalence                 0.3220       0.2712         0.08475
## Detection Rate             0.3051       0.2712         0.08475
## Detection Prevalence       0.3051       0.2712         0.08475
## Balanced Accuracy          0.9737       1.0000         1.00000
##                      Class: orange
## Sensitivity                 1.0000
## Specificity                 0.9750
## Pos Pred Value              0.9500
## Neg Pred Value              1.0000
## Prevalence                  0.3220
## Detection Rate              0.3220
## Detection Prevalence        0.3390
## Balanced Accuracy           0.9875

we can go for training and testing data also

#again Import the data set fruit

fruit = read.table("file:///D:/1 Teaching Material/1 inurture Lectures/1 multivariate data analysis/1 Multivariate Data Analysis PPts Self/KNN/case study- KNN/KNN-to-classify-fruits-master/fruit_data_with_colors.txt", sep="\t", header=TRUE)

# split the data into training and test
split = createDataPartition(fruit$fruit_name, p=0.7, list=FALSE)
train = fruit[split,]
test = fruit[-split,]

#------------- convert categorical data to dummy variables

library(caret)
d1<-dummyVars(~fruit_subtype, data = fruit)

#predict the values of dummy from the data
dummies <- predict(d1, newdata = fruit)
#fix(dummies)

# combine it with main data
fruit2<-cbind(fruit,dummies)
# fix(fruit2)
#-----------------------------------#
# remove the fruit subtype from data
fruit3<-fruit2[,-3]
str(fruit3)
## 'data.frame':    59 obs. of  16 variables:
##  $ fruit_label                   : int  1 1 1 2 2 2 2 2 1 1 ...
##  $ fruit_name                    : Factor w/ 4 levels "apple","lemon",..: 1 1 1 3 3 3 3 3 1 1 ...
##  $ mass                          : num  192 180 176 86 84 80 80 76 178 172 ...
##  $ width                         : num  8.4 8 7.4 6.2 6 5.8 5.9 5.8 7.1 7.4 ...
##  $ height                        : num  7.3 6.8 7.2 4.7 4.6 4.3 4.3 4 7.8 7 ...
##  $ color_score                   : num  0.55 0.59 0.6 0.8 0.79 0.77 0.81 0.81 0.92 0.89 ...
##  $ fruit_subtype.braeburn        : num  0 0 0 0 0 0 0 0 1 1 ...
##  $ fruit_subtype.cripps_pink     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit_subtype.golden_delicious: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit_subtype.granny_smith    : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ fruit_subtype.mandarin        : num  0 0 0 1 1 1 1 1 0 0 ...
##  $ fruit_subtype.selected_seconds: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit_subtype.spanish_belsan  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit_subtype.spanish_jumbo   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit_subtype.turkey_navel    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit_subtype.unknown         : num  0 0 0 0 0 0 0 0 0 0 ...
fitControl = trainControl(method="repeatedcv", repeats=3)

# apply the knn on the whole data

knnMod2 = train(fruit_name ~ ., data=fruit3,
                method="knn",
                trControl=fitControl,
                preProcess=c("center","scale"),
                tuneLength=10)

summary(knnMod2)
##             Length Class      Mode     
## learn        2     -none-     list     
## k            1     -none-     numeric  
## theDots      0     -none-     list     
## xNames      15     -none-     character
## problemType  1     -none-     character
## tuneValue    1     data.frame list     
## obsLevels    4     -none-     character
## param        0     -none-     list
print(knnMod2)
## k-Nearest Neighbors 
## 
## 59 samples
## 15 predictors
##  4 classes: 'apple', 'lemon', 'mandarin', 'orange' 
## 
## Pre-processing: centered (15), scaled (15) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 52, 52, 54, 53, 54, 53, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.9944444  0.9916667
##    7  0.9722222  0.9596154
##    9  0.8593651  0.7987543
##   11  0.7369841  0.6202357
##   13  0.7377778  0.6212243
##   15  0.6877778  0.5500664
##   17  0.6334921  0.4762837
##   19  0.5330159  0.3267684
##   21  0.4982540  0.2638573
##   23  0.4985714  0.2666597
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
plot(knnMod2)
knnMod2$pred
## NULL
pred = predict(knnMod2, fruit3)
confusionMatrix(pred, fruit3[,2]) # any one can be used
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction apple lemon mandarin orange
##   apple       19     0        0      0
##   lemon        0    16        0      0
##   mandarin     0     0        5      0
##   orange       0     0        0     19
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9394, 1)
##     No Information Rate : 0.322      
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: apple Class: lemon Class: mandarin
## Sensitivity                 1.000       1.0000         1.00000
## Specificity                 1.000       1.0000         1.00000
## Pos Pred Value              1.000       1.0000         1.00000
## Neg Pred Value              1.000       1.0000         1.00000
## Prevalence                  0.322       0.2712         0.08475
## Detection Rate              0.322       0.2712         0.08475
## Detection Prevalence        0.322       0.2712         0.08475
## Balanced Accuracy           1.000       1.0000         1.00000
##                      Class: orange
## Sensitivity                  1.000
## Specificity                  1.000
## Pos Pred Value               1.000
## Neg Pred Value               1.000
## Prevalence                   0.322
## Detection Rate               0.322
## Detection Prevalence         0.322
## Balanced Accuracy            1.000
confusionMatrix(pred, fruit3$fruit_name)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction apple lemon mandarin orange
##   apple       19     0        0      0
##   lemon        0    16        0      0
##   mandarin     0     0        5      0
##   orange       0     0        0     19
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9394, 1)
##     No Information Rate : 0.322      
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: apple Class: lemon Class: mandarin
## Sensitivity                 1.000       1.0000         1.00000
## Specificity                 1.000       1.0000         1.00000
## Pos Pred Value              1.000       1.0000         1.00000
## Neg Pred Value              1.000       1.0000         1.00000
## Prevalence                  0.322       0.2712         0.08475
## Detection Rate              0.322       0.2712         0.08475
## Detection Prevalence        0.322       0.2712         0.08475
## Balanced Accuracy           1.000       1.0000         1.00000
##                      Class: orange
## Sensitivity                  1.000
## Specificity                  1.000
## Pos Pred Value               1.000
## Neg Pred Value               1.000
## Prevalence                   0.322
## Detection Rate               0.322
## Detection Prevalence         0.322
## Balanced Accuracy            1.000
#-----------------------------#

using scale and categorical variable in KNN-

using subcategories of fruits is also used

# convert categorical data into dummy variables

library(fastDummies)

fruit4<-dummy_cols(fruit)
fruit4<-dummy_cols(fruit, select_columns = "fruit_subtype")
fruit4<-dummy_cols(fruit, select_columns = "fruit_subtype", remove_first_dummy = TRUE)

# remove the actual category

fruit5<-fruit4[,c(-1,-3)]
#fix(fruit5)
str(fruit5)
## 'data.frame':    59 obs. of  14 variables:
##  $ fruit_name                    : Factor w/ 4 levels "apple","lemon",..: 1 1 1 3 3 3 3 3 1 1 ...
##  $ mass                          : num  192 180 176 86 84 80 80 76 178 172 ...
##  $ width                         : num  8.4 8 7.4 6.2 6 5.8 5.9 5.8 7.1 7.4 ...
##  $ height                        : num  7.3 6.8 7.2 4.7 4.6 4.3 4.3 4 7.8 7 ...
##  $ color_score                   : num  0.55 0.59 0.6 0.8 0.79 0.77 0.81 0.81 0.92 0.89 ...
##  $ fruit_subtype_mandarin        : int  0 0 0 1 1 1 1 1 0 0 ...
##  $ fruit_subtype_braeburn        : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ fruit_subtype_golden_delicious: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit_subtype_cripps_pink     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit_subtype_spanish_jumbo   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit_subtype_selected_seconds: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit_subtype_turkey_navel    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit_subtype_spanish_belsan  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit_subtype_unknown         : int  0 0 0 0 0 0 0 0 0 0 ...
fitControl = trainControl(method="repeatedcv", repeats=3)
# apply the knn on the whole data
#fix(fruit5)

knnMod2 = train(fruit_name ~ ., data=fruit5,
                method="knn",
                trControl=fitControl,
                preProcess=c("center","scale"),
                tuneLength=10)

print(knnMod2)
## k-Nearest Neighbors 
## 
## 59 samples
## 13 predictors
##  4 classes: 'apple', 'lemon', 'mandarin', 'orange' 
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 54, 54, 53, 54, 54, 52, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.9838095  0.9766204
##    7  0.9668254  0.9513889
##    9  0.8960317  0.8496546
##   11  0.7715079  0.6643729
##   13  0.7318254  0.6074530
##   15  0.6100000  0.4254632
##   17  0.5937302  0.4050109
##   19  0.5189683  0.2910614
##   21  0.4288095  0.1539879
##   23  0.4572222  0.2004943
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
plot(knnMod2)
pred = predict(knnMod2,fruit5)
confusionMatrix(pred, fruit5$fruit_name)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction apple lemon mandarin orange
##   apple       19     0        0      0
##   lemon        0    16        0      0
##   mandarin     0     0        5      0
##   orange       0     0        0     19
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9394, 1)
##     No Information Rate : 0.322      
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: apple Class: lemon Class: mandarin
## Sensitivity                 1.000       1.0000         1.00000
## Specificity                 1.000       1.0000         1.00000
## Pos Pred Value              1.000       1.0000         1.00000
## Neg Pred Value              1.000       1.0000         1.00000
## Prevalence                  0.322       0.2712         0.08475
## Detection Rate              0.322       0.2712         0.08475
## Detection Prevalence        0.322       0.2712         0.08475
## Balanced Accuracy           1.000       1.0000         1.00000
##                      Class: orange
## Sensitivity                  1.000
## Specificity                  1.000
## Pos Pred Value               1.000
## Neg Pred Value               1.000
## Prevalence                   0.322
## Detection Rate               0.322
## Detection Prevalence         0.322
## Balanced Accuracy            1.000