library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
data("iris")
data1<-iris
str(data1)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
fitControl = trainControl(method="repeatedcv", repeats=3)
knnMod2 = train(Species~ ., data=data1,
method="knn",
trControl=fitControl,
preProcess=c("center","scale"),
tuneLength=10)
print(knnMod2)
## k-Nearest Neighbors
##
## 150 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9600000 0.9400000
## 7 0.9600000 0.9400000
## 9 0.9488889 0.9233333
## 11 0.9511111 0.9266667
## 13 0.9688889 0.9533333
## 15 0.9666667 0.9500000
## 17 0.9600000 0.9400000
## 19 0.9511111 0.9266667
## 21 0.9488889 0.9233333
## 23 0.9511111 0.9266667
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 13.
plot(knnMod2)
pred = predict(knnMod2, newdata=data1)
confusionMatrix(pred, data1$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 3
## virginica 0 2 47
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.9239, 0.9891)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.95
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9600 0.9400
## Specificity 1.0000 0.9700 0.9800
## Pos Pred Value 1.0000 0.9412 0.9592
## Neg Pred Value 1.0000 0.9798 0.9703
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3200 0.3133
## Detection Prevalence 0.3333 0.3400 0.3267
## Balanced Accuracy 1.0000 0.9650 0.9600
Example two- fruit_data_with_colors.txt
library(caret)
fruit = read.table("file:///D:/1 Teaching Material/1 inurture Lectures/1 multivariate data analysis/1 Multivariate Data Analysis PPts Self/KNN/case study- KNN/KNN-to-classify-fruits-master/fruit_data_with_colors.txt", sep="\t", header=TRUE)
fruit<-fruit[, c(2,4,5,6,7)]
str(fruit)
## 'data.frame': 59 obs. of 5 variables:
## $ fruit_name : Factor w/ 4 levels "apple","lemon",..: 1 1 1 3 3 3 3 3 1 1 ...
## $ mass : num 192 180 176 86 84 80 80 76 178 172 ...
## $ width : num 8.4 8 7.4 6.2 6 5.8 5.9 5.8 7.1 7.4 ...
## $ height : num 7.3 6.8 7.2 4.7 4.6 4.3 4.3 4 7.8 7 ...
## $ color_score: num 0.55 0.59 0.6 0.8 0.79 0.77 0.81 0.81 0.92 0.89 ...
fitControl = trainControl(method="repeatedcv", repeats=3)
knnMod2 = train(fruit_name ~ ., data=fruit,
method="knn",
trControl=fitControl,
preProcess=c("center","scale"),
tuneLength=10)
summary(knnMod2)
## Length Class Mode
## learn 2 -none- list
## k 1 -none- numeric
## theDots 0 -none- list
## xNames 4 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 4 -none- character
## param 0 -none- list
print(knnMod2)
## k-Nearest Neighbors
##
## 59 samples
## 4 predictor
## 4 classes: 'apple', 'lemon', 'mandarin', 'orange'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 53, 53, 53, 52, 54, 53, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9489683 0.92723064
## 7 0.9356349 0.90804204
## 9 0.8982540 0.85549704
## 11 0.8389683 0.77036974
## 13 0.7921429 0.69977445
## 15 0.6915079 0.55691944
## 17 0.6546032 0.49725470
## 19 0.5073810 0.27615439
## 21 0.3740476 0.07196162
## 23 0.3419841 0.02170672
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
plot(knnMod2)
pred = predict(knnMod2, newdata=fruit)
confusionMatrix(pred, fruit[,1])
## Confusion Matrix and Statistics
##
## Reference
## Prediction apple lemon mandarin orange
## apple 18 0 0 0
## lemon 0 16 0 0
## mandarin 0 0 5 0
## orange 1 0 0 19
##
## Overall Statistics
##
## Accuracy : 0.9831
## 95% CI : (0.9091, 0.9996)
## No Information Rate : 0.322
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9762
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: apple Class: lemon Class: mandarin
## Sensitivity 0.9474 1.0000 1.00000
## Specificity 1.0000 1.0000 1.00000
## Pos Pred Value 1.0000 1.0000 1.00000
## Neg Pred Value 0.9756 1.0000 1.00000
## Prevalence 0.3220 0.2712 0.08475
## Detection Rate 0.3051 0.2712 0.08475
## Detection Prevalence 0.3051 0.2712 0.08475
## Balanced Accuracy 0.9737 1.0000 1.00000
## Class: orange
## Sensitivity 1.0000
## Specificity 0.9750
## Pos Pred Value 0.9500
## Neg Pred Value 1.0000
## Prevalence 0.3220
## Detection Rate 0.3220
## Detection Prevalence 0.3390
## Balanced Accuracy 0.9875
we can go for training and testing data also
fruit = read.table("file:///D:/1 Teaching Material/1 inurture Lectures/1 multivariate data analysis/1 Multivariate Data Analysis PPts Self/KNN/case study- KNN/KNN-to-classify-fruits-master/fruit_data_with_colors.txt", sep="\t", header=TRUE)
split = createDataPartition(fruit$fruit_name, p=0.7, list=FALSE)
train = fruit[split,]
test = fruit[-split,]
library(caret)
d1<-dummyVars(~fruit_subtype, data = fruit)
dummies <- predict(d1, newdata = fruit)
fruit2<-cbind(fruit,dummies)
fruit3<-fruit2[,-3]
str(fruit3)
## 'data.frame': 59 obs. of 16 variables:
## $ fruit_label : int 1 1 1 2 2 2 2 2 1 1 ...
## $ fruit_name : Factor w/ 4 levels "apple","lemon",..: 1 1 1 3 3 3 3 3 1 1 ...
## $ mass : num 192 180 176 86 84 80 80 76 178 172 ...
## $ width : num 8.4 8 7.4 6.2 6 5.8 5.9 5.8 7.1 7.4 ...
## $ height : num 7.3 6.8 7.2 4.7 4.6 4.3 4.3 4 7.8 7 ...
## $ color_score : num 0.55 0.59 0.6 0.8 0.79 0.77 0.81 0.81 0.92 0.89 ...
## $ fruit_subtype.braeburn : num 0 0 0 0 0 0 0 0 1 1 ...
## $ fruit_subtype.cripps_pink : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit_subtype.golden_delicious: num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit_subtype.granny_smith : num 1 1 1 0 0 0 0 0 0 0 ...
## $ fruit_subtype.mandarin : num 0 0 0 1 1 1 1 1 0 0 ...
## $ fruit_subtype.selected_seconds: num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit_subtype.spanish_belsan : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit_subtype.spanish_jumbo : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit_subtype.turkey_navel : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit_subtype.unknown : num 0 0 0 0 0 0 0 0 0 0 ...
fitControl = trainControl(method="repeatedcv", repeats=3)
knnMod2 = train(fruit_name ~ ., data=fruit3,
method="knn",
trControl=fitControl,
preProcess=c("center","scale"),
tuneLength=10)
summary(knnMod2)
## Length Class Mode
## learn 2 -none- list
## k 1 -none- numeric
## theDots 0 -none- list
## xNames 15 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 4 -none- character
## param 0 -none- list
print(knnMod2)
## k-Nearest Neighbors
##
## 59 samples
## 15 predictors
## 4 classes: 'apple', 'lemon', 'mandarin', 'orange'
##
## Pre-processing: centered (15), scaled (15)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 52, 52, 54, 53, 54, 53, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9944444 0.9916667
## 7 0.9722222 0.9596154
## 9 0.8593651 0.7987543
## 11 0.7369841 0.6202357
## 13 0.7377778 0.6212243
## 15 0.6877778 0.5500664
## 17 0.6334921 0.4762837
## 19 0.5330159 0.3267684
## 21 0.4982540 0.2638573
## 23 0.4985714 0.2666597
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
plot(knnMod2)
knnMod2$pred
## NULL
pred = predict(knnMod2, fruit3)
confusionMatrix(pred, fruit3[,2])
## Confusion Matrix and Statistics
##
## Reference
## Prediction apple lemon mandarin orange
## apple 19 0 0 0
## lemon 0 16 0 0
## mandarin 0 0 5 0
## orange 0 0 0 19
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9394, 1)
## No Information Rate : 0.322
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: apple Class: lemon Class: mandarin
## Sensitivity 1.000 1.0000 1.00000
## Specificity 1.000 1.0000 1.00000
## Pos Pred Value 1.000 1.0000 1.00000
## Neg Pred Value 1.000 1.0000 1.00000
## Prevalence 0.322 0.2712 0.08475
## Detection Rate 0.322 0.2712 0.08475
## Detection Prevalence 0.322 0.2712 0.08475
## Balanced Accuracy 1.000 1.0000 1.00000
## Class: orange
## Sensitivity 1.000
## Specificity 1.000
## Pos Pred Value 1.000
## Neg Pred Value 1.000
## Prevalence 0.322
## Detection Rate 0.322
## Detection Prevalence 0.322
## Balanced Accuracy 1.000
confusionMatrix(pred, fruit3$fruit_name)
## Confusion Matrix and Statistics
##
## Reference
## Prediction apple lemon mandarin orange
## apple 19 0 0 0
## lemon 0 16 0 0
## mandarin 0 0 5 0
## orange 0 0 0 19
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9394, 1)
## No Information Rate : 0.322
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: apple Class: lemon Class: mandarin
## Sensitivity 1.000 1.0000 1.00000
## Specificity 1.000 1.0000 1.00000
## Pos Pred Value 1.000 1.0000 1.00000
## Neg Pred Value 1.000 1.0000 1.00000
## Prevalence 0.322 0.2712 0.08475
## Detection Rate 0.322 0.2712 0.08475
## Detection Prevalence 0.322 0.2712 0.08475
## Balanced Accuracy 1.000 1.0000 1.00000
## Class: orange
## Sensitivity 1.000
## Specificity 1.000
## Pos Pred Value 1.000
## Neg Pred Value 1.000
## Prevalence 0.322
## Detection Rate 0.322
## Detection Prevalence 0.322
## Balanced Accuracy 1.000
using scale and categorical variable in KNN-
using subcategories of fruits is also used
library(fastDummies)
fruit4<-dummy_cols(fruit)
fruit4<-dummy_cols(fruit, select_columns = "fruit_subtype")
fruit4<-dummy_cols(fruit, select_columns = "fruit_subtype", remove_first_dummy = TRUE)
fruit5<-fruit4[,c(-1,-3)]
str(fruit5)
## 'data.frame': 59 obs. of 14 variables:
## $ fruit_name : Factor w/ 4 levels "apple","lemon",..: 1 1 1 3 3 3 3 3 1 1 ...
## $ mass : num 192 180 176 86 84 80 80 76 178 172 ...
## $ width : num 8.4 8 7.4 6.2 6 5.8 5.9 5.8 7.1 7.4 ...
## $ height : num 7.3 6.8 7.2 4.7 4.6 4.3 4.3 4 7.8 7 ...
## $ color_score : num 0.55 0.59 0.6 0.8 0.79 0.77 0.81 0.81 0.92 0.89 ...
## $ fruit_subtype_mandarin : int 0 0 0 1 1 1 1 1 0 0 ...
## $ fruit_subtype_braeburn : int 0 0 0 0 0 0 0 0 1 1 ...
## $ fruit_subtype_golden_delicious: int 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit_subtype_cripps_pink : int 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit_subtype_spanish_jumbo : int 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit_subtype_selected_seconds: int 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit_subtype_turkey_navel : int 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit_subtype_spanish_belsan : int 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit_subtype_unknown : int 0 0 0 0 0 0 0 0 0 0 ...
fitControl = trainControl(method="repeatedcv", repeats=3)
knnMod2 = train(fruit_name ~ ., data=fruit5,
method="knn",
trControl=fitControl,
preProcess=c("center","scale"),
tuneLength=10)
print(knnMod2)
## k-Nearest Neighbors
##
## 59 samples
## 13 predictors
## 4 classes: 'apple', 'lemon', 'mandarin', 'orange'
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 54, 54, 53, 54, 54, 52, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9838095 0.9766204
## 7 0.9668254 0.9513889
## 9 0.8960317 0.8496546
## 11 0.7715079 0.6643729
## 13 0.7318254 0.6074530
## 15 0.6100000 0.4254632
## 17 0.5937302 0.4050109
## 19 0.5189683 0.2910614
## 21 0.4288095 0.1539879
## 23 0.4572222 0.2004943
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
plot(knnMod2)
pred = predict(knnMod2,fruit5)
confusionMatrix(pred, fruit5$fruit_name)
## Confusion Matrix and Statistics
##
## Reference
## Prediction apple lemon mandarin orange
## apple 19 0 0 0
## lemon 0 16 0 0
## mandarin 0 0 5 0
## orange 0 0 0 19
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9394, 1)
## No Information Rate : 0.322
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: apple Class: lemon Class: mandarin
## Sensitivity 1.000 1.0000 1.00000
## Specificity 1.000 1.0000 1.00000
## Pos Pred Value 1.000 1.0000 1.00000
## Neg Pred Value 1.000 1.0000 1.00000
## Prevalence 0.322 0.2712 0.08475
## Detection Rate 0.322 0.2712 0.08475
## Detection Prevalence 0.322 0.2712 0.08475
## Balanced Accuracy 1.000 1.0000 1.00000
## Class: orange
## Sensitivity 1.000
## Specificity 1.000
## Pos Pred Value 1.000
## Neg Pred Value 1.000
## Prevalence 0.322
## Detection Rate 0.322
## Detection Prevalence 0.322
## Balanced Accuracy 1.000