Sunday, February 3, 2019

Basic of R Session 21.1- Support Vector Machine- e1071 package

rm(list=ls())

#using the iris data set
data1<-iris

dim(data1)
## [1] 150   5
str(data1)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#fix(data1)
summary(data1)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

for linear svm we will remove one of the categories of species, we want to consider only two categories

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
str(data1)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
data2<-filter(data1,Species!= "setosa")
str(data2)
## 'data.frame':    100 obs. of  5 variables:
##  $ Sepal.Length: num  7 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 ...
##  $ Sepal.Width : num  3.2 3.2 3.1 2.3 2.8 2.8 3.3 2.4 2.9 2.7 ...
##  $ Petal.Length: num  4.7 4.5 4.9 4 4.6 4.5 4.7 3.3 4.6 3.9 ...
##  $ Petal.Width : num  1.4 1.5 1.5 1.3 1.5 1.3 1.6 1 1.3 1.4 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
#fix(data2)

# since in the data one category is removed so we have to drop it from the variable also

data2<-data.frame(data2)
str(data2$Species)
##  Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
data2$Species<-droplevels(data2$Species)
str(data2$Species)
##  Factor w/ 2 levels "versicolor","virginica": 1 1 1 1 1 1 1 1 1 1 ...

plot the data points, if error use dev.off()

library(ggplot2)

ggplot(data2, aes(data2$Sepal.Length,data2$Sepal.Width, color=data2$Species))+ geom_point()
ggplot(data2, aes(data2$Petal.Length,data2$Petal.Width, color=data2$Species))+ geom_point()
ggplot(data2, aes(data2$Sepal.Length,data2$Petal.Length, color=data2$Species))+ geom_point()
ggplot(data2, aes(data2$Petal.Length,data2$Sepal.Width, color=data2$Species))+ geom_point()

using e1701 package

library(e1071)

# using two categories and two variables only

dtm1<-svm(Species~Sepal.Length+Sepal.Width, data = data2,kernel = "linear",scale = TRUE)
summary(dtm1)
## 
## Call:
## svm(formula = Species ~ Sepal.Length + Sepal.Width, data = data2, 
##     kernel = "linear", scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.5 
## 
## Number of Support Vectors:  67
## 
##  ( 34 33 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica
#dtm1$SV
#dtm1$kernel
#dtm1$tot.nSV
#dtm1$decision.values
#dtm1$fitted
#dtm1$coefs

plot(dtm1, data = data2,Sepal.Length~Sepal.Width)
# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm1,data2)
library(caret)
## Loading required package: lattice
confusionMatrix(predict1,data2$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         38        15
##   virginica          12        35
##                                          
##                Accuracy : 0.73           
##                  95% CI : (0.632, 0.8139)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 2.346e-06      
##                                          
##                   Kappa : 0.46           
##  Mcnemar's Test P-Value : 0.7003         
##                                          
##             Sensitivity : 0.7600         
##             Specificity : 0.7000         
##          Pos Pred Value : 0.7170         
##          Neg Pred Value : 0.7447         
##              Prevalence : 0.5000         
##          Detection Rate : 0.3800         
##    Detection Prevalence : 0.5300         
##       Balanced Accuracy : 0.7300         
##                                          
##        'Positive' Class : versicolor     
## 

change the value of C

dtm2<-svm(Species~Sepal.Length+Sepal.Width, data = data2,kernel = "linear",Cost =100,scale = TRUE)
summary(dtm2)
## 
## Call:
## svm(formula = Species ~ Sepal.Length + Sepal.Width, data = data2, 
##     kernel = "linear", Cost = 100, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.5 
## 
## Number of Support Vectors:  67
## 
##  ( 34 33 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica
plot(dtm2, data = data2,Sepal.Length~Sepal.Width)
# to check is it a good classifier we will use confusion matrix
predict2<-predict(dtm2,data2)
confusionMatrix(predict2,data2$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         38        15
##   virginica          12        35
##                                          
##                Accuracy : 0.73           
##                  95% CI : (0.632, 0.8139)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 2.346e-06      
##                                          
##                   Kappa : 0.46           
##  Mcnemar's Test P-Value : 0.7003         
##                                          
##             Sensitivity : 0.7600         
##             Specificity : 0.7000         
##          Pos Pred Value : 0.7170         
##          Neg Pred Value : 0.7447         
##              Prevalence : 0.5000         
##          Detection Rate : 0.3800         
##    Detection Prevalence : 0.5300         
##       Balanced Accuracy : 0.7300         
##                                          
##        'Positive' Class : versicolor     
## 

———-polynomial Kernel———

dtm3<-svm(Species~Sepal.Length+Sepal.Width, data = data2,kernel = "polynomial",scale = TRUE)
summary(dtm3)
## 
## Call:
## svm(formula = Species ~ Sepal.Length + Sepal.Width, data = data2, 
##     kernel = "polynomial", scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  1 
##      degree:  3 
##       gamma:  0.5 
##      coef.0:  0 
## 
## Number of Support Vectors:  77
## 
##  ( 38 39 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica
plot(dtm3, data = data2,Sepal.Length~Sepal.Width)
# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm3,data2)
confusionMatrix(predict1,data2$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         39        16
##   virginica          11        34
##                                          
##                Accuracy : 0.73           
##                  95% CI : (0.632, 0.8139)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 2.346e-06      
##                                          
##                   Kappa : 0.46           
##  Mcnemar's Test P-Value : 0.4414         
##                                          
##             Sensitivity : 0.7800         
##             Specificity : 0.6800         
##          Pos Pred Value : 0.7091         
##          Neg Pred Value : 0.7556         
##              Prevalence : 0.5000         
##          Detection Rate : 0.3900         
##    Detection Prevalence : 0.5500         
##       Balanced Accuracy : 0.7300         
##                                          
##        'Positive' Class : versicolor     
## 

———–Radial Kernel——————–

dtm4<-svm(Species~Sepal.Length+Sepal.Width, data = data2,kernel = "radial",scale = FALSE)
summary(dtm4)
## 
## Call:
## svm(formula = Species ~ Sepal.Length + Sepal.Width, data = data2, 
##     kernel = "radial", scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.5 
## 
## Number of Support Vectors:  70
## 
##  ( 35 35 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica
plot(dtm4, data = data2,Sepal.Length~Sepal.Width)
# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm4,data2)
confusionMatrix(predict1,data2$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         38        14
##   virginica          12        36
##                                           
##                Accuracy : 0.74            
##                  95% CI : (0.6427, 0.8226)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 8.337e-07       
##                                           
##                   Kappa : 0.48            
##  Mcnemar's Test P-Value : 0.8445          
##                                           
##             Sensitivity : 0.7600          
##             Specificity : 0.7200          
##          Pos Pred Value : 0.7308          
##          Neg Pred Value : 0.7500          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3800          
##    Detection Prevalence : 0.5200          
##       Balanced Accuracy : 0.7400          
##                                           
##        'Positive' Class : versicolor      
## 

——–Sigmoid Kernel—————

dtm5<-svm(Species~Sepal.Length+Sepal.Width, data = data2,kernel = "sigmoid",scale = FALSE)
summary(dtm5)
## 
## Call:
## svm(formula = Species ~ Sepal.Length + Sepal.Width, data = data2, 
##     kernel = "sigmoid", scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  sigmoid 
##        cost:  1 
##       gamma:  0.5 
##      coef.0:  0 
## 
## Number of Support Vectors:  100
## 
##  ( 50 50 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica
plot(dtm5, data = data2,Sepal.Length~Sepal.Width)
# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm5,data2)
confusionMatrix(predict1,data2$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor          0         0
##   virginica          50        50
##                                           
##                Accuracy : 0.5             
##                  95% CI : (0.3983, 0.6017)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.5398          
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : 4.219e-12       
##                                           
##             Sensitivity : 0.0             
##             Specificity : 1.0             
##          Pos Pred Value : NaN             
##          Neg Pred Value : 0.5             
##              Prevalence : 0.5             
##          Detection Rate : 0.0             
##    Detection Prevalence : 0.0             
##       Balanced Accuracy : 0.5             
##                                           
##        'Positive' Class : versicolor      
## 

use tunning for finding the best tune parameter

tune1<-tune.svm(Species~Sepal.Length+Sepal.Width, data = data2,
                gamma = c(0.1,0.2, 0.3, 0.5, 0.7), cost = c(1,10,100))
print(tune1)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  gamma cost
##    0.7    1
## 
## - best performance: 0.29
plot(tune1)
# apply the value of cost and gamma to the model

dtm1<-svm(Species~Sepal.Length+Sepal.Width, data = data2,kernel = "linear",scale = TRUE, cost=1, gamma= 0.3)
summary(dtm1)
## 
## Call:
## svm(formula = Species ~ Sepal.Length + Sepal.Width, data = data2, 
##     kernel = "linear", cost = 1, gamma = 0.3, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.3 
## 
## Number of Support Vectors:  67
## 
##  ( 34 33 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica
plot(dtm1, data = data2,Sepal.Length~Sepal.Width)
# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm1,data2)
confusionMatrix(predict1,data2$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         38        15
##   virginica          12        35
##                                          
##                Accuracy : 0.73           
##                  95% CI : (0.632, 0.8139)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 2.346e-06      
##                                          
##                   Kappa : 0.46           
##  Mcnemar's Test P-Value : 0.7003         
##                                          
##             Sensitivity : 0.7600         
##             Specificity : 0.7000         
##          Pos Pred Value : 0.7170         
##          Neg Pred Value : 0.7447         
##              Prevalence : 0.5000         
##          Detection Rate : 0.3800         
##    Detection Prevalence : 0.5300         
##       Balanced Accuracy : 0.7300         
##                                          
##        'Positive' Class : versicolor     
## 

—————————————————

now using three categories and all the four variables

—————————————–

str(data1)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
data1<-iris

#fix(data2)
dtm1<-svm(Species~., data = data2,kernel = "linear",scale = TRUE)
summary(dtm1)
## 
## Call:
## svm(formula = Species ~ ., data = data2, kernel = "linear", scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.25 
## 
## Number of Support Vectors:  16
## 
##  ( 8 8 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica
# since use had used four variable, plotting in 2D 4 variables
# not possible so we have to specify the variables (not a good visualization)
plot(dtm1, data = data2,Sepal.Length~Sepal.Width)
# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm1,data2)
confusionMatrix(predict1,data2$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         46         0
##   virginica           4        50
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.9007, 0.989)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.92           
##  Mcnemar's Test P-Value : 0.1336         
##                                          
##             Sensitivity : 0.9200         
##             Specificity : 1.0000         
##          Pos Pred Value : 1.0000         
##          Neg Pred Value : 0.9259         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4600         
##    Detection Prevalence : 0.4600         
##       Balanced Accuracy : 0.9600         
##                                          
##        'Positive' Class : versicolor     
## 

use tunning for finding the best tune parameter

tune1<-tune.svm(Species~., data = data2,
                gamma = c(0.1,0.2, 0.3, 0.5, 0.7), cost = c(1,10,100))
print(tune1)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  gamma cost
##    0.1    1
## 
## - best performance: 0.04
plot(tune1)
tune1$best.parameters
##   gamma cost
## 1   0.1    1
# use best parameters and create the model
dtm1<-svm(Species~., data = data2,kernel = "linear",scale = TRUE, cost=1, gamma=0.1)
summary(dtm1)
## 
## Call:
## svm(formula = Species ~ ., data = data2, kernel = "linear", cost = 1, 
##     gamma = 0.1, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.1 
## 
## Number of Support Vectors:  16
## 
##  ( 8 8 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica
# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm1,data2)
confusionMatrix(predict1,data2$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         46         0
##   virginica           4        50
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.9007, 0.989)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.92           
##  Mcnemar's Test P-Value : 0.1336         
##                                          
##             Sensitivity : 0.9200         
##             Specificity : 1.0000         
##          Pos Pred Value : 1.0000         
##          Neg Pred Value : 0.9259         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4600         
##    Detection Prevalence : 0.4600         
##       Balanced Accuracy : 0.9600         
##                                          
##        'Positive' Class : versicolor     
## 
#--------------------------*****-------------------------------#

No comments:

Post a Comment