rm(list=ls())

#using the iris data set
data1<-iris

dim(data1)

## [1] 150   5

str(data1)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

#fix(data1)
summary(data1)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

for linear svm we will remove one of the categories of species, we want to consider only two categories

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

str(data1)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

data2<-filter(data1,Species!= "setosa")
str(data2)

## 'data.frame':    100 obs. of  5 variables:
##  $ Sepal.Length: num  7 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 ...
##  $ Sepal.Width : num  3.2 3.2 3.1 2.3 2.8 2.8 3.3 2.4 2.9 2.7 ...
##  $ Petal.Length: num  4.7 4.5 4.9 4 4.6 4.5 4.7 3.3 4.6 3.9 ...
##  $ Petal.Width : num  1.4 1.5 1.5 1.3 1.5 1.3 1.6 1 1.3 1.4 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...

#fix(data2)

# since in the data one category is removed so we have to drop it from the variable also

data2<-data.frame(data2)
str(data2$Species)

##  Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...

data2$Species<-droplevels(data2$Species)
str(data2$Species)

##  Factor w/ 2 levels "versicolor","virginica": 1 1 1 1 1 1 1 1 1 1 ...

plot the data points, if error use dev.off()

library(ggplot2)

ggplot(data2, aes(data2$Sepal.Length,data2$Sepal.Width, color=data2$Species))+ geom_point()

ggplot(data2, aes(data2$Petal.Length,data2$Petal.Width, color=data2$Species))+ geom_point()

ggplot(data2, aes(data2$Sepal.Length,data2$Petal.Length, color=data2$Species))+ geom_point()

ggplot(data2, aes(data2$Petal.Length,data2$Sepal.Width, color=data2$Species))+ geom_point()

using e1701 package

library(e1071)

# using two categories and two variables only

dtm1<-svm(Species~Sepal.Length+Sepal.Width, data = data2,kernel = "linear",scale = TRUE)
summary(dtm1)

## 
## Call:
## svm(formula = Species ~ Sepal.Length + Sepal.Width, data = data2, 
##     kernel = "linear", scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.5 
## 
## Number of Support Vectors:  67
## 
##  ( 34 33 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica

#dtm1$SV
#dtm1$kernel
#dtm1$tot.nSV
#dtm1$decision.values
#dtm1$fitted
#dtm1$coefs

plot(dtm1, data = data2,Sepal.Length~Sepal.Width)

# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm1,data2)
library(caret)

## Loading required package: lattice

confusionMatrix(predict1,data2$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         38        15
##   virginica          12        35
##                                          
##                Accuracy : 0.73           
##                  95% CI : (0.632, 0.8139)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 2.346e-06      
##                                          
##                   Kappa : 0.46           
##  Mcnemar's Test P-Value : 0.7003         
##                                          
##             Sensitivity : 0.7600         
##             Specificity : 0.7000         
##          Pos Pred Value : 0.7170         
##          Neg Pred Value : 0.7447         
##              Prevalence : 0.5000         
##          Detection Rate : 0.3800         
##    Detection Prevalence : 0.5300         
##       Balanced Accuracy : 0.7300         
##                                          
##        'Positive' Class : versicolor     
##

change the value of C

dtm2<-svm(Species~Sepal.Length+Sepal.Width, data = data2,kernel = "linear",Cost =100,scale = TRUE)
summary(dtm2)

## 
## Call:
## svm(formula = Species ~ Sepal.Length + Sepal.Width, data = data2, 
##     kernel = "linear", Cost = 100, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.5 
## 
## Number of Support Vectors:  67
## 
##  ( 34 33 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica

plot(dtm2, data = data2,Sepal.Length~Sepal.Width)

# to check is it a good classifier we will use confusion matrix
predict2<-predict(dtm2,data2)
confusionMatrix(predict2,data2$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         38        15
##   virginica          12        35
##                                          
##                Accuracy : 0.73           
##                  95% CI : (0.632, 0.8139)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 2.346e-06      
##                                          
##                   Kappa : 0.46           
##  Mcnemar's Test P-Value : 0.7003         
##                                          
##             Sensitivity : 0.7600         
##             Specificity : 0.7000         
##          Pos Pred Value : 0.7170         
##          Neg Pred Value : 0.7447         
##              Prevalence : 0.5000         
##          Detection Rate : 0.3800         
##    Detection Prevalence : 0.5300         
##       Balanced Accuracy : 0.7300         
##                                          
##        'Positive' Class : versicolor     
##

———-polynomial Kernel———

dtm3<-svm(Species~Sepal.Length+Sepal.Width, data = data2,kernel = "polynomial",scale = TRUE)
summary(dtm3)

## 
## Call:
## svm(formula = Species ~ Sepal.Length + Sepal.Width, data = data2, 
##     kernel = "polynomial", scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  1 
##      degree:  3 
##       gamma:  0.5 
##      coef.0:  0 
## 
## Number of Support Vectors:  77
## 
##  ( 38 39 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica

plot(dtm3, data = data2,Sepal.Length~Sepal.Width)

# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm3,data2)
confusionMatrix(predict1,data2$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         39        16
##   virginica          11        34
##                                          
##                Accuracy : 0.73           
##                  95% CI : (0.632, 0.8139)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 2.346e-06      
##                                          
##                   Kappa : 0.46           
##  Mcnemar's Test P-Value : 0.4414         
##                                          
##             Sensitivity : 0.7800         
##             Specificity : 0.6800         
##          Pos Pred Value : 0.7091         
##          Neg Pred Value : 0.7556         
##              Prevalence : 0.5000         
##          Detection Rate : 0.3900         
##    Detection Prevalence : 0.5500         
##       Balanced Accuracy : 0.7300         
##                                          
##        'Positive' Class : versicolor     
##

———–Radial Kernel——————–

dtm4<-svm(Species~Sepal.Length+Sepal.Width, data = data2,kernel = "radial",scale = FALSE)
summary(dtm4)

## 
## Call:
## svm(formula = Species ~ Sepal.Length + Sepal.Width, data = data2, 
##     kernel = "radial", scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.5 
## 
## Number of Support Vectors:  70
## 
##  ( 35 35 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica

plot(dtm4, data = data2,Sepal.Length~Sepal.Width)

# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm4,data2)
confusionMatrix(predict1,data2$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         38        14
##   virginica          12        36
##                                           
##                Accuracy : 0.74            
##                  95% CI : (0.6427, 0.8226)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 8.337e-07       
##                                           
##                   Kappa : 0.48            
##  Mcnemar's Test P-Value : 0.8445          
##                                           
##             Sensitivity : 0.7600          
##             Specificity : 0.7200          
##          Pos Pred Value : 0.7308          
##          Neg Pred Value : 0.7500          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3800          
##    Detection Prevalence : 0.5200          
##       Balanced Accuracy : 0.7400          
##                                           
##        'Positive' Class : versicolor      
##

——–Sigmoid Kernel—————

dtm5<-svm(Species~Sepal.Length+Sepal.Width, data = data2,kernel = "sigmoid",scale = FALSE)
summary(dtm5)

## 
## Call:
## svm(formula = Species ~ Sepal.Length + Sepal.Width, data = data2, 
##     kernel = "sigmoid", scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  sigmoid 
##        cost:  1 
##       gamma:  0.5 
##      coef.0:  0 
## 
## Number of Support Vectors:  100
## 
##  ( 50 50 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica

plot(dtm5, data = data2,Sepal.Length~Sepal.Width)

# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm5,data2)
confusionMatrix(predict1,data2$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor          0         0
##   virginica          50        50
##                                           
##                Accuracy : 0.5             
##                  95% CI : (0.3983, 0.6017)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.5398          
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : 4.219e-12       
##                                           
##             Sensitivity : 0.0             
##             Specificity : 1.0             
##          Pos Pred Value : NaN             
##          Neg Pred Value : 0.5             
##              Prevalence : 0.5             
##          Detection Rate : 0.0             
##    Detection Prevalence : 0.0             
##       Balanced Accuracy : 0.5             
##                                           
##        'Positive' Class : versicolor      
##

use tunning for finding the best tune parameter

tune1<-tune.svm(Species~Sepal.Length+Sepal.Width, data = data2,
                gamma = c(0.1,0.2, 0.3, 0.5, 0.7), cost = c(1,10,100))
print(tune1)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  gamma cost
##    0.7    1
## 
## - best performance: 0.29

plot(tune1)

# apply the value of cost and gamma to the model

dtm1<-svm(Species~Sepal.Length+Sepal.Width, data = data2,kernel = "linear",scale = TRUE, cost=1, gamma= 0.3)
summary(dtm1)

## 
## Call:
## svm(formula = Species ~ Sepal.Length + Sepal.Width, data = data2, 
##     kernel = "linear", cost = 1, gamma = 0.3, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.3 
## 
## Number of Support Vectors:  67
## 
##  ( 34 33 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica

plot(dtm1, data = data2,Sepal.Length~Sepal.Width)

# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm1,data2)
confusionMatrix(predict1,data2$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         38        15
##   virginica          12        35
##                                          
##                Accuracy : 0.73           
##                  95% CI : (0.632, 0.8139)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 2.346e-06      
##                                          
##                   Kappa : 0.46           
##  Mcnemar's Test P-Value : 0.7003         
##                                          
##             Sensitivity : 0.7600         
##             Specificity : 0.7000         
##          Pos Pred Value : 0.7170         
##          Neg Pred Value : 0.7447         
##              Prevalence : 0.5000         
##          Detection Rate : 0.3800         
##    Detection Prevalence : 0.5300         
##       Balanced Accuracy : 0.7300         
##                                          
##        'Positive' Class : versicolor     
##

—————————————————

now using three categories and all the four variables

—————————————–

str(data1)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

data1<-iris

#fix(data2)
dtm1<-svm(Species~., data = data2,kernel = "linear",scale = TRUE)
summary(dtm1)

## 
## Call:
## svm(formula = Species ~ ., data = data2, kernel = "linear", scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.25 
## 
## Number of Support Vectors:  16
## 
##  ( 8 8 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica

# since use had used four variable, plotting in 2D 4 variables
# not possible so we have to specify the variables (not a good visualization)
plot(dtm1, data = data2,Sepal.Length~Sepal.Width)

# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm1,data2)
confusionMatrix(predict1,data2$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         46         0
##   virginica           4        50
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.9007, 0.989)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.92           
##  Mcnemar's Test P-Value : 0.1336         
##                                          
##             Sensitivity : 0.9200         
##             Specificity : 1.0000         
##          Pos Pred Value : 1.0000         
##          Neg Pred Value : 0.9259         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4600         
##    Detection Prevalence : 0.4600         
##       Balanced Accuracy : 0.9600         
##                                          
##        'Positive' Class : versicolor     
##

use tunning for finding the best tune parameter

tune1<-tune.svm(Species~., data = data2,
                gamma = c(0.1,0.2, 0.3, 0.5, 0.7), cost = c(1,10,100))
print(tune1)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  gamma cost
##    0.1    1
## 
## - best performance: 0.04

plot(tune1)

tune1$best.parameters

##   gamma cost
## 1   0.1    1

# use best parameters and create the model
dtm1<-svm(Species~., data = data2,kernel = "linear",scale = TRUE, cost=1, gamma=0.1)
summary(dtm1)

## 
## Call:
## svm(formula = Species ~ ., data = data2, kernel = "linear", cost = 1, 
##     gamma = 0.1, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.1 
## 
## Number of Support Vectors:  16
## 
##  ( 8 8 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  versicolor virginica

# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm1,data2)
confusionMatrix(predict1,data2$Species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   versicolor virginica
##   versicolor         46         0
##   virginica           4        50
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.9007, 0.989)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.92           
##  Mcnemar's Test P-Value : 0.1336         
##                                          
##             Sensitivity : 0.9200         
##             Specificity : 1.0000         
##          Pos Pred Value : 1.0000         
##          Neg Pred Value : 0.9259         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4600         
##    Detection Prevalence : 0.4600         
##       Balanced Accuracy : 0.9600         
##                                          
##        'Positive' Class : versicolor     
##

#--------------------------*****-------------------------------#

Learn R

Sunday, February 3, 2019

Basic of R Session 21.1- Support Vector Machine- e1071 package

Basic of R Session 21.1- Support Vector Machine- e1071 package

Dr Manohar Kapse

for linear svm we will remove one of the categories of species, we want to consider only two categories

plot the data points, if error use dev.off()

using e1701 package

change the value of C

———-polynomial Kernel———

———–Radial Kernel——————–

——–Sigmoid Kernel—————

use tunning for finding the best tune parameter

—————————————————

now using three categories and all the four variables

—————————————–

use tunning for finding the best tune parameter

No comments:

Post a Comment

Report Abuse