Thursday, January 10, 2019

Basics of R- Session 20- Random Forest

rm(list=ls())
#using the data set perceived score
data1<-read.csv("D:/1 Research/1 preparedness for future/3 decision science specialization/sheet final.csv/final data1.7.csv")

dim(data1)
## [1] 473  25
# fix(data1)
summary(data1)
##      ï..F1         pre.score       Age_in_years   Percentage_in_10_Class
##  Min.   :  4.0   Min.   : 30.00   Min.   :19.00   Min.   :58.90         
##  1st Qu.:194.0   1st Qu.: 60.00   1st Qu.:21.00   1st Qu.:79.00         
##  Median :389.0   Median : 70.00   Median :22.00   Median :85.00         
##  Mean   :394.5   Mean   : 68.18   Mean   :22.56   Mean   :84.02         
##  3rd Qu.:592.0   3rd Qu.: 76.67   3rd Qu.:24.00   3rd Qu.:90.00         
##  Max.   :783.0   Max.   :100.00   Max.   :26.00   Max.   :97.20         
##                                                                         
##  Percentage_in_12_Class Percentage_in_Under_Graduate Number_of_siblings
##  Min.   :60.29          Min.   :58.00                Min.   :0.000     
##  1st Qu.:75.20          1st Qu.:67.88                1st Qu.:1.000     
##  Median :83.40          Median :74.00                Median :1.000     
##  Mean   :81.56          Mean   :73.41                Mean   :1.034     
##  3rd Qu.:89.33          3rd Qu.:78.00                3rd Qu.:1.000     
##  Max.   :97.17          Max.   :94.00                Max.   :4.000     
##                                                                        
##  Over.all.percentage.in.MBA   post.score        Gender   
##  Min.   :57.00              Min.   :  7.00   Female:205  
##  1st Qu.:65.60              1st Qu.: 70.00   Male  :268  
##  Median :68.00              Median : 76.67               
##  Mean   :67.87              Mean   : 76.22               
##  3rd Qu.:70.48              3rd Qu.: 85.00               
##  Max.   :77.41              Max.   :100.00               
##                                                          
##           STATE        Previous_Degree               Fathers_qualification
##  Central Zone: 56   Arts       :  3    Under Graduate           :257      
##  East Zone   : 52   Commerce   :148    Post Graduation          :110      
##  North East  :  5   Engineering:235    HSC                      : 35      
##  North Zone  : 86   Management : 44    no formal Education      : 31      
##  South Zone  :258   Science    : 43    DIPLOMA                  : 17      
##  West Zone   : 16                      PhD or higher qualificati:  8      
##                                        (Other)                  : 15      
##                Mothers_qualification        Fathers_occupation
##  DIPLOMA                  :  2       Farming         : 23     
##  HSC                      : 48       Not Alive       :  3     
##  no formal Education      : 33       Professional Job:192     
##  PhD or higher qualificati:  9       RETIRED         : 18     
##  Post Graduation          :109       Self employed   :181     
##  SSC                      : 35       Technical Job   : 56     
##  Under Graduate           :237                                
##         Mothers_occupation Marital_status Place_you_belong_to
##  House wife      :307      Married: 11    Rural     : 44     
##  Professional Job:130      Single :462    Semi Urban:116     
##  Self employed   : 25                     Urban     :313     
##  Technical Job   : 11                                        
##                                                              
##                                                              
##                                                              
##    Total_Family_Income_per_annum  Funding_for_the_MBA_Program
##  0-5 Lakh         :181           Family         :200         
##  10-15 lakh       : 55           Loan           :103         
##  15 lakh and above: 51           Loan and family: 85         
##  5-10 Lakh        :186           Loan and self  : 37         
##                                  Self           : 41         
##                                  Self and Family:  7         
##                                                              
##          Work_Experience              Career_options_after_MBA
##  1-2 year        : 80    Higher Studies           :  5        
##  2-3 year        : 60    Job                      :449        
##  3-4 year        :  9    Not yet decided          :  4        
##  less than 1 year: 50    Self-employed/ entreprene: 15        
##  no experience   :274                                         
##                                                               
##                                                               
##  Alternate_Career_Option_after_MBA_if_not_selected_through_Instit
##  Family Business          : 20                                   
##  Higher Studies           : 47                                   
##  Job                      :244                                   
##  Not yet decided          : 42                                   
##  Self-employed/ entreprene:120                                   
##                                                                  
##                                                                  
##       perceived.Job.Skill   Specialization
##  desired skills :167      Finance  :147   
##  prefered skills:295      HR       : 47   
##  required skills: 11      LOS      :106   
##                           Marketing:173   
##                                           
##                                           
## 
# data set = data1
data2<-na.omit(data1)

data2$Specialization<-droplevels(data2$Specialization)
dim(data2)
## [1] 473  25

split the data into two part using caret package

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(100)
datasplit<-createDataPartition(data2$Specialization, times = 1, p=0.7, list = FALSE)

# create training and test data
datatrain<-data2[datasplit,]  # training data set 70%
datatest<-data2[-datasplit,]  # training data set 30%

dim(datatrain)
## [1] 333  25
dim(datatest)
## [1] 140  25
summary(datatrain$Specialization)
##   Finance        HR       LOS Marketing 
##       103        33        75       122
summary(datatest)
##      ï..F1         pre.score       Age_in_years   Percentage_in_10_Class
##  Min.   :  5.0   Min.   : 33.33   Min.   :19.00   Min.   :61.00         
##  1st Qu.:176.5   1st Qu.: 60.00   1st Qu.:22.00   1st Qu.:79.00         
##  Median :444.5   Median : 71.67   Median :22.00   Median :85.25         
##  Mean   :407.0   Mean   : 68.36   Mean   :22.69   Mean   :83.77         
##  3rd Qu.:600.0   3rd Qu.: 76.67   3rd Qu.:24.00   3rd Qu.:89.40         
##  Max.   :781.0   Max.   :100.00   Max.   :26.00   Max.   :97.20         
##                                                                         
##  Percentage_in_12_Class Percentage_in_Under_Graduate Number_of_siblings
##  Min.   :61.00          Min.   :58.00                Min.   :0.000     
##  1st Qu.:74.80          1st Qu.:67.00                1st Qu.:1.000     
##  Median :80.70          Median :73.08                Median :1.000     
##  Mean   :80.28          Mean   :73.05                Mean   :1.007     
##  3rd Qu.:87.50          3rd Qu.:77.00                3rd Qu.:1.000     
##  Max.   :97.17          Max.   :94.00                Max.   :4.000     
##                                                                        
##  Over.all.percentage.in.MBA   post.score        Gender            STATE   
##  Min.   :59.16              Min.   :  8.00   Female:58   Central Zone:26  
##  1st Qu.:65.08              1st Qu.: 69.58   Male  :82   East Zone   :16  
##  Median :67.27              Median : 76.67               North East  : 1  
##  Mean   :67.52              Mean   : 75.78               North Zone  :19  
##  3rd Qu.:69.45              3rd Qu.: 85.00               South Zone  :74  
##  Max.   :77.41              Max.   :100.00               West Zone   : 4  
##                                                                           
##     Previous_Degree         Fathers_qualification
##  Arts       : 1     Under Graduate     :81       
##  Commerce   :44     Post Graduation    :31       
##  Engineering:74     HSC                : 9       
##  Management : 6     no formal Education: 9       
##  Science    :15     DIPLOMA            : 5       
##                     SSC                : 2       
##                     (Other)            : 3       
##                Mothers_qualification        Fathers_occupation
##  DIPLOMA                  : 0        Farming         :11      
##  HSC                      :20        Not Alive       : 1      
##  no formal Education      : 9        Professional Job:55      
##  PhD or higher qualificati: 1        RETIRED         : 2      
##  Post Graduation          :32        Self employed   :50      
##  SSC                      :11        Technical Job   :21      
##  Under Graduate           :67                                 
##         Mothers_occupation Marital_status Place_you_belong_to
##  House wife      :96       Married:  4    Rural     :13      
##  Professional Job:34       Single :136    Semi Urban:31      
##  Self employed   : 6                      Urban     :96      
##  Technical Job   : 4                                         
##                                                              
##                                                              
##                                                              
##    Total_Family_Income_per_annum  Funding_for_the_MBA_Program
##  0-5 Lakh         :52            Family         :58          
##  10-15 lakh       :19            Loan           :34          
##  15 lakh and above:17            Loan and family:25          
##  5-10 Lakh        :52            Loan and self  :12          
##                                  Self           :10          
##                                  Self and Family: 1          
##                                                              
##          Work_Experience              Career_options_after_MBA
##  1-2 year        :23     Higher Studies           :  2        
##  2-3 year        :15     Job                      :135        
##  3-4 year        : 3     Not yet decided          :  1        
##  less than 1 year:16     Self-employed/ entreprene:  2        
##  no experience   :83                                          
##                                                               
##                                                               
##  Alternate_Career_Option_after_MBA_if_not_selected_through_Instit
##  Family Business          : 5                                    
##  Higher Studies           :18                                    
##  Job                      :63                                    
##  Not yet decided          :15                                    
##  Self-employed/ entreprene:39                                    
##                                                                  
##                                                                  
##       perceived.Job.Skill   Specialization
##  desired skills :48       Finance  :44    
##  prefered skills:89       HR       :14    
##  required skills: 3       LOS      :31    
##                           Marketing:51    
##                                           
##                                           
## 
summary(datatrain)
##      ï..F1         pre.score       Age_in_years  Percentage_in_10_Class
##  Min.   :  4.0   Min.   : 30.00   Min.   :19.0   Min.   :58.90         
##  1st Qu.:201.0   1st Qu.: 60.00   1st Qu.:21.0   1st Qu.:79.00         
##  Median :380.0   Median : 70.00   Median :22.0   Median :85.00         
##  Mean   :389.3   Mean   : 68.11   Mean   :22.5   Mean   :84.12         
##  3rd Qu.:586.0   3rd Qu.: 76.67   3rd Qu.:24.0   3rd Qu.:91.00         
##  Max.   :783.0   Max.   :100.00   Max.   :26.0   Max.   :97.20         
##                                                                        
##  Percentage_in_12_Class Percentage_in_Under_Graduate Number_of_siblings
##  Min.   :60.29          Min.   :58.00                Min.   :0.000     
##  1st Qu.:76.00          1st Qu.:68.00                1st Qu.:1.000     
##  Median :84.00          Median :74.00                Median :1.000     
##  Mean   :82.09          Mean   :73.55                Mean   :1.045     
##  3rd Qu.:90.20          3rd Qu.:78.13                3rd Qu.:1.000     
##  Max.   :97.17          Max.   :94.00                Max.   :4.000     
##                                                                        
##  Over.all.percentage.in.MBA   post.score        Gender   
##  Min.   :57.00              Min.   :  7.00   Female:147  
##  1st Qu.:66.00              1st Qu.: 71.67   Male  :186  
##  Median :68.10              Median : 76.67               
##  Mean   :68.02              Mean   : 76.40               
##  3rd Qu.:70.64              3rd Qu.: 85.00               
##  Max.   :77.41              Max.   :100.00               
##                                                          
##           STATE        Previous_Degree               Fathers_qualification
##  Central Zone: 30   Arts       :  2    Under Graduate           :176      
##  East Zone   : 36   Commerce   :104    Post Graduation          : 79      
##  North East  :  4   Engineering:161    HSC                      : 26      
##  North Zone  : 67   Management : 38    no formal Education      : 22      
##  South Zone  :184   Science    : 28    DIPLOMA                  : 12      
##  West Zone   : 12                      PhD or higher qualificati:  7      
##                                        (Other)                  : 11      
##                Mothers_qualification        Fathers_occupation
##  DIPLOMA                  :  2       Farming         : 12     
##  HSC                      : 28       Not Alive       :  2     
##  no formal Education      : 24       Professional Job:137     
##  PhD or higher qualificati:  8       RETIRED         : 16     
##  Post Graduation          : 77       Self employed   :131     
##  SSC                      : 24       Technical Job   : 35     
##  Under Graduate           :170                                
##         Mothers_occupation Marital_status Place_you_belong_to
##  House wife      :211      Married:  7    Rural     : 31     
##  Professional Job: 96      Single :326    Semi Urban: 85     
##  Self employed   : 19                     Urban     :217     
##  Technical Job   :  7                                        
##                                                              
##                                                              
##                                                              
##    Total_Family_Income_per_annum  Funding_for_the_MBA_Program
##  0-5 Lakh         :129           Family         :142         
##  10-15 lakh       : 36           Loan           : 69         
##  15 lakh and above: 34           Loan and family: 60         
##  5-10 Lakh        :134           Loan and self  : 25         
##                                  Self           : 31         
##                                  Self and Family:  6         
##                                                              
##          Work_Experience              Career_options_after_MBA
##  1-2 year        : 57    Higher Studies           :  3        
##  2-3 year        : 45    Job                      :314        
##  3-4 year        :  6    Not yet decided          :  3        
##  less than 1 year: 34    Self-employed/ entreprene: 13        
##  no experience   :191                                         
##                                                               
##                                                               
##  Alternate_Career_Option_after_MBA_if_not_selected_through_Instit
##  Family Business          : 15                                   
##  Higher Studies           : 29                                   
##  Job                      :181                                   
##  Not yet decided          : 27                                   
##  Self-employed/ entreprene: 81                                   
##                                                                  
##                                                                  
##       perceived.Job.Skill   Specialization
##  desired skills :119      Finance  :103   
##  prefered skills:206      HR       : 33   
##  required skills:  8      LOS      : 75   
##                           Marketing:122   
##                                           
##                                           
## 

training data set control parameter

control1<-trainControl(method = "cv", number =10, savePredictions = TRUE)
control1<-trainControl(method = "cv", number =10)
control1<-trainControl(method = "cv", number =5)


dtm4<-train(Specialization ~ Gender+Previous_Degree+Marital_status+Place_you_belong_to+Percentage_in_10_Class+Percentage_in_12_Class+Percentage_in_Under_Graduate+Over.all.percentage.in.MBA+Total_Family_Income_per_annum+Work_Experience,

            data = datatrain, trcontrol=control1, method="rf", savePredictions = TRUE)

dtm4
  • mtry: Number of variables randomly sampled as candidates at each split.
  • ntree: Number of trees to grow.
## Random Forest 
## 
## 333 samples
##  10 predictor
##   4 classes: 'Finance', 'HR', 'LOS', 'Marketing' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 333, 333, 333, 333, 333, 333, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8420018  0.7733131
##   10    0.8762908  0.8244549
##   19    0.8716184  0.8182187
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.

explore the model

dtm4
## Random Forest 
## 
## 333 samples
##  10 predictor
##   4 classes: 'Finance', 'HR', 'LOS', 'Marketing' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 333, 333, 333, 333, 333, 333, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8420018  0.7733131
##   10    0.8762908  0.8244549
##   19    0.8716184  0.8182187
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.
print(dtm4)
## Random Forest 
## 
## 333 samples
##  10 predictor
##   4 classes: 'Finance', 'HR', 'LOS', 'Marketing' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 333, 333, 333, 333, 333, 333, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8420018  0.7733131
##   10    0.8762908  0.8244549
##   19    0.8716184  0.8182187
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.
summary(dtm4)
##                 Length Class      Mode     
## call               6   -none-     call     
## type               1   -none-     character
## predicted        333   factor     numeric  
## err.rate        2500   -none-     numeric  
## confusion         20   -none-     numeric  
## votes           1332   matrix     numeric  
## oob.times        333   -none-     numeric  
## classes            4   -none-     character
## importance        19   -none-     numeric  
## importanceSD       0   -none-     NULL     
## localImportance    0   -none-     NULL     
## proximity          0   -none-     NULL     
## ntree              1   -none-     numeric  
## mtry               1   -none-     numeric  
## forest            14   -none-     list     
## y                333   factor     numeric  
## test               0   -none-     NULL     
## inbag              0   -none-     NULL     
## xNames            19   -none-     character
## problemType        1   -none-     character
## tuneValue          1   data.frame list     
## obsLevels          4   -none-     character
## param              2   -none-     list
plot(dtm4$finalModel)
dtm4$coefnames
##  [1] "GenderMale"                                    
##  [2] "Previous_DegreeCommerce"                       
##  [3] "Previous_DegreeEngineering"                    
##  [4] "Previous_DegreeManagement"                     
##  [5] "Previous_DegreeScience"                        
##  [6] "Marital_statusSingle"                          
##  [7] "Place_you_belong_toSemi Urban"                 
##  [8] "Place_you_belong_toUrban"                      
##  [9] "Percentage_in_10_Class"                        
## [10] "Percentage_in_12_Class"                        
## [11] "Percentage_in_Under_Graduate"                  
## [12] "Over.all.percentage.in.MBA"                    
## [13] "Total_Family_Income_per_annum10-15 lakh"       
## [14] "Total_Family_Income_per_annum15 lakh and above"
## [15] "Total_Family_Income_per_annum5-10 Lakh"        
## [16] "Work_Experience2-3 year"                       
## [17] "Work_Experience3-4 year"                       
## [18] "Work_Experienceless than 1 year"               
## [19] "Work_Experienceno experience"
# dtm4$modelInfo

dtm4$results
##   mtry  Accuracy     Kappa AccuracySD    KappaSD
## 1    2 0.8420018 0.7733131 0.05062263 0.07319933
## 2   10 0.8762908 0.8244549 0.03340855 0.04748827
## 3   19 0.8716184 0.8182187 0.03675329 0.05249690
dtm4$modelType
## [1] "Classification"
dtm4$bestTune
##   mtry
## 2   10
varImp(dtm4)
## rf variable importance
## 
##                                                Overall
## Percentage_in_Under_Graduate                   100.000
## Over.all.percentage.in.MBA                      88.979
## Percentage_in_12_Class                          75.238
## Percentage_in_10_Class                          69.144
## Previous_DegreeEngineering                      36.772
## GenderMale                                      16.762
## Work_Experienceno experience                    16.688
## Previous_DegreeCommerce                         16.285
## Total_Family_Income_per_annum5-10 Lakh          15.200
## Place_you_belong_toUrban                         9.885
## Work_Experienceless than 1 year                  8.519
## Previous_DegreeScience                           7.975
## Place_you_belong_toSemi Urban                    7.962
## Total_Family_Income_per_annum15 lakh and above   7.149
## Work_Experience2-3 year                          6.817
## Previous_DegreeManagement                        6.343
## Total_Family_Income_per_annum10-15 lakh          4.846
## Marital_statusSingle                             2.623
## Work_Experience3-4 year                          0.000
dtm4_f<-dtm4$finalModel

#-----------------#

predict_dtm4<-predict(dtm4, datatest)
predict_dtm4
##   [1] Marketing LOS       Finance   Marketing Marketing Marketing Finance  
##   [8] Finance   Finance   LOS       Marketing Finance   Finance   Finance  
##  [15] Finance   Finance   Marketing Finance   LOS       Marketing Finance  
##  [22] Marketing Marketing Marketing Finance   Finance   Marketing Finance  
##  [29] Finance   LOS       Finance   Marketing Marketing Marketing HR       
##  [36] HR        Finance   Finance   Marketing HR        LOS       Marketing
##  [43] HR        Marketing LOS       LOS       LOS       Marketing Finance  
##  [50] HR        Finance   Marketing HR        LOS       Marketing Finance  
##  [57] Marketing Marketing HR        HR        LOS       Finance   Marketing
##  [64] Finance   Marketing LOS       LOS       Finance   LOS       Marketing
##  [71] Marketing LOS       Finance   Finance   Marketing LOS       Finance  
##  [78] HR        Marketing LOS       LOS       Finance   Finance   Marketing
##  [85] LOS       Marketing LOS       Finance   LOS       Finance   HR       
##  [92] Marketing LOS       Marketing HR        LOS       Finance   LOS      
##  [99] Marketing Marketing Marketing LOS       HR        Marketing Marketing
## [106] HR        Marketing Finance   Finance   Marketing Marketing Marketing
## [113] Marketing LOS       HR        Marketing Finance   Finance   LOS      
## [120] Marketing Finance   Finance   Finance   Marketing Marketing LOS      
## [127] LOS       Marketing Finance   LOS       Marketing LOS       Marketing
## [134] LOS       LOS       Finance   Marketing LOS       Finance   Finance  
## Levels: Finance HR LOS Marketing
confusionMatrix(predict_dtm4, datatest$Specialization)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Finance HR LOS Marketing
##   Finance        40  0   0         3
##   HR              0 14   0         0
##   LOS             0  0  31         2
##   Marketing       4  0   0        46
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9357          
##                  95% CI : (0.8815, 0.9702)
##     No Information Rate : 0.3643          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9096          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Finance Class: HR Class: LOS Class: Marketing
## Sensitivity                  0.9091       1.0     1.0000           0.9020
## Specificity                  0.9688       1.0     0.9817           0.9551
## Pos Pred Value               0.9302       1.0     0.9394           0.9200
## Neg Pred Value               0.9588       1.0     1.0000           0.9444
## Prevalence                   0.3143       0.1     0.2214           0.3643
## Detection Rate               0.2857       0.1     0.2214           0.3286
## Detection Prevalence         0.3071       0.1     0.2357           0.3571
## Balanced Accuracy            0.9389       1.0     0.9908           0.9285

No comments:

Post a Comment