rm(list=ls())
# decision tree
#rpart package
library(rpart)
library(rpart.plot)
#using the data set iris
data1<-read.csv("D:/1 Research/1 preparedness for future/3 decision science specialization/perceived score data 111.csv")
fix(data1)
summary(data1)
# data set = data1
data2<-na.omit(data1)
data2$specialization<-droplevels(data2$specialization)
# if error with Y zero levels or no data in Y
# split the data into two part using caret package
library(caret)
set.seed(100)
datasplit<-createDataPartition(data2$specialization, times = 1, p=0.7, list = FALSE, groups = min(5,length(data2$specialization)))
# create training and test data
datatrain<-data2[datasplit,] # training data set 70%
datatest<-data2[-datasplit,] # training data set 30%
summary(datatrain)
summary(datatest)
# decision tree
dtm1<-rpart(specialization~Gender_MF+Previous_Degree+Marital_status+Place_you_belong_to+Total_Family_Income_per_annum, data = datatrain, method = "class", minsplit=5, minbucket=1)
summary(dtm1)
dtm1$frame
dtm1$where
dtm1$terms
dtm1$cptable
plot(dtm1) #not a good
text(dtm1) #not a good
rpart.plot(dtm1)
rpart.plot(dtm1, clip.facs = TRUE)
dtm2<-rpart(specialization ~ Percentage_in_10_Class+Percentage_in_12_Class+Percentage_in_Under_Graduate+mba.percentage, data = datatrain, method = "class", minsplit=10, minbucket=1)
summary(dtm2)
dtm2$frame
dtm2$where
dtm2$terms
dtm2$cptable
plot(dtm2) #not a good
text(dtm2) #not a good
dtm3<-rpart(specialization ~ Gender_MF+Previous_Degree+Marital_status+Place_you_belong_to+Total_Family_Income_per_annum+
Percentage_in_10_Class+Percentage_in_12_Class+Percentage_in_Under_Graduate+mba.percentage,data = datatrain, method = "class", minsplit=10, minbucket=1, control = rpart.control(cp=0.01))
rpart.plot(dtm2, type = 0)
rpart.plot(dtm2, type = 1)
rpart.plot(dtm2, type = 2)
rpart.plot(dtm2, branch= 0.2)
rpart.plot(dtm2, branch= 0.5)
rpart.plot(dtm2, branch= 0.8)
rpart.plot(dtm2, fallen.leaves = FALSE)
rpart.plot(dtm2, tweak = 2) # text size equal to 200%
rpart.plot(dtm3, tweak = 2,fallen.leaves = TRUE)
library(rattle)
fancyRpartPlot(dtm2)
# validation for using prediction in decision tree with categories we have to mention type = class
dtm1_predict<-predict(dtm1, datatest, type = "class")
confusionMatrix(datatest$specialization, dtm1_predict)
dtm2_predict<-predict(dtm2, datatest, type = "class")
confusionMatrix(datatest$specialization, dtm2_predict)
dtm3_predict<-predict(dtm3, datatest, type = "class")
confusionMatrix(datatest$specialization, dtm3_predict)
#------------------------------#
# pruning of decision tree
#-----------------------------------#
# prepruning
# based on minimumsplit , minibucket, cp- fixed
#-------------------------------------#
# post pruning based on complexity parameter----
The complexity parameter (cp) in rpart is the minimum improvement in the model needed at each node. It’s based on the cost complexity of the model
- For the given tree, add up the misclassification at every terminal node.
- The cp value is a stopping parameter. It helps speed up the search for splits because it can identify splits that don’t meet this criteria and prune them before going too far., co a value very low will create deeper trees (0.0001) and higher value will make it simple (0.01)
# on the basis of the plot decide the cp, size of tree and error
dtm3$cptable
# use minimum cp and prune the tree
dtm3_prune<-prune(dtm3, cp=0.01)
plot(dtm3_prune)
rpart.plot(dtm3_prune)
dtm3_prune$cptable
#----------------------------------#
# cross validation pruning, can be done using caret library
or library(tree)