Wednesday, November 7, 2018

Basics of R- Session 18- Decision Theory- 2


rm(list=ls())

# decision tree
#rpart package

library(rpart)
library(rpart.plot)

#using the data set iris
data1<-read.csv("D:/1 Research/1 preparedness for future/3 decision science specialization/perceived score data 111.csv")
fix(data1)
summary(data1)



# data set = data1
data2<-na.omit(data1)

data2$specialization<-droplevels(data2$specialization)

# if error with Y zero levels or no data in Y

# split the data into two part using caret package

library(caret)
set.seed(100)
datasplit<-createDataPartition(data2$specialization, times = 1, p=0.7, list = FALSE, groups = min(5,length(data2$specialization)))



# create training and test data
datatrain<-data2[datasplit,]  # training data set 70%
datatest<-data2[-datasplit,]  # training data set 30%

summary(datatrain)
summary(datatest)




# decision tree
dtm1<-rpart(specialization~Gender_MF+Previous_Degree+Marital_status+Place_you_belong_to+Total_Family_Income_per_annum, data = datatrain, method = "class", minsplit=5, minbucket=1)
summary(dtm1)
dtm1$frame
dtm1$where
dtm1$terms
dtm1$cptable
plot(dtm1) #not a good
text(dtm1) #not a good
rpart.plot(dtm1)
rpart.plot(dtm1, clip.facs = TRUE)


dtm2<-rpart(specialization ~ Percentage_in_10_Class+Percentage_in_12_Class+Percentage_in_Under_Graduate+mba.percentage, data = datatrain, method = "class", minsplit=10, minbucket=1)

summary(dtm2)
dtm2$frame
dtm2$where
dtm2$terms
dtm2$cptable
plot(dtm2) #not a good
text(dtm2) #not a good

dtm3<-rpart(specialization ~ Gender_MF+Previous_Degree+Marital_status+Place_you_belong_to+Total_Family_Income_per_annum+
              Percentage_in_10_Class+Percentage_in_12_Class+Percentage_in_Under_Graduate+mba.percentage,data = datatrain, method = "class", minsplit=10, minbucket=1, control = rpart.control(cp=0.01))

rpart.plot(dtm2, type = 0)
rpart.plot(dtm2, type = 1)
rpart.plot(dtm2, type = 2)

rpart.plot(dtm2, branch= 0.2)
rpart.plot(dtm2, branch= 0.5)
rpart.plot(dtm2, branch= 0.8)

rpart.plot(dtm2, fallen.leaves = FALSE)
rpart.plot(dtm2, tweak = 2)  # text size equal to 200%

rpart.plot(dtm3, tweak = 2,fallen.leaves = TRUE)


library(rattle)
fancyRpartPlot(dtm2)

# validation for using prediction in decision tree with categories we have to mention type = class


dtm1_predict<-predict(dtm1, datatest, type = "class")

confusionMatrix(datatest$specialization, dtm1_predict)

dtm2_predict<-predict(dtm2, datatest, type = "class")

confusionMatrix(datatest$specialization, dtm2_predict)

dtm3_predict<-predict(dtm3, datatest, type = "class")

confusionMatrix(datatest$specialization, dtm3_predict)

#------------------------------#
# pruning of decision tree

#-----------------------------------#
# prepruning
# based on minimumsplit , minibucket, cp- fixed


#-------------------------------------#
# post pruning based on complexity parameter----

The complexity parameter (cp) in rpart is the minimum improvement in the model needed at each node. It’s based on the cost complexity of the model
  • For the given tree, add up the misclassification at every terminal node.
  • The cp value is a stopping parameter. It helps speed up the search for splits because it can identify splits that don’t meet this criteria and prune them before going too far., co a value very low will create deeper trees (0.0001) and higher value will make it simple (0.01)
plotcp(dtm3)
# on the basis of the plot decide the cp, size of tree and error
dtm3$cptable

# use minimum cp and prune the tree

dtm3_prune<-prune(dtm3, cp=0.01)

plot(dtm3_prune)
rpart.plot(dtm3_prune)
dtm3_prune$cptable


#----------------------------------#
# cross validation pruning, can be done using caret library
or  library(tree)