Learn R: Basics of R- Session 18- Decision Theory- 2

rm(list=ls())

# decision tree
#rpart package

library(rpart)
library(rpart.plot)

#using the data set iris
data1<-read.csv("D:/1 Research/1 preparedness for future/3 decision science specialization/perceived score data 111.csv")
fix(data1)
summary(data1)

# data set = data1
data2<-na.omit(data1)

data2$specialization<-droplevels(data2$specialization)

# if error with Y zero levels or no data in Y

# split the data into two part using caret package

library(caret)
set.seed(100)
datasplit<-createDataPartition(data2$specialization, times = 1, p=0.7, list = FALSE, groups = min(5,length(data2$specialization)))

# create training and test data
datatrain<-data2[datasplit,] # training data set 70%
datatest<-data2[-datasplit,] # training data set 30%

summary(datatrain)
summary(datatest)

# decision tree
dtm1<-rpart(specialization~Gender_MF+Previous_Degree+Marital_status+Place_you_belong_to+Total_Family_Income_per_annum, data = datatrain, method = "class", minsplit=5, minbucket=1)
summary(dtm1)
dtm1$frame
dtm1$where
dtm1$terms
dtm1$cptable
plot(dtm1) #not a good
text(dtm1) #not a good
rpart.plot(dtm1)
rpart.plot(dtm1, clip.facs = TRUE)

dtm2<-rpart(specialization ~ Percentage_in_10_Class+Percentage_in_12_Class+Percentage_in_Under_Graduate+mba.percentage, data = datatrain, method = "class", minsplit=10, minbucket=1)

summary(dtm2)
dtm2$frame
dtm2$where
dtm2$terms
dtm2$cptable
plot(dtm2) #not a good
text(dtm2) #not a good

dtm3<-rpart(specialization ~ Gender_MF+Previous_Degree+Marital_status+Place_you_belong_to+Total_Family_Income_per_annum+
Percentage_in_10_Class+Percentage_in_12_Class+Percentage_in_Under_Graduate+mba.percentage,data = datatrain, method = "class", minsplit=10, minbucket=1, control = rpart.control(cp=0.01))

rpart.plot(dtm2, type = 0)
rpart.plot(dtm2, type = 1)
rpart.plot(dtm2, type = 2)

rpart.plot(dtm2, branch= 0.2)
rpart.plot(dtm2, branch= 0.5)
rpart.plot(dtm2, branch= 0.8)

rpart.plot(dtm2, fallen.leaves = FALSE)
rpart.plot(dtm2, tweak = 2) # text size equal to 200%

rpart.plot(dtm3, tweak = 2,fallen.leaves = TRUE)

library(rattle)
fancyRpartPlot(dtm2)

# validation for using prediction in decision tree with categories we have to mention type = class

dtm1_predict<-predict(dtm1, datatest, type = "class")

confusionMatrix(datatest$specialization, dtm1_predict)

dtm2_predict<-predict(dtm2, datatest, type = "class")

confusionMatrix(datatest$specialization, dtm2_predict)

dtm3_predict<-predict(dtm3, datatest, type = "class")

confusionMatrix(datatest$specialization, dtm3_predict)

#------------------------------#
# pruning of decision tree

#-----------------------------------#
# prepruning
# based on minimumsplit , minibucket, cp- fixed

#-------------------------------------#
# post pruning based on complexity parameter----

The complexity parameter (cp) in rpart is the minimum improvement in the model needed at each node. It’s based on the cost complexity of the model

For the given tree, add up the misclassification at every terminal node.
The cp value is a stopping parameter. It helps speed up the search for splits because it can identify splits that don’t meet this criteria and prune them before going too far., co a value very low will create deeper trees (0.0001) and higher value will make it simple (0.01)

plotcp(dtm3)
# on the basis of the plot decide the cp, size of tree and error
dtm3$cptable

# use minimum cp and prune the tree

dtm3_prune<-prune(dtm3, cp=0.01)

plot(dtm3_prune)
rpart.plot(dtm3_prune)
dtm3_prune$cptable

#----------------------------------#
# cross validation pruning, can be done using caret library
or library(tree)

Learn R

Wednesday, November 7, 2018

Basics of R- Session 18- Decision Theory- 2

No comments:

Post a Comment

Report Abuse