Monday, December 31, 2018

Basics of R Session -19- Cluster Analysis

rm(list=ls())
# import the data
cluster1.1<-read.csv("file:///D:/1 Teaching Material/1 inurture Lectures/1 multivariate data analysis/1 Multivariate Data Analysis PPts Self/Cluster Analysis/Cluster Analysis MDP 2018/Data1.csv")

str(cluster1.1)
## 'data.frame':    9 obs. of  3 variables:
##  $ Name       : Factor w/ 9 levels "A","B","C","D",..: 1 2 3 4 5 6 7 8 9
##  $ Physics    : int  1 3 2 2 4 5 6 8 6
##  $ Mathematics: int  5 5 6 4 7 9 6 8 8
# remove the missing observations
cluster1.1<-na.omit(cluster1.1)
str(cluster1.1)
## 'data.frame':    9 obs. of  3 variables:
##  $ Name       : Factor w/ 9 levels "A","B","C","D",..: 1 2 3 4 5 6 7 8 9
##  $ Physics    : int  1 3 2 2 4 5 6 8 6
##  $ Mathematics: int  5 5 6 4 7 9 6 8 8
#fix(cluster1.1)

# remove the labels of the data set or extra variables if present
cluster1.2<-cluster1.1[,-1]

calculate distance using dist() or daisy()

dist1<-dist(cluster1.2,method = "euclidean")
dist1
##          1        2        3        4        5        6        7        8
## 2 2.000000                                                               
## 3 1.414214 1.414214                                                      
## 4 1.414214 1.414214 2.000000                                             
## 5 3.605551 2.236068 2.236068 3.605551                                    
## 6 5.656854 4.472136 4.242641 5.830952 2.236068                           
## 7 5.099020 3.162278 4.000000 4.472136 2.236068 3.162278                  
## 8 7.615773 5.830952 6.324555 7.211103 4.123106 3.162278 2.828427         
## 9 5.830952 4.242641 4.472136 5.656854 2.236068 1.414214 2.000000 2.000000
dist2<-dist(cluster1.2,method = "maximum")
dist2
##   1 2 3 4 5 6 7 8
## 2 2              
## 3 1 1            
## 4 1 1 2          
## 5 3 2 2 3        
## 6 4 4 3 5 2      
## 7 5 3 4 4 2 3    
## 8 7 5 6 6 4 3 2  
## 9 5 3 4 4 2 1 2 2
dist3<-dist(cluster1.2,method = "manhattan")
dist3
##    1  2  3  4  5  6  7  8
## 2  2                     
## 3  2  2                  
## 4  2  2  2               
## 5  5  3  3  5            
## 6  8  6  6  8  3         
## 7  6  4  4  6  3  4      
## 8 10  8  8 10  5  4  4   
## 9  8  6  6  8  3  2  2  2
dist4<-dist(cluster1.2,method = "minkowski")
dist4
##          1        2        3        4        5        6        7        8
## 2 2.000000                                                               
## 3 1.414214 1.414214                                                      
## 4 1.414214 1.414214 2.000000                                             
## 5 3.605551 2.236068 2.236068 3.605551                                    
## 6 5.656854 4.472136 4.242641 5.830952 2.236068                           
## 7 5.099020 3.162278 4.000000 4.472136 2.236068 3.162278                  
## 8 7.615773 5.830952 6.324555 7.211103 4.123106 3.162278 2.828427         
## 9 5.830952 4.242641 4.472136 5.656854 2.236068 1.414214 2.000000 2.000000

hierarchical cluster analysis use any one of the distance and any one method

clusterout<-hclust(dist1,method = "single")
plot(clusterout)
clusterout<-hclust(dist1,method = "complete")
plot(clusterout)
clusterout<-hclust(dist1,method = "average")
plot(clusterout)
clusterout<-hclust(dist1,method = "median")
plot(clusterout)
clusterout<-hclust(dist1,method = "centroid")
plot(clusterout)
clusterout<-hclust(dist1,method = "ward.D")
plot(clusterout)
clusterout<-hclust(dist1,method = "ward.D2")
plot(clusterout)

adding labels to the r object clusterout

clusterout$labels<-cluster1.1[,1]
plot(clusterout)
 # reducing the tree or dentograph either on the basis of

number of clusters -k or height- h # generally h not considered

groups<- cutree(clusterout, k=3)   # cut the existing tree into 3 clusters
groups
## A B C D E G H I J 
## 1 1 1 1 2 2 2 3 2

draw dendogram with red borders around the 3 clusters

plot(clusterout)
rect.hclust(clusterout, k=3,border="red")
print(clusterout)
## 
## Call:
## hclust(d = dist1, method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 9
summary(clusterout)
##             Length Class  Mode     
## merge       16     -none- numeric  
## height       8     -none- numeric  
## order        9     -none- numeric  
## labels       9     factor numeric  
## method       1     -none- character
## call         3     -none- call     
## dist.method  1     -none- character
clusterout$height
## [1] 1.414214 1.414214 1.414214 2.000000 2.236068 2.915476 3.535534 9.706813

centroid of the clusters

aggregate(cluster1.2,by=list(groups),FUN=mean)
##   Group.1 Physics Mathematics
## 1       1    2.00         5.0
## 2       2    5.25         7.5
## 3       3    8.00         8.0
# using centroid we can check the profiles of the clusters formed

—————-

Other Method #—————-
# Libraries for Cluster Analysis
library(NbClust)
library(mclust)
## Warning: package 'mclust' was built under R version 3.5.3
## Package 'mclust' version 5.4.3
## Type 'citation("mclust")' for citing this R package in publications.
library(fpc)
## Warning: package 'fpc' was built under R version 3.5.3
# max.nc can be more also say 10
nb<-NbClust(cluster1.2, distance = "euclidean",min.nc = 2, max.nc = 4,method = "single")
## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 
## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 11 proposed 2 as the best number of clusters 
## * 6 proposed 3 as the best number of clusters 
## * 6 proposed 4 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
nb
## $All.index
##        KL      CH Hartigan    CCC   Scott Marriot TrCovW  TraceW Friedman
## 2 36.1587 15.9034   3.3185 4.8583 41.2499     372  56.25 19.9000  61.1016
## 3  0.2211 11.4691   2.6786 2.5279 47.7833     405  56.25 13.5000  86.6250
## 4  0.2568  9.9603   4.3333 1.4017 57.0900     256  40.00  9.3333 145.0278
##     Rubin Cindex     DB Silhouette   Duda Pseudot2  Beale Ratkowsky   Ball
## 2 29.6985 0.3788 0.6614     0.4819 0.8173   0.6706 0.1676    0.5726 9.9500
## 3 43.7778 0.4711 0.6005     0.3245 0.5614   1.5625 0.5208    0.5059 4.5000
## 4 63.3214 0.3717 0.6415     0.4064 0.1875   4.3333 2.1667    0.4513 2.3333
##   Ptbiserial   Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2     0.7006 1.3500  0.5909 0.6202 0.0158  0.8562 1.3638 0.7935
## 3     0.6437 0.9198  0.9087 0.7071 0.0168  1.1765 1.0730 0.1482
## 4     0.5955 0.5167  1.2746 0.6325 0.0192  1.5557 0.8513 0.1061
## 
## $All.CriticalValues
##   CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2        -0.4219           -10.1102       0.8495
## 3        -0.5522            -5.6219       0.6295
## 4        -0.7431            -2.3458       0.3158
## 
## $Best.nc
##                      KL      CH Hartigan    CCC  Scott Marriot TrCovW
## Number_clusters  2.0000  2.0000   4.0000 2.0000 4.0000       3   4.00
## Value_Index     36.1587 15.9034   1.6548 4.8583 9.3067    -182  16.25
##                 TraceW Friedman  Rubin Cindex     DB Silhouette   Duda
## Number_clusters 3.0000   4.0000 3.0000 4.0000 3.0000     2.0000 2.0000
## Value_Index     2.2333  58.4028 5.4644 0.3717 0.6005     0.4819 0.8173
##                 PseudoT2  Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters       NA 2.0000    2.0000 3.00     2.0000 2.00  2.0000
## Value_Index           NA 0.1676    0.5726 5.45     0.7006 1.35  0.5909
##                   Dunn Hubert SDindex Dindex   SDbw
## Number_clusters 3.0000      0  2.0000      0 4.0000
## Value_Index     0.7071      0  0.8562      0 0.1061
## 
## $Best.partition
## 1 2 3 4 5 6 7 8 9 
## 1 1 1 1 1 2 2 2 2
nb$All.index
##        KL      CH Hartigan    CCC   Scott Marriot TrCovW  TraceW Friedman
## 2 36.1587 15.9034   3.3185 4.8583 41.2499     372  56.25 19.9000  61.1016
## 3  0.2211 11.4691   2.6786 2.5279 47.7833     405  56.25 13.5000  86.6250
## 4  0.2568  9.9603   4.3333 1.4017 57.0900     256  40.00  9.3333 145.0278
##     Rubin Cindex     DB Silhouette   Duda Pseudot2  Beale Ratkowsky   Ball
## 2 29.6985 0.3788 0.6614     0.4819 0.8173   0.6706 0.1676    0.5726 9.9500
## 3 43.7778 0.4711 0.6005     0.3245 0.5614   1.5625 0.5208    0.5059 4.5000
## 4 63.3214 0.3717 0.6415     0.4064 0.1875   4.3333 2.1667    0.4513 2.3333
##   Ptbiserial   Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2     0.7006 1.3500  0.5909 0.6202 0.0158  0.8562 1.3638 0.7935
## 3     0.6437 0.9198  0.9087 0.7071 0.0168  1.1765 1.0730 0.1482
## 4     0.5955 0.5167  1.2746 0.6325 0.0192  1.5557 0.8513 0.1061
nb$Best.partition
## 1 2 3 4 5 6 7 8 9 
## 1 1 1 1 1 2 2 2 2
# or we can check scree plot as well
# if we require some specific number of cluster (say 2) then

nbfit<-kmeans(cluster1.2,2)
nbfit$cluster
## 1 2 3 4 5 6 7 8 9 
## 1 1 1 1 2 2 2 2 2
nbfit$centers  # for profiling or prediction
##   Physics Mathematics
## 1     2.0         5.0
## 2     5.8         7.6
nbfit$totss
## [1] 65.11111
nbfit$withinss
## [1]  4 14
nbfit$tot.withinss
## [1] 18
nbfit$betweenss
## [1] 47.11111
nbfit$size
## [1] 4 5
nbfit$iter
## [1] 1

Wednesday, November 7, 2018

Basics of R- Session 18- Decision Theory- 2


rm(list=ls())

# decision tree
#rpart package

library(rpart)
library(rpart.plot)

#using the data set iris
data1<-read.csv("D:/1 Research/1 preparedness for future/3 decision science specialization/perceived score data 111.csv")
fix(data1)
summary(data1)



# data set = data1
data2<-na.omit(data1)

data2$specialization<-droplevels(data2$specialization)

# if error with Y zero levels or no data in Y

# split the data into two part using caret package

library(caret)
set.seed(100)
datasplit<-createDataPartition(data2$specialization, times = 1, p=0.7, list = FALSE, groups = min(5,length(data2$specialization)))



# create training and test data
datatrain<-data2[datasplit,]  # training data set 70%
datatest<-data2[-datasplit,]  # training data set 30%

summary(datatrain)
summary(datatest)




# decision tree
dtm1<-rpart(specialization~Gender_MF+Previous_Degree+Marital_status+Place_you_belong_to+Total_Family_Income_per_annum, data = datatrain, method = "class", minsplit=5, minbucket=1)
summary(dtm1)
dtm1$frame
dtm1$where
dtm1$terms
dtm1$cptable
plot(dtm1) #not a good
text(dtm1) #not a good
rpart.plot(dtm1)
rpart.plot(dtm1, clip.facs = TRUE)


dtm2<-rpart(specialization ~ Percentage_in_10_Class+Percentage_in_12_Class+Percentage_in_Under_Graduate+mba.percentage, data = datatrain, method = "class", minsplit=10, minbucket=1)

summary(dtm2)
dtm2$frame
dtm2$where
dtm2$terms
dtm2$cptable
plot(dtm2) #not a good
text(dtm2) #not a good

dtm3<-rpart(specialization ~ Gender_MF+Previous_Degree+Marital_status+Place_you_belong_to+Total_Family_Income_per_annum+
              Percentage_in_10_Class+Percentage_in_12_Class+Percentage_in_Under_Graduate+mba.percentage,data = datatrain, method = "class", minsplit=10, minbucket=1, control = rpart.control(cp=0.01))

rpart.plot(dtm2, type = 0)
rpart.plot(dtm2, type = 1)
rpart.plot(dtm2, type = 2)

rpart.plot(dtm2, branch= 0.2)
rpart.plot(dtm2, branch= 0.5)
rpart.plot(dtm2, branch= 0.8)

rpart.plot(dtm2, fallen.leaves = FALSE)
rpart.plot(dtm2, tweak = 2)  # text size equal to 200%

rpart.plot(dtm3, tweak = 2,fallen.leaves = TRUE)


library(rattle)
fancyRpartPlot(dtm2)

# validation for using prediction in decision tree with categories we have to mention type = class


dtm1_predict<-predict(dtm1, datatest, type = "class")

confusionMatrix(datatest$specialization, dtm1_predict)

dtm2_predict<-predict(dtm2, datatest, type = "class")

confusionMatrix(datatest$specialization, dtm2_predict)

dtm3_predict<-predict(dtm3, datatest, type = "class")

confusionMatrix(datatest$specialization, dtm3_predict)

#------------------------------#
# pruning of decision tree

#-----------------------------------#
# prepruning
# based on minimumsplit , minibucket, cp- fixed


#-------------------------------------#
# post pruning based on complexity parameter----

The complexity parameter (cp) in rpart is the minimum improvement in the model needed at each node. It’s based on the cost complexity of the model
  • For the given tree, add up the misclassification at every terminal node.
  • The cp value is a stopping parameter. It helps speed up the search for splits because it can identify splits that don’t meet this criteria and prune them before going too far., co a value very low will create deeper trees (0.0001) and higher value will make it simple (0.01)
plotcp(dtm3)
# on the basis of the plot decide the cp, size of tree and error
dtm3$cptable

# use minimum cp and prune the tree

dtm3_prune<-prune(dtm3, cp=0.01)

plot(dtm3_prune)
rpart.plot(dtm3_prune)
dtm3_prune$cptable


#----------------------------------#
# cross validation pruning, can be done using caret library
or  library(tree)

Sunday, October 7, 2018

Basics of R- Session 18- Decision Tree-1


library(rpart)
library(tree)
library(partykit)
library(rpart.plot)


library(CHAID)
car1<-read.csv("C:/Users/Administrator.vaio1/Desktop/data car.csv")
fix(car1)
dim(car1)

#CHAID-, Chi Square automated identification detector
# all are categorical variable
#Minimumsplit is the minimum number of units before split 10% or atleast 20
#Minbucket is the minimum number of units in a leaf
# maxdepth length of the tree, 1 - single tree

chaid1<-chaid(Car_accaptability~Buying+maintenance+safety, control=chaid_control(minprob = 0.0, minsplit = 500, minbucket = 1), data=car1)
plot(chaid1)
plot(chaid1, margin =0.1)  # page margin
plot(chaid1, margin =1)
plot(chaid1, margin =5)
print(chaid1)

#--------------------------------------------------------#

#CHART

chart1<-rpart(Car_accaptability~Buying+maintenance+safety,data = car1,method = "class", minsplit=500, minbucket=1)
#cp complexity parameter
chart1<-rpart(Car_accaptability~Buying+maintenance+safety,data = car1,method = "class", minsplit=500, minbucket=1, parms = list(split="gini"),cp=0.01)


chart1$parms
chart1$functions
chart1$frame
chart1$splits
chart1$csplit

plot(chart1)
text(chart1)

plot(chart1, uniform = TRUE)
text(chart1, all = TRUE, use.n = TRUE)

summary(chart1)

library(rpart.plot)
rpart.plot(chart1)

library(rattle)
fancyRpartPlot(chart1)

#-----------------------------------------------#

dtm<-rpart(Species~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width, data = iris_train, method = "class")
dtm

print(dtm)
summary(dtm)
plot(dtm) #not a good
text(dtm) #not a good
rpart.plot(dtm)

#prediction can also be done
predict1<-predict(dtm,iris_test, type = "class")
table(iris_test$Species,predict1)

library(caret)
library(e1071)
confusionMatrix(predict1,iris_test$Species)
#----------------------------------------------------------------#


Sunday, September 23, 2018

Basics of R- Session 17- Factor analysis



#factorANalysis
library(psych)

#import data set
factoranalysis<-read.csv("C:/Users/Administrator.vaio1/Desktop/factor1.1.csv")


library(psych)
library(Hmisc)
library(stats)
library(HSAUR)
library(FactoMineR)

#method 1---principal component analysis

#use principal
#default options will be used here

# use library(psych)

#bartlet test of speriocity

library(psych)
cortest.bartlett(factoranalysis)

#KMO test
KMO(factoranalysis)



PCA2<-principal(factoranalysis)
PCA2$rotation
PCA2$values
PCA2$communality
PCA2$factors
PCA2$scores

#change the options in Principal

PCA2<-principal(factoranalysis,nfactors = 6, residuals = FALSE, rotate = "varimax", n.obs = 414,covar = FALSE,scores = TRUE, missing = FALSE, impute = "median", oblique.scores = TRUE, method = "regression" )

PCA2$rotation #"none", "varimax", "quatimax", "promax", #"oblimin", "simplimax", and "cluster" are also

PCA2$values
PCA2$communality
PCA2$factors
PCA2$scores
PCA2$loadings
PCA2$weights
PCA2$rot.mat
PCA2$chi



Basics of R- Session 16- Discriminant Analysis

discriminant analysis for two group

rm(list=ls())

#Discriminat analysis

#import data set

da1<-read.csv("C:/Users/Administrator.vaio1/Desktop/discriminant two group hair.csv")
fix(da1)

library(MASS)
library(psych)
library(Hmisc)
library(stats)
library(HSAUR)
library(dawai)
#library(rattle)
library(car)
library(MVN)
library(perturb)
library(biotools)
library(FactoMineR)
library(DiscriMiner)


#create a subset of the data for simplicity say x2 and x6, x7,x8,x9,x10

dasub1<-da1[,c(3,7:19)]
fix(dasub1)

#Henze-Zirkler's MVN test

library(MVN)

multinormality<-mvn(dasub1, subset = "x4", mvnTest = "hz")  

# mvnTest "mardia" for Mardia’s test, 
# "hz" for HenzeZirkler’s test, 
# "royston" for Royston’s test, 
# "dh" for Doornik-Hansen’s test
# and energy for E-statistic

multinormality$multivariateNormality

# outliers can also be identified
multinormality<-mvn(dasub1, subset = "x4", mvnTest = "hz", showOutliers = TRUE)  
multinormality$multivariateOutliers
#multicollinearity

#Homogeneity of variance-covariance matrix using Anderson's test, Box's M test

#boxplot
boxplot(dasub1$x6~dasub1$x2)

#box M test
library(biotools)

# box m (IV, D-group)
boxM(dasub1[,-1],dasub1$x2)

# wilks lambda
library(rrcov)
Wilks.test(dasub1[,-1],dasub1$x2)
or
Wilks.test(dasub1[,-1], dasub1$x4)
Wilks.test(x4~., data = dasub1)


# eign value and cannonical correlation

library(candisc)
reg3<-lm(as.matrix(dasub1[,-1])~dasub1$x4)
regcanno<-candisc(reg3)
regcanno


#discriminant analysis
library(MASS)

da11<-lda(x2~x6+x7+x8+x9+x10+x11+x12+x13+x14+x15+x16+x17+x18, data= da1, na.action = "na.omit", cv=TRUE)
summary(da11)
da11$prior
da11$counts
da11$means
da11$scaling  # canonical discriminat function
da11$svd

#linear discriminat analysis (classification function)
library(DiscriMiner)
dasub1$x2<-factor(dasub1$x2)

da12<-linDA(dasub1[,-1],dasub1$x2)
da12
summary(da12)
da12$functions  # fisher classification function
da12$confusion
da12$scores
da12$classification
da12$error_rate

discPower(dasub1[,-1],dasub1$x2)  # # discriminating power, wilks lambda


Sunday, September 16, 2018

Basics of R- Session 15 Multiple Regression 2

# Machine Learning Regression Ridge lasso

install.packages(c("caret", ";", "glmnet;", "mlbench;", "psych"))

library(caret)
library(glmnet)
library(mlbench)
library(psych)
library(ggplot2)

# data set, reg2

reg1<-read.csv("C:/Users/Administrator.vaio1/Desktop/reg2.csv")
# keep only those variables which are suppose to be used in analysis

reg1X<-reg1[,8:19]
reg1X<-as.matrix(reg1X)
fix(reg1X)

reg1Y<-as.matrix(reg1[,20])
fix(reg1Y)

lambda1<-10^seq(10,-2,length =100)

ridge1<-glmnet(reg1X, reg1Y, alpha = 0, lambda = lambda1)
plot(ridge1)

ridge1$lambda[1]
coef(ridge1)[,1] # dislay the first model

ridge1$lambda[2]
coef(ridge1)[,2] # dislay the first model

ridge1$lambda[100]
coef(ridge1)[,100] # dislay the first model




lasso1<-glmnet(reg1X, reg1Y, alpha = 1, lambda = lambda1)
plot(ridge1)

lasso1$lambda[1]
coef(lasso1)[,1] # dislay the first model

lasso1$lambda[2]
coef(lasso1)[,2] # dislay the first model

lasso1$lambda[100]
coef(lasso1)[,100] # dislay the first model

#-------------------------------------#
#--------------------------------------------------------------#

# Machine Learning Regression Ridge lasso

install.packages(c("caret", ";", "glmnet;", "mlbench;", "psych"))

library(caret)
library(glmnet)
library(mlbench)
library(psych)
library(ggplot2)


reg1<-read.csv("C:/Users/Administrator.vaio1/Desktop/reg2.csv")
reg1<-reg1[,8:20]
# linear regression
lm<-train(X19~.,reg1, method= "lm")
summary(lm)
# ridge regression
ridge1<-train(X19~.,reg1, method= "glmnet", 
              tuneGrid=expand.grid(alpha =0, lambda = seq(0.0001,1,length=100)))

ridge1
plot(ridge1)
plot(ridge1$finalModel)
plot(ridge1$finalModel, xvar = "lambda", label = TRUE)

# parameters alpha and lambda
ridge
ridge$bestTune

# coeficient of best model
coef(ridge$finalModel)
coef(ridge$finalModel, s=ridge$bestTune$lambda) # for which value of lambda
coef(ridge$finalModel, s=ridge$bestTune$alpha) # for which value of alpha

plot(varImp(ridge1))


# best model of ridge
ridge1$bestTune
bestridge<-ridge1$finalModel
coef(bestridge, s=ridge1$bestTune$lambda)




# lasso regression

lasso1<-train(X19~.,reg1, method= "glmnet", 
              tuneGrid=expand.grid(alpha =1, lambda = seq(0.0001,1,length=100)))

lasso1
plot(lasso1)
plot(lasso1$finalModel)
plot(lasso1$finalModel, xvar = "lambda", label = TRUE)
plot(lasso1$finalModel, xvar = "dev", label = TRUE)
plot(varImp(lasso1))


# best model of lasso
lasso1$bestTune
bestlasso<-lasso1$finalModel
coef(bestlaso, s=lasso1$bestTune$lambda)


# elastic net regression


en1<-train(X19~.,reg1, method= "glmnet", 
              tuneGrid=expand.grid(alpha =seq(0,1, length=10), lambda = seq(0.0001,1,length=100)))


plot(en1)
plot(en1$finalModel)
plot(en1$finalModel, xvar = "lambda", label = TRUE)
plot(en1$finalModel, xvar = "dev", label = TRUE)
plot(varImp(en1))

# best model of elastic net

en1$bestTune
besten<-en1$finalModel
coef(besten, s=en1$bestTune$lambda)







Basics of R- Session 15 Multiple Regression 1

rm(list=ls())

reg1<-read.csv("C:/Users/Administrator.vaio1/Desktop/reg2.csv")

#check/ explore the file
str(reg1)
head(reg1)

# Descriptive statistics of all the variables

library(psych)
describe(reg1)

library(pastecs)
stat.desc(reg1)


# here we will  use x19 satisfaction as dependent variable
# and X7 - E-Commerce Activities,X8 - Technical Support,
#X9 - Complaint,Resolution,X10 - Advertising,X11 - Product Line,
#X12 - Salesforce Image,X13 - Competitive Pricing,X14 - Warranty & Claims,
#X15 - New Products,X16 Order & Billing,
#X17 - Price Flexibility,and X18 - Delivery Speed

# Normality of only error terms has to be checked

#split the data set into two parts- training and test data set

library(caTools)
set.seed(1000)
split1 = sample.split(reg1$id, SplitRatio = 0.70)
summary(split1)


#training data set
reg1train<-subset(reg1,split1==TRUE)
str(reg1train)
dim(reg1train)
#validation data set
reg1Val<-subset(reg1,split1==FALSE)
str(reg1Val)
dim(reg1Val)


# Remove all the missing observation
dim(reg1train)
reg1train<-na.omit(reg1train)
dim(reg1train)


#outliers
#to identify the outliers, use box plot, here we had used/ explained it for a single variable
boxplot(reg1train$id)
#convert them into log to remove the outliers effect
boxplot(log(reg1train$id))



#assumptions of MLRegression
#1. Linear relationship
# plot to check the linearity- bivaraite plot

plot(reg1train$X19...Satisfaction, reg1train$X7...E.Commerce.Activities)
# this plot is only indicative, it may not make any sense
library(ggplot2)
ggplot(reg1train, aes(reg1train$X19,reg1train$X7))+geom_smooth()

#plot a linear relation between the variables and try to find weather it is linear or not
#y~X
regx19_x7<-lm(reg1train$X19~reg1train$X7)
summary(regx19_x7)
# check the coefficient of b1 only, correlation

# use correlation
library(stats)
cor(reg1train$X7, reg1train$X19)
cor.test(reg1train$X7, reg1train$X19)


# Application of the Model

# Multiple Linear Regression Example

# ---------------fit the model------------------#

fit1<-lm(X19~X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18, data = reg1train)
summary(fit1)
attributes(fit1)

fit1$coefficients
fit1$residuals

# check outliers
library(car)
outlierTest(reg1)

# give the items with influencial values
influenceIndexPlot(reg1)


# to check residual analysis, heteroscadity, etc we use plots
plot(fit1)

#-----------Multicollinearity-------------------------#
library(car)
vif(fit1) #variance inflation factors  multicollinearity

#plot to find randomness of the residuals

plot(fit1$residuals, c(1:length(fit1$residuals)))


#normality for residual values
qqnorm(fit1$residuals, pch=20, cex=1)
qqline(fit1$residuals, col="red",lwd=3)
boxplot(fit1$residuals)
#sapiro test has to be applied

#there seems to be a lot of outliers which are creating the problem of non normality of residuals

#autocorrelation
library(lmtest)
dwtest(fit1)

#test for heteroscadisticity
plot(fit1$residuals, fit1$fitted.values)

#from the plot it is not easy to analyze the pattern of the data so we will apply bptest
bptest(fit1)

#-----------------------------------#


# --------------------for stepwise forward backward Method------------#

library(leaps)
fit2<-regsubsets(X19~X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18, data=reg1train, nbest = 3)
fit2
summary(fit2)

plot(fit2, scale ="r2")
plot(fit2, scale ="adjr2")
plot(fit2, scale ="bic")

#-------------------#


library(MASS)

regback<-stepAIC(fit1,direction = "backward", trace = 1)
summary(regback)
regback$anova

regfor<-stepAIC(fit1, direction = "forward", trace = 1)
summary(regfor)
regfor$anova

regall<-stepAIC(fit1,direction = c("both", "backward", "forward"), trace = 1)

summary(regall)
regall$anova
#-----------------------------#

# training and test model comparison
compare the two model with respect to R2 and adjR2 and coeefficient

then can compare them with predicted values


fitfinal<-lm(x19~x6+x7+x9+x11+x12+x13+x17,data = regtrain)
summary(fitfinal)

fitfinaltest<-lm(x19~x6+x7+x9+x11+x12+x13+x17,data = regtest)
summary(fitfinaltest)

predict1<-predict(fitfinal, regtest)

library(Metrics)
rmse(predict1, regtest$x19)

Low value of RMSE indicates a good model

AIC(fitfinal)
AIC(fitfinaltest)

BIC(fitfinal)
BIC(fitfinaltest)



Saturday, September 8, 2018

Basics of R- Session 14- using library dplyr

library(dplyr)
# import the file

mbaper<-read.csv("C:/Users/Administrator.vaio1/Desktop/MBAdata.csv")

#Equivalent of str in traditional approach. Gives the structure of data
str(mbaper)
fix(mbaper)
glimpse(mbaper)

# adding or removing variables from the file
mbaper_age<-select(mbaper, Age_in_years_completed)
fix(mbaper_age)
mbaper_age<-select(mbaper, -Age_in_years_completed)
fix(mbaper_age)

mbaper_age<-select(mbaper, Age_in_years_completed:Mothers_qualification)
fix(mbaper_age)

#-------------------------------------------------------#

#To filter data based on some conditions we could use the filter function=filter(dataset, variable==value)
mbaper_age<-filter(mbaper, Percentage_in_10_Class==75)
fix(mbaper_age)

mbaper_age<-filter(mbaper, Percentage_in_10_Class<75)
fix(mbaper_age)

mbaper_age<-filter(mbaper, Percentage_in_10_Class>75)
fix(mbaper_age)

#-------------------------------------------------------#

# specific category from a categorical variable
mbaper_zone<-filter(mbaper, STATE=="North East Zone")
fix(mbaper_zone)

# more than one category
#   , for and
mbaper_zone<-filter(mbaper, STATE=="North East Zone",STATE=="Central Zone")
fix(mbaper_zone)

#   | for or
mbaper_zone<-filter(mbaper, STATE=="North East Zone" | STATE=="Central Zone")
fix(mbaper_zone)

#   | for or### exact word has to be used
mbaper_zone<-filter(mbaper, STATE=="North East Zone" | Previous_Degree=="Commerce")
fix(mbaper_zone)

mbaper_zone<-filter(mbaper, STATE=="North East Zone" , Previous_Degree=="Commerce")
fix(mbaper_zone)

#-------------------------------------#

#Select columns and then subset data based on a condition, combination of rows and variables

mbaper_age<-select(mbaper, Age_in_years_completed)
fix(mbaper_age)

mbaper_age<-filter(select(mbaper, Age_in_years_completed),Age_in_years_completed==25)
fix(mbaper_age)

# --------------------------------------------------# pipe function

1:8 %>% sum
1:8 %>% sum %>% sqrt
1:8 %>% sum %>% sqrt%>% sum*10

#---------------------------------#
mbaper_age<-select(mbaper, Age_in_years_completed) %>%
  filter(Age_in_years_completed==22)

fix(mbaper_age)

mbaper_age<-select(mbaper, Age_in_years_completed) %>%
  arrange(Age_in_years_completed)

mbaper_age<-select(mbaper, Age_in_years_completed) %>%
  arrange(-Age_in_years_completed)