Monday, December 31, 2018

Basics of R Session -19- Cluster Analysis

rm(list=ls())
# import the data
cluster1.1<-read.csv("file:///D:/1 Teaching Material/1 inurture Lectures/1 multivariate data analysis/1 Multivariate Data Analysis PPts Self/Cluster Analysis/Cluster Analysis MDP 2018/Data1.csv")

str(cluster1.1)
## 'data.frame':    9 obs. of  3 variables:
##  $ Name       : Factor w/ 9 levels "A","B","C","D",..: 1 2 3 4 5 6 7 8 9
##  $ Physics    : int  1 3 2 2 4 5 6 8 6
##  $ Mathematics: int  5 5 6 4 7 9 6 8 8
# remove the missing observations
cluster1.1<-na.omit(cluster1.1)
str(cluster1.1)
## 'data.frame':    9 obs. of  3 variables:
##  $ Name       : Factor w/ 9 levels "A","B","C","D",..: 1 2 3 4 5 6 7 8 9
##  $ Physics    : int  1 3 2 2 4 5 6 8 6
##  $ Mathematics: int  5 5 6 4 7 9 6 8 8
#fix(cluster1.1)

# remove the labels of the data set or extra variables if present
cluster1.2<-cluster1.1[,-1]

calculate distance using dist() or daisy()

dist1<-dist(cluster1.2,method = "euclidean")
dist1
##          1        2        3        4        5        6        7        8
## 2 2.000000                                                               
## 3 1.414214 1.414214                                                      
## 4 1.414214 1.414214 2.000000                                             
## 5 3.605551 2.236068 2.236068 3.605551                                    
## 6 5.656854 4.472136 4.242641 5.830952 2.236068                           
## 7 5.099020 3.162278 4.000000 4.472136 2.236068 3.162278                  
## 8 7.615773 5.830952 6.324555 7.211103 4.123106 3.162278 2.828427         
## 9 5.830952 4.242641 4.472136 5.656854 2.236068 1.414214 2.000000 2.000000
dist2<-dist(cluster1.2,method = "maximum")
dist2
##   1 2 3 4 5 6 7 8
## 2 2              
## 3 1 1            
## 4 1 1 2          
## 5 3 2 2 3        
## 6 4 4 3 5 2      
## 7 5 3 4 4 2 3    
## 8 7 5 6 6 4 3 2  
## 9 5 3 4 4 2 1 2 2
dist3<-dist(cluster1.2,method = "manhattan")
dist3
##    1  2  3  4  5  6  7  8
## 2  2                     
## 3  2  2                  
## 4  2  2  2               
## 5  5  3  3  5            
## 6  8  6  6  8  3         
## 7  6  4  4  6  3  4      
## 8 10  8  8 10  5  4  4   
## 9  8  6  6  8  3  2  2  2
dist4<-dist(cluster1.2,method = "minkowski")
dist4
##          1        2        3        4        5        6        7        8
## 2 2.000000                                                               
## 3 1.414214 1.414214                                                      
## 4 1.414214 1.414214 2.000000                                             
## 5 3.605551 2.236068 2.236068 3.605551                                    
## 6 5.656854 4.472136 4.242641 5.830952 2.236068                           
## 7 5.099020 3.162278 4.000000 4.472136 2.236068 3.162278                  
## 8 7.615773 5.830952 6.324555 7.211103 4.123106 3.162278 2.828427         
## 9 5.830952 4.242641 4.472136 5.656854 2.236068 1.414214 2.000000 2.000000

hierarchical cluster analysis use any one of the distance and any one method

clusterout<-hclust(dist1,method = "single")
plot(clusterout)
clusterout<-hclust(dist1,method = "complete")
plot(clusterout)
clusterout<-hclust(dist1,method = "average")
plot(clusterout)
clusterout<-hclust(dist1,method = "median")
plot(clusterout)
clusterout<-hclust(dist1,method = "centroid")
plot(clusterout)
clusterout<-hclust(dist1,method = "ward.D")
plot(clusterout)
clusterout<-hclust(dist1,method = "ward.D2")
plot(clusterout)

adding labels to the r object clusterout

clusterout$labels<-cluster1.1[,1]
plot(clusterout)
 # reducing the tree or dentograph either on the basis of

number of clusters -k or height- h # generally h not considered

groups<- cutree(clusterout, k=3)   # cut the existing tree into 3 clusters
groups
## A B C D E G H I J 
## 1 1 1 1 2 2 2 3 2

draw dendogram with red borders around the 3 clusters

plot(clusterout)
rect.hclust(clusterout, k=3,border="red")
print(clusterout)
## 
## Call:
## hclust(d = dist1, method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 9
summary(clusterout)
##             Length Class  Mode     
## merge       16     -none- numeric  
## height       8     -none- numeric  
## order        9     -none- numeric  
## labels       9     factor numeric  
## method       1     -none- character
## call         3     -none- call     
## dist.method  1     -none- character
clusterout$height
## [1] 1.414214 1.414214 1.414214 2.000000 2.236068 2.915476 3.535534 9.706813

centroid of the clusters

aggregate(cluster1.2,by=list(groups),FUN=mean)
##   Group.1 Physics Mathematics
## 1       1    2.00         5.0
## 2       2    5.25         7.5
## 3       3    8.00         8.0
# using centroid we can check the profiles of the clusters formed

—————-

Other Method #—————-
# Libraries for Cluster Analysis
library(NbClust)
library(mclust)
## Warning: package 'mclust' was built under R version 3.5.3
## Package 'mclust' version 5.4.3
## Type 'citation("mclust")' for citing this R package in publications.
library(fpc)
## Warning: package 'fpc' was built under R version 3.5.3
# max.nc can be more also say 10
nb<-NbClust(cluster1.2, distance = "euclidean",min.nc = 2, max.nc = 4,method = "single")
## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 
## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 11 proposed 2 as the best number of clusters 
## * 6 proposed 3 as the best number of clusters 
## * 6 proposed 4 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
nb
## $All.index
##        KL      CH Hartigan    CCC   Scott Marriot TrCovW  TraceW Friedman
## 2 36.1587 15.9034   3.3185 4.8583 41.2499     372  56.25 19.9000  61.1016
## 3  0.2211 11.4691   2.6786 2.5279 47.7833     405  56.25 13.5000  86.6250
## 4  0.2568  9.9603   4.3333 1.4017 57.0900     256  40.00  9.3333 145.0278
##     Rubin Cindex     DB Silhouette   Duda Pseudot2  Beale Ratkowsky   Ball
## 2 29.6985 0.3788 0.6614     0.4819 0.8173   0.6706 0.1676    0.5726 9.9500
## 3 43.7778 0.4711 0.6005     0.3245 0.5614   1.5625 0.5208    0.5059 4.5000
## 4 63.3214 0.3717 0.6415     0.4064 0.1875   4.3333 2.1667    0.4513 2.3333
##   Ptbiserial   Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2     0.7006 1.3500  0.5909 0.6202 0.0158  0.8562 1.3638 0.7935
## 3     0.6437 0.9198  0.9087 0.7071 0.0168  1.1765 1.0730 0.1482
## 4     0.5955 0.5167  1.2746 0.6325 0.0192  1.5557 0.8513 0.1061
## 
## $All.CriticalValues
##   CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2        -0.4219           -10.1102       0.8495
## 3        -0.5522            -5.6219       0.6295
## 4        -0.7431            -2.3458       0.3158
## 
## $Best.nc
##                      KL      CH Hartigan    CCC  Scott Marriot TrCovW
## Number_clusters  2.0000  2.0000   4.0000 2.0000 4.0000       3   4.00
## Value_Index     36.1587 15.9034   1.6548 4.8583 9.3067    -182  16.25
##                 TraceW Friedman  Rubin Cindex     DB Silhouette   Duda
## Number_clusters 3.0000   4.0000 3.0000 4.0000 3.0000     2.0000 2.0000
## Value_Index     2.2333  58.4028 5.4644 0.3717 0.6005     0.4819 0.8173
##                 PseudoT2  Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters       NA 2.0000    2.0000 3.00     2.0000 2.00  2.0000
## Value_Index           NA 0.1676    0.5726 5.45     0.7006 1.35  0.5909
##                   Dunn Hubert SDindex Dindex   SDbw
## Number_clusters 3.0000      0  2.0000      0 4.0000
## Value_Index     0.7071      0  0.8562      0 0.1061
## 
## $Best.partition
## 1 2 3 4 5 6 7 8 9 
## 1 1 1 1 1 2 2 2 2
nb$All.index
##        KL      CH Hartigan    CCC   Scott Marriot TrCovW  TraceW Friedman
## 2 36.1587 15.9034   3.3185 4.8583 41.2499     372  56.25 19.9000  61.1016
## 3  0.2211 11.4691   2.6786 2.5279 47.7833     405  56.25 13.5000  86.6250
## 4  0.2568  9.9603   4.3333 1.4017 57.0900     256  40.00  9.3333 145.0278
##     Rubin Cindex     DB Silhouette   Duda Pseudot2  Beale Ratkowsky   Ball
## 2 29.6985 0.3788 0.6614     0.4819 0.8173   0.6706 0.1676    0.5726 9.9500
## 3 43.7778 0.4711 0.6005     0.3245 0.5614   1.5625 0.5208    0.5059 4.5000
## 4 63.3214 0.3717 0.6415     0.4064 0.1875   4.3333 2.1667    0.4513 2.3333
##   Ptbiserial   Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2     0.7006 1.3500  0.5909 0.6202 0.0158  0.8562 1.3638 0.7935
## 3     0.6437 0.9198  0.9087 0.7071 0.0168  1.1765 1.0730 0.1482
## 4     0.5955 0.5167  1.2746 0.6325 0.0192  1.5557 0.8513 0.1061
nb$Best.partition
## 1 2 3 4 5 6 7 8 9 
## 1 1 1 1 1 2 2 2 2
# or we can check scree plot as well
# if we require some specific number of cluster (say 2) then

nbfit<-kmeans(cluster1.2,2)
nbfit$cluster
## 1 2 3 4 5 6 7 8 9 
## 1 1 1 1 2 2 2 2 2
nbfit$centers  # for profiling or prediction
##   Physics Mathematics
## 1     2.0         5.0
## 2     5.8         7.6
nbfit$totss
## [1] 65.11111
nbfit$withinss
## [1]  4 14
nbfit$tot.withinss
## [1] 18
nbfit$betweenss
## [1] 47.11111
nbfit$size
## [1] 4 5
nbfit$iter
## [1] 1

Wednesday, November 7, 2018

Basics of R- Session 18- Decision Theory- 2


rm(list=ls())

# decision tree
#rpart package

library(rpart)
library(rpart.plot)

#using the data set iris
data1<-read.csv("D:/1 Research/1 preparedness for future/3 decision science specialization/perceived score data 111.csv")
fix(data1)
summary(data1)



# data set = data1
data2<-na.omit(data1)

data2$specialization<-droplevels(data2$specialization)

# if error with Y zero levels or no data in Y

# split the data into two part using caret package

library(caret)
set.seed(100)
datasplit<-createDataPartition(data2$specialization, times = 1, p=0.7, list = FALSE, groups = min(5,length(data2$specialization)))



# create training and test data
datatrain<-data2[datasplit,]  # training data set 70%
datatest<-data2[-datasplit,]  # training data set 30%

summary(datatrain)
summary(datatest)




# decision tree
dtm1<-rpart(specialization~Gender_MF+Previous_Degree+Marital_status+Place_you_belong_to+Total_Family_Income_per_annum, data = datatrain, method = "class", minsplit=5, minbucket=1)
summary(dtm1)
dtm1$frame
dtm1$where
dtm1$terms
dtm1$cptable
plot(dtm1) #not a good
text(dtm1) #not a good
rpart.plot(dtm1)
rpart.plot(dtm1, clip.facs = TRUE)


dtm2<-rpart(specialization ~ Percentage_in_10_Class+Percentage_in_12_Class+Percentage_in_Under_Graduate+mba.percentage, data = datatrain, method = "class", minsplit=10, minbucket=1)

summary(dtm2)
dtm2$frame
dtm2$where
dtm2$terms
dtm2$cptable
plot(dtm2) #not a good
text(dtm2) #not a good

dtm3<-rpart(specialization ~ Gender_MF+Previous_Degree+Marital_status+Place_you_belong_to+Total_Family_Income_per_annum+
              Percentage_in_10_Class+Percentage_in_12_Class+Percentage_in_Under_Graduate+mba.percentage,data = datatrain, method = "class", minsplit=10, minbucket=1, control = rpart.control(cp=0.01))

rpart.plot(dtm2, type = 0)
rpart.plot(dtm2, type = 1)
rpart.plot(dtm2, type = 2)

rpart.plot(dtm2, branch= 0.2)
rpart.plot(dtm2, branch= 0.5)
rpart.plot(dtm2, branch= 0.8)

rpart.plot(dtm2, fallen.leaves = FALSE)
rpart.plot(dtm2, tweak = 2)  # text size equal to 200%

rpart.plot(dtm3, tweak = 2,fallen.leaves = TRUE)


library(rattle)
fancyRpartPlot(dtm2)

# validation for using prediction in decision tree with categories we have to mention type = class


dtm1_predict<-predict(dtm1, datatest, type = "class")

confusionMatrix(datatest$specialization, dtm1_predict)

dtm2_predict<-predict(dtm2, datatest, type = "class")

confusionMatrix(datatest$specialization, dtm2_predict)

dtm3_predict<-predict(dtm3, datatest, type = "class")

confusionMatrix(datatest$specialization, dtm3_predict)

#------------------------------#
# pruning of decision tree

#-----------------------------------#
# prepruning
# based on minimumsplit , minibucket, cp- fixed


#-------------------------------------#
# post pruning based on complexity parameter----

The complexity parameter (cp) in rpart is the minimum improvement in the model needed at each node. It’s based on the cost complexity of the model
  • For the given tree, add up the misclassification at every terminal node.
  • The cp value is a stopping parameter. It helps speed up the search for splits because it can identify splits that don’t meet this criteria and prune them before going too far., co a value very low will create deeper trees (0.0001) and higher value will make it simple (0.01)
plotcp(dtm3)
# on the basis of the plot decide the cp, size of tree and error
dtm3$cptable

# use minimum cp and prune the tree

dtm3_prune<-prune(dtm3, cp=0.01)

plot(dtm3_prune)
rpart.plot(dtm3_prune)
dtm3_prune$cptable


#----------------------------------#
# cross validation pruning, can be done using caret library
or  library(tree)

Sunday, October 7, 2018

Basics of R- Session 18- Decision Tree-1


library(rpart)
library(tree)
library(partykit)
library(rpart.plot)


library(CHAID)
car1<-read.csv("C:/Users/Administrator.vaio1/Desktop/data car.csv")
fix(car1)
dim(car1)

#CHAID-, Chi Square automated identification detector
# all are categorical variable
#Minimumsplit is the minimum number of units before split 10% or atleast 20
#Minbucket is the minimum number of units in a leaf
# maxdepth length of the tree, 1 - single tree

chaid1<-chaid(Car_accaptability~Buying+maintenance+safety, control=chaid_control(minprob = 0.0, minsplit = 500, minbucket = 1), data=car1)
plot(chaid1)
plot(chaid1, margin =0.1)  # page margin
plot(chaid1, margin =1)
plot(chaid1, margin =5)
print(chaid1)

#--------------------------------------------------------#

#CHART

chart1<-rpart(Car_accaptability~Buying+maintenance+safety,data = car1,method = "class", minsplit=500, minbucket=1)
#cp complexity parameter
chart1<-rpart(Car_accaptability~Buying+maintenance+safety,data = car1,method = "class", minsplit=500, minbucket=1, parms = list(split="gini"),cp=0.01)


chart1$parms
chart1$functions
chart1$frame
chart1$splits
chart1$csplit

plot(chart1)
text(chart1)

plot(chart1, uniform = TRUE)
text(chart1, all = TRUE, use.n = TRUE)

summary(chart1)

library(rpart.plot)
rpart.plot(chart1)

library(rattle)
fancyRpartPlot(chart1)

#-----------------------------------------------#

dtm<-rpart(Species~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width, data = iris_train, method = "class")
dtm

print(dtm)
summary(dtm)
plot(dtm) #not a good
text(dtm) #not a good
rpart.plot(dtm)

#prediction can also be done
predict1<-predict(dtm,iris_test, type = "class")
table(iris_test$Species,predict1)

library(caret)
library(e1071)
confusionMatrix(predict1,iris_test$Species)
#----------------------------------------------------------------#


Sunday, September 23, 2018

Basics of R- Session 17- Factor analysis



#factorANalysis
library(psych)

#import data set
factoranalysis<-read.csv("C:/Users/Administrator.vaio1/Desktop/factor1.1.csv")


library(psych)
library(Hmisc)
library(stats)
library(HSAUR)
library(FactoMineR)

#method 1---principal component analysis

#use principal
#default options will be used here

# use library(psych)

#bartlet test of speriocity

library(psych)
cortest.bartlett(factoranalysis)

#KMO test
KMO(factoranalysis)



PCA2<-principal(factoranalysis)
PCA2$rotation
PCA2$values
PCA2$communality
PCA2$factors
PCA2$scores

#change the options in Principal

PCA2<-principal(factoranalysis,nfactors = 6, residuals = FALSE, rotate = "varimax", n.obs = 414,covar = FALSE,scores = TRUE, missing = FALSE, impute = "median", oblique.scores = TRUE, method = "regression" )

PCA2$rotation #"none", "varimax", "quatimax", "promax", #"oblimin", "simplimax", and "cluster" are also

PCA2$values
PCA2$communality
PCA2$factors
PCA2$scores
PCA2$loadings
PCA2$weights
PCA2$rot.mat
PCA2$chi



Basics of R- Session 16- Discriminant Analysis

discriminant analysis for two group

rm(list=ls())

#Discriminat analysis

#import data set

da1<-read.csv("C:/Users/Administrator.vaio1/Desktop/discriminant two group hair.csv")
fix(da1)

library(MASS)
library(psych)
library(Hmisc)
library(stats)
library(HSAUR)
library(dawai)
#library(rattle)
library(car)
library(MVN)
library(perturb)
library(biotools)
library(FactoMineR)
library(DiscriMiner)


#create a subset of the data for simplicity say x2 and x6, x7,x8,x9,x10

dasub1<-da1[,c(3,7:19)]
fix(dasub1)

#Henze-Zirkler's MVN test

library(MVN)

multinormality<-mvn(dasub1, subset = "x4", mvnTest = "hz")  

# mvnTest "mardia" for Mardia’s test, 
# "hz" for HenzeZirkler’s test, 
# "royston" for Royston’s test, 
# "dh" for Doornik-Hansen’s test
# and energy for E-statistic

multinormality$multivariateNormality

# outliers can also be identified
multinormality<-mvn(dasub1, subset = "x4", mvnTest = "hz", showOutliers = TRUE)  
multinormality$multivariateOutliers
#multicollinearity

#Homogeneity of variance-covariance matrix using Anderson's test, Box's M test

#boxplot
boxplot(dasub1$x6~dasub1$x2)

#box M test
library(biotools)

# box m (IV, D-group)
boxM(dasub1[,-1],dasub1$x2)

# wilks lambda
library(rrcov)
Wilks.test(dasub1[,-1],dasub1$x2)
or
Wilks.test(dasub1[,-1], dasub1$x4)
Wilks.test(x4~., data = dasub1)


# eign value and cannonical correlation

library(candisc)
reg3<-lm(as.matrix(dasub1[,-1])~dasub1$x4)
regcanno<-candisc(reg3)
regcanno


#discriminant analysis
library(MASS)

da11<-lda(x2~x6+x7+x8+x9+x10+x11+x12+x13+x14+x15+x16+x17+x18, data= da1, na.action = "na.omit", cv=TRUE)
summary(da11)
da11$prior
da11$counts
da11$means
da11$scaling  # canonical discriminat function
da11$svd

#linear discriminat analysis (classification function)
library(DiscriMiner)
dasub1$x2<-factor(dasub1$x2)

da12<-linDA(dasub1[,-1],dasub1$x2)
da12
summary(da12)
da12$functions  # fisher classification function
da12$confusion
da12$scores
da12$classification
da12$error_rate

discPower(dasub1[,-1],dasub1$x2)  # # discriminating power, wilks lambda


Sunday, September 16, 2018

Basics of R- Session 15 Multiple Regression 2

# Machine Learning Regression Ridge lasso

install.packages(c("caret", ";", "glmnet;", "mlbench;", "psych"))

library(caret)
library(glmnet)
library(mlbench)
library(psych)
library(ggplot2)

# data set, reg2

reg1<-read.csv("C:/Users/Administrator.vaio1/Desktop/reg2.csv")
# keep only those variables which are suppose to be used in analysis

reg1X<-reg1[,8:19]
reg1X<-as.matrix(reg1X)
fix(reg1X)

reg1Y<-as.matrix(reg1[,20])
fix(reg1Y)

lambda1<-10^seq(10,-2,length =100)

ridge1<-glmnet(reg1X, reg1Y, alpha = 0, lambda = lambda1)
plot(ridge1)

ridge1$lambda[1]
coef(ridge1)[,1] # dislay the first model

ridge1$lambda[2]
coef(ridge1)[,2] # dislay the first model

ridge1$lambda[100]
coef(ridge1)[,100] # dislay the first model




lasso1<-glmnet(reg1X, reg1Y, alpha = 1, lambda = lambda1)
plot(ridge1)

lasso1$lambda[1]
coef(lasso1)[,1] # dislay the first model

lasso1$lambda[2]
coef(lasso1)[,2] # dislay the first model

lasso1$lambda[100]
coef(lasso1)[,100] # dislay the first model

#-------------------------------------#
#--------------------------------------------------------------#

# Machine Learning Regression Ridge lasso

install.packages(c("caret", ";", "glmnet;", "mlbench;", "psych"))

library(caret)
library(glmnet)
library(mlbench)
library(psych)
library(ggplot2)


reg1<-read.csv("C:/Users/Administrator.vaio1/Desktop/reg2.csv")
reg1<-reg1[,8:20]
# linear regression
lm<-train(X19~.,reg1, method= "lm")
summary(lm)
# ridge regression
ridge1<-train(X19~.,reg1, method= "glmnet", 
              tuneGrid=expand.grid(alpha =0, lambda = seq(0.0001,1,length=100)))

ridge1
plot(ridge1)
plot(ridge1$finalModel)
plot(ridge1$finalModel, xvar = "lambda", label = TRUE)

# parameters alpha and lambda
ridge
ridge$bestTune

# coeficient of best model
coef(ridge$finalModel)
coef(ridge$finalModel, s=ridge$bestTune$lambda) # for which value of lambda
coef(ridge$finalModel, s=ridge$bestTune$alpha) # for which value of alpha

plot(varImp(ridge1))


# best model of ridge
ridge1$bestTune
bestridge<-ridge1$finalModel
coef(bestridge, s=ridge1$bestTune$lambda)




# lasso regression

lasso1<-train(X19~.,reg1, method= "glmnet", 
              tuneGrid=expand.grid(alpha =1, lambda = seq(0.0001,1,length=100)))

lasso1
plot(lasso1)
plot(lasso1$finalModel)
plot(lasso1$finalModel, xvar = "lambda", label = TRUE)
plot(lasso1$finalModel, xvar = "dev", label = TRUE)
plot(varImp(lasso1))


# best model of lasso
lasso1$bestTune
bestlasso<-lasso1$finalModel
coef(bestlaso, s=lasso1$bestTune$lambda)


# elastic net regression


en1<-train(X19~.,reg1, method= "glmnet", 
              tuneGrid=expand.grid(alpha =seq(0,1, length=10), lambda = seq(0.0001,1,length=100)))


plot(en1)
plot(en1$finalModel)
plot(en1$finalModel, xvar = "lambda", label = TRUE)
plot(en1$finalModel, xvar = "dev", label = TRUE)
plot(varImp(en1))

# best model of elastic net

en1$bestTune
besten<-en1$finalModel
coef(besten, s=en1$bestTune$lambda)







Basics of R- Session 15 Multiple Regression 1

rm(list=ls())

reg1<-read.csv("C:/Users/Administrator.vaio1/Desktop/reg2.csv")

#check/ explore the file
str(reg1)
head(reg1)

# Descriptive statistics of all the variables

library(psych)
describe(reg1)

library(pastecs)
stat.desc(reg1)


# here we will  use x19 satisfaction as dependent variable
# and X7 - E-Commerce Activities,X8 - Technical Support,
#X9 - Complaint,Resolution,X10 - Advertising,X11 - Product Line,
#X12 - Salesforce Image,X13 - Competitive Pricing,X14 - Warranty & Claims,
#X15 - New Products,X16 Order & Billing,
#X17 - Price Flexibility,and X18 - Delivery Speed

# Normality of only error terms has to be checked

#split the data set into two parts- training and test data set

library(caTools)
set.seed(1000)
split1 = sample.split(reg1$id, SplitRatio = 0.70)
summary(split1)


#training data set
reg1train<-subset(reg1,split1==TRUE)
str(reg1train)
dim(reg1train)
#validation data set
reg1Val<-subset(reg1,split1==FALSE)
str(reg1Val)
dim(reg1Val)


# Remove all the missing observation
dim(reg1train)
reg1train<-na.omit(reg1train)
dim(reg1train)


#outliers
#to identify the outliers, use box plot, here we had used/ explained it for a single variable
boxplot(reg1train$id)
#convert them into log to remove the outliers effect
boxplot(log(reg1train$id))



#assumptions of MLRegression
#1. Linear relationship
# plot to check the linearity- bivaraite plot

plot(reg1train$X19...Satisfaction, reg1train$X7...E.Commerce.Activities)
# this plot is only indicative, it may not make any sense
library(ggplot2)
ggplot(reg1train, aes(reg1train$X19,reg1train$X7))+geom_smooth()

#plot a linear relation between the variables and try to find weather it is linear or not
#y~X
regx19_x7<-lm(reg1train$X19~reg1train$X7)
summary(regx19_x7)
# check the coefficient of b1 only, correlation

# use correlation
library(stats)
cor(reg1train$X7, reg1train$X19)
cor.test(reg1train$X7, reg1train$X19)


# Application of the Model

# Multiple Linear Regression Example

# ---------------fit the model------------------#

fit1<-lm(X19~X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18, data = reg1train)
summary(fit1)
attributes(fit1)

fit1$coefficients
fit1$residuals

# check outliers
library(car)
outlierTest(reg1)

# give the items with influencial values
influenceIndexPlot(reg1)


# to check residual analysis, heteroscadity, etc we use plots
plot(fit1)

#-----------Multicollinearity-------------------------#
library(car)
vif(fit1) #variance inflation factors  multicollinearity

#plot to find randomness of the residuals

plot(fit1$residuals, c(1:length(fit1$residuals)))


#normality for residual values
qqnorm(fit1$residuals, pch=20, cex=1)
qqline(fit1$residuals, col="red",lwd=3)
boxplot(fit1$residuals)
#sapiro test has to be applied

#there seems to be a lot of outliers which are creating the problem of non normality of residuals

#autocorrelation
library(lmtest)
dwtest(fit1)

#test for heteroscadisticity
plot(fit1$residuals, fit1$fitted.values)

#from the plot it is not easy to analyze the pattern of the data so we will apply bptest
bptest(fit1)

#-----------------------------------#


# --------------------for stepwise forward backward Method------------#

library(leaps)
fit2<-regsubsets(X19~X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18, data=reg1train, nbest = 3)
fit2
summary(fit2)

plot(fit2, scale ="r2")
plot(fit2, scale ="adjr2")
plot(fit2, scale ="bic")

#-------------------#


library(MASS)

regback<-stepAIC(fit1,direction = "backward", trace = 1)
summary(regback)
regback$anova

regfor<-stepAIC(fit1, direction = "forward", trace = 1)
summary(regfor)
regfor$anova

regall<-stepAIC(fit1,direction = c("both", "backward", "forward"), trace = 1)

summary(regall)
regall$anova
#-----------------------------#

# training and test model comparison
compare the two model with respect to R2 and adjR2 and coeefficient

then can compare them with predicted values


fitfinal<-lm(x19~x6+x7+x9+x11+x12+x13+x17,data = regtrain)
summary(fitfinal)

fitfinaltest<-lm(x19~x6+x7+x9+x11+x12+x13+x17,data = regtest)
summary(fitfinaltest)

predict1<-predict(fitfinal, regtest)

library(Metrics)
rmse(predict1, regtest$x19)

Low value of RMSE indicates a good model

AIC(fitfinal)
AIC(fitfinaltest)

BIC(fitfinal)
BIC(fitfinaltest)



Saturday, September 8, 2018

Basics of R- Session 14- using library dplyr

library(dplyr)
# import the file

mbaper<-read.csv("C:/Users/Administrator.vaio1/Desktop/MBAdata.csv")

#Equivalent of str in traditional approach. Gives the structure of data
str(mbaper)
fix(mbaper)
glimpse(mbaper)

# adding or removing variables from the file
mbaper_age<-select(mbaper, Age_in_years_completed)
fix(mbaper_age)
mbaper_age<-select(mbaper, -Age_in_years_completed)
fix(mbaper_age)

mbaper_age<-select(mbaper, Age_in_years_completed:Mothers_qualification)
fix(mbaper_age)

#-------------------------------------------------------#

#To filter data based on some conditions we could use the filter function=filter(dataset, variable==value)
mbaper_age<-filter(mbaper, Percentage_in_10_Class==75)
fix(mbaper_age)

mbaper_age<-filter(mbaper, Percentage_in_10_Class<75)
fix(mbaper_age)

mbaper_age<-filter(mbaper, Percentage_in_10_Class>75)
fix(mbaper_age)

#-------------------------------------------------------#

# specific category from a categorical variable
mbaper_zone<-filter(mbaper, STATE=="North East Zone")
fix(mbaper_zone)

# more than one category
#   , for and
mbaper_zone<-filter(mbaper, STATE=="North East Zone",STATE=="Central Zone")
fix(mbaper_zone)

#   | for or
mbaper_zone<-filter(mbaper, STATE=="North East Zone" | STATE=="Central Zone")
fix(mbaper_zone)

#   | for or### exact word has to be used
mbaper_zone<-filter(mbaper, STATE=="North East Zone" | Previous_Degree=="Commerce")
fix(mbaper_zone)

mbaper_zone<-filter(mbaper, STATE=="North East Zone" , Previous_Degree=="Commerce")
fix(mbaper_zone)

#-------------------------------------#

#Select columns and then subset data based on a condition, combination of rows and variables

mbaper_age<-select(mbaper, Age_in_years_completed)
fix(mbaper_age)

mbaper_age<-filter(select(mbaper, Age_in_years_completed),Age_in_years_completed==25)
fix(mbaper_age)

# --------------------------------------------------# pipe function

1:8 %>% sum
1:8 %>% sum %>% sqrt
1:8 %>% sum %>% sqrt%>% sum*10

#---------------------------------#
mbaper_age<-select(mbaper, Age_in_years_completed) %>%
  filter(Age_in_years_completed==22)

fix(mbaper_age)

mbaper_age<-select(mbaper, Age_in_years_completed) %>%
  arrange(Age_in_years_completed)

mbaper_age<-select(mbaper, Age_in_years_completed) %>%
  arrange(-Age_in_years_completed)

Monday, July 30, 2018

Basics of R Session 13 Conjoint Analysis


library(conjoint)

Case study- Beauty Bar Soap

There are three attributes of the problem/ case study

1)colour with three levels (red, blue, and yellow)
2)shape with three levels (cubic, cylindrical and spherical) and
3)aroma with two levels (scented, and unscented)

Create an R object attribute1 for the factors with levels

attribute1<-list(
colour=c("red","blue", "yellow"),
shape =c("cubic", "cylindrical","spherical"),
aroma=c("scented", "unscented")
    )

attribute1
fix(attribute1)

all the possible profiles for the 3 factors each with three levels 3*3*2=18

profiles1<-expand.grid(attribute1)
profiles1

fix(profiles1)
length(profiles1)
dim(profiles1)

orthogonal design, no of cards are equal to 3*3*2= 18

design1<-caFactorialDesign(data=profiles1,type="fractional", cards=18)

print(design1)
fix(design1)

load the files with the names of all the levels of the three factors

possiblefactors <- read.csv("file:///E:/2 presentation for class/inurture Lectures/1 multivariate data analysis/1 Multivariate Data Analysis PPts Self/conjoint Analysis/Beauty Bar Soap Case IIM/profile.csv", header = FALSE)


possiblefactors


load the files with the perception of the respondents on the 3 factors and their sublevels

perception1 <- read.csv("file:///E:/2 presentation for class/inurture Lectures/1 multivariate data analysis/1 Multivariate Data Analysis PPts Self/conjoint Analysis/Beauty Bar Soap Case IIM/perception.csv", header=T)
here the respondents are asked to rank the levels for all the factors with their levels



fix(perception1)


Conjoint(perception1, design1, possiblefactors)



Tuesday, July 10, 2018

Basics of R- Session 12- Market Basket Analysis


Market Basket Analysis

library(arules)
library(arulesViz)

# file mbaper
mbaper<-read.csv("E:/2 presentation for class/R/1 R Open elective/data set/mbadata.csv")
fix(mbaper)
str(mbaper)

remove first four variables which are scale in Nature
mbaper<-mbaper[,c(-1:-4)]

# study the categorical variables using tables and cross tab and find the variables which are having associations or some sort of relations

convert the files into transactions type use all the factor/ categorical variables in this analysis

mbaper<-as(mbaper, "transactions")

check the data set created after creating transactions

inspect(mbaper)

inspect(head(mbaper))

head(mbaper@itemInfo)

head(c(mbaper@itemsetInfo, mbaper@itemInfo))

# create a priori rules for the data set
# to check the code for the function or argument
args(apriori)
rules1<- apriori(mbaper)

rules1<- apriori(mbaper, parameter = list(supp = 0.10, conf = 0.80))
rules1<- apriori(mbaper, parameter = list(supp = 0.50, conf = 0.80))

# Number of items in the rules minlen 1 and malen=4, means there will be maximum 4 items in the rules created
rules1<- apriori(mbaper, parameter = list(minlen=1, supp=0.10, conf=0.80, maxlen=4))

#----------------------------#
# check the frequency distribution of the transactions
itemFrequency(mbaper)

# we can use other methods as well
itemFrequency(mbaper,type="absolute")

# plot the frequency
itemFrequencyPlot(rules1)

# if it doesn't work, then use items()

items(rules1)
itemFrequencyPlot(items(rules1))


#--------------------------#

#inspect the rules created

inspect(rules1)

#first three rules
inspect(rules1[1:3])

#first ten rules
inspect(rules1[1:10])


#-----------------------------------------------------#
inspect(head(sort(rules1, by = "support"), n=10))

inspect(head(sort(rules1, by = "confidence"), n=10))

inspect(head(sort(rules1, by = "lift"), n=100))
#---------------------------------------------------------------#

#when we want to study some specific combination of LHS and RHS
rules with RHS containing "perceivedscorecat=required skills" only



rules2<- apriori(mbaper, parameter = list(minlen=1, supp=0.001, conf=0.51, maxlen=4),appearance = list(rhs=c("perceivedscorecat=required skills")))

rules2<- apriori(mbaper, parameter = list(minlen=1, supp=0.001, conf=0.51, maxlen=4),appearance = list(rhs=c("perceivedscorecat=required skills"), default="rhs"))

rules2<- apriori(mbaper, parameter = list(minlen=1, supp=0.001, conf=0.51, maxlen=4),appearance = list(rhs=c("perceivedscorecat=required skills")))





rules2<- apriori(mbaper, parameter = list(minlen=1, supp=0.001, conf=0.51, maxlen=4),appearance = list(rhs=c("perceivedscorecat=required skills"), default ="lhs"))

inspect(head(sort(rules2, by = "lift"), n=10))

#--------------------------------------------------#


#rules with LHS containing "STATE = delhi" only

rules2<- apriori(mbaper, parameter = list(minlen=1, supp=0.05, conf=0.8),appearance = list(default ="rhs", lhs="perceivedscorecat=required skills"))

inspect(head(sort(rules2, by = "lift"), n=10))

#___________________________________________________________________________#


library(arulesViz)
plot(rules2)
plot(rules1, method = "graph")
plot(rules1, method = "grouped")
plotly_arules(rules1)


#----------------#

rules3<- apriori(mbaper, parameter = list(supp = 0.50, conf = 0.80))

plot(rules3,method="graph",interactive=TRUE,shading=NA)


Monday, July 9, 2018

Basics of R- Session 11- Factor Analysis

Factor Analysis

library(psych)
library(Hmisc)

#import data set

factoranalysis<-read.csv("E:/2 presentation for class/R/1 R Open elective/data set/FACTOR ANALYSIS.csv")

# check the structure of the data set, here we are using only 10 Variables

# labeling the variables
label(factoranalysis$ï..Resp)<-"Respondent"
label(factoranalysis$X1)<-"refreshing"
label(factoranalysis$X2)<-"bad for health"
label(factoranalysis$X3)<-"very convenient to serve"
label(factoranalysis$X4)<-"avoided with age"
label(factoranalysis$X5)<-"very tasty"
label(factoranalysis$X6)<-"not good for children"
label(factoranalysis$X7)<-"consumed occasionally"
label(factoranalysis$X8)<-"not be taken in large quantity"
label(factoranalysis$X9)<-"not as good as energy drinks"
label(factoranalysis$X10)<-"better than fruit juices"
label(factoranalysis$S)<-"Recommending aerated drinks to others"

str(factoranalysis)

fix(factoranalysis)
View(factoranalysis)

#-----------------#
#--------------------#
#----------------------#
# here the library(psych) will be used

# bartlet test of speriocity 
# the file should contain only those variables for which we have to apply factor analysis
# remove those variables, which are not included

fix(factoranalysis)
factoranalysis<-factoranalysis[,c(-1,-12)]
fix(factoranalysis)

cortest.bartlett(factoranalysis)  # data set


#KMO test 
KMO(factoranalysis)

#-----------------#
#--------------------#
#----------------------#

#code for principal in factor analysis

principal(r, nfactors = 1, residuals = FALSE,rotate="varimax",n.obs=NA, covar=FALSE,
 scores=TRUE,missing=FALSE,impute="median",oblique.scores=TRUE,method="regression",...)

Rotation-->
"none", "varimax", "quartimax", "promax", "oblimin", "simplimax", and "cluster"

PCA2<-principal(factoranalysis)
PCA2$rotation
PCA2$values
PCA2$communality
PCA2$factors
PCA2$scores

# for screeplot
X11<-PCA2$values
Y11<-1:length(PCA2$values)
plot(X11,Y11, type="l")

# scree plot
VSS.scree(factoranalysis)

The column h2 is a measure of communalities, and u2 is uniqueness; 

Communalities refer to shared variance with the other items, while uniqueness is variance not explained by the other items, but that could be explained by the latent variable as well as measurement error. 



Tuesday, July 3, 2018

Basics of R- Session 10- Data Visualization-4 Tree map


library(treemap)

# here index is the categorical variable or variables, vsize is the size of the rectangle (scale variable)

treemap(mbaper, index = "Gender_MF", vSize = "Percentage_in_10_Class")

# adding more categorical variables

treemap(mbaper, index = c("Gender_MF","Previous_Degree"), vSize = "Percentage_in_10_Class")


# interactive treemap using itreemap
# here index is the categorical variable or variables, vsize is the size of the rectangle (scale variable)

treemap::itreemap(mbaper, index = "Gender_MF", vSize = "Percentage_in_10_Class")