Sunday, October 7, 2018

Basics of R- Session 18- Decision Tree-1


library(rpart)
library(tree)
library(partykit)
library(rpart.plot)


library(CHAID)
car1<-read.csv("C:/Users/Administrator.vaio1/Desktop/data car.csv")
fix(car1)
dim(car1)

#CHAID-, Chi Square automated identification detector
# all are categorical variable
#Minimumsplit is the minimum number of units before split 10% or atleast 20
#Minbucket is the minimum number of units in a leaf
# maxdepth length of the tree, 1 - single tree

chaid1<-chaid(Car_accaptability~Buying+maintenance+safety, control=chaid_control(minprob = 0.0, minsplit = 500, minbucket = 1), data=car1)
plot(chaid1)
plot(chaid1, margin =0.1)  # page margin
plot(chaid1, margin =1)
plot(chaid1, margin =5)
print(chaid1)

#--------------------------------------------------------#

#CHART

chart1<-rpart(Car_accaptability~Buying+maintenance+safety,data = car1,method = "class", minsplit=500, minbucket=1)
#cp complexity parameter
chart1<-rpart(Car_accaptability~Buying+maintenance+safety,data = car1,method = "class", minsplit=500, minbucket=1, parms = list(split="gini"),cp=0.01)


chart1$parms
chart1$functions
chart1$frame
chart1$splits
chart1$csplit

plot(chart1)
text(chart1)

plot(chart1, uniform = TRUE)
text(chart1, all = TRUE, use.n = TRUE)

summary(chart1)

library(rpart.plot)
rpart.plot(chart1)

library(rattle)
fancyRpartPlot(chart1)

#-----------------------------------------------#

dtm<-rpart(Species~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width, data = iris_train, method = "class")
dtm

print(dtm)
summary(dtm)
plot(dtm) #not a good
text(dtm) #not a good
rpart.plot(dtm)

#prediction can also be done
predict1<-predict(dtm,iris_test, type = "class")
table(iris_test$Species,predict1)

library(caret)
library(e1071)
confusionMatrix(predict1,iris_test$Species)
#----------------------------------------------------------------#


Sunday, September 23, 2018

Basics of R- Session 17- Factor analysis



#factorANalysis
library(psych)

#import data set
factoranalysis<-read.csv("C:/Users/Administrator.vaio1/Desktop/factor1.1.csv")


library(psych)
library(Hmisc)
library(stats)
library(HSAUR)
library(FactoMineR)

#method 1---principal component analysis

#use principal
#default options will be used here

# use library(psych)

#bartlet test of speriocity

library(psych)
cortest.bartlett(factoranalysis)

#KMO test
KMO(factoranalysis)



PCA2<-principal(factoranalysis)
PCA2$rotation
PCA2$values
PCA2$communality
PCA2$factors
PCA2$scores

#change the options in Principal

PCA2<-principal(factoranalysis,nfactors = 6, residuals = FALSE, rotate = "varimax", n.obs = 414,covar = FALSE,scores = TRUE, missing = FALSE, impute = "median", oblique.scores = TRUE, method = "regression" )

PCA2$rotation #"none", "varimax", "quatimax", "promax", #"oblimin", "simplimax", and "cluster" are also

PCA2$values
PCA2$communality
PCA2$factors
PCA2$scores
PCA2$loadings
PCA2$weights
PCA2$rot.mat
PCA2$chi



Basics of R- Session 16- Discriminant Analysis

discriminant analysis for two group

rm(list=ls())

#Discriminat analysis

#import data set

da1<-read.csv("C:/Users/Administrator.vaio1/Desktop/discriminant two group hair.csv")
fix(da1)

library(MASS)
library(psych)
library(Hmisc)
library(stats)
library(HSAUR)
library(dawai)
#library(rattle)
library(car)
library(MVN)
library(perturb)
library(biotools)
library(FactoMineR)
library(DiscriMiner)


#create a subset of the data for simplicity say x2 and x6, x7,x8,x9,x10

dasub1<-da1[,c(3,7:19)]
fix(dasub1)

#Henze-Zirkler's MVN test

library(MVN)

multinormality<-mvn(dasub1, subset = "x4", mvnTest = "hz")  

# mvnTest "mardia" for Mardia’s test, 
# "hz" for HenzeZirkler’s test, 
# "royston" for Royston’s test, 
# "dh" for Doornik-Hansen’s test
# and energy for E-statistic

multinormality$multivariateNormality

# outliers can also be identified
multinormality<-mvn(dasub1, subset = "x4", mvnTest = "hz", showOutliers = TRUE)  
multinormality$multivariateOutliers
#multicollinearity

#Homogeneity of variance-covariance matrix using Anderson's test, Box's M test

#boxplot
boxplot(dasub1$x6~dasub1$x2)

#box M test
library(biotools)

# box m (IV, D-group)
boxM(dasub1[,-1],dasub1$x2)

# wilks lambda
library(rrcov)
Wilks.test(dasub1[,-1],dasub1$x2)
or
Wilks.test(dasub1[,-1], dasub1$x4)
Wilks.test(x4~., data = dasub1)


# eign value and cannonical correlation

library(candisc)
reg3<-lm(as.matrix(dasub1[,-1])~dasub1$x4)
regcanno<-candisc(reg3)
regcanno


#discriminant analysis
library(MASS)

da11<-lda(x2~x6+x7+x8+x9+x10+x11+x12+x13+x14+x15+x16+x17+x18, data= da1, na.action = "na.omit", cv=TRUE)
summary(da11)
da11$prior
da11$counts
da11$means
da11$scaling  # canonical discriminat function
da11$svd

#linear discriminat analysis (classification function)
library(DiscriMiner)
dasub1$x2<-factor(dasub1$x2)

da12<-linDA(dasub1[,-1],dasub1$x2)
da12
summary(da12)
da12$functions  # fisher classification function
da12$confusion
da12$scores
da12$classification
da12$error_rate

discPower(dasub1[,-1],dasub1$x2)  # # discriminating power, wilks lambda


Sunday, September 16, 2018

Basics of R- Session 15 Multiple Regression 2

# Machine Learning Regression Ridge lasso

install.packages(c("caret", ";", "glmnet;", "mlbench;", "psych"))

library(caret)
library(glmnet)
library(mlbench)
library(psych)
library(ggplot2)

# data set, reg2

reg1<-read.csv("C:/Users/Administrator.vaio1/Desktop/reg2.csv")
# keep only those variables which are suppose to be used in analysis

reg1X<-reg1[,8:19]
reg1X<-as.matrix(reg1X)
fix(reg1X)

reg1Y<-as.matrix(reg1[,20])
fix(reg1Y)

lambda1<-10^seq(10,-2,length =100)

ridge1<-glmnet(reg1X, reg1Y, alpha = 0, lambda = lambda1)
plot(ridge1)

ridge1$lambda[1]
coef(ridge1)[,1] # dislay the first model

ridge1$lambda[2]
coef(ridge1)[,2] # dislay the first model

ridge1$lambda[100]
coef(ridge1)[,100] # dislay the first model




lasso1<-glmnet(reg1X, reg1Y, alpha = 1, lambda = lambda1)
plot(ridge1)

lasso1$lambda[1]
coef(lasso1)[,1] # dislay the first model

lasso1$lambda[2]
coef(lasso1)[,2] # dislay the first model

lasso1$lambda[100]
coef(lasso1)[,100] # dislay the first model

#-------------------------------------#
#--------------------------------------------------------------#

# Machine Learning Regression Ridge lasso

install.packages(c("caret", ";", "glmnet;", "mlbench;", "psych"))

library(caret)
library(glmnet)
library(mlbench)
library(psych)
library(ggplot2)


reg1<-read.csv("C:/Users/Administrator.vaio1/Desktop/reg2.csv")
reg1<-reg1[,8:20]
# linear regression
lm<-train(X19~.,reg1, method= "lm")
summary(lm)
# ridge regression
ridge1<-train(X19~.,reg1, method= "glmnet", 
              tuneGrid=expand.grid(alpha =0, lambda = seq(0.0001,1,length=100)))

ridge1
plot(ridge1)
plot(ridge1$finalModel)
plot(ridge1$finalModel, xvar = "lambda", label = TRUE)

# parameters alpha and lambda
ridge
ridge$bestTune

# coeficient of best model
coef(ridge$finalModel)
coef(ridge$finalModel, s=ridge$bestTune$lambda) # for which value of lambda
coef(ridge$finalModel, s=ridge$bestTune$alpha) # for which value of alpha

plot(varImp(ridge1))


# best model of ridge
ridge1$bestTune
bestridge<-ridge1$finalModel
coef(bestridge, s=ridge1$bestTune$lambda)




# lasso regression

lasso1<-train(X19~.,reg1, method= "glmnet", 
              tuneGrid=expand.grid(alpha =1, lambda = seq(0.0001,1,length=100)))

lasso1
plot(lasso1)
plot(lasso1$finalModel)
plot(lasso1$finalModel, xvar = "lambda", label = TRUE)
plot(lasso1$finalModel, xvar = "dev", label = TRUE)
plot(varImp(lasso1))


# best model of lasso
lasso1$bestTune
bestlasso<-lasso1$finalModel
coef(bestlaso, s=lasso1$bestTune$lambda)


# elastic net regression


en1<-train(X19~.,reg1, method= "glmnet", 
              tuneGrid=expand.grid(alpha =seq(0,1, length=10), lambda = seq(0.0001,1,length=100)))


plot(en1)
plot(en1$finalModel)
plot(en1$finalModel, xvar = "lambda", label = TRUE)
plot(en1$finalModel, xvar = "dev", label = TRUE)
plot(varImp(en1))

# best model of elastic net

en1$bestTune
besten<-en1$finalModel
coef(besten, s=en1$bestTune$lambda)







Basics of R- Session 15 Multiple Regression 1

rm(list=ls())

reg1<-read.csv("C:/Users/Administrator.vaio1/Desktop/reg2.csv")

#check/ explore the file
str(reg1)
head(reg1)

# Descriptive statistics of all the variables

library(psych)
describe(reg1)

library(pastecs)
stat.desc(reg1)


# here we will  use x19 satisfaction as dependent variable
# and X7 - E-Commerce Activities,X8 - Technical Support,
#X9 - Complaint,Resolution,X10 - Advertising,X11 - Product Line,
#X12 - Salesforce Image,X13 - Competitive Pricing,X14 - Warranty & Claims,
#X15 - New Products,X16 Order & Billing,
#X17 - Price Flexibility,and X18 - Delivery Speed

# Normality of only error terms has to be checked

#split the data set into two parts- training and test data set

library(caTools)
set.seed(1000)
split1 = sample.split(reg1$id, SplitRatio = 0.70)
summary(split1)


#training data set
reg1train<-subset(reg1,split1==TRUE)
str(reg1train)
dim(reg1train)
#validation data set
reg1Val<-subset(reg1,split1==FALSE)
str(reg1Val)
dim(reg1Val)


# Remove all the missing observation
dim(reg1train)
reg1train<-na.omit(reg1train)
dim(reg1train)


#outliers
#to identify the outliers, use box plot, here we had used/ explained it for a single variable
boxplot(reg1train$id)
#convert them into log to remove the outliers effect
boxplot(log(reg1train$id))



#assumptions of MLRegression
#1. Linear relationship
# plot to check the linearity- bivaraite plot

plot(reg1train$X19...Satisfaction, reg1train$X7...E.Commerce.Activities)
# this plot is only indicative, it may not make any sense
library(ggplot2)
ggplot(reg1train, aes(reg1train$X19,reg1train$X7))+geom_smooth()

#plot a linear relation between the variables and try to find weather it is linear or not
#y~X
regx19_x7<-lm(reg1train$X19~reg1train$X7)
summary(regx19_x7)
# check the coefficient of b1 only, correlation

# use correlation
library(stats)
cor(reg1train$X7, reg1train$X19)
cor.test(reg1train$X7, reg1train$X19)


# Application of the Model

# Multiple Linear Regression Example

# ---------------fit the model------------------#

fit1<-lm(X19~X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18, data = reg1train)
summary(fit1)
attributes(fit1)

fit1$coefficients
fit1$residuals

# check outliers
library(car)
outlierTest(reg1)

# give the items with influencial values
influenceIndexPlot(reg1)


# to check residual analysis, heteroscadity, etc we use plots
plot(fit1)

#-----------Multicollinearity-------------------------#
library(car)
vif(fit1) #variance inflation factors  multicollinearity

#plot to find randomness of the residuals

plot(fit1$residuals, c(1:length(fit1$residuals)))


#normality for residual values
qqnorm(fit1$residuals, pch=20, cex=1)
qqline(fit1$residuals, col="red",lwd=3)
boxplot(fit1$residuals)
#sapiro test has to be applied

#there seems to be a lot of outliers which are creating the problem of non normality of residuals

#autocorrelation
library(lmtest)
dwtest(fit1)

#test for heteroscadisticity
plot(fit1$residuals, fit1$fitted.values)

#from the plot it is not easy to analyze the pattern of the data so we will apply bptest
bptest(fit1)

#-----------------------------------#


# --------------------for stepwise forward backward Method------------#

library(leaps)
fit2<-regsubsets(X19~X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18, data=reg1train, nbest = 3)
fit2
summary(fit2)

plot(fit2, scale ="r2")
plot(fit2, scale ="adjr2")
plot(fit2, scale ="bic")

#-------------------#


library(MASS)

regback<-stepAIC(fit1,direction = "backward", trace = 1)
summary(regback)
regback$anova

regfor<-stepAIC(fit1, direction = "forward", trace = 1)
summary(regfor)
regfor$anova

regall<-stepAIC(fit1,direction = c("both", "backward", "forward"), trace = 1)

summary(regall)
regall$anova
#-----------------------------#

# training and test model comparison
compare the two model with respect to R2 and adjR2 and coeefficient

then can compare them with predicted values


fitfinal<-lm(x19~x6+x7+x9+x11+x12+x13+x17,data = regtrain)
summary(fitfinal)

fitfinaltest<-lm(x19~x6+x7+x9+x11+x12+x13+x17,data = regtest)
summary(fitfinaltest)

predict1<-predict(fitfinal, regtest)

library(Metrics)
rmse(predict1, regtest$x19)

Low value of RMSE indicates a good model

AIC(fitfinal)
AIC(fitfinaltest)

BIC(fitfinal)
BIC(fitfinaltest)



Saturday, September 8, 2018

Basics of R- Session 14- using library dplyr

library(dplyr)
# import the file

mbaper<-read.csv("C:/Users/Administrator.vaio1/Desktop/MBAdata.csv")

#Equivalent of str in traditional approach. Gives the structure of data
str(mbaper)
fix(mbaper)
glimpse(mbaper)

# adding or removing variables from the file
mbaper_age<-select(mbaper, Age_in_years_completed)
fix(mbaper_age)
mbaper_age<-select(mbaper, -Age_in_years_completed)
fix(mbaper_age)

mbaper_age<-select(mbaper, Age_in_years_completed:Mothers_qualification)
fix(mbaper_age)

#-------------------------------------------------------#

#To filter data based on some conditions we could use the filter function=filter(dataset, variable==value)
mbaper_age<-filter(mbaper, Percentage_in_10_Class==75)
fix(mbaper_age)

mbaper_age<-filter(mbaper, Percentage_in_10_Class<75)
fix(mbaper_age)

mbaper_age<-filter(mbaper, Percentage_in_10_Class>75)
fix(mbaper_age)

#-------------------------------------------------------#

# specific category from a categorical variable
mbaper_zone<-filter(mbaper, STATE=="North East Zone")
fix(mbaper_zone)

# more than one category
#   , for and
mbaper_zone<-filter(mbaper, STATE=="North East Zone",STATE=="Central Zone")
fix(mbaper_zone)

#   | for or
mbaper_zone<-filter(mbaper, STATE=="North East Zone" | STATE=="Central Zone")
fix(mbaper_zone)

#   | for or### exact word has to be used
mbaper_zone<-filter(mbaper, STATE=="North East Zone" | Previous_Degree=="Commerce")
fix(mbaper_zone)

mbaper_zone<-filter(mbaper, STATE=="North East Zone" , Previous_Degree=="Commerce")
fix(mbaper_zone)

#-------------------------------------#

#Select columns and then subset data based on a condition, combination of rows and variables

mbaper_age<-select(mbaper, Age_in_years_completed)
fix(mbaper_age)

mbaper_age<-filter(select(mbaper, Age_in_years_completed),Age_in_years_completed==25)
fix(mbaper_age)

# --------------------------------------------------# pipe function

1:8 %>% sum
1:8 %>% sum %>% sqrt
1:8 %>% sum %>% sqrt%>% sum*10

#---------------------------------#
mbaper_age<-select(mbaper, Age_in_years_completed) %>%
  filter(Age_in_years_completed==22)

fix(mbaper_age)

mbaper_age<-select(mbaper, Age_in_years_completed) %>%
  arrange(Age_in_years_completed)

mbaper_age<-select(mbaper, Age_in_years_completed) %>%
  arrange(-Age_in_years_completed)

Monday, July 30, 2018

Basics of R Session 13 Conjoint Analysis


library(conjoint)

Case study- Beauty Bar Soap

There are three attributes of the problem/ case study

1)colour with three levels (red, blue, and yellow)
2)shape with three levels (cubic, cylindrical and spherical) and
3)aroma with two levels (scented, and unscented)

Create an R object attribute1 for the factors with levels

attribute1<-list(
colour=c("red","blue", "yellow"),
shape =c("cubic", "cylindrical","spherical"),
aroma=c("scented", "unscented")
    )

attribute1
fix(attribute1)

all the possible profiles for the 3 factors each with three levels 3*3*2=18

profiles1<-expand.grid(attribute1)
profiles1

fix(profiles1)
length(profiles1)
dim(profiles1)

orthogonal design, no of cards are equal to 3*3*2= 18

design1<-caFactorialDesign(data=profiles1,type="fractional", cards=18)

print(design1)
fix(design1)

load the files with the names of all the levels of the three factors

possiblefactors <- read.csv("file:///E:/2 presentation for class/inurture Lectures/1 multivariate data analysis/1 Multivariate Data Analysis PPts Self/conjoint Analysis/Beauty Bar Soap Case IIM/profile.csv", header = FALSE)


possiblefactors


load the files with the perception of the respondents on the 3 factors and their sublevels

perception1 <- read.csv("file:///E:/2 presentation for class/inurture Lectures/1 multivariate data analysis/1 Multivariate Data Analysis PPts Self/conjoint Analysis/Beauty Bar Soap Case IIM/perception.csv", header=T)
here the respondents are asked to rank the levels for all the factors with their levels



fix(perception1)


Conjoint(perception1, design1, possiblefactors)



Tuesday, July 10, 2018

Basics of R- Session 12- Market Basket Analysis


Market Basket Analysis

library(arules)
library(arulesViz)

# file mbaper
mbaper<-read.csv("E:/2 presentation for class/R/1 R Open elective/data set/mbadata.csv")
fix(mbaper)
str(mbaper)

remove first four variables which are scale in Nature
mbaper<-mbaper[,c(-1:-4)]

# study the categorical variables using tables and cross tab and find the variables which are having associations or some sort of relations

convert the files into transactions type use all the factor/ categorical variables in this analysis

mbaper<-as(mbaper, "transactions")

check the data set created after creating transactions

inspect(mbaper)

inspect(head(mbaper))

head(mbaper@itemInfo)

head(c(mbaper@itemsetInfo, mbaper@itemInfo))

# create a priori rules for the data set
# to check the code for the function or argument
args(apriori)
rules1<- apriori(mbaper)

rules1<- apriori(mbaper, parameter = list(supp = 0.10, conf = 0.80))
rules1<- apriori(mbaper, parameter = list(supp = 0.50, conf = 0.80))

# Number of items in the rules minlen 1 and malen=4, means there will be maximum 4 items in the rules created
rules1<- apriori(mbaper, parameter = list(minlen=1, supp=0.10, conf=0.80, maxlen=4))

#----------------------------#
# check the frequency distribution of the transactions
itemFrequency(mbaper)

# we can use other methods as well
itemFrequency(mbaper,type="absolute")

# plot the frequency
itemFrequencyPlot(rules1)

# if it doesn't work, then use items()

items(rules1)
itemFrequencyPlot(items(rules1))


#--------------------------#

#inspect the rules created

inspect(rules1)

#first three rules
inspect(rules1[1:3])

#first ten rules
inspect(rules1[1:10])


#-----------------------------------------------------#
inspect(head(sort(rules1, by = "support"), n=10))

inspect(head(sort(rules1, by = "confidence"), n=10))

inspect(head(sort(rules1, by = "lift"), n=100))
#---------------------------------------------------------------#

#when we want to study some specific combination of LHS and RHS
rules with RHS containing "perceivedscorecat=required skills" only



rules2<- apriori(mbaper, parameter = list(minlen=1, supp=0.001, conf=0.51, maxlen=4),appearance = list(rhs=c("perceivedscorecat=required skills")))

rules2<- apriori(mbaper, parameter = list(minlen=1, supp=0.001, conf=0.51, maxlen=4),appearance = list(rhs=c("perceivedscorecat=required skills"), default="rhs"))

rules2<- apriori(mbaper, parameter = list(minlen=1, supp=0.001, conf=0.51, maxlen=4),appearance = list(rhs=c("perceivedscorecat=required skills")))





rules2<- apriori(mbaper, parameter = list(minlen=1, supp=0.001, conf=0.51, maxlen=4),appearance = list(rhs=c("perceivedscorecat=required skills"), default ="lhs"))

inspect(head(sort(rules2, by = "lift"), n=10))

#--------------------------------------------------#


#rules with LHS containing "STATE = delhi" only

rules2<- apriori(mbaper, parameter = list(minlen=1, supp=0.05, conf=0.8),appearance = list(default ="rhs", lhs="perceivedscorecat=required skills"))

inspect(head(sort(rules2, by = "lift"), n=10))

#___________________________________________________________________________#


library(arulesViz)
plot(rules2)
plot(rules1, method = "graph")
plot(rules1, method = "grouped")
plotly_arules(rules1)


#----------------#

rules3<- apriori(mbaper, parameter = list(supp = 0.50, conf = 0.80))

plot(rules3,method="graph",interactive=TRUE,shading=NA)


Monday, July 9, 2018

Basics of R- Session 11- Factor Analysis

Factor Analysis

library(psych)
library(Hmisc)

#import data set

factoranalysis<-read.csv("E:/2 presentation for class/R/1 R Open elective/data set/FACTOR ANALYSIS.csv")

# check the structure of the data set, here we are using only 10 Variables

# labeling the variables
label(factoranalysis$ï..Resp)<-"Respondent"
label(factoranalysis$X1)<-"refreshing"
label(factoranalysis$X2)<-"bad for health"
label(factoranalysis$X3)<-"very convenient to serve"
label(factoranalysis$X4)<-"avoided with age"
label(factoranalysis$X5)<-"very tasty"
label(factoranalysis$X6)<-"not good for children"
label(factoranalysis$X7)<-"consumed occasionally"
label(factoranalysis$X8)<-"not be taken in large quantity"
label(factoranalysis$X9)<-"not as good as energy drinks"
label(factoranalysis$X10)<-"better than fruit juices"
label(factoranalysis$S)<-"Recommending aerated drinks to others"

str(factoranalysis)

fix(factoranalysis)
View(factoranalysis)

#-----------------#
#--------------------#
#----------------------#
# here the library(psych) will be used

# bartlet test of speriocity 
# the file should contain only those variables for which we have to apply factor analysis
# remove those variables, which are not included

fix(factoranalysis)
factoranalysis<-factoranalysis[,c(-1,-12)]
fix(factoranalysis)

cortest.bartlett(factoranalysis)  # data set


#KMO test 
KMO(factoranalysis)

#-----------------#
#--------------------#
#----------------------#

#code for principal in factor analysis

principal(r, nfactors = 1, residuals = FALSE,rotate="varimax",n.obs=NA, covar=FALSE,
 scores=TRUE,missing=FALSE,impute="median",oblique.scores=TRUE,method="regression",...)

Rotation-->
"none", "varimax", "quartimax", "promax", "oblimin", "simplimax", and "cluster"

PCA2<-principal(factoranalysis)
PCA2$rotation
PCA2$values
PCA2$communality
PCA2$factors
PCA2$scores

# for screeplot
X11<-PCA2$values
Y11<-1:length(PCA2$values)
plot(X11,Y11, type="l")

# scree plot
VSS.scree(factoranalysis)

The column h2 is a measure of communalities, and u2 is uniqueness; 

Communalities refer to shared variance with the other items, while uniqueness is variance not explained by the other items, but that could be explained by the latent variable as well as measurement error. 



Tuesday, July 3, 2018

Basics of R- Session 10- Data Visualization-4 Tree map


library(treemap)

# here index is the categorical variable or variables, vsize is the size of the rectangle (scale variable)

treemap(mbaper, index = "Gender_MF", vSize = "Percentage_in_10_Class")

# adding more categorical variables

treemap(mbaper, index = c("Gender_MF","Previous_Degree"), vSize = "Percentage_in_10_Class")


# interactive treemap using itreemap
# here index is the categorical variable or variables, vsize is the size of the rectangle (scale variable)

treemap::itreemap(mbaper, index = "Gender_MF", vSize = "Percentage_in_10_Class")

Tuesday, June 19, 2018

Basics of R- Session 9- Data Visualization-3 using facets for adding layers in ggplot2


# Use of facet in ggplot for adding layers

ggplot(mbaper, aes(mbaper$Percentage_in_10_Class))+geom_dotplot()+facet_grid(~mbaper$Gender_MF)

ggplot(mbaper, aes(mbaper$Percentage_in_10_Class))+geom_dotplot()+facet_grid(~mbaper$Gender_MF+mbaper$Previous_Degree)

ggplot(mbaper, aes(mbaper$Percentage_in_10_Class))+geom_dotplot()+facet_grid(mbaper$Gender_MF~mbaper$Previous_Degree+mbaper$Place_you_belong_to)

ggplot(mbaper, aes(mbaper$perceivedscorecat))+geom_bar()+facet_grid(~mbaper$Gender_MF)

ggplot(mbaper, aes(mbaper$perceivedscorecat))+geom_bar()+facet_grid(~mbaper$Gender_MF+mbaper$Previous_Degree)

ggplot(mbaper, aes(mbaper$perceivedscorecat))+geom_bar()+facet_grid(mbaper$Gender_MF~mbaper$Previous_Degree+mbaper$Place_you_belong_to)

Basics of R- session 8- data visualization-2

one categorical and one scale variable
x- axis categorical, y axis scale
box plot



ggplot(mbaper, aes(mbaper$perceivedscorecat, mbaper$Percentage_in_12_Class))+geom_boxplot()
ggplot(mbaper, aes(mbaper$perceivedscorecat, mbaper$Percentage_in_12_Class))+geom_col()
ggplot(mbaper, aes(mbaper$perceivedscorecat, mbaper$Percentage_in_12_Class))+geom_count()
ggplot(mbaper, aes(mbaper$perceivedscorecat, mbaper$Percentage_in_12_Class))+geom_bin2d()
ggplot(mbaper, aes(mbaper$perceivedscorecat, mbaper$Percentage_in_12_Class))+geom_jitter()
ggplot(mbaper, aes(mbaper$perceivedscorecat, mbaper$Percentage_in_12_Class))+geom_violin()


one categorical and one scale variable
X axis scale, Y- axis categorical
box plot

ggplot(mbaper, aes(mbaper$Percentage_in_12_Class, mbaper$perceivedscorecat))+geom_bin2d()
ggplot(mbaper, aes(mbaper$Percentage_in_12_Class, mbaper$perceivedscorecat))+geom_jitter()

# better to have x axis as categorical

multiple variables
3 variables
2 scale and one categorical


ggplot(mbaper, aes(mbaper$Percentage_in_12_Class, mbaper$Percentage_in_10_Class))+geom_point()

#add more layer in terms of colour

ggplot(mbaper, aes(mbaper$Percentage_in_12_Class, mbaper$Percentage_in_10_Class))+geom_point(aes(color=perceivedscorecat))

ggplot(mbaper, aes(mbaper$Percentage_in_12_Class, mbaper$Percentage_in_10_Class))+geom_point(aes(color=Previous_Degree))

#add more layer in terms of shape

ggplot(mbaper, aes(mbaper$Percentage_in_12_Class, mbaper$Percentage_in_10_Class))+geom_point(aes(shape=perceivedscorecat))

# divide it into parts use facets
# facet_wrap()

ggplot(mbaper, aes(mbaper$Percentage_in_12_Class, mbaper$Percentage_in_10_Class))+geom_point()+facet_wrap(~perceivedscorecat)

#extra facet
ggplot(mbaper, aes(mbaper$Percentage_in_12_Class, mbaper$Percentage_in_10_Class))+geom_point()+facet_wrap(~perceivedscorecat+Marital_status)

ggplot(mbaper, aes(mbaper$Percentage_in_12_Class, mbaper$Percentage_in_10_Class))+geom_point()+facet_wrap(perceivedscorecat~Marital_status)

# we can add colour also

ggplot(mbaper, aes(mbaper$Percentage_in_12_Class, mbaper$Percentage_in_10_Class))+geom_point(aes(color=Previous_Degree))+facet_wrap(~perceivedscorecat)

Basics of R session 7- Data Visualization Mosaic Plot

Use library vcd
mosaic will be used for categorical variables
#Import the file- MBAdata.csv and save as an R object mbaper
# import the file from the location-

mbaper<-read.csv("D:/1 Teaching Material/R/importfile/MBAdata.csv")

library(vcd)
## Loading required package: grid
#here the data set used is mbaper, and the variable used is Marital_status

mosaic(~mbaper$Marital_status)
mosaic(~Marital_status, data= mbaper)
# here the output will be different in terms of labeling- the block name as the labels of the variable Marital_status, and the title will be the name of the data
Adding variables
mosaic(~mbaper$Gender+mbaper$Marital_status+mbaper$Place_you_belong_to)
mosaic(~Gender+Marital_status+Place_you_belong_to, data = mbaper)

cross tab mosaic

mosaic(mbaper$Gender~mbaper$Marital_status)

adding variables

mosaic(mbaper$Gender~mbaper$Marital_status+mbaper$perceivedscorecat)
mosaic(mbaper$Gender~mbaper$Marital_status+mbaper$perceivedscorecat+mbaper$Place_you_belong_to)
mosaic(mbaper$Gender~mbaper$Marital_status)
rotating the label for better visibility
mosaic(~mbaper$Gender+mbaper$Marital_status, labeling= labeling_border(rot_labels = c(45,45,45,45)))
if there are null blocks or no observation in the combination cell, nothing will be displayed if zero_size=0
mosaic(~mbaper$Gender+mbaper$STATE, zero_size= 0)
for colouring the mosaic, wth specific colour
mosaic(~mbaper$Marital_status, gp= gpar(fill=c("red", "green")))
different types of shading based on residual
mosaic(~mbaper$Gender+mbaper$Marital_status, gp = shading_hcl)
mosaic(~mbaper$Gender+mbaper$Marital_status, gp = shading_hsv)
mosaic(~mbaper$Gender+mbaper$Marital_status, gp = shading_max)
mosaic(~mbaper$Gender+mbaper$Marital_status, gp = shading_Friendly)
mosaic(~mbaper$Gender+mbaper$Marital_status, gp = shading_Friendly2)
mosaic(~mbaper$Gender+mbaper$Marital_status, gp = shading_sieve)
mosaic(~mbaper$Gender+mbaper$Marital_status, gp = shading_binary)