Sunday, September 16, 2018

Basics of R- Session 15 Multiple Regression 1

rm(list=ls())

reg1<-read.csv("C:/Users/Administrator.vaio1/Desktop/reg2.csv")

#check/ explore the file
str(reg1)
head(reg1)

# Descriptive statistics of all the variables

library(psych)
describe(reg1)

library(pastecs)
stat.desc(reg1)


# here we will  use x19 satisfaction as dependent variable
# and X7 - E-Commerce Activities,X8 - Technical Support,
#X9 - Complaint,Resolution,X10 - Advertising,X11 - Product Line,
#X12 - Salesforce Image,X13 - Competitive Pricing,X14 - Warranty & Claims,
#X15 - New Products,X16 Order & Billing,
#X17 - Price Flexibility,and X18 - Delivery Speed

# Normality of only error terms has to be checked

#split the data set into two parts- training and test data set

library(caTools)
set.seed(1000)
split1 = sample.split(reg1$id, SplitRatio = 0.70)
summary(split1)


#training data set
reg1train<-subset(reg1,split1==TRUE)
str(reg1train)
dim(reg1train)
#validation data set
reg1Val<-subset(reg1,split1==FALSE)
str(reg1Val)
dim(reg1Val)


# Remove all the missing observation
dim(reg1train)
reg1train<-na.omit(reg1train)
dim(reg1train)


#outliers
#to identify the outliers, use box plot, here we had used/ explained it for a single variable
boxplot(reg1train$id)
#convert them into log to remove the outliers effect
boxplot(log(reg1train$id))



#assumptions of MLRegression
#1. Linear relationship
# plot to check the linearity- bivaraite plot

plot(reg1train$X19...Satisfaction, reg1train$X7...E.Commerce.Activities)
# this plot is only indicative, it may not make any sense
library(ggplot2)
ggplot(reg1train, aes(reg1train$X19,reg1train$X7))+geom_smooth()

#plot a linear relation between the variables and try to find weather it is linear or not
#y~X
regx19_x7<-lm(reg1train$X19~reg1train$X7)
summary(regx19_x7)
# check the coefficient of b1 only, correlation

# use correlation
library(stats)
cor(reg1train$X7, reg1train$X19)
cor.test(reg1train$X7, reg1train$X19)


# Application of the Model

# Multiple Linear Regression Example

# ---------------fit the model------------------#

fit1<-lm(X19~X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18, data = reg1train)
summary(fit1)
attributes(fit1)

fit1$coefficients
fit1$residuals

# check outliers
library(car)
outlierTest(reg1)

# give the items with influencial values
influenceIndexPlot(reg1)


# to check residual analysis, heteroscadity, etc we use plots
plot(fit1)

#-----------Multicollinearity-------------------------#
library(car)
vif(fit1) #variance inflation factors  multicollinearity

#plot to find randomness of the residuals

plot(fit1$residuals, c(1:length(fit1$residuals)))


#normality for residual values
qqnorm(fit1$residuals, pch=20, cex=1)
qqline(fit1$residuals, col="red",lwd=3)
boxplot(fit1$residuals)
#sapiro test has to be applied

#there seems to be a lot of outliers which are creating the problem of non normality of residuals

#autocorrelation
library(lmtest)
dwtest(fit1)

#test for heteroscadisticity
plot(fit1$residuals, fit1$fitted.values)

#from the plot it is not easy to analyze the pattern of the data so we will apply bptest
bptest(fit1)

#-----------------------------------#


# --------------------for stepwise forward backward Method------------#

library(leaps)
fit2<-regsubsets(X19~X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18, data=reg1train, nbest = 3)
fit2
summary(fit2)

plot(fit2, scale ="r2")
plot(fit2, scale ="adjr2")
plot(fit2, scale ="bic")

#-------------------#


library(MASS)

regback<-stepAIC(fit1,direction = "backward", trace = 1)
summary(regback)
regback$anova

regfor<-stepAIC(fit1, direction = "forward", trace = 1)
summary(regfor)
regfor$anova

regall<-stepAIC(fit1,direction = c("both", "backward", "forward"), trace = 1)

summary(regall)
regall$anova
#-----------------------------#

# training and test model comparison
compare the two model with respect to R2 and adjR2 and coeefficient

then can compare them with predicted values


fitfinal<-lm(x19~x6+x7+x9+x11+x12+x13+x17,data = regtrain)
summary(fitfinal)

fitfinaltest<-lm(x19~x6+x7+x9+x11+x12+x13+x17,data = regtest)
summary(fitfinaltest)

predict1<-predict(fitfinal, regtest)

library(Metrics)
rmse(predict1, regtest$x19)

Low value of RMSE indicates a good model

AIC(fitfinal)
AIC(fitfinaltest)

BIC(fitfinal)
BIC(fitfinaltest)



No comments:

Post a Comment