rm(list=ls())
reg1<-read.csv("C:/Users/Administrator.vaio1/Desktop/reg2.csv")
#check/ explore the file
str(reg1)
head(reg1)
# Descriptive statistics of all the variables
library(psych)
describe(reg1)
library(pastecs)
stat.desc(reg1)
# here we will use x19 satisfaction as dependent variable
# and X7 - E-Commerce Activities,X8 - Technical Support,
#X9 - Complaint,Resolution,X10 - Advertising,X11 - Product Line,
#X12 - Salesforce Image,X13 - Competitive Pricing,X14 - Warranty & Claims,
#X15 - New Products,X16 Order & Billing,
#X17 - Price Flexibility,and X18 - Delivery Speed
# Normality of only error terms has to be checked
#split the data set into two parts- training and test data set
library(caTools)
set.seed(1000)
split1 = sample.split(reg1$id, SplitRatio = 0.70)
summary(split1)
#training data set
reg1train<-subset(reg1,split1==TRUE)
str(reg1train)
dim(reg1train)
#validation data set
reg1Val<-subset(reg1,split1==FALSE)
str(reg1Val)
dim(reg1Val)
# Remove all the missing observation
dim(reg1train)
reg1train<-na.omit(reg1train)
dim(reg1train)
#outliers
#to identify the outliers, use box plot, here we had used/ explained it for a single variable
boxplot(reg1train$id)
#convert them into log to remove the outliers effect
boxplot(log(reg1train$id))
#assumptions of MLRegression
#1. Linear relationship
# plot to check the linearity- bivaraite plot
plot(reg1train$X19...Satisfaction, reg1train$X7...E.Commerce.Activities)
# this plot is only indicative, it may not make any sense
library(ggplot2)
ggplot(reg1train, aes(reg1train$X19,reg1train$X7))+geom_smooth()
#plot a linear relation between the variables and try to find weather it is linear or not
#y~X
regx19_x7<-lm(reg1train$X19~reg1train$X7)
summary(regx19_x7)
# check the coefficient of b1 only, correlation
# use correlation
library(stats)
cor(reg1train$X7, reg1train$X19)
cor.test(reg1train$X7, reg1train$X19)
# Application of the Model
# Multiple Linear Regression Example
# ---------------fit the model------------------#
fit1<-lm(X19~X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18, data = reg1train)
summary(fit1)
attributes(fit1)
fit1$coefficients
fit1$residuals
# check outliers
reg1<-read.csv("C:/Users/Administrator.vaio1/Desktop/reg2.csv")
#check/ explore the file
str(reg1)
head(reg1)
# Descriptive statistics of all the variables
library(psych)
describe(reg1)
library(pastecs)
stat.desc(reg1)
# here we will use x19 satisfaction as dependent variable
# and X7 - E-Commerce Activities,X8 - Technical Support,
#X9 - Complaint,Resolution,X10 - Advertising,X11 - Product Line,
#X12 - Salesforce Image,X13 - Competitive Pricing,X14 - Warranty & Claims,
#X15 - New Products,X16 Order & Billing,
#X17 - Price Flexibility,and X18 - Delivery Speed
# Normality of only error terms has to be checked
#split the data set into two parts- training and test data set
library(caTools)
set.seed(1000)
split1 = sample.split(reg1$id, SplitRatio = 0.70)
summary(split1)
#training data set
reg1train<-subset(reg1,split1==TRUE)
str(reg1train)
dim(reg1train)
#validation data set
reg1Val<-subset(reg1,split1==FALSE)
str(reg1Val)
dim(reg1Val)
# Remove all the missing observation
dim(reg1train)
reg1train<-na.omit(reg1train)
dim(reg1train)
#outliers
#to identify the outliers, use box plot, here we had used/ explained it for a single variable
boxplot(reg1train$id)
#convert them into log to remove the outliers effect
boxplot(log(reg1train$id))
#assumptions of MLRegression
#1. Linear relationship
# plot to check the linearity- bivaraite plot
plot(reg1train$X19...Satisfaction, reg1train$X7...E.Commerce.Activities)
# this plot is only indicative, it may not make any sense
library(ggplot2)
ggplot(reg1train, aes(reg1train$X19,reg1train$X7))+geom_smooth()
#plot a linear relation between the variables and try to find weather it is linear or not
#y~X
regx19_x7<-lm(reg1train$X19~reg1train$X7)
summary(regx19_x7)
# check the coefficient of b1 only, correlation
# use correlation
library(stats)
cor(reg1train$X7, reg1train$X19)
cor.test(reg1train$X7, reg1train$X19)
# Application of the Model
# Multiple Linear Regression Example
# ---------------fit the model------------------#
fit1<-lm(X19~X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18, data = reg1train)
summary(fit1)
attributes(fit1)
fit1$coefficients
fit1$residuals
# check outliers
library(car)
outlierTest(reg1)
# give the items with influencial values
influenceIndexPlot(reg1)
# to check residual analysis, heteroscadity, etc we use plots
plot(fit1)
#-----------Multicollinearity-------------------------#
library(car)
vif(fit1) #variance inflation factors multicollinearity
#plot to find randomness of the residuals
plot(fit1$residuals, c(1:length(fit1$residuals)))
#normality for residual values
qqnorm(fit1$residuals, pch=20, cex=1)
qqline(fit1$residuals, col="red",lwd=3)
boxplot(fit1$residuals)
#sapiro test has to be applied
#there seems to be a lot of outliers which are creating the problem of non normality of residuals
#autocorrelation
library(lmtest)
dwtest(fit1)
#test for heteroscadisticity
plot(fit1$residuals, fit1$fitted.values)
#from the plot it is not easy to analyze the pattern of the data so we will apply bptest
bptest(fit1)
#-----------------------------------#
# --------------------for stepwise forward backward Method------------#
library(leaps)
fit2<-regsubsets(X19~X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18, data=reg1train, nbest = 3)
fit2
summary(fit2)
plot(fit2, scale ="r2")
plot(fit2, scale ="adjr2")
plot(fit2, scale ="bic")
#-------------------#
library(MASS)
regback<-stepAIC(fit1,direction = "backward", trace = 1)
summary(regback)
regback$anova
regfor<-stepAIC(fit1, direction = "forward", trace = 1)
summary(regfor)
regfor$anova
regall<-stepAIC(fit1,direction = c("both", "backward", "forward"), trace = 1)
summary(regall)
regall$anova
plot(fit1)
#-----------Multicollinearity-------------------------#
library(car)
vif(fit1) #variance inflation factors multicollinearity
#plot to find randomness of the residuals
plot(fit1$residuals, c(1:length(fit1$residuals)))
#normality for residual values
qqnorm(fit1$residuals, pch=20, cex=1)
qqline(fit1$residuals, col="red",lwd=3)
boxplot(fit1$residuals)
#sapiro test has to be applied
#there seems to be a lot of outliers which are creating the problem of non normality of residuals
#autocorrelation
library(lmtest)
dwtest(fit1)
#test for heteroscadisticity
plot(fit1$residuals, fit1$fitted.values)
#from the plot it is not easy to analyze the pattern of the data so we will apply bptest
bptest(fit1)
#-----------------------------------#
library(leaps)
fit2<-regsubsets(X19~X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18, data=reg1train, nbest = 3)
fit2
summary(fit2)
plot(fit2, scale ="r2")
plot(fit2, scale ="adjr2")
plot(fit2, scale ="bic")
#-------------------#
library(MASS)
regback<-stepAIC(fit1,direction = "backward", trace = 1)
summary(regback)
regback$anova
regfor<-stepAIC(fit1, direction = "forward", trace = 1)
summary(regfor)
regfor$anova
regall<-stepAIC(fit1,direction = c("both", "backward", "forward"), trace = 1)
summary(regall)
regall$anova
#-----------------------------#
# training and test model comparison
compare the two model with respect to R2 and adjR2 and coeefficient
then can compare them with predicted values
fitfinal<-lm(x19~x6+x7+x9+x11+x12+x13+x17,data = regtrain)
summary(fitfinal)
fitfinaltest<-lm(x19~x6+x7+x9+x11+x12+x13+x17,data = regtest)
summary(fitfinaltest)
predict1<-predict(fitfinal, regtest)
library(Metrics)
rmse(predict1, regtest$x19)
Low value of RMSE indicates a good model
AIC(fitfinal)
AIC(fitfinaltest)
BIC(fitfinal)
BIC(fitfinaltest)
# training and test model comparison
compare the two model with respect to R2 and adjR2 and coeefficient
then can compare them with predicted values
fitfinal<-lm(x19~x6+x7+x9+x11+x12+x13+x17,data = regtrain)
summary(fitfinal)
fitfinaltest<-lm(x19~x6+x7+x9+x11+x12+x13+x17,data = regtest)
summary(fitfinaltest)
predict1<-predict(fitfinal, regtest)
library(Metrics)
rmse(predict1, regtest$x19)
Low value of RMSE indicates a good model
AIC(fitfinal)
AIC(fitfinaltest)
BIC(fitfinal)
BIC(fitfinaltest)
No comments:
Post a Comment