Monday, March 4, 2019

Basics of R- Session :- Image classification using JPG images and svm, and knn

# classification of Image into two group
# Library raster, rasterimage

rm(list=ls())

library(imager)
## Loading required package: magrittr
## 
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
## 
##     add
## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum
## The following object is masked from 'package:graphics':
## 
##     frame
## The following object is masked from 'package:base':
## 
##     save.image
# import images from Directory or load images of IMCU faculty members
IMCU.Images<- load.dir(path="C:/Users/LENOVO/Desktop/Image classification/institute of Management", pattern=".jpg")
#str(IMCU.Images)
# plot(IMCU.Images[1])

# Use for loop

#--------------------------------------------#
# convert image into gray scale
for (i in 1:length(IMCU.Images))
{
  IMCU.Images[[i]]<- grayscale(IMCU.Images[[i]])  
}

#str(IMCU.Images)
class(IMCU.Images)
## [1] "imlist" "list"
# plot(IMCU.Images[[1]])

#--------------------------------------------#
# conver image to same size- rescale
for (i in 1:length(IMCU.Images))
{
  IMCU.Images[[i]]<-resize(IMCU.Images[[i]], size_x = 100, size_y = 100, size_z = 1, size_c = 1)
}

IMCU.Images
## Image list of size 66
#str(IMCU.Images)
class(IMCU.Images)
## [1] "imlist" "list"
#plot(IMCU.Images[[1]])

#-------------------------------------------------#

# save the each image pixel data as numeric

for (i in 1:length(IMCU.Images))
{
  IMCU.Images[[i]]<-as.numeric(IMCU.Images[[i]]) # convert to number
}

#str(IMCU.Images)
class(IMCU.Images)
## [1] "imlist" "list"
#-------------------------------------------------#
# convert data into data frame

IMCU.Images.data<-as.data.frame(IMCU.Images)
str(IMCU.Images.data)
## 'data.frame':    660000 obs. of  2 variables:
##  $ im: chr  "E1125.jpg" "E1125.jpg" "E1125.jpg" "E1125.jpg" ...
##  $ v : num  1 1 1 1 1 1 1 1 1 1 ...
#---------------------------------------------#
# adding unique ids to the data, 
# since we know each image has 100*100 points

IMCU.Images.data$id<-seq(1:10000)
str(IMCU.Images.data)
## 'data.frame':    660000 obs. of  3 variables:
##  $ im: chr  "E1125.jpg" "E1125.jpg" "E1125.jpg" "E1125.jpg" ...
##  $ v : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ id: int  1 2 3 4 5 6 7 8 9 10 ...
#---------------------------------#
# add grouping variable to the data set
IMCU.Images.data$group1<-"imcu"
str(IMCU.Images.data)
## 'data.frame':    660000 obs. of  4 variables:
##  $ im    : chr  "E1125.jpg" "E1125.jpg" "E1125.jpg" "E1125.jpg" ...
##  $ v     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ group1: chr  "imcu" "imcu" "imcu" "imcu" ...
IMCU.Images.data$group1<-as.factor(IMCU.Images.data$group1)

# rearrange the data
IMCU.Images.data<-IMCU.Images.data[,c("id","group1","im","v")]
str(IMCU.Images.data)
## 'data.frame':    660000 obs. of  4 variables:
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ group1: Factor w/ 1 level "imcu": 1 1 1 1 1 1 1 1 1 1 ...
##  $ im    : chr  "E1125.jpg" "E1125.jpg" "E1125.jpg" "E1125.jpg" ...
##  $ v     : num  1 1 1 1 1 1 1 1 1 1 ...
#IMCU.Images.data

#----------------------------#
# since the data created is a single col with all pixel positions
# converting 10000 pixels address as variables

library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:imager':
## 
##     fill
## The following object is masked from 'package:magrittr':
## 
##     extract
IMCU.Images.data.1<-spread(IMCU.Images.data, id, v)
#str(IMCU.Images.data.1)
now create a data set for the other department Management Science
# import images from Directory or load images of IMCU faculty members
MS.Images<- load.dir(path="C:/Users/LENOVO/Desktop/Image classification/management studies", pattern=".jpg")
#str(MS.Images)
#plot(MS.Images[1])

# Use for loop

#--------------------------------------------#
# convert image into gray scale
for (i in 1:length(MS.Images))
{
  MS.Images[[i]]<- grayscale(MS.Images[[i]])  
}

#str(MS.Images)
class(MS.Images)
## [1] "imlist" "list"
#--------------------------------------------#
# conver image to same size- rescale
for (i in 1:length(MS.Images))
{
  MS.Images[[i]]<-resize(MS.Images[[i]], size_x = 100, size_y = 100, size_z = 1, size_c = 1)
}

MS.Images
## Image list of size 40
#str(MS.Images)
class(MS.Images)
## [1] "imlist" "list"
# plot(MS.Images[[1]])

#-------------------------------------------------#

# save the each image pixel data as numeric

for (i in 1:length(MS.Images))
{
  MS.Images[[i]]<-as.numeric(MS.Images[[i]]) # convert to number
}

#str(MS.Images)
class(MS.Images)
## [1] "imlist" "list"
#-------------------------------------------------#
# convert data into data frame

MS.Images.data<-as.data.frame(MS.Images)

#---------------------------------------------#
# adding unique ids to the data, 
# since we know each image has 100*100 points

MS.Images.data$id<-seq(1:10000)
str(MS.Images.data)
## 'data.frame':    400000 obs. of  3 variables:
##  $ im: chr  "E1247.jpg" "E1247.jpg" "E1247.jpg" "E1247.jpg" ...
##  $ v : num  0.922 0.941 0.945 0.937 0.941 ...
##  $ id: int  1 2 3 4 5 6 7 8 9 10 ...
#---------------------------------#
# add grouping variable to the data set
MS.Images.data$group1<-"MS"
str(MS.Images.data)
## 'data.frame':    400000 obs. of  4 variables:
##  $ im    : chr  "E1247.jpg" "E1247.jpg" "E1247.jpg" "E1247.jpg" ...
##  $ v     : num  0.922 0.941 0.945 0.937 0.941 ...
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ group1: chr  "MS" "MS" "MS" "MS" ...
MS.Images.data$group1<-as.factor(MS.Images.data$group1)

# rearrange the data
MS.Images.data<-MS.Images.data[,c("id","group1","im","v")]
str(MS.Images.data)
## 'data.frame':    400000 obs. of  4 variables:
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ group1: Factor w/ 1 level "MS": 1 1 1 1 1 1 1 1 1 1 ...
##  $ im    : chr  "E1247.jpg" "E1247.jpg" "E1247.jpg" "E1247.jpg" ...
##  $ v     : num  0.922 0.941 0.945 0.937 0.941 ...
# MS.Images.data

#----------------------------#
# since the data created is a single col with all pixel positions
# converting 10000 pixels address as variables

library(tidyr)
MS.Images.data.1<-spread(MS.Images.data, id, v)
#str(MS.Images.data.1)
combine the two data set created
CU.images.data<-rbind(IMCU.Images.data.1, MS.Images.data.1)
# str(CU.images.data)

apply model for classification

apply the knn on the whole data

# remove the variable im
CU.images.data<-CU.images.data[,-2]
# use caret package for KNN
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
fitControl = trainControl(method="cv")

knnMod2 = train(group1 ~ ., data=CU.images.data,
                method="knn",
                trControl=fitControl,
                preProcess=c("center","scale"),
                tuneLength=5)

summary(knnMod2)
##             Length Class      Mode     
## learn           2  -none-     list     
## k               1  -none-     numeric  
## theDots         0  -none-     list     
## xNames      10000  -none-     character
## problemType     1  -none-     character
## tuneValue       1  data.frame list     
## obsLevels       2  -none-     character
## param           0  -none-     list
print(knnMod2)
## k-Nearest Neighbors 
## 
##   106 samples
## 10000 predictors
##     2 classes: 'imcu', 'MS' 
## 
## Pre-processing: centered (10000), scaled (10000) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 96, 95, 96, 95, 95, 96, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.6754545  0.3341953
##    7  0.6690909  0.3081981
##    9  0.6300000  0.2157474
##   11  0.6209091  0.1961218
##   13  0.6127273  0.1482331
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
plot(knnMod2)
pred = predict(knnMod2, newdata=CU.images.data)
confusionMatrix(pred, CU.images.data[,1])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction imcu MS
##       imcu   56 10
##       MS     10 30
##                                           
##                Accuracy : 0.8113          
##                  95% CI : (0.7238, 0.8808)
##     No Information Rate : 0.6226          
##     P-Value [Acc > NIR] : 2.122e-05       
##                                           
##                   Kappa : 0.5985          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8485          
##             Specificity : 0.7500          
##          Pos Pred Value : 0.8485          
##          Neg Pred Value : 0.7500          
##              Prevalence : 0.6226          
##          Detection Rate : 0.5283          
##    Detection Prevalence : 0.6226          
##       Balanced Accuracy : 0.7992          
##                                           
##        'Positive' Class : imcu            
## 

—————————————————-

SUpport vector Machine

library(e1071)

dtm1<-svm(group1~., data = CU.images.data,kernel = "linear",scale = TRUE)
summary(dtm1)
## 
## Call:
## svm(formula = group1 ~ ., data = CU.images.data, kernel = "linear", 
##     scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  1e-04 
## 
## Number of Support Vectors:  85
## 
##  ( 50 35 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  imcu MS
#dtm1$SV
#dtm1$kernel
#dtm1$tot.nSV
#dtm1$decision.values
#dtm1$fitted
#dtm1$coefs

# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm1,CU.images.data)
confusionMatrix(predict1,CU.images.data$group1)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction imcu MS
##       imcu   66  0
##       MS      0 40
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9658, 1)
##     No Information Rate : 0.6226     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.6226     
##          Detection Rate : 0.6226     
##    Detection Prevalence : 0.6226     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : imcu       
## 

————————————————————-

Predicting for image out of data set

test data set

import images from Directory or load images of IMCU faculty members

test.Images<- load.dir(path="C:/Users/LENOVO/Desktop/Image classification/test", pattern=".jpg")

# convert image into gray scale
for (i in 1:length(test.Images))
{
  test.Images[[i]]<- grayscale(test.Images[[i]])  
}

# conver image to same size- rescale

for (i in 1:length(test.Images))
{
  test.Images[[i]]<-resize(test.Images[[i]], size_x = 100, size_y = 100, size_z = 1, size_c = 1)
}
# save the each image pixel data as numeric

for (i in 1:length(test.Images))
{
  test.Images[[i]]<-as.numeric(test.Images[[i]]) # convert to number
}

#-------------------------------------------------#
# convert data into data frame

test.Images.data<-as.data.frame(test.Images)

#---------------------------------------------#
# adding unique ids to the data, 
# since we know each image has 100*100 points

test.Images.data$id<-seq(1:10000)

#---------------------------------#
# add grouping variable to the data set
test.Images.data$group1<-"test"
test.Images.data$group1<-as.factor(test.Images.data$group1)

# rearrange the data
test.Images.data<-test.Images.data[,c("id","group1","im","v")]

#----------------------------#
# converting 10000 pixels address as variables

library(tidyr)

test.Images.data.1<-spread(test.Images.data, id, v)
#str(test.Images.data.1)
test.Images.data.1<-test.Images.data.1[,-2]

# to check is it a good classifier we will use confusion matrix
predict1<-predict(dtm1,test.Images.data.1)
predict1
##    1    2    3    4 
## imcu imcu imcu imcu 
## Levels: imcu MS

———————————————-

Random Forest tree

takes lot of time - Not useful

Thursday, February 7, 2019

Basic of R- Session Hypothesis testing

# Data set MBAper, location of the dataset
data1<-read.csv("file:///C:/Users/LENOVO/Desktop/MBAdata.csv")

summary and descriptive statistics

descriptive statistics

t-test

str(data1$Gender)
##  Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 1 2 2 ...
# one sample t-test

# t.test(y,mu=50) # Ho: mu=50

t.test(data1$Percentage_in_10_Class, mu=80)
## 
##  One Sample t-test
## 
## data:  data1$Percentage_in_10_Class
## t = 6.5793, df = 272, p-value = 2.417e-10
## alternative hypothesis: true mean is not equal to 80
## 95 percent confidence interval:
##  82.32584 84.31211
## sample estimates:
## mean of x 
##  83.31897

independent 2-group t-test

# t.test(y~x) # where y is numeric and x is a binary factor

t.test(data1$Age_in_years_completed~data1$Gender)
## 
##  Welch Two Sample t-test
## 
## data:  data1$Age_in_years_completed by data1$Gender
## t = -2.0978, df = 246.26, p-value = 0.03694
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.83004766 -0.02616149
## sample estimates:
## mean in group Female   mean in group Male 
##             22.16667             22.59477
t.test(data1$Percentage_in_10_Class~data1$Gender)
## 
##  Welch Two Sample t-test
## 
## data:  data1$Percentage_in_10_Class by data1$Gender
## t = 3.5962, df = 266.85, p-value = 0.0003846
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.596708 5.460344
## sample estimates:
## mean in group Female   mean in group Male 
##             85.29650             81.76797
t.test(data1$Percentage_in_12_Class~data1$Gender)
## 
##  Welch Two Sample t-test
## 
## data:  data1$Percentage_in_12_Class by data1$Gender
## t = 4.7554, df = 264.43, p-value = 3.259e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  3.114591 7.516383
## sample estimates:
## mean in group Female   mean in group Male 
##             83.54483             78.22935
t.test(data1$Percentage_in_Under_Graduate~data1$Gender)
## 
##  Welch Two Sample t-test
## 
## data:  data1$Percentage_in_Under_Graduate by data1$Gender
## t = 5.0318, df = 246.04, p-value = 9.381e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.735661 6.254914
## sample estimates:
## mean in group Female   mean in group Male 
##             75.86800             71.37271
#-----------------------------------------------#

t.test(data1$Age_in_years_completed~data1$Gender, alternative=c("two.sided"))
## 
##  Welch Two Sample t-test
## 
## data:  data1$Age_in_years_completed by data1$Gender
## t = -2.0978, df = 246.26, p-value = 0.03694
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.83004766 -0.02616149
## sample estimates:
## mean in group Female   mean in group Male 
##             22.16667             22.59477
t.test(data1$Age_in_years_completed~data1$Gender, alternative=c("less"))
## 
##  Welch Two Sample t-test
## 
## data:  data1$Age_in_years_completed by data1$Gender
## t = -2.0978, df = 246.26, p-value = 0.01847
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##         -Inf -0.09117358
## sample estimates:
## mean in group Female   mean in group Male 
##             22.16667             22.59477
t.test(data1$Age_in_years_completed~data1$Gender, alternative=c("greater"))
## 
##  Welch Two Sample t-test
## 
## data:  data1$Age_in_years_completed by data1$Gender
## t = -2.0978, df = 246.26, p-value = 0.9815
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -0.7650356        Inf
## sample estimates:
## mean in group Female   mean in group Male 
##             22.16667             22.59477

independent 2-group t-test

t.test(y1,y2) # where y1 and y2 are numeric

t.test(data1$Percentage_in_10_Class,data1$Percentage_in_12_Class)
## 
##  Welch Two Sample t-test
## 
## data:  data1$Percentage_in_10_Class and data1$Percentage_in_12_Class
## t = 3.5745, df = 533.24, p-value = 0.0003827
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.240114 4.266186
## sample estimates:
## mean of x mean of y 
##  83.31897  80.56582
#-------------------
t.test(data1$Percentage_in_10_Class,data1$Percentage_in_12_Class, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data1$Percentage_in_10_Class and data1$Percentage_in_12_Class
## t = 3.5745, df = 544, p-value = 0.0003821
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.240182 4.266118
## sample estimates:
## mean of x mean of y 
##  83.31897  80.56582
t.test(data1$Percentage_in_10_Class,data1$Percentage_in_12_Class, var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  data1$Percentage_in_10_Class and data1$Percentage_in_12_Class
## t = 3.5745, df = 533.24, p-value = 0.0003827
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.240114 4.266186
## sample estimates:
## mean of x mean of y 
##  83.31897  80.56582
#---------------------
t.test(data1$Percentage_in_10_Class,data1$Percentage_in_12_Class, conf.level = 0.95)
## 
##  Welch Two Sample t-test
## 
## data:  data1$Percentage_in_10_Class and data1$Percentage_in_12_Class
## t = 3.5745, df = 533.24, p-value = 0.0003827
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.240114 4.266186
## sample estimates:
## mean of x mean of y 
##  83.31897  80.56582

paired t-test

t.test(y1,y2,paired=TRUE) # where y1 & y2 are numeric

t.test(data1$Percentage_in_10_Class,data1$Percentage_in_12_Class, paired = TRUE)
## 
##  Paired t-test
## 
## data:  data1$Percentage_in_10_Class and data1$Percentage_in_12_Class
## t = 4.845, df = 272, p-value = 2.129e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.634436 3.871864
## sample estimates:
## mean of the differences 
##                 2.75315

analysis of variance

library(psych)

describeBy(data1$Percentage_in_10_Class, data1$Previous_Degree)
## 
##  Descriptive statistics by group 
## group: Arts
##    vars n  mean   sd median trimmed  mad  min  max range  skew kurtosis
## X1    1 4 81.03 8.48   82.8   81.03 5.41 69.2 89.3  20.1 -0.44    -1.83
##      se
## X1 4.24
## -------------------------------------------------------- 
## group: Commerce
##    vars   n  mean   sd median trimmed  mad min  max range  skew kurtosis
## X1    1 101 81.28 8.79   81.7   81.52 9.93  61 96.4  35.4 -0.19    -0.85
##      se
## X1 0.87
## -------------------------------------------------------- 
## group: Engineering
##    vars   n mean   sd median trimmed  mad min  max range  skew kurtosis
## X1    1 108 85.5 6.91   86.1   85.83 7.56  68 97.2  29.2 -0.45    -0.49
##      se
## X1 0.67
## -------------------------------------------------------- 
## group: Journalism
##    vars n mean sd median trimmed mad min max range skew kurtosis se
## X1    1 1   65 NA     65      65   0  65  65     0   NA       NA NA
## -------------------------------------------------------- 
## group: Management
##    vars  n  mean   sd median trimmed  mad  min max range  skew kurtosis
## X1    1 36 81.84 8.47     83   82.29 8.45 58.9  94  35.1 -0.59    -0.34
##      se
## X1 1.41
## -------------------------------------------------------- 
## group: Science
##    vars  n  mean   sd median trimmed  mad min max range  skew kurtosis
## X1    1 23 85.53 9.16   88.6   87.09 6.23  56  95    39 -1.69     2.71
##      se
## X1 1.91
describeBy(data1$Percentage_in_10_Class, data1$Previous_Degree, mat = TRUE)
##     item      group1 vars   n     mean       sd median  trimmed     mad
## X11    1        Arts    1   4 81.02500 8.483071   82.8 81.02500 5.41149
## X12    2    Commerce    1 101 81.27931 8.787344   81.7 81.51617 9.93342
## X13    3 Engineering    1 108 85.50361 6.913155   86.1 85.82818 7.56126
## X14    4  Journalism    1   1 65.00000       NA   65.0 65.00000 0.00000
## X15    5  Management    1  36 81.83944 8.474893   83.0 82.29067 8.45082
## X16    6     Science    1  23 85.52870 9.164647   88.6 87.09263 6.22692
##      min  max range       skew   kurtosis        se
## X11 69.2 89.3  20.1 -0.4377254 -1.8275607 4.2415357
## X12 61.0 96.4  35.4 -0.1926382 -0.8546568 0.8743734
## X13 68.0 97.2  29.2 -0.4537738 -0.4903248 0.6652186
## X14 65.0 65.0   0.0         NA         NA        NA
## X15 58.9 94.0  35.1 -0.5920874 -0.3430281 1.4124822
## X16 56.0 95.0  39.0 -1.6928706  2.7076566 1.9109610
describeBy(data1$Percentage_in_10_Class, data1$Previous_Degree, mat = FALSE)
## 
##  Descriptive statistics by group 
## group: Arts
##    vars n  mean   sd median trimmed  mad  min  max range  skew kurtosis
## X1    1 4 81.03 8.48   82.8   81.03 5.41 69.2 89.3  20.1 -0.44    -1.83
##      se
## X1 4.24
## -------------------------------------------------------- 
## group: Commerce
##    vars   n  mean   sd median trimmed  mad min  max range  skew kurtosis
## X1    1 101 81.28 8.79   81.7   81.52 9.93  61 96.4  35.4 -0.19    -0.85
##      se
## X1 0.87
## -------------------------------------------------------- 
## group: Engineering
##    vars   n mean   sd median trimmed  mad min  max range  skew kurtosis
## X1    1 108 85.5 6.91   86.1   85.83 7.56  68 97.2  29.2 -0.45    -0.49
##      se
## X1 0.67
## -------------------------------------------------------- 
## group: Journalism
##    vars n mean sd median trimmed mad min max range skew kurtosis se
## X1    1 1   65 NA     65      65   0  65  65     0   NA       NA NA
## -------------------------------------------------------- 
## group: Management
##    vars  n  mean   sd median trimmed  mad  min max range  skew kurtosis
## X1    1 36 81.84 8.47     83   82.29 8.45 58.9  94  35.1 -0.59    -0.34
##      se
## X1 1.41
## -------------------------------------------------------- 
## group: Science
##    vars  n  mean   sd median trimmed  mad min max range  skew kurtosis
## X1    1 23 85.53 9.16   88.6   87.09 6.23  56  95    39 -1.69     2.71
##      se
## X1 1.91
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
ggplot(data1, aes(data1$Previous_Degree,data1$Percentage_in_10_Class))+geom_boxplot()
ggplot(data1, aes(data1$Previous_Degree,data1$Percentage_in_10_Class))+geom_boxplot(aes(color=Previous_Degree))
anova1<-aov(Percentage_in_10_Class~Previous_Degree, data=data1)
anova1
## Call:
##    aov(formula = Percentage_in_10_Class ~ Previous_Degree, data = data1)
## 
## Terms:
##                 Previous_Degree Residuals
## Sum of Squares         1483.374 17412.972
## Deg. of Freedom               5       267
## 
## Residual standard error: 8.075712
## Estimated effects may be unbalanced
print(anova1)
## Call:
##    aov(formula = Percentage_in_10_Class ~ Previous_Degree, data = data1)
## 
## Terms:
##                 Previous_Degree Residuals
## Sum of Squares         1483.374 17412.972
## Deg. of Freedom               5       267
## 
## Residual standard error: 8.075712
## Estimated effects may be unbalanced
summary(anova1)
##                  Df Sum Sq Mean Sq F value   Pr(>F)    
## Previous_Degree   5   1483  296.67   4.549 0.000535 ***
## Residuals       267  17413   65.22                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# post hoc test

TukeyHSD(anova1)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Percentage_in_10_Class ~ Previous_Degree, data = data1)
## 
## $Previous_Degree
##                                diff        lwr        upr     p adj
## Commerce-Arts            0.25430693 -11.563795 12.0724093 0.9999999
## Engineering-Arts         4.47861111  -7.324892 16.2821142 0.8854315
## Journalism-Arts        -16.02500000 -41.942839  9.8928389 0.4837954
## Management-Arts          0.81444444 -11.403342 13.0322309 0.9999643
## Science-Arts             4.50369565  -8.054630 17.0620215 0.9077213
## Engineering-Commerce     4.22430418   1.015492  7.4331168 0.0026482
## Journalism-Commerce    -16.27930693 -39.575405  7.0167907 0.3416660
## Management-Commerce      0.56013751  -3.939651  5.0599263 0.9992302
## Science-Commerce         4.24938872  -1.106481  9.6052586 0.2069561
## Journalism-Engineering -20.50361111 -43.792306  2.7850837 0.1199175
## Management-Engineering  -3.66416667  -8.125471  0.7971381 0.1752232
## Science-Engineering      0.02508454  -5.298494  5.3486627 1.0000000
## Management-Journalism   16.83944444  -6.661937 40.3408258 0.3133739
## Science-Journalism      20.52869565  -3.151511 44.2089020 0.1311848
## Science-Management       3.68925121  -2.498810  9.8773126 0.5254607
Two way anova
anova2<-aov(Percentage_in_10_Class~Previous_Degree+Gender, data=data1)
anova2
Interaction Effect
anova3<-aov(Percentage_in_10_Class~Previous_Degree+Gender+Previous_Degree:Gender, data=data1)
anova3