Basics of R- Session 4- Import data from Excel and Explore the data
Dr Manohar Kapse
19 March 2019
Import the dataset–> Big Mart Dataset
# To import excel sheet, through import dataset in R environment or through read.csv()
# Location of the data set---D:\1 Teaching Material\R\1 R Open elective\data set
# Method 1
Bigmart<-read.csv("D:/1 Teaching Material/R/1 R Open elective/data set/Big Mart Dataset.csv")
# Method 2
# Bigmart<-read.csv(choose.files())
To import xls file, java is also required
library(readxl)
Bigmart1<-read_xls("D:/1 Teaching Material/R/1 R Open elective/data set/Big Mart Dataset.xls", 1)
# here the number 1 means the first sheet of the xlsx file
Exploration of the data set
# Bigmart
str(Bigmart)
## 'data.frame': 8523 obs. of 12 variables:
## $ Item_Identifier : Factor w/ 1559 levels "DRA12","DRA24",..: 157 9 663 1122 1298 759 697 739 441 991 ...
## $ Item_Weight : num 9.3 5.92 17.5 19.2 8.93 ...
## $ Item_Fat_Content : Factor w/ 5 levels "LF","low fat",..: 3 5 3 5 3 5 5 3 5 5 ...
## $ Item_Visibility : num 0.016 0.0193 0.0168 0 0 ...
## $ Item_Type : Factor w/ 16 levels "Baking Goods",..: 5 15 11 7 10 1 14 14 6 6 ...
## $ Item_MRP : num 249.8 48.3 141.6 182.1 53.9 ...
## $ Outlet_Identifier : Factor w/ 10 levels "OUT010","OUT013",..: 10 4 10 1 2 4 2 6 8 3 ...
## $ Outlet_Establishment_Year: int 1999 2009 1999 1998 1987 2009 1987 1985 2002 2007 ...
## $ Outlet_Size : Factor w/ 4 levels "","High","Medium",..: 3 3 3 1 2 3 2 3 1 1 ...
## $ Outlet_Location_Type : Factor w/ 3 levels "Tier 1","Tier 2",..: 1 3 1 3 3 3 3 3 2 2 ...
## $ Outlet_Type : Factor w/ 4 levels "Grocery Store",..: 2 3 2 1 2 3 2 4 2 2 ...
## $ Item_Outlet_Sales : num 3735 443 2097 732 995 ...
dim(Bigmart)
## [1] 8523 12
class(Bigmart)
## [1] "data.frame"
class(Bigmart$Item_Identifier)
## [1] "factor"
# Bigmart[]
# Bigmart[,]
#Bigmart[,2]
# Bigmart[1,]
Bigmart[1,2:3]
## Item_Weight Item_Fat_Content
## 1 9.3 Low Fat
# Bigmart$Item_Identifier
Bigmart$Item_Identifier[2]
## [1] DRC01
## 1559 Levels: DRA12 DRA24 DRA59 DRB01 DRB13 DRB24 DRB25 DRB48 DRC01 ... NCZ54
Bigmart$Item_Identifier[2:4]
## [1] DRC01 FDN15 FDX07
## 1559 Levels: DRA12 DRA24 DRA59 DRB01 DRB13 DRB24 DRB25 DRB48 DRC01 ... NCZ54
Bigmart$Item_Identifier[c(2:4, 10:12)]
## [1] DRC01 FDN15 FDX07 FDU28 FDY07 FDA03
## 1559 Levels: DRA12 DRA24 DRA59 DRB01 DRB13 DRB24 DRB25 DRB48 DRC01 ... NCZ54
Bigmart$Item_Identifier[2:4]
## [1] DRC01 FDN15 FDX07
## 1559 Levels: DRA12 DRA24 DRA59 DRB01 DRB13 DRB24 DRB25 DRB48 DRC01 ... NCZ54
# fix(Bigmart)
# To find the number of variables in a file
length(Bigmart)
## [1] 12
# To find the number of observation in a file
length(Bigmart$Item_Weight)
## [1] 8523
# number of rows or columns
nrow(Bigmart)
## [1] 8523
ncol(Bigmart)
## [1] 12
head(Bigmart)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## 1 FDA15 9.300 Low Fat 0.016047
## 2 DRC01 5.920 Regular 0.019278
## 3 FDN15 17.500 Low Fat 0.016760
## 4 FDX07 19.200 Regular 0.000000
## 5 NCD19 8.930 Low Fat 0.000000
## 6 FDP36 10.395 Regular 0.000000
## Item_Type Item_MRP Outlet_Identifier
## 1 Dairy 249.8092 OUT049
## 2 Soft Drinks 48.2692 OUT018
## 3 Meat 141.6180 OUT049
## 4 Fruits and Vegetables 182.0950 OUT010
## 5 Household 53.8614 OUT013
## 6 Baking Goods 51.4008 OUT018
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## 1 1999 Medium Tier 1
## 2 2009 Medium Tier 3
## 3 1999 Medium Tier 1
## 4 1998 Tier 3
## 5 1987 High Tier 3
## 6 2009 Medium Tier 3
## Outlet_Type Item_Outlet_Sales
## 1 Supermarket Type1 3735.1380
## 2 Supermarket Type2 443.4228
## 3 Supermarket Type1 2097.2700
## 4 Grocery Store 732.3800
## 5 Supermarket Type1 994.7052
## 6 Supermarket Type2 556.6088
tail(Bigmart)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## 8518 FDF53 20.750 reg 0.083607
## 8519 FDF22 6.865 Low Fat 0.056783
## 8520 FDS36 8.380 Regular 0.046982
## 8521 NCJ29 10.600 Low Fat 0.035186
## 8522 FDN46 7.210 Regular 0.145221
## 8523 DRG01 14.800 Low Fat 0.044878
## Item_Type Item_MRP Outlet_Identifier
## 8518 Frozen Foods 178.8318 OUT046
## 8519 Snack Foods 214.5218 OUT013
## 8520 Baking Goods 108.1570 OUT045
## 8521 Health and Hygiene 85.1224 OUT035
## 8522 Snack Foods 103.1332 OUT018
## 8523 Soft Drinks 75.4670 OUT046
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## 8518 1997 Small Tier 1
## 8519 1987 High Tier 3
## 8520 2002 Tier 2
## 8521 2004 Small Tier 2
## 8522 2009 Medium Tier 3
## 8523 1997 Small Tier 1
## Outlet_Type Item_Outlet_Sales
## 8518 Supermarket Type1 3608.636
## 8519 Supermarket Type1 2778.383
## 8520 Supermarket Type1 549.285
## 8521 Supermarket Type1 1193.114
## 8522 Supermarket Type2 1845.598
## 8523 Supermarket Type1 765.670
head(Bigmart, n=10)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## 1 FDA15 9.300 Low Fat 0.016047
## 2 DRC01 5.920 Regular 0.019278
## 3 FDN15 17.500 Low Fat 0.016760
## 4 FDX07 19.200 Regular 0.000000
## 5 NCD19 8.930 Low Fat 0.000000
## 6 FDP36 10.395 Regular 0.000000
## 7 FDO10 13.650 Regular 0.012741
## 8 FDP10 NA Low Fat 0.127470
## 9 FDH17 16.200 Regular 0.016687
## 10 FDU28 19.200 Regular 0.094450
## Item_Type Item_MRP Outlet_Identifier
## 1 Dairy 249.8092 OUT049
## 2 Soft Drinks 48.2692 OUT018
## 3 Meat 141.6180 OUT049
## 4 Fruits and Vegetables 182.0950 OUT010
## 5 Household 53.8614 OUT013
## 6 Baking Goods 51.4008 OUT018
## 7 Snack Foods 57.6588 OUT013
## 8 Snack Foods 107.7622 OUT027
## 9 Frozen Foods 96.9726 OUT045
## 10 Frozen Foods 187.8214 OUT017
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## 1 1999 Medium Tier 1
## 2 2009 Medium Tier 3
## 3 1999 Medium Tier 1
## 4 1998 Tier 3
## 5 1987 High Tier 3
## 6 2009 Medium Tier 3
## 7 1987 High Tier 3
## 8 1985 Medium Tier 3
## 9 2002 Tier 2
## 10 2007 Tier 2
## Outlet_Type Item_Outlet_Sales
## 1 Supermarket Type1 3735.1380
## 2 Supermarket Type2 443.4228
## 3 Supermarket Type1 2097.2700
## 4 Grocery Store 732.3800
## 5 Supermarket Type1 994.7052
## 6 Supermarket Type2 556.6088
## 7 Supermarket Type1 343.5528
## 8 Supermarket Type3 4022.7640
## 9 Supermarket Type1 1076.5990
## 10 Supermarket Type1 4710.5350
tail(Bigmart, n=10)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## 8514 FDH31 12.000 Regular 0.020407
## 8515 FDA01 15.000 Regular 0.054489
## 8516 FDH24 20.700 Low Fat 0.021518
## 8517 NCJ19 18.600 Low Fat 0.118661
## 8518 FDF53 20.750 reg 0.083607
## 8519 FDF22 6.865 Low Fat 0.056783
## 8520 FDS36 8.380 Regular 0.046982
## 8521 NCJ29 10.600 Low Fat 0.035186
## 8522 FDN46 7.210 Regular 0.145221
## 8523 DRG01 14.800 Low Fat 0.044878
## Item_Type Item_MRP Outlet_Identifier
## 8514 Meat 99.9042 OUT035
## 8515 Canned 57.5904 OUT045
## 8516 Baking Goods 157.5288 OUT018
## 8517 Others 58.7588 OUT018
## 8518 Frozen Foods 178.8318 OUT046
## 8519 Snack Foods 214.5218 OUT013
## 8520 Baking Goods 108.1570 OUT045
## 8521 Health and Hygiene 85.1224 OUT035
## 8522 Snack Foods 103.1332 OUT018
## 8523 Soft Drinks 75.4670 OUT046
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## 8514 2004 Small Tier 2
## 8515 2002 Tier 2
## 8516 2009 Medium Tier 3
## 8517 2009 Medium Tier 3
## 8518 1997 Small Tier 1
## 8519 1987 High Tier 3
## 8520 2002 Tier 2
## 8521 2004 Small Tier 2
## 8522 2009 Medium Tier 3
## 8523 1997 Small Tier 1
## Outlet_Type Item_Outlet_Sales
## 8514 Supermarket Type1 595.2252
## 8515 Supermarket Type1 468.7232
## 8516 Supermarket Type2 1571.2880
## 8517 Supermarket Type2 858.8820
## 8518 Supermarket Type1 3608.6360
## 8519 Supermarket Type1 2778.3830
## 8520 Supermarket Type1 549.2850
## 8521 Supermarket Type1 1193.1140
## 8522 Supermarket Type2 1845.5980
## 8523 Supermarket Type1 765.6700
Extracting variables from the file- save as other file
# Remove last 2 variables or recreate the file with 10 variables
Bigmart_10<-Bigmart[,1:10]
# fix(Bigmart_10)
# Create file with only 1 to 100 observation
Bigmart_10<-Bigmart[1:100,]
# fix(Bigmart_10)
combine two data set by column use
bigmart_C<-cbind(Bigmart, Bigmart)
# fix(bigmart_C)
dim(bigmart_C)
## [1] 8523 24
combine two data set by row use
bigmart_r<-rbind(Bigmart, Bigmart)
# fix(bigmart_r)
dim(bigmart_r)
## [1] 17046 12
random sampling- create a file with a random sample of 20 observation
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
bigmart_20<-sample_n(Bigmart, 20)
dim(bigmart_20)
## [1] 20 12
create a random sample with 25% data
bigmart_25_per<-sample_frac(Bigmart, size = 0.25)
dim(bigmart_25_per)
## [1] 2131 12
Explore the scale variables-
summary(Bigmart)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## FDG33 : 10 Min. : 4.555 LF : 316 Min. :0.00000
## FDW13 : 10 1st Qu.: 8.774 low fat: 112 1st Qu.:0.02699
## DRE49 : 9 Median :12.600 Low Fat:5089 Median :0.05393
## DRN47 : 9 Mean :12.858 reg : 117 Mean :0.06613
## FDD38 : 9 3rd Qu.:16.850 Regular:2889 3rd Qu.:0.09459
## FDF52 : 9 Max. :21.350 Max. :0.32839
## (Other):8467 NA's :1463
## Item_Type Item_MRP Outlet_Identifier
## Fruits and Vegetables:1232 Min. : 31.29 OUT027 : 935
## Snack Foods :1200 1st Qu.: 93.83 OUT013 : 932
## Household : 910 Median :143.01 OUT035 : 930
## Frozen Foods : 856 Mean :140.99 OUT046 : 930
## Dairy : 682 3rd Qu.:185.64 OUT049 : 930
## Canned : 649 Max. :266.89 OUT045 : 929
## (Other) :2994 (Other):2937
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## Min. :1985 :2410 Tier 1:2388
## 1st Qu.:1987 High : 932 Tier 2:2785
## Median :1999 Medium:2793 Tier 3:3350
## Mean :1998 Small :2388
## 3rd Qu.:2004
## Max. :2009
##
## Outlet_Type Item_Outlet_Sales
## Grocery Store :1083 Min. : 33.29
## Supermarket Type1:5577 1st Qu.: 834.25
## Supermarket Type2: 928 Median : 1794.33
## Supermarket Type3: 935 Mean : 2181.29
## 3rd Qu.: 3101.30
## Max. :13086.96
##
summary(Bigmart$Item_Fat_Content)
## LF low fat Low Fat reg Regular
## 316 112 5089 117 2889
summary(Bigmart$Item_Weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 4.555 8.774 12.600 12.858 16.850 21.350 1463
#better then summary
library(descr)
descr(Bigmart)
##
## Item_Identifier
## FDG33 FDW13 DRE49 DRN47 FDD38 FDF52 FDF56 FDG09 FDO19
## 10 10 9 9 9 9 9 9 9
## FDP25 FDQ40 FDT07 FDU12 FDV38 FDV60 FDW26 FDW49 FDX04
## 9 9 9 9 9 9 9 9 9
## FDX20 FDX31 NCB18 NCF42 NCI54 NCJ30 NCL31 NCQ06 NCY18
## 9 9 9 9 9 9 9 9 9
## DRA59 DRD25 DRF01 DRF03 DRF23 DRF27 DRI03 DRJ24 DRK12
## 8 8 8 8 8 8 8 8 8
## DRK35 DRP35 FDA04 FDA13 FDA15 FDA39 FDA44 FDA50 FDB17
## 8 8 8 8 8 8 8 8 8
## FDC14 FDD05 FDD29 FDE11 FDF04 FDF05 FDF16 FDF22 FDG24
## 8 8 8 8 8 8 8 8 8
## FDG38 FDG57 FDH10 FDH27 FDH28 FDH33 FDI22 FDI41 FDJ44
## 8 8 8 8 8 8 8 8 8
## FDJ55 FDJ58 FDK20 FDK58 FDL10 FDL20 FDL34 FDL58 FDN56
## 8 8 8 8 8 8 8 8 8
## FDO10 FDO32 FDO37 FDO52 FDP11 FDP28 FDQ39 FDR04 FDR23
## 8 8 8 8 8 8 8 8 8
## FDR43 FDR44 FDR46 FDR48 FDR52 FDR59 FDS33 FDS52 FDS55
## 8 8 8 8 8 8 8 8 8
## FDT21 FDT24 FDT32 FDT40 FDT49 FDT55 FDU13 FDU19 FDU23
## 8 8 8 8 8 8 8 8 8
## (Other)
## 7702
##
## Item_Weight
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 4.555 8.774 12.600 12.858 16.850 21.350 1463
##
## Item_Fat_Content
## LF low fat Low Fat reg Regular
## 316 112 5089 117 2889
##
## Item_Visibility
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.02699 0.05393 0.06613 0.09459 0.32839
##
## Item_Type
## Baking Goods Breads Breakfast
## 648 251 110
## Canned Dairy Frozen Foods
## 649 682 856
## Fruits and Vegetables Hard Drinks Health and Hygiene
## 1232 214 520
## Household Meat Others
## 910 425 169
## Seafood Snack Foods Soft Drinks
## 64 1200 445
## Starchy Foods
## 148
##
## Item_MRP
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 31.29 93.83 143.01 140.99 185.64 266.89
##
## Outlet_Identifier
## OUT010 OUT013 OUT017 OUT018 OUT019 OUT027 OUT035 OUT045 OUT046 OUT049
## 555 932 926 928 528 935 930 929 930 930
##
## Outlet_Establishment_Year
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1985 1987 1999 1998 2004 2009
##
## Outlet_Size
## High Medium Small
## 2410 932 2793 2388
##
## Outlet_Location_Type
## Tier 1 Tier 2 Tier 3
## 2388 2785 3350
##
## Outlet_Type
## Grocery Store Supermarket Type1 Supermarket Type2 Supermarket Type3
## 1083 5577 928 935
##
## Item_Outlet_Sales
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.29 834.25 1794.33 2181.29 3101.30 13086.96
library(pastecs)
## Warning: package 'pastecs' was built under R version 3.5.3
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
stat.desc(Bigmart)
## Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## nbr.val NA 7.060000e+03 NA 8.523000e+03
## nbr.null NA 0.000000e+00 NA 5.260000e+02
## nbr.na NA 1.463000e+03 NA 0.000000e+00
## min NA 4.555000e+00 NA 0.000000e+00
## max NA 2.135000e+01 NA 3.283910e-01
## range NA 1.679500e+01 NA 3.283910e-01
## sum NA 9.077498e+04 NA 5.636433e+02
## median NA 1.260000e+01 NA 5.393100e-02
## mean NA 1.285765e+01 NA 6.613203e-02
## SE.mean NA 5.526358e-02 NA 5.589013e-04
## CI.mean NA 1.083332e-01 NA 1.095582e-03
## var NA 2.156169e+01 NA 2.662335e-03
## std.dev NA 4.643456e+00 NA 5.159782e-02
## coef.var NA 3.611436e-01 NA 7.802244e-01
## Item_Type Item_MRP Outlet_Identifier
## nbr.val NA 8.523000e+03 NA
## nbr.null NA 0.000000e+00 NA
## nbr.na NA 0.000000e+00 NA
## min NA 3.129000e+01 NA
## max NA 2.668884e+02 NA
## range NA 2.355984e+02 NA
## sum NA 1.201681e+06 NA
## median NA 1.430128e+02 NA
## mean NA 1.409928e+02 NA
## SE.mean NA 6.745559e-01 NA
## CI.mean NA 1.322293e+00 NA
## var NA 3.878184e+03 NA
## std.dev NA 6.227507e+01 NA
## coef.var NA 4.416897e-01 NA
## Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## nbr.val 8.523000e+03 NA NA
## nbr.null 0.000000e+00 NA NA
## nbr.na 0.000000e+00 NA NA
## min 1.985000e+03 NA NA
## max 2.009000e+03 NA NA
## range 2.400000e+01 NA NA
## sum 1.702752e+07 NA NA
## median 1.999000e+03 NA NA
## mean 1.997832e+03 NA NA
## SE.mean 9.068189e-02 NA NA
## CI.mean 1.777585e-01 NA NA
## var 7.008637e+01 NA NA
## std.dev 8.371760e+00 NA NA
## coef.var 4.190423e-03 NA NA
## Outlet_Type Item_Outlet_Sales
## nbr.val NA 8.523000e+03
## nbr.null NA 0.000000e+00
## nbr.na NA 0.000000e+00
## min NA 3.329000e+01
## max NA 1.308696e+04
## range NA 1.305367e+04
## sum NA 1.859113e+07
## median NA 1.794331e+03
## mean NA 2.181289e+03
## SE.mean NA 1.848460e+01
## CI.mean NA 3.623429e+01
## var NA 2.912141e+06
## std.dev NA 1.706500e+03
## coef.var NA 7.823354e-01
library(psych)
describe(Bigmart)
## vars n mean sd median trimmed
## Item_Identifier* 1 8523 780.71 449.22 784.00 781.25
## Item_Weight 2 7060 12.86 4.64 12.60 12.80
## Item_Fat_Content* 3 8523 3.60 1.08 3.00 3.61
## Item_Visibility 4 8523 0.07 0.05 0.05 0.06
## Item_Type* 5 8523 8.23 4.21 7.00 8.27
## Item_MRP 6 8523 140.99 62.28 143.01 139.70
## Outlet_Identifier* 7 8523 5.72 2.84 6.00 5.73
## Outlet_Establishment_Year 8 8523 1997.83 8.37 1999.00 1998.04
## Outlet_Size* 9 8523 2.61 1.17 3.00 2.63
## Outlet_Location_Type* 10 8523 2.11 0.81 2.00 2.14
## Outlet_Type* 11 8523 2.20 0.80 2.00 2.13
## Item_Outlet_Sales 12 8523 2181.29 1706.50 1794.33 1971.33
## mad min max range skew kurtosis
## Item_Identifier* 572.28 1.00 1559.00 1558.00 -0.01 -1.20
## Item_Weight 6.08 4.55 21.35 16.80 0.08 -1.23
## Item_Fat_Content* 0.00 1.00 5.00 4.00 0.06 -0.68
## Item_Visibility 0.05 0.00 0.33 0.33 1.17 1.68
## Item_Type* 4.45 1.00 16.00 15.00 0.10 -0.97
## Item_MRP 68.26 31.29 266.89 235.60 0.13 -0.89
## Outlet_Identifier* 4.45 1.00 10.00 9.00 -0.06 -1.26
## Outlet_Establishment_Year 7.41 1985.00 2009.00 24.00 -0.40 -1.21
## Outlet_Size* 1.48 1.00 4.00 3.00 -0.26 -1.41
## Outlet_Location_Type* 1.48 1.00 3.00 2.00 -0.21 -1.46
## Outlet_Type* 0.00 1.00 4.00 3.00 0.93 0.62
## Item_Outlet_Sales 1604.06 33.29 13086.96 13053.67 1.18 1.61
## se
## Item_Identifier* 4.87
## Item_Weight 0.06
## Item_Fat_Content* 0.01
## Item_Visibility 0.00
## Item_Type* 0.05
## Item_MRP 0.67
## Outlet_Identifier* 0.03
## Outlet_Establishment_Year 0.09
## Outlet_Size* 0.01
## Outlet_Location_Type* 0.01
## Outlet_Type* 0.01
## Item_Outlet_Sales 18.48
describe(Bigmart$Item_Weight)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 7060 12.86 4.64 12.6 12.8 6.08 4.55 21.35 16.8 0.08 -1.23
## se
## X1 0.06
skim(bigmart1)
library(DataExplorer)
create_report(bigmart1)
Note:- Few codes are not run intentionally
example
# Bigmart
# Bigmart[,]
# Bigmart[2,]
and so on
No comments:
Post a Comment