Basics of R Session 5.1 - Library(dplyr)
Dr Manohar Kapse
19 March 2019
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# importthe data from the location C:/Users/LENOVO/Desktop/MBAdata.csv
mbaper<-read.csv("C:/Users/LENOVO/Desktop/MBAdata.csv")
#Equivalent of str in traditional approach. Gives the structure of data
str(mbaper)
## 'data.frame': 273 obs. of 19 variables:
## $ Age_in_years_completed : int 22 26 24 22 22 26 22 20 22 25 ...
## $ Percentage_in_10_Class : num 91.2 77 79.8 82 95 86.4 91.2 91.4 71 77.6 ...
## $ Percentage_in_12_Class : num 91.2 68 61.6 70 78.6 80.4 80 77 74.8 82.6 ...
## $ Percentage_in_Under_Graduate : num 70 72.7 60.3 76.5 75.4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 1 2 2 ...
## $ STATE : Factor w/ 9 levels "Central Zone",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Previous_Degree : Factor w/ 6 levels "Arts","Commerce",..: 2 3 3 3 3 3 2 2 3 3 ...
## $ Fathers_qualification : Factor w/ 9 levels "CA/CS","DIPLOMA",..: 9 5 7 7 9 9 9 9 7 9 ...
## $ Mothers_qualification : Factor w/ 7 levels "DIPLOMA","HSC",..: 5 5 2 7 7 7 5 7 7 7 ...
## $ Fathers_occupation : Factor w/ 7 levels "Farming","Not Alive",..: 4 2 4 4 4 7 6 6 7 7 ...
## $ Mothers_occupation : Factor w/ 4 levels "House wife","Professional Job",..: 2 1 1 2 1 1 2 1 1 1 ...
## $ Marital_status : Factor w/ 2 levels "Married","Single": 2 1 2 2 2 2 2 2 2 2 ...
## $ Place_you_belong_to : Factor w/ 3 levels "Rural","Semi Urban",..: 3 3 3 2 3 2 2 3 3 2 ...
## $ Total_Family_Income_per_annum : Factor w/ 6 levels "0-3 Lakh","10-15 lakh",..: 4 1 2 6 6 3 1 4 6 6 ...
## $ Funding_for_the_MBA_Program : Factor w/ 6 levels "Family","Loan",..: 3 1 2 3 1 1 4 1 2 3 ...
## $ Work_Experience : Factor w/ 6 levels "1-2 year","2-3 year",..: 6 4 6 6 6 2 6 6 6 6 ...
## $ Career_options_after_MBA : Factor w/ 5 levels "Family Business",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Alternate_Career_Option_after_MBA_if_not_selected_through_Instit: Factor w/ 5 levels "Family Business",..: 3 3 3 3 3 2 5 3 5 3 ...
## $ perceivedscorecat : Factor w/ 3 levels "desired skills",..: 3 2 2 2 2 1 2 2 2 2 ...
#fix(mbaper)
glimpse(mbaper)
## Observations: 273
## Variables: 19
## $ Age_in_years_completed <int> ...
## $ Percentage_in_10_Class <dbl> ...
## $ Percentage_in_12_Class <dbl> ...
## $ Percentage_in_Under_Graduate <dbl> ...
## $ Gender <fct> ...
## $ STATE <fct> ...
## $ Previous_Degree <fct> ...
## $ Fathers_qualification <fct> ...
## $ Mothers_qualification <fct> ...
## $ Fathers_occupation <fct> ...
## $ Mothers_occupation <fct> ...
## $ Marital_status <fct> ...
## $ Place_you_belong_to <fct> ...
## $ Total_Family_Income_per_annum <fct> ...
## $ Funding_for_the_MBA_Program <fct> ...
## $ Work_Experience <fct> ...
## $ Career_options_after_MBA <fct> ...
## $ Alternate_Career_Option_after_MBA_if_not_selected_through_Instit <fct> ...
## $ perceivedscorecat <fct> ...
adding or removing variables from the file
mbaper_age<-select(mbaper, Age_in_years_completed)
str(mbaper_age)
## 'data.frame': 273 obs. of 1 variable:
## $ Age_in_years_completed: int 22 26 24 22 22 26 22 20 22 25 ...
mbaper_age<-select(mbaper, -Age_in_years_completed)
str(mbaper_age)
## 'data.frame': 273 obs. of 18 variables:
## $ Percentage_in_10_Class : num 91.2 77 79.8 82 95 86.4 91.2 91.4 71 77.6 ...
## $ Percentage_in_12_Class : num 91.2 68 61.6 70 78.6 80.4 80 77 74.8 82.6 ...
## $ Percentage_in_Under_Graduate : num 70 72.7 60.3 76.5 75.4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 1 2 2 ...
## $ STATE : Factor w/ 9 levels "Central Zone",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Previous_Degree : Factor w/ 6 levels "Arts","Commerce",..: 2 3 3 3 3 3 2 2 3 3 ...
## $ Fathers_qualification : Factor w/ 9 levels "CA/CS","DIPLOMA",..: 9 5 7 7 9 9 9 9 7 9 ...
## $ Mothers_qualification : Factor w/ 7 levels "DIPLOMA","HSC",..: 5 5 2 7 7 7 5 7 7 7 ...
## $ Fathers_occupation : Factor w/ 7 levels "Farming","Not Alive",..: 4 2 4 4 4 7 6 6 7 7 ...
## $ Mothers_occupation : Factor w/ 4 levels "House wife","Professional Job",..: 2 1 1 2 1 1 2 1 1 1 ...
## $ Marital_status : Factor w/ 2 levels "Married","Single": 2 1 2 2 2 2 2 2 2 2 ...
## $ Place_you_belong_to : Factor w/ 3 levels "Rural","Semi Urban",..: 3 3 3 2 3 2 2 3 3 2 ...
## $ Total_Family_Income_per_annum : Factor w/ 6 levels "0-3 Lakh","10-15 lakh",..: 4 1 2 6 6 3 1 4 6 6 ...
## $ Funding_for_the_MBA_Program : Factor w/ 6 levels "Family","Loan",..: 3 1 2 3 1 1 4 1 2 3 ...
## $ Work_Experience : Factor w/ 6 levels "1-2 year","2-3 year",..: 6 4 6 6 6 2 6 6 6 6 ...
## $ Career_options_after_MBA : Factor w/ 5 levels "Family Business",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Alternate_Career_Option_after_MBA_if_not_selected_through_Instit: Factor w/ 5 levels "Family Business",..: 3 3 3 3 3 2 5 3 5 3 ...
## $ perceivedscorecat : Factor w/ 3 levels "desired skills",..: 3 2 2 2 2 1 2 2 2 2 ...
mbaper_age<-select(mbaper, Age_in_years_completed:Mothers_qualification)
str(mbaper_age)
## 'data.frame': 273 obs. of 9 variables:
## $ Age_in_years_completed : int 22 26 24 22 22 26 22 20 22 25 ...
## $ Percentage_in_10_Class : num 91.2 77 79.8 82 95 86.4 91.2 91.4 71 77.6 ...
## $ Percentage_in_12_Class : num 91.2 68 61.6 70 78.6 80.4 80 77 74.8 82.6 ...
## $ Percentage_in_Under_Graduate: num 70 72.7 60.3 76.5 75.4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 1 2 2 ...
## $ STATE : Factor w/ 9 levels "Central Zone",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Previous_Degree : Factor w/ 6 levels "Arts","Commerce",..: 2 3 3 3 3 3 2 2 3 3 ...
## $ Fathers_qualification : Factor w/ 9 levels "CA/CS","DIPLOMA",..: 9 5 7 7 9 9 9 9 7 9 ...
## $ Mothers_qualification : Factor w/ 7 levels "DIPLOMA","HSC",..: 5 5 2 7 7 7 5 7 7 7 ...
To filter data based on some conditions we could use the filter function, the filter is associated with cases
filter(dataset, variable==value)
mbaper_age<-filter(mbaper, Percentage_in_10_Class==75)
str(mbaper_age$Percentage_in_10_Class)
## num [1:4] 75 75 75 75
mbaper_age<-filter(mbaper, Percentage_in_10_Class<75)
str(mbaper_age$Percentage_in_10_Class)
## num [1:43] 71 70 72.2 67 74 66.4 72 69.6 72 69.2 ...
mbaper_age<-filter(mbaper, Percentage_in_10_Class>75)
str(mbaper_age$Percentage_in_10_Class)
## num [1:226] 91.2 77 79.8 82 95 86.4 91.2 91.4 77.6 87.4 ...
#-------------------------------------------------------#
specific category from a categorical variable
mbaper_zone<-filter(mbaper, STATE =="GOA")
str(mbaper_zone$STATE)
## Factor w/ 9 levels "Central Zone",..: 3
dim(mbaper_zone$STATE)
## NULL
head(mbaper_zone$STATE) # only one observation is there
## [1] GOA
## 9 Levels: Central Zone East Zone GOA GUJARAT Nepal ... west Zone
more than one category
, for and
mbaper_zone<-filter(mbaper, STATE=="North East Zone",STATE=="Central Zone")
str(mbaper_zone)
## 'data.frame': 0 obs. of 19 variables:
## $ Age_in_years_completed : int
## $ Percentage_in_10_Class : num
## $ Percentage_in_12_Class : num
## $ Percentage_in_Under_Graduate : num
## $ Gender : Factor w/ 2 levels "Female","Male":
## $ STATE : Factor w/ 9 levels "Central Zone",..:
## $ Previous_Degree : Factor w/ 6 levels "Arts","Commerce",..:
## $ Fathers_qualification : Factor w/ 9 levels "CA/CS","DIPLOMA",..:
## $ Mothers_qualification : Factor w/ 7 levels "DIPLOMA","HSC",..:
## $ Fathers_occupation : Factor w/ 7 levels "Farming","Not Alive",..:
## $ Mothers_occupation : Factor w/ 4 levels "House wife","Professional Job",..:
## $ Marital_status : Factor w/ 2 levels "Married","Single":
## $ Place_you_belong_to : Factor w/ 3 levels "Rural","Semi Urban",..:
## $ Total_Family_Income_per_annum : Factor w/ 6 levels "0-3 Lakh","10-15 lakh",..:
## $ Funding_for_the_MBA_Program : Factor w/ 6 levels "Family","Loan",..:
## $ Work_Experience : Factor w/ 6 levels "1-2 year","2-3 year",..:
## $ Career_options_after_MBA : Factor w/ 5 levels "Family Business",..:
## $ Alternate_Career_Option_after_MBA_if_not_selected_through_Instit: Factor w/ 5 levels "Family Business",..:
## $ perceivedscorecat : Factor w/ 3 levels "desired skills",..:
dim(mbaper_zone$STATE)
## NULL
head(mbaper_zone$STATE)
## factor(0)
## 9 Levels: Central Zone East Zone GOA GUJARAT Nepal ... west Zone
# | for or
mbaper_zone<-filter(mbaper, STATE=="North East Zone" | STATE=="Central Zone")
str(mbaper_zone)
## 'data.frame': 24 obs. of 19 variables:
## $ Age_in_years_completed : int 22 26 24 22 22 26 22 20 22 25 ...
## $ Percentage_in_10_Class : num 91.2 77 79.8 82 95 86.4 91.2 91.4 71 77.6 ...
## $ Percentage_in_12_Class : num 91.2 68 61.6 70 78.6 80.4 80 77 74.8 82.6 ...
## $ Percentage_in_Under_Graduate : num 70 72.7 60.3 76.5 75.4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 1 2 2 ...
## $ STATE : Factor w/ 9 levels "Central Zone",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Previous_Degree : Factor w/ 6 levels "Arts","Commerce",..: 2 3 3 3 3 3 2 2 3 3 ...
## $ Fathers_qualification : Factor w/ 9 levels "CA/CS","DIPLOMA",..: 9 5 7 7 9 9 9 9 7 9 ...
## $ Mothers_qualification : Factor w/ 7 levels "DIPLOMA","HSC",..: 5 5 2 7 7 7 5 7 7 7 ...
## $ Fathers_occupation : Factor w/ 7 levels "Farming","Not Alive",..: 4 2 4 4 4 7 6 6 7 7 ...
## $ Mothers_occupation : Factor w/ 4 levels "House wife","Professional Job",..: 2 1 1 2 1 1 2 1 1 1 ...
## $ Marital_status : Factor w/ 2 levels "Married","Single": 2 1 2 2 2 2 2 2 2 2 ...
## $ Place_you_belong_to : Factor w/ 3 levels "Rural","Semi Urban",..: 3 3 3 2 3 2 2 3 3 2 ...
## $ Total_Family_Income_per_annum : Factor w/ 6 levels "0-3 Lakh","10-15 lakh",..: 4 1 2 6 6 3 1 4 6 6 ...
## $ Funding_for_the_MBA_Program : Factor w/ 6 levels "Family","Loan",..: 3 1 2 3 1 1 4 1 2 3 ...
## $ Work_Experience : Factor w/ 6 levels "1-2 year","2-3 year",..: 6 4 6 6 6 2 6 6 6 6 ...
## $ Career_options_after_MBA : Factor w/ 5 levels "Family Business",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Alternate_Career_Option_after_MBA_if_not_selected_through_Instit: Factor w/ 5 levels "Family Business",..: 3 3 3 3 3 2 5 3 5 3 ...
## $ perceivedscorecat : Factor w/ 3 levels "desired skills",..: 3 2 2 2 2 1 2 2 2 2 ...
dim(mbaper_zone$STATE)
## NULL
head(mbaper_zone$STATE)
## [1] Central Zone Central Zone Central Zone Central Zone Central Zone
## [6] Central Zone
## 9 Levels: Central Zone East Zone GOA GUJARAT Nepal ... west Zone
| for or ### exact word has to be used
mbaper_zone<-filter(mbaper, STATE=="North East Zone" | Previous_Degree=="Commerce")
str(mbaper_zone)
## 'data.frame': 102 obs. of 19 variables:
## $ Age_in_years_completed : int 22 22 20 23 22 22 22 22 21 21 ...
## $ Percentage_in_10_Class : num 91.2 91.2 91.4 67 74 85.5 83 76 85 92.4 ...
## $ Percentage_in_12_Class : num 91.2 80 77 62 61 84.6 82 75 88 89 ...
## $ Percentage_in_Under_Graduate : num 70 67 65 67 66 86.9 75.6 69 79 77 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 1 1 2 1 2 1 1 1 ...
## $ STATE : Factor w/ 9 levels "Central Zone",..: 1 1 1 1 1 2 2 2 2 2 ...
## $ Previous_Degree : Factor w/ 6 levels "Arts","Commerce",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Fathers_qualification : Factor w/ 9 levels "CA/CS","DIPLOMA",..: 9 9 9 7 9 9 9 9 9 9 ...
## $ Mothers_qualification : Factor w/ 7 levels "DIPLOMA","HSC",..: 5 5 7 5 7 5 6 7 7 7 ...
## $ Fathers_occupation : Factor w/ 7 levels "Farming","Not Alive",..: 4 6 6 6 6 6 6 6 6 6 ...
## $ Mothers_occupation : Factor w/ 4 levels "House wife","Professional Job",..: 2 2 1 1 2 2 1 1 1 1 ...
## $ Marital_status : Factor w/ 2 levels "Married","Single": 2 2 2 2 2 2 2 2 2 2 ...
## $ Place_you_belong_to : Factor w/ 3 levels "Rural","Semi Urban",..: 3 2 3 3 3 1 3 2 3 3 ...
## $ Total_Family_Income_per_annum : Factor w/ 6 levels "0-3 Lakh","10-15 lakh",..: 4 1 4 1 6 6 5 5 1 4 ...
## $ Funding_for_the_MBA_Program : Factor w/ 6 levels "Family","Loan",..: 3 4 1 3 4 3 5 1 2 3 ...
## $ Work_Experience : Factor w/ 6 levels "1-2 year","2-3 year",..: 6 6 6 1 6 5 1 6 6 6 ...
## $ Career_options_after_MBA : Factor w/ 5 levels "Family Business",..: 3 3 3 3 3 3 5 3 3 2 ...
## $ Alternate_Career_Option_after_MBA_if_not_selected_through_Instit: Factor w/ 5 levels "Family Business",..: 3 5 3 3 5 5 2 5 3 2 ...
## $ perceivedscorecat : Factor w/ 3 levels "desired skills",..: 3 2 2 1 1 2 2 2 2 2 ...
dim(mbaper_zone$STATE)
## NULL
head(mbaper_zone$STATE)
## [1] Central Zone Central Zone Central Zone Central Zone Central Zone
## [6] East Zone
## 9 Levels: Central Zone East Zone GOA GUJARAT Nepal ... west Zone
mbaper_zone<-filter(mbaper, STATE=="North East Zone" , Previous_Degree=="Commerce")
str(mbaper_zone)
## 'data.frame': 3 obs. of 19 variables:
## $ Age_in_years_completed : int 23 21 25
## $ Percentage_in_10_Class : num 75 68 71.7
## $ Percentage_in_12_Class : num 74 78 90
## $ Percentage_in_Under_Graduate : num 67 65 73
## $ Gender : Factor w/ 2 levels "Female","Male": 2 1 1
## $ STATE : Factor w/ 9 levels "Central Zone",..: 6 6 6
## $ Previous_Degree : Factor w/ 6 levels "Arts","Commerce",..: 2 2 2
## $ Fathers_qualification : Factor w/ 9 levels "CA/CS","DIPLOMA",..: 2 9 9
## $ Mothers_qualification : Factor w/ 7 levels "DIPLOMA","HSC",..: 1 6 7
## $ Fathers_occupation : Factor w/ 7 levels "Farming","Not Alive",..: 4 6 7
## $ Mothers_occupation : Factor w/ 4 levels "House wife","Professional Job",..: 2 1 1
## $ Marital_status : Factor w/ 2 levels "Married","Single": 2 2 2
## $ Place_you_belong_to : Factor w/ 3 levels "Rural","Semi Urban",..: 3 1 3
## $ Total_Family_Income_per_annum : Factor w/ 6 levels "0-3 Lakh","10-15 lakh",..: 6 2 2
## $ Funding_for_the_MBA_Program : Factor w/ 6 levels "Family","Loan",..: 4 1 1
## $ Work_Experience : Factor w/ 6 levels "1-2 year","2-3 year",..: 6 6 2
## $ Career_options_after_MBA : Factor w/ 5 levels "Family Business",..: 3 4 3
## $ Alternate_Career_Option_after_MBA_if_not_selected_through_Instit: Factor w/ 5 levels "Family Business",..: 3 4 3
## $ perceivedscorecat : Factor w/ 3 levels "desired skills",..: 2 2 1
dim(mbaper_zone$STATE)
we can use ==, >, >= &, |, !, xor(), between(), near()
head(mbaper_zone$STATE)
## [1] North East Zone North East Zone North East Zone
## 9 Levels: Central Zone East Zone GOA GUJARAT Nepal ... west Zone
#-------------------------------------#
Select columns and then subset data based on a condition, combination of rows and variables
mbaper_age<-select(mbaper, Age_in_years_completed)
str(mbaper_age)
## 'data.frame': 273 obs. of 1 variable:
## $ Age_in_years_completed: int 22 26 24 22 22 26 22 20 22 25 ...
dim(mbaper_age)
## [1] 273 1
head(mbaper_age)
## Age_in_years_completed
## 1 22
## 2 26
## 3 24
## 4 22
## 5 22
## 6 26
mbaper_age<-filter(select(mbaper, Age_in_years_completed),Age_in_years_completed==25)
str(mbaper_age)
## 'data.frame': 22 obs. of 1 variable:
## $ Age_in_years_completed: int 25 25 25 25 25 25 25 25 25 25 ...
dim(mbaper_age)
## [1] 22 1
head(mbaper_age)
## Age_in_years_completed
## 1 25
## 2 25
## 3 25
## 4 25
## 5 25
## 6 25
use of pipe function
1:8 %>% sum
## [1] 36
1:8 %>% sum %>% sqrt
## [1] 6
1:8 %>% sum %>% sqrt%>% sum*10
## [1] 60
#---------------------------------#
mbaper_age<-select(mbaper, Age_in_years_completed) %>%
filter(Age_in_years_completed==22)
str(mbaper_age)
## 'data.frame': 70 obs. of 1 variable:
## $ Age_in_years_completed: int 22 22 22 22 22 22 22 22 22 22 ...
dim(mbaper_age)
## [1] 70 1
head(mbaper_age)
## Age_in_years_completed
## 1 22
## 2 22
## 3 22
## 4 22
## 5 22
## 6 22
mbaper_age<-select(mbaper, Age_in_years_completed) %>%
arrange(Age_in_years_completed)
mbaper_age<-select(mbaper, Age_in_years_completed) %>%
arrange(-Age_in_years_completed)
we can also use, starts_with(), ends_with(), contains() example select(mbaper,starts_with("percentage")
No comments:
Post a Comment