Thursday, January 17, 2019

Basics of R Session 5.2 - Missing values

Basics of R Session 5.1 - Library(dplyr)

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# importthe data from the location C:/Users/LENOVO/Desktop/MBAdata.csv

mbaper<-read.csv("C:/Users/LENOVO/Desktop/MBAdata.csv")

#Equivalent of str in traditional approach. Gives the structure of data
str(mbaper)
## 'data.frame':    273 obs. of  19 variables:
##  $ Age_in_years_completed                                          : int  22 26 24 22 22 26 22 20 22 25 ...
##  $ Percentage_in_10_Class                                          : num  91.2 77 79.8 82 95 86.4 91.2 91.4 71 77.6 ...
##  $ Percentage_in_12_Class                                          : num  91.2 68 61.6 70 78.6 80.4 80 77 74.8 82.6 ...
##  $ Percentage_in_Under_Graduate                                    : num  70 72.7 60.3 76.5 75.4 ...
##  $ Gender                                                          : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 1 2 2 ...
##  $ STATE                                                           : Factor w/ 9 levels "Central Zone",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Previous_Degree                                                 : Factor w/ 6 levels "Arts","Commerce",..: 2 3 3 3 3 3 2 2 3 3 ...
##  $ Fathers_qualification                                           : Factor w/ 9 levels "CA/CS","DIPLOMA",..: 9 5 7 7 9 9 9 9 7 9 ...
##  $ Mothers_qualification                                           : Factor w/ 7 levels "DIPLOMA","HSC",..: 5 5 2 7 7 7 5 7 7 7 ...
##  $ Fathers_occupation                                              : Factor w/ 7 levels "Farming","Not Alive",..: 4 2 4 4 4 7 6 6 7 7 ...
##  $ Mothers_occupation                                              : Factor w/ 4 levels "House wife","Professional Job",..: 2 1 1 2 1 1 2 1 1 1 ...
##  $ Marital_status                                                  : Factor w/ 2 levels "Married","Single": 2 1 2 2 2 2 2 2 2 2 ...
##  $ Place_you_belong_to                                             : Factor w/ 3 levels "Rural","Semi Urban",..: 3 3 3 2 3 2 2 3 3 2 ...
##  $ Total_Family_Income_per_annum                                   : Factor w/ 6 levels "0-3 Lakh","10-15 lakh",..: 4 1 2 6 6 3 1 4 6 6 ...
##  $ Funding_for_the_MBA_Program                                     : Factor w/ 6 levels "Family","Loan",..: 3 1 2 3 1 1 4 1 2 3 ...
##  $ Work_Experience                                                 : Factor w/ 6 levels "1-2 year","2-3 year",..: 6 4 6 6 6 2 6 6 6 6 ...
##  $ Career_options_after_MBA                                        : Factor w/ 5 levels "Family Business",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Alternate_Career_Option_after_MBA_if_not_selected_through_Instit: Factor w/ 5 levels "Family Business",..: 3 3 3 3 3 2 5 3 5 3 ...
##  $ perceivedscorecat                                               : Factor w/ 3 levels "desired skills",..: 3 2 2 2 2 1 2 2 2 2 ...
#fix(mbaper)
glimpse(mbaper)
## Observations: 273
## Variables: 19
## $ Age_in_years_completed                                           <int> ...
## $ Percentage_in_10_Class                                           <dbl> ...
## $ Percentage_in_12_Class                                           <dbl> ...
## $ Percentage_in_Under_Graduate                                     <dbl> ...
## $ Gender                                                           <fct> ...
## $ STATE                                                            <fct> ...
## $ Previous_Degree                                                  <fct> ...
## $ Fathers_qualification                                            <fct> ...
## $ Mothers_qualification                                            <fct> ...
## $ Fathers_occupation                                               <fct> ...
## $ Mothers_occupation                                               <fct> ...
## $ Marital_status                                                   <fct> ...
## $ Place_you_belong_to                                              <fct> ...
## $ Total_Family_Income_per_annum                                    <fct> ...
## $ Funding_for_the_MBA_Program                                      <fct> ...
## $ Work_Experience                                                  <fct> ...
## $ Career_options_after_MBA                                         <fct> ...
## $ Alternate_Career_Option_after_MBA_if_not_selected_through_Instit <fct> ...
## $ perceivedscorecat                                                <fct> ...

adding or removing variables from the file

mbaper_age<-select(mbaper, Age_in_years_completed)
str(mbaper_age)
## 'data.frame':    273 obs. of  1 variable:
##  $ Age_in_years_completed: int  22 26 24 22 22 26 22 20 22 25 ...
mbaper_age<-select(mbaper, -Age_in_years_completed)
str(mbaper_age)
## 'data.frame':    273 obs. of  18 variables:
##  $ Percentage_in_10_Class                                          : num  91.2 77 79.8 82 95 86.4 91.2 91.4 71 77.6 ...
##  $ Percentage_in_12_Class                                          : num  91.2 68 61.6 70 78.6 80.4 80 77 74.8 82.6 ...
##  $ Percentage_in_Under_Graduate                                    : num  70 72.7 60.3 76.5 75.4 ...
##  $ Gender                                                          : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 1 2 2 ...
##  $ STATE                                                           : Factor w/ 9 levels "Central Zone",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Previous_Degree                                                 : Factor w/ 6 levels "Arts","Commerce",..: 2 3 3 3 3 3 2 2 3 3 ...
##  $ Fathers_qualification                                           : Factor w/ 9 levels "CA/CS","DIPLOMA",..: 9 5 7 7 9 9 9 9 7 9 ...
##  $ Mothers_qualification                                           : Factor w/ 7 levels "DIPLOMA","HSC",..: 5 5 2 7 7 7 5 7 7 7 ...
##  $ Fathers_occupation                                              : Factor w/ 7 levels "Farming","Not Alive",..: 4 2 4 4 4 7 6 6 7 7 ...
##  $ Mothers_occupation                                              : Factor w/ 4 levels "House wife","Professional Job",..: 2 1 1 2 1 1 2 1 1 1 ...
##  $ Marital_status                                                  : Factor w/ 2 levels "Married","Single": 2 1 2 2 2 2 2 2 2 2 ...
##  $ Place_you_belong_to                                             : Factor w/ 3 levels "Rural","Semi Urban",..: 3 3 3 2 3 2 2 3 3 2 ...
##  $ Total_Family_Income_per_annum                                   : Factor w/ 6 levels "0-3 Lakh","10-15 lakh",..: 4 1 2 6 6 3 1 4 6 6 ...
##  $ Funding_for_the_MBA_Program                                     : Factor w/ 6 levels "Family","Loan",..: 3 1 2 3 1 1 4 1 2 3 ...
##  $ Work_Experience                                                 : Factor w/ 6 levels "1-2 year","2-3 year",..: 6 4 6 6 6 2 6 6 6 6 ...
##  $ Career_options_after_MBA                                        : Factor w/ 5 levels "Family Business",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Alternate_Career_Option_after_MBA_if_not_selected_through_Instit: Factor w/ 5 levels "Family Business",..: 3 3 3 3 3 2 5 3 5 3 ...
##  $ perceivedscorecat                                               : Factor w/ 3 levels "desired skills",..: 3 2 2 2 2 1 2 2 2 2 ...
mbaper_age<-select(mbaper, Age_in_years_completed:Mothers_qualification)
str(mbaper_age)
## 'data.frame':    273 obs. of  9 variables:
##  $ Age_in_years_completed      : int  22 26 24 22 22 26 22 20 22 25 ...
##  $ Percentage_in_10_Class      : num  91.2 77 79.8 82 95 86.4 91.2 91.4 71 77.6 ...
##  $ Percentage_in_12_Class      : num  91.2 68 61.6 70 78.6 80.4 80 77 74.8 82.6 ...
##  $ Percentage_in_Under_Graduate: num  70 72.7 60.3 76.5 75.4 ...
##  $ Gender                      : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 1 2 2 ...
##  $ STATE                       : Factor w/ 9 levels "Central Zone",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Previous_Degree             : Factor w/ 6 levels "Arts","Commerce",..: 2 3 3 3 3 3 2 2 3 3 ...
##  $ Fathers_qualification       : Factor w/ 9 levels "CA/CS","DIPLOMA",..: 9 5 7 7 9 9 9 9 7 9 ...
##  $ Mothers_qualification       : Factor w/ 7 levels "DIPLOMA","HSC",..: 5 5 2 7 7 7 5 7 7 7 ...

To filter data based on some conditions we could use the filter function, the filter is associated with cases

filter(dataset, variable==value)

mbaper_age<-filter(mbaper, Percentage_in_10_Class==75)
str(mbaper_age$Percentage_in_10_Class)
##  num [1:4] 75 75 75 75
mbaper_age<-filter(mbaper, Percentage_in_10_Class<75)
str(mbaper_age$Percentage_in_10_Class)
##  num [1:43] 71 70 72.2 67 74 66.4 72 69.6 72 69.2 ...
mbaper_age<-filter(mbaper, Percentage_in_10_Class>75)
str(mbaper_age$Percentage_in_10_Class)
##  num [1:226] 91.2 77 79.8 82 95 86.4 91.2 91.4 77.6 87.4 ...
#-------------------------------------------------------#

specific category from a categorical variable

mbaper_zone<-filter(mbaper, STATE =="GOA")
str(mbaper_zone$STATE)
##  Factor w/ 9 levels "Central Zone",..: 3
dim(mbaper_zone$STATE)
## NULL
head(mbaper_zone$STATE)  # only one observation is there
## [1] GOA
## 9 Levels: Central Zone East Zone GOA GUJARAT Nepal ... west Zone

more than one category

, for and

mbaper_zone<-filter(mbaper, STATE=="North East Zone",STATE=="Central Zone")
str(mbaper_zone)
## 'data.frame':    0 obs. of  19 variables:
##  $ Age_in_years_completed                                          : int 
##  $ Percentage_in_10_Class                                          : num 
##  $ Percentage_in_12_Class                                          : num 
##  $ Percentage_in_Under_Graduate                                    : num 
##  $ Gender                                                          : Factor w/ 2 levels "Female","Male": 
##  $ STATE                                                           : Factor w/ 9 levels "Central Zone",..: 
##  $ Previous_Degree                                                 : Factor w/ 6 levels "Arts","Commerce",..: 
##  $ Fathers_qualification                                           : Factor w/ 9 levels "CA/CS","DIPLOMA",..: 
##  $ Mothers_qualification                                           : Factor w/ 7 levels "DIPLOMA","HSC",..: 
##  $ Fathers_occupation                                              : Factor w/ 7 levels "Farming","Not Alive",..: 
##  $ Mothers_occupation                                              : Factor w/ 4 levels "House wife","Professional Job",..: 
##  $ Marital_status                                                  : Factor w/ 2 levels "Married","Single": 
##  $ Place_you_belong_to                                             : Factor w/ 3 levels "Rural","Semi Urban",..: 
##  $ Total_Family_Income_per_annum                                   : Factor w/ 6 levels "0-3 Lakh","10-15 lakh",..: 
##  $ Funding_for_the_MBA_Program                                     : Factor w/ 6 levels "Family","Loan",..: 
##  $ Work_Experience                                                 : Factor w/ 6 levels "1-2 year","2-3 year",..: 
##  $ Career_options_after_MBA                                        : Factor w/ 5 levels "Family Business",..: 
##  $ Alternate_Career_Option_after_MBA_if_not_selected_through_Instit: Factor w/ 5 levels "Family Business",..: 
##  $ perceivedscorecat                                               : Factor w/ 3 levels "desired skills",..:
dim(mbaper_zone$STATE)
## NULL
head(mbaper_zone$STATE)
## factor(0)
## 9 Levels: Central Zone East Zone GOA GUJARAT Nepal ... west Zone
#   | for or

mbaper_zone<-filter(mbaper, STATE=="North East Zone" | STATE=="Central Zone")

str(mbaper_zone)
## 'data.frame':    24 obs. of  19 variables:
##  $ Age_in_years_completed                                          : int  22 26 24 22 22 26 22 20 22 25 ...
##  $ Percentage_in_10_Class                                          : num  91.2 77 79.8 82 95 86.4 91.2 91.4 71 77.6 ...
##  $ Percentage_in_12_Class                                          : num  91.2 68 61.6 70 78.6 80.4 80 77 74.8 82.6 ...
##  $ Percentage_in_Under_Graduate                                    : num  70 72.7 60.3 76.5 75.4 ...
##  $ Gender                                                          : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 1 2 2 ...
##  $ STATE                                                           : Factor w/ 9 levels "Central Zone",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Previous_Degree                                                 : Factor w/ 6 levels "Arts","Commerce",..: 2 3 3 3 3 3 2 2 3 3 ...
##  $ Fathers_qualification                                           : Factor w/ 9 levels "CA/CS","DIPLOMA",..: 9 5 7 7 9 9 9 9 7 9 ...
##  $ Mothers_qualification                                           : Factor w/ 7 levels "DIPLOMA","HSC",..: 5 5 2 7 7 7 5 7 7 7 ...
##  $ Fathers_occupation                                              : Factor w/ 7 levels "Farming","Not Alive",..: 4 2 4 4 4 7 6 6 7 7 ...
##  $ Mothers_occupation                                              : Factor w/ 4 levels "House wife","Professional Job",..: 2 1 1 2 1 1 2 1 1 1 ...
##  $ Marital_status                                                  : Factor w/ 2 levels "Married","Single": 2 1 2 2 2 2 2 2 2 2 ...
##  $ Place_you_belong_to                                             : Factor w/ 3 levels "Rural","Semi Urban",..: 3 3 3 2 3 2 2 3 3 2 ...
##  $ Total_Family_Income_per_annum                                   : Factor w/ 6 levels "0-3 Lakh","10-15 lakh",..: 4 1 2 6 6 3 1 4 6 6 ...
##  $ Funding_for_the_MBA_Program                                     : Factor w/ 6 levels "Family","Loan",..: 3 1 2 3 1 1 4 1 2 3 ...
##  $ Work_Experience                                                 : Factor w/ 6 levels "1-2 year","2-3 year",..: 6 4 6 6 6 2 6 6 6 6 ...
##  $ Career_options_after_MBA                                        : Factor w/ 5 levels "Family Business",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Alternate_Career_Option_after_MBA_if_not_selected_through_Instit: Factor w/ 5 levels "Family Business",..: 3 3 3 3 3 2 5 3 5 3 ...
##  $ perceivedscorecat                                               : Factor w/ 3 levels "desired skills",..: 3 2 2 2 2 1 2 2 2 2 ...
dim(mbaper_zone$STATE)
## NULL
head(mbaper_zone$STATE)
## [1] Central Zone Central Zone Central Zone Central Zone Central Zone
## [6] Central Zone
## 9 Levels: Central Zone East Zone GOA GUJARAT Nepal ... west Zone

| for or ### exact word has to be used

mbaper_zone<-filter(mbaper, STATE=="North East Zone" | Previous_Degree=="Commerce")

str(mbaper_zone)
## 'data.frame':    102 obs. of  19 variables:
##  $ Age_in_years_completed                                          : int  22 22 20 23 22 22 22 22 21 21 ...
##  $ Percentage_in_10_Class                                          : num  91.2 91.2 91.4 67 74 85.5 83 76 85 92.4 ...
##  $ Percentage_in_12_Class                                          : num  91.2 80 77 62 61 84.6 82 75 88 89 ...
##  $ Percentage_in_Under_Graduate                                    : num  70 67 65 67 66 86.9 75.6 69 79 77 ...
##  $ Gender                                                          : Factor w/ 2 levels "Female","Male": 1 2 1 1 2 1 2 1 1 1 ...
##  $ STATE                                                           : Factor w/ 9 levels "Central Zone",..: 1 1 1 1 1 2 2 2 2 2 ...
##  $ Previous_Degree                                                 : Factor w/ 6 levels "Arts","Commerce",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Fathers_qualification                                           : Factor w/ 9 levels "CA/CS","DIPLOMA",..: 9 9 9 7 9 9 9 9 9 9 ...
##  $ Mothers_qualification                                           : Factor w/ 7 levels "DIPLOMA","HSC",..: 5 5 7 5 7 5 6 7 7 7 ...
##  $ Fathers_occupation                                              : Factor w/ 7 levels "Farming","Not Alive",..: 4 6 6 6 6 6 6 6 6 6 ...
##  $ Mothers_occupation                                              : Factor w/ 4 levels "House wife","Professional Job",..: 2 2 1 1 2 2 1 1 1 1 ...
##  $ Marital_status                                                  : Factor w/ 2 levels "Married","Single": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Place_you_belong_to                                             : Factor w/ 3 levels "Rural","Semi Urban",..: 3 2 3 3 3 1 3 2 3 3 ...
##  $ Total_Family_Income_per_annum                                   : Factor w/ 6 levels "0-3 Lakh","10-15 lakh",..: 4 1 4 1 6 6 5 5 1 4 ...
##  $ Funding_for_the_MBA_Program                                     : Factor w/ 6 levels "Family","Loan",..: 3 4 1 3 4 3 5 1 2 3 ...
##  $ Work_Experience                                                 : Factor w/ 6 levels "1-2 year","2-3 year",..: 6 6 6 1 6 5 1 6 6 6 ...
##  $ Career_options_after_MBA                                        : Factor w/ 5 levels "Family Business",..: 3 3 3 3 3 3 5 3 3 2 ...
##  $ Alternate_Career_Option_after_MBA_if_not_selected_through_Instit: Factor w/ 5 levels "Family Business",..: 3 5 3 3 5 5 2 5 3 2 ...
##  $ perceivedscorecat                                               : Factor w/ 3 levels "desired skills",..: 3 2 2 1 1 2 2 2 2 2 ...
dim(mbaper_zone$STATE)
## NULL
head(mbaper_zone$STATE)
## [1] Central Zone Central Zone Central Zone Central Zone Central Zone
## [6] East Zone   
## 9 Levels: Central Zone East Zone GOA GUJARAT Nepal ... west Zone
mbaper_zone<-filter(mbaper, STATE=="North East Zone" , Previous_Degree=="Commerce")

str(mbaper_zone)
## 'data.frame':    3 obs. of  19 variables:
##  $ Age_in_years_completed                                          : int  23 21 25
##  $ Percentage_in_10_Class                                          : num  75 68 71.7
##  $ Percentage_in_12_Class                                          : num  74 78 90
##  $ Percentage_in_Under_Graduate                                    : num  67 65 73
##  $ Gender                                                          : Factor w/ 2 levels "Female","Male": 2 1 1
##  $ STATE                                                           : Factor w/ 9 levels "Central Zone",..: 6 6 6
##  $ Previous_Degree                                                 : Factor w/ 6 levels "Arts","Commerce",..: 2 2 2
##  $ Fathers_qualification                                           : Factor w/ 9 levels "CA/CS","DIPLOMA",..: 2 9 9
##  $ Mothers_qualification                                           : Factor w/ 7 levels "DIPLOMA","HSC",..: 1 6 7
##  $ Fathers_occupation                                              : Factor w/ 7 levels "Farming","Not Alive",..: 4 6 7
##  $ Mothers_occupation                                              : Factor w/ 4 levels "House wife","Professional Job",..: 2 1 1
##  $ Marital_status                                                  : Factor w/ 2 levels "Married","Single": 2 2 2
##  $ Place_you_belong_to                                             : Factor w/ 3 levels "Rural","Semi Urban",..: 3 1 3
##  $ Total_Family_Income_per_annum                                   : Factor w/ 6 levels "0-3 Lakh","10-15 lakh",..: 6 2 2
##  $ Funding_for_the_MBA_Program                                     : Factor w/ 6 levels "Family","Loan",..: 4 1 1
##  $ Work_Experience                                                 : Factor w/ 6 levels "1-2 year","2-3 year",..: 6 6 2
##  $ Career_options_after_MBA                                        : Factor w/ 5 levels "Family Business",..: 3 4 3
##  $ Alternate_Career_Option_after_MBA_if_not_selected_through_Instit: Factor w/ 5 levels "Family Business",..: 3 4 3
##  $ perceivedscorecat                                               : Factor w/ 3 levels "desired skills",..: 2 2 1
dim(mbaper_zone$STATE)
we can use ==, >, >= &, |, !, xor(), between(), near()
head(mbaper_zone$STATE)
## [1] North East Zone North East Zone North East Zone
## 9 Levels: Central Zone East Zone GOA GUJARAT Nepal ... west Zone
#-------------------------------------#

Select columns and then subset data based on a condition, combination of rows and variables

mbaper_age<-select(mbaper, Age_in_years_completed)

str(mbaper_age)
## 'data.frame':    273 obs. of  1 variable:
##  $ Age_in_years_completed: int  22 26 24 22 22 26 22 20 22 25 ...
dim(mbaper_age)
## [1] 273   1
head(mbaper_age)
##   Age_in_years_completed
## 1                     22
## 2                     26
## 3                     24
## 4                     22
## 5                     22
## 6                     26
mbaper_age<-filter(select(mbaper, Age_in_years_completed),Age_in_years_completed==25)

str(mbaper_age)
## 'data.frame':    22 obs. of  1 variable:
##  $ Age_in_years_completed: int  25 25 25 25 25 25 25 25 25 25 ...
dim(mbaper_age)
## [1] 22  1
head(mbaper_age)
##   Age_in_years_completed
## 1                     25
## 2                     25
## 3                     25
## 4                     25
## 5                     25
## 6                     25
use of pipe function
1:8 %>% sum
## [1] 36
1:8 %>% sum %>% sqrt
## [1] 6
1:8 %>% sum %>% sqrt%>% sum*10
## [1] 60
#---------------------------------#

mbaper_age<-select(mbaper, Age_in_years_completed) %>%

  filter(Age_in_years_completed==22)

str(mbaper_age)
## 'data.frame':    70 obs. of  1 variable:
##  $ Age_in_years_completed: int  22 22 22 22 22 22 22 22 22 22 ...
dim(mbaper_age)
## [1] 70  1
head(mbaper_age)
##   Age_in_years_completed
## 1                     22
## 2                     22
## 3                     22
## 4                     22
## 5                     22
## 6                     22
mbaper_age<-select(mbaper, Age_in_years_completed) %>%

  arrange(Age_in_years_completed)



mbaper_age<-select(mbaper, Age_in_years_completed) %>%

  arrange(-Age_in_years_completed)
we can also use, starts_with(), ends_with(), contains() example select(mbaper,starts_with("percentage")