Tuesday, June 16, 2020

Basics of R Session 5.4 - Missing values

data1<-read.csv("file:///C:/Users/LENOVO/Desktop/Missing/missing1.1.csv")

library(descr)
descr(data1)
descr(data1$v1)

is.na(data1)  # used to identify the place where there is missing in the data as true or false

sum(is.na(data1))  #total number of missing values
sum(is.na(data1))/(nrow(data1)*ncol(data1)) # % of missing values

sum(is.na(data1$v1))/length(data1$v1)  # % of missing in a variable

library(Amelia)
AmeliaView()  # only to see the pattern of the missing data
# note if there is perfect multicollinearity between any  two variables, it will not work

data1_impute<-amelia(data1, m=3)  # default

data1_impute$imputations[[1]]   # Imputation data set 1
data1_impute$imputations[[1]]$v1   # Imputation data set 1 variable 2

# save as data1_impute_1
data1_impute_1<-data1_impute$imputations[[1]]   # Imputation data set 1
str(data1_impute_1)

# idvars
# a vector of column numbers or column names that indicates identification variables.
# noms
# a vector of numbers or names indicating columns in the data that are nominal variables.
# ords
# a vector of numbers or names indicating columns in the data that should be treated as ordinal variables.

data2_impute<-amelia(data1, m=3, idvars = 1, noms = c(11,12,13), ords = 14 )

data2_impute$imputations[[1]]

# uses bootstrap EM algorithm (Expectation Maximization)

#-------------------------#



library(Hmisc)

impute(data1$v1,mean)
impute(data1$v1,median)
impute(data1$v1,min)
impute(data1$v1,max)
impute(data1$v1,"random")

impute_data1_2<-aregImpute(v1~v2, data1,n.impute = 3,type = "regression")
impute_data1_2$imputed$v2

# there are many more methods- which you can explore yourself

No comments:

Post a Comment