Learn R: June 2020

Saturday, June 20, 2020

Basics of R Session 5.5- Outliers

options(scipen=999) # to turn off scientific notation
data1<-read.csv("file:///C:/Users/LENOVO/Desktop/Missing/outliers sumeer.csv")

#-----------------------#
library(outliers)

library(car)
boxplot(data1$AGE)

#convert the data into standard form
data1$HB_scale<-scale(data1$HB, center = TRUE, scale = TRUE)

summary(data1$HB_scale)
plot(data1$HB_scale)

data1$HB_scale
which(data1$HB_scale>=1.96)
which(data1$HB_scale<=-1.96)

#chi square test
chisq.out.test(data1$HB)
outlier(data1$HB)

#inter quartile range

IQR(data1$HB)
quantile(data1$HB)

score1<-scores(data1$HB, type = "chisq", prob = 0.95)
score1<-scores(data1$HB, type = "z", prob = 0.95)
score1<-scores(data1$HB, type = "t", prob = 0.95)

# mahalobanis distance
library(psych)
# reducing the data using only those variables whcih are required
data2<-data1[,c(2:3)]
outlier(data2, plot = TRUE)

# regression or residual to find outliers

reg1<-lm(AGE~HB+TLC+PLATELET, data = data1)
plot(reg1)

Tuesday, June 16, 2020

Basics of R Session 5.4 - Missing values

data1<-read.csv("file:///C:/Users/LENOVO/Desktop/Missing/missing1.1.csv")

library(descr)
descr(data1)
descr(data1$v1)

is.na(data1) # used to identify the place where there is missing in the data as true or false

sum(is.na(data1)) #total number of missing values
sum(is.na(data1))/(nrow(data1)*ncol(data1)) # % of missing values

sum(is.na(data1$v1))/length(data1$v1) # % of missing in a variable

library(Amelia)
AmeliaView() # only to see the pattern of the missing data
# note if there is perfect multicollinearity between any two variables, it will not work

data1_impute<-amelia(data1, m=3) # default

data1_impute$imputations[[1]] # Imputation data set 1
data1_impute$imputations[[1]]$v1 # Imputation data set 1 variable 2

# save as data1_impute_1
data1_impute_1<-data1_impute$imputations[[1]] # Imputation data set 1
str(data1_impute_1)

# idvars
# a vector of column numbers or column names that indicates identification variables.
# noms
# a vector of numbers or names indicating columns in the data that are nominal variables.
# ords
# a vector of numbers or names indicating columns in the data that should be treated as ordinal variables.

data2_impute<-amelia(data1, m=3, idvars = 1, noms = c(11,12,13), ords = 14 )

data2_impute$imputations[[1]]

# uses bootstrap EM algorithm (Expectation Maximization)

#-------------------------#

library(Hmisc)

impute(data1$v1,mean)
impute(data1$v1,median)
impute(data1$v1,min)
impute(data1$v1,max)
impute(data1$v1,"random")

impute_data1_2<-aregImpute(v1~v2, data1,n.impute = 3,type = "regression")
impute_data1_2$imputed$v2

# there are many more methods- which you can explore yourself