[This article was first published on R on Datentrang, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

The goal is to compare a few algorithms for missing imputation when used before k-means clustering is performed. For the latter we use the same algorithm as in ClustImpute to ensure that only the computation time of the imputation is compared. In a nutshell, we’ll se that ClustImpute scales like a random imputation and hence is much faster than a pre-processing with MICE / MissRanger. This is not surprising since ClustImpute basically runs a fixed number of random imputations conditional on the current cluster assignment. Below is the code that has been used.

First we’ll load all relevant packages

Then we define a function that creates us an artificial data set with variable size $$n$$.

create_random_data <- function(n,nr_other_vars=4,seedvar=739) {
n <- round(n/3)*3
set.seed(seedvar)
mat <- matrix(rnorm(nr_other_vars*n),n,nr_other_vars)
me<-4 # mean
x <- c(rnorm(n/3,me/2,1),rnorm(2*n/3,-me/2,1))
y <- c(rnorm(n/3,0,1),rnorm(n/3,me,1),rnorm(n/3,-me,1))
true_clust <- c(rep(1,n/3),rep(2,n/3),rep(3,n/3)) # true clusters
dat <- cbind(mat,x,y)
dat<- as.data.frame(scale(dat)) # scaling

return(list(data=dat,true_clusters=true_clust))
}

The parametrization $$nr_iter_other$$ of the clustering for external imputation strategies is set so that we have a comparable number of steps after imputation (steps of Clustimpute with a weight < 1 are considered the burn-in phase since only afterwards the distribution is considered credible). Due to a very slow performance we decided to drop MissForrest and only consider MissRanger. The results below are for a missing rate of 20%, though this could be changed easily. For each data set, imputation and clustering are repeated 5 times with a differnt seed each time.

### for various n and p
N <- 2^(0:4)*400
p <- .2
### Parameters
# ClustImpute
nr_iter <- 14 # iterations of procedure
n_end <- 10 # step until convergence of weight function to 1
nr_cluster <- 3 # number of clusters
c_steps <- 50 # numer of cluster steps per iteration
# Clustering based on imputation
nr_iter_other <- (nr_iter-n_end) * c_steps # comparable number of steps "after" imputation

param_times <- 5 # how often to compute the benchmark

results <- list()
count <- 0

for (n in N) {
count <- count + 1
# Create random data with missings
random_data <- create_random_data(n)
dat <- random_data$data dat_with_miss <- miss_sim(dat,p,seed_nr=123) ### Benchmark time and calculate rand index rand_ClustImpute <- 0 rand_missRanger <- 0 rand_RandomImp <- 0 rand_mice <- 0 rand_amelia <- 0 # Use a different seed each time mbm <- microbenchmark("ClustImpute" = { res <- ClustImpute(dat_with_miss,nr_cluster=nr_cluster, nr_iter=nr_iter, c_steps=c_steps, n_end=n_end, seed_nr = random_seed) rand_ClustImpute <- rand_ClustImpute + external_validation(random_data$true_clusters, res$clusters) }, "RandomImp"={ dat_random_imp <- dat_with_miss for (j in 1:dim(dat)[2]) { dat_random_imp[,j] <- impute(dat_random_imp[,j],fun="random") } cl_RandomImp <- KMeans_arma(data=dat_random_imp,clusters=3,n_iter=nr_iter_other,seed=random_seed) pred_RandomImp <- predict_KMeans(dat_random_imp,cl_RandomImp) class(pred_RandomImp) <- "numeric" rand_RandomImp <- rand_RandomImp + external_validation(random_data$true_clusters, pred_RandomImp)
},
"MissRanger"={
imp_missRanger <- missRanger(dat_with_miss,pmm.k = 3)
cl_missRanger <- KMeans_arma(data=imp_missRanger,clusters=3,n_iter=nr_iter_other,seed=random_seed)
pred_missRanger <- predict_KMeans(imp_missRanger,cl_missRanger)
class(pred_missRanger) <- "numeric"
rand_missRanger <- rand_missRanger + external_validation(random_data$true_clusters, pred_missRanger) }, "MICE"={ dat_mice <- mice(dat_with_miss,m=1,maxit=50,meth='pmm') # single data set dat_mice <- complete(dat_mice) cl_mice <- KMeans_arma(data=dat_mice,clusters=3,n_iter=nr_iter_other,seed=random_seed) pred_mice <- predict_KMeans(dat_mice,cl_mice) class(pred_mice) <- "numeric" rand_mice <- rand_mice + external_validation(random_data$true_clusters, pred_mice)
}, "AMELIA"= {
dat_amelia <- amelia(dat_with_miss,m=1) # single data set
dat_amelia <- dat_amelia$imputations$imp1
cl_amelia <- KMeans_arma(data=dat_amelia,clusters=3,n_iter=nr_iter_other,seed=random_seed)
pred_amelia <- predict_KMeans(dat_amelia,cl_amelia)
class(pred_amelia) <- "numeric"
rand_amelia <- rand_amelia + external_validation(random_data$true_clusters, pred_amelia) } ,times = param_times, setup = {random_seed=round(runif(1)*1e5)}, unit = "s") # compute average rand index rand_ClustImpute <- rand_ClustImpute/param_times rand_missRanger <- rand_missRanger/param_times rand_RandomImp <- rand_RandomImp/param_times rand_mice <- rand_mice/param_times rand_amelia <- rand_amelia/param_times results$randIndex[[count]] <- c(ClustImpute=rand_ClustImpute,RandomImp=rand_RandomImp,#missForest=rand_missForest,
missRanger=rand_missRanger,MICE=rand_mice,AMELIA=rand_amelia)

results$benchmark[[count]] <- mbm mbm_median <- print(mbm)$median
results$benchmark_median[[count]] <- mbm_median } Below are the results for the four data sets (only differing in size). Clearly a reandom imputation is fastest, but the AMELIA package is only slightly slower. ClustImpute is 3rd and considerably faster than MICE and missRanger. plot_grid(autoplot(results$benchmark[[2]]),autoplot(results$benchmark[[3]]),autoplot(results$benchmark[[4]]),
autoplot(results$benchmark[[5]]), ncol=2,labels=sprintf("n = %s",N),label_size = 12,vjust=1) Let’s focus on the scalabiltiy here: Amelia and ClustImpute scale much better than MICE and missRanger. ClustImpute scales like a simple random imputation and similarly as AMELIA. data2plot <- data.frame(median=unlist(results$benchmark_median))
data2plot$method <- rep(Hmisc::Cs(ClustImpute,RandomImp,MissRanger,MICE,AMELIA),length(N)) data2plot$n <- rep(N,each=5)

# with shared legend
ps1 <- ggplot(data2plot,aes(x=n,y=median,color=method)) + geom_point() + theme_cowplot() + geom_smooth() +
xlab("Numer of observations n") + ylab("Median running time in seconds")
legend <- get_legend(ps1)
ps2 <- ggplot(data2plot,aes(x=n,y=median,color=method)) + geom_point() + theme_cowplot() + geom_smooth() +
scale_y_log10() + xlab("Numer of observations n") + ylab("Median running time in seconds") + theme(legend.position="none")
plot_grid(ps1 + theme(legend.position="none"),ps2,legend,nrow=1,rel_widths = c(3,3,1))

Of course, above plots only consider the running time. Below is a table of the rand indices comparing the resulting clusters with the true clusters. ClustImpute provides the highest numbers, although the other imputation methods (except random imputation) could provide better results if they are tuned.

randtbl <- data.frame(matrix(unlist(results$randIndex),nrow=length(results$randIndex), byrow=T))
colnames(randtbl) <- names(results$randIndex[[1]]) randtbl$N <- N
knitr::kable(x=randtbl)
ClustImpute RandomImp missRanger MICE AMELIA N
0.6784811 0.3162996 0.3031195 0.2920014 0.3529032 400
0.6518900 0.3149466 0.3107541 0.3186570 0.2839927 800
0.6895598 0.4122350 0.2648335 0.2978446 0.2051036 1600
0.6732008 0.3458818 0.3365066 0.4451816 0.4263087 3200
0.6672859 0.2768001 0.3134265 0.2249913 0.3039771 6400