Motivated by my young friend, HongMing Song, I managed to find more handy ways to calculate aggregated statistics by group in R. They require loading additional packages, plyr, doBy, Hmisc, and gdata, and are extremely user-friendly. In terms of CPU time, while the method with summarize() is as efficient as the 2nd method with by() introduced yesterday, summaryBy() in doBy package seems the slowest.

> # METHOD 5: USING DDPLY() > library(plyr) > summ5 <- ddply(df, .(SELFEMPL, OWNRENT), summarize, INCOME = mean(INCOME), BAD = mean(BAD)) > print(summ5) SELFEMPL OWNRENT INCOME BAD 1 0 0 2133.314 0.08470957 2 0 1 2881.201 0.06293210 3 1 0 2742.247 0.06896552 4 1 1 3487.910 0.05316973 > > # METHOD 6: USING DOBy() > library(doBy) > summ6 <- summaryBy(INCOME + BAD ~ SELFEMPL + OWNRENT, data = df, fun = c(mean), keep.names = TRUE) > print(summ6) SELFEMPL OWNRENT INCOME BAD 1 0 0 2133.314 0.08470957 2 0 1 2881.201 0.06293210 3 1 0 2742.247 0.06896552 4 1 1 3487.910 0.05316973 > > # METHOD 7: USING SUMMARIZE() > library(Hmisc) > summ7 <- summarize(df[c('INCOME', 'BAD', 'SELFEMPL', 'OWNRENT')], df[c('SELFEMPL', 'OWNRENT')], colMeans, stat.name = NULL) > print(summ7) SELFEMPL OWNRENT INCOME BAD 1 0 0 2133.314 0.08470957 2 0 1 2881.201 0.06293210 3 1 0 2742.247 0.06896552 4 1 1 3487.910 0.05316973 > > # METHOD 8: USING FRAMEAPPLY() > library(gdata) > summ8 <- frameApply(df, by = c('SELFEMPL', 'OWNRENT'), on = c('INCOME', 'BAD'), fun = colMeans) > rownames(summ8) <- NULL > print(summ8) SELFEMPL OWNRENT INCOME BAD 1 0 0 2133.314 0.08470957 2 0 1 2881.201 0.06293210 3 1 0 2742.247 0.06896552 4 1 1 3487.910 0.05316973

**Efficiency Comparison**

> test5 <- function(n){ + for (i in 1:n){ + summ5 <- ddply(df, .(SELFEMPL, OWNRENT), summarize, INCOME = mean(INCOME), BAD = mean(BAD)) + } + } > system.time(test5(10)) user system elapsed 0.524 0.068 0.622 > > test6 <- function(n){ + for (i in 1:n){ + summ6 <- summaryBy(INCOME + BAD ~ SELFEMPL + OWNRENT, data = df, fun = c(mean), keep.names = TRUE) + } + } > system.time(test6(10)) user system elapsed 1.800 0.060 1.903 > > test7 <- function(n){ + for (i in 1:n){ + summ7 <- summarize(df[c('INCOME', 'BAD', 'SELFEMPL', 'OWNRENT')], df[c('SELFEMPL', 'OWNRENT')], colMeans, stat.name = NULL) + } + } > system.time(test7(10)) user system elapsed 0.236 0.020 0.274 > > test8 <- function(n){ + for (i in 1:n){ + summ8 <- frameApply(df, by = c('SELFEMPL', 'OWNRENT'), on = c('INCOME', 'BAD'), fun = colMeans) + rownames(summ8) <- NULL + } + } > system.time(test8(10)) user system elapsed 0.580 0.008 0.668

