# This post does some basic correlation analysis between responses
# to the survey I recently released through R Shiny at:
# I have saved the data from the survey after completing the survey myself.
# This data is incomplete because the survey has been running since I
# saved the survey and because shinyApp.io server automatically resets
# data every so often. Thus survey results are lost sometimes.
# (Something to be remedied at a future time.)
# Let's get our data
Rsurvey <- read.csv(paste0(
# Looking at the data we have 321 responses in total though
# on average it looks like we have closer to 250 responses
# to work with.
# The majority of respondents consider their knowledge of R
# to be either Advanced (119) or Moderate (92).
# The majority of users are aged 26-35 (121) or 36-45 (79).
# The frequency that respondents read R bloggers is
# most frequently daily (164) or weekly (75).
# The frequency of respondents read my blog never (149)
# or monthly (48).
# The self-reported technical knowledge of most users in a
# theoretic stastics/econometrics/pschometrics field was
# most frequently reported as either moderate (103) or
# advanced (97).
# The favorite colors of people was most frequently blue (114)
# and green (66).
# The vast majority of respondents were male (243) compared
# with only 22 females. Looks like R bloggers is not going
# to become a dating website in the near future.
# Of those who selected an area of research the majority
# chose data analysis (150) followed by (statistics).
# As for the user's knowledge of Shiny few had much at all with
# Basic (118) and Non (76) being the most frequent responses.
# Finally, as to the question of "What is the air speed velocity
# of an unladen swallow" the majority of respondents chose
# the correct response to the Monte Python reference (129)
# while the next largest group indicated that "they did not
# know" (98).
# Well let's see if there is any correlation between particular
# outcomes of interest.
# Let's see if there is a correlation between knowledge of R
# and frequency of reading R bloggers.
knowledgeR <- rep(NA, nrow(Rsurvey))
knowledgeR[Rsurvey[,2]=="None"] <- 0
knowledgeR[Rsurvey[,2]=="Basic"] <- 1
knowledgeR[Rsurvey[,2]=="Moderate"] <- 2
knowledgeR[Rsurvey[,2]=="Advanced"] <- 3
knowledgeR[Rsurvey[,2]=="Expert"] <- 4
frequency <- rep(NA, nrow(Rsurvey))
# Convert these rates to number of days per year
# reading R bloggers.
frequency[Rsurvey[,5]=="None"] <- 0
frequency[Rsurvey[,5]=="daily"] <- 360
frequency[Rsurvey[,5]=="weekly"] <- 50
frequency[Rsurvey[,5]=="monthly"] <- 12
cor(frequency, knowledgeR, use="pairwise.complete.obs")
# There seems to be modest correlation beteen # of days spent
# reading R bloggers and self assessment of R expertise.
# Let's see if coldness or warmth allong the color
# spectrum is a useful variable.
warmth <- rep(NA, nrow(Rsurvey))
warmth[Rsurvey[,8]=="blue"] <- 0
warmth[Rsurvey[,8]=="green"] <- 1
warmth[Rsurvey[,8]=="orange"] <- 2
warmth[Rsurvey[,8]=="red"] <- 3
cor(warmth, knowledgeR, use="pairwise.complete.obs")
# There is a slight negative correlation between
# the favorite color warmth of users and self-reported
# knoweldge of R.
# Finally, let's look at success on the Monte Python
# trivia question.
monte <- rep(NA, nrow(Rsurvey))
monte[Rsurvey[,12]=="I don't know!"] <- 0
monte[Rsurvey[,12]=="~50 MPH"] <- 0
"What do you mean? An African or European swallow?"] <- 1
cor(monte, knowledgeR, use="pairwise.complete.obs")
# We see a modest positive correlation between
# knowledge of R and being able to answer Monte Python
knowledgeStats <- rep(NA, nrow(Rsurvey))
knowledgeStats[Rsurvey[,6]=="None"] <- 0
knowledgeStats[Rsurvey[,6]=="Basic"] <- 1
knowledgeStats[Rsurvey[,6]=="Moderate"] <- 2
knowledgeStats[Rsurvey[,6]=="Advanced"] <- 3
knowledgeStats[Rsurvey[,6]=="Expert"] <- 4
cor(knowledgeStats, knowledgeR, use="pairwise.complete.obs")
# There seems to be a very stong correlation with
# self reported knowledge of Statstics and knowedge
# of R.
# In order to see the data points a little more clearly
# I will add some tiny noise to both knowledge sets
knowledgeStatsN <- knowledgeStats + .15*rnorm(nrow(Rsurvey))
knowledgeRN <- knowledgeR + .15*rnorm(nrow(Rsurvey))
main="Knowledge of Stats against that of R",
# We can see the diagnol elements 2,2 and 3,3 have the most
# Overall, we can see that knowledge of Statistics
# stongly positively predicts knowledge of R.
# Frequency of reading R bloggers also seems to
# have an effect size significant nearly at the
# %5 level.
# The coefficient on frequency is very small but that is because
# the scale is quite large (from 0 to 360). However
# if someone where to start reading R bloggers daily
# (assuming R-bloggers -> R knowledge one directionally)
# Then we would expect a change of R knowledge of:
# Not as large a predictor as knowledge of stats but certainy
# there exists some relationship.
# Of course little causal relationships can be inferred from the
# data. We cannot expect reading R bloggers to be independent
# of self-assessed knowledge of R any more than knowledge of
# statistics to be uncorrelated with knowledge of R since
# many users, learn R simultaneously with statistics.