# This post does some basic correlation analysis between responses # to the survey I recently released through R Shiny at: http://www.econometricsbysimulation.com/2013/11/Shiny-Survey-Tool.html # I have saved the data from the survey after completing the survey myself. # This data is incomplete because the survey has been running since I # saved the survey and because shinyApp.io server automatically resets # data every so often. Thus survey results are lost sometimes. # (Something to be remedied at a future time.) # Let's get our data Rsurvey <- read.csv(paste0( "https://raw.github.com/EconometricsBySimulation/", "Shiny-Demos/master/Survey/sample-results.csv")) summary(Rsurvey) # Looking at the data we have 321 responses in total though # on average it looks like we have closer to 250 responses # to work with. # The majority of respondents consider their knowledge of R # to be either Advanced (119) or Moderate (92). # The majority of users are aged 26-35 (121) or 36-45 (79). # The frequency that respondents read R bloggers is # most frequently daily (164) or weekly (75). # The frequency of respondents read my blog never (149) # or monthly (48). # The self-reported technical knowledge of most users in a # theoretic stastics/econometrics/pschometrics field was # most frequently reported as either moderate (103) or # advanced (97). # The favorite colors of people was most frequently blue (114) # and green (66). # The vast majority of respondents were male (243) compared # with only 22 females. Looks like R bloggers is not going # to become a dating website in the near future. # Of those who selected an area of research the majority # chose data analysis (150) followed by (statistics). # As for the user's knowledge of Shiny few had much at all with # Basic (118) and Non (76) being the most frequent responses. # Finally, as to the question of "What is the air speed velocity # of an unladen swallow" the majority of respondents chose # the correct response to the Monte Python reference (129) # while the next largest group indicated that "they did not # know" (98). # Well let's see if there is any correlation between particular # outcomes of interest. # Let's see if there is a correlation between knowledge of R # and frequency of reading R bloggers. knowledgeR <- rep(NA, nrow(Rsurvey)) knowledgeR[Rsurvey[,2]=="None"] <- 0 knowledgeR[Rsurvey[,2]=="Basic"] <- 1 knowledgeR[Rsurvey[,2]=="Moderate"] <- 2 knowledgeR[Rsurvey[,2]=="Advanced"] <- 3 knowledgeR[Rsurvey[,2]=="Expert"] <- 4 frequency <- rep(NA, nrow(Rsurvey)) # Convert these rates to number of days per year # reading R bloggers. frequency[Rsurvey[,5]=="None"] <- 0 frequency[Rsurvey[,5]=="daily"] <- 360 frequency[Rsurvey[,5]=="weekly"] <- 50 frequency[Rsurvey[,5]=="monthly"] <- 12 cor(frequency, knowledgeR, use="pairwise.complete.obs") # There seems to be modest correlation beteen # of days spent # reading R bloggers and self assessment of R expertise. # Let's see if coldness or warmth allong the color # spectrum is a useful variable. warmth <- rep(NA, nrow(Rsurvey)) warmth[Rsurvey[,8]=="blue"] <- 0 warmth[Rsurvey[,8]=="green"] <- 1 warmth[Rsurvey[,8]=="orange"] <- 2 warmth[Rsurvey[,8]=="red"] <- 3 cor(warmth, knowledgeR, use="pairwise.complete.obs") # There is a slight negative correlation between # the favorite color warmth of users and self-reported # knoweldge of R. # Finally, let's look at success on the Monte Python # trivia question. monte <- rep(NA, nrow(Rsurvey)) monte[Rsurvey[,12]=="I don't know!"] <- 0 monte[Rsurvey[,12]=="~50 MPH"] <- 0 monte[Rsurvey[,12]== "What do you mean? An African or European swallow?"] <- 1 cor(monte, knowledgeR, use="pairwise.complete.obs") # We see a modest positive correlation between # knowledge of R and being able to answer Monte Python # trivia. knowledgeStats <- rep(NA, nrow(Rsurvey)) knowledgeStats[Rsurvey[,6]=="None"] <- 0 knowledgeStats[Rsurvey[,6]=="Basic"] <- 1 knowledgeStats[Rsurvey[,6]=="Moderate"] <- 2 knowledgeStats[Rsurvey[,6]=="Advanced"] <- 3 knowledgeStats[Rsurvey[,6]=="Expert"] <- 4 cor(knowledgeStats, knowledgeR, use="pairwise.complete.obs") # There seems to be a very stong correlation with # self reported knowledge of Statstics and knowedge # of R. # In order to see the data points a little more clearly # I will add some tiny noise to both knowledge sets knowledgeStatsN <- knowledgeStats + .15*rnorm(nrow(Rsurvey)) knowledgeRN <- knowledgeR + .15*rnorm(nrow(Rsurvey)) plot(knowledgeRN, knowledgeStatsN, main="Knowledge of Stats against that of R", xlab="R", ylab="Stats") # We can see the diagnol elements 2,2 and 3,3 have the most # frequency. summary(lm(knowledgeR~warmth+frequency+monte+knowledgeStats)) # Overall, we can see that knowledge of Statistics # stongly positively predicts knowledge of R. # Frequency of reading R bloggers also seems to # have an effect size significant nearly at the # %5 level. # The coefficient on frequency is very small but that is because # the scale is quite large (from 0 to 360). However # if someone where to start reading R bloggers daily # (assuming R-bloggers -> R knowledge one directionally) # Then we would expect a change of R knowledge of: .0006994*360 # 0.251784 # Not as large a predictor as knowledge of stats but certainy # there exists some relationship. # Of course little causal relationships can be inferred from the # data. We cannot expect reading R bloggers to be independent # of self-assessed knowledge of R any more than knowledge of # statistics to be uncorrelated with knowledge of R since # many users, learn R simultaneously with statistics.