## geom_smooth: method="auto" and size of largest group is <1000, so using
## loess. Use 'method = x' to change the smoothing method.
I remember my dad telling me that when he was at Northwestern in the mid-70s, the team was essentially winless. As a small consolation, he remembered that the football team had actually been full of good students.
Some time ago, I stumbled across nflcombineresults.com. Among other measures, the site reports Wonderlic scores for a bunch of players entering the NFL.
By adding data on a school's institutional strength (from US News and World Report), I can look for an association between a quarterback's Wonderlic (a measure of cognitive ability), and the (academic) strength of a quarterback's alma mater.
ResultsWhile this is fairly rough, it looks like there is a relationship here – quarterbacks who attend better schools have higher Wonderlic scores; this relationship seems to only hold for top-50 schools.
There are a bunch of causal relationships that could give rise to this pattern, and we really don't have the data to separate these stories. It could be that better students choose better schools, or that attending some schools will increase a Wonderlic score. Alternatively, it may be that students at top schools are more likely to attend college for 4 years.
While the assessment is really preliminary, it looks like there might be something here.
I've attached my code below. At some point, I'll add the csv file with institutional strength (this information is freely available at US News as well).
# loading libraries:
# gathering wonderlic data:
url = "http://nflcombineresults.com/nflcombinedata.php?year=&pos=&college="
test = readHTMLTable(url)
dat = test[]
## cleaning data:
names(dat) = tolower(names(dat))
# replacing spaces in variable names:
names(dat) = gsub(x = names(dat), pattern = "\\s", ".")
# adjusting vert.leap.(in)
names(dat) = "vert.leap.in"
# cleaning individual columns:
dat$year = as.numeric(as.character(dat$year))
dat$name = as.character(dat$name)
dat$wonderlic = as.numeric(as.character(dat$wonderlic))
dat$bench.press = as.numeric(as.character(dat$bench.press))
dat$vert.leap.in = as.numeric(as.character(dat$vert.leap.in))
# separating out the individuals with wonderlic scores:
dat.sub = dat[!is.na(dat$wonderlic), ]
dat.sub = dat.sub[order(dat.sub$wonderlic, decreasing = TRUE), ]
# examining the scores by position:
pos.dat = ddply(dat.sub, .(pos), summarise, mean.wonderlic = mean(wonderlic,
na.rm = TRUE), count = length(wonderlic))
# note: not really enough to compare by position.
qb.dat = dat.sub[dat.sub$pos == "QB", ]
# reading in qbschools:
qb.schools = read.csv("qbschools.csv")
qb.dat$college = as.character(qb.dat$college)
qb.schools$school = as.character(qb.schools$school)
merged = merge(qb.dat, qb.schools, by.x = "college", by.y = "school")
names(merged) = tolower(names(merged))
merged$usnewsrank = as.numeric(as.character(merged$usnewsrank))
# to generate plot: p = ggplot(merged, aes(x = usnewsrank, y = wonderlic)) +
# geom_point() + geom_smooth() p + opts(title = 'QB Wonderlic score \n by
# (academic) strength of undergraduate institution') + xlab('US News and
# World Report institution rank (as of 2013)') + ylab ('Wonderlic score')