**Jason Bryer » R**, and kindly contributed to R-bloggers)

Here is a quick analysis of the relationship between SAT score and student retention. The data is from the Integrated Postsecondary Education Data System (IPEDS) and analyzed using R. This was a quick analysis and would be careful about making any strong conclusions. The source for running this analysis along with some additional graphics that are not included in this post.

Here are the results of the regression analysis:

Estimate | Std. Error | t value | Pr(> |t|) | ||
---|---|---|---|---|---|

(Intercept) | 17.9209 | 3.3090 | 5.42 | 0.0000 | *** |

SATWriting | 0.0307 | 0.0118 | 2.61 | 0.0093 | ** |

SATMath | 0.0921 | 0.0112 | 8.19 | 0.0000 | *** |

AcceptanceTotal | -0.5566 | 1.5400 | -0.36 | 0.7179 | |

UseAdmissionTestScoresRecommended | -8.1989 | 2.4935 | -3.29 | 0.0011 | ** |

UseAdmissionTestScoresRequired | -4.7632 | 2.1289 | -2.24 | 0.0256 | * |

* p < .05; ** p < .01; *** p < .001 |

Residual standard error: 144.5 on 684 degrees of freedom

(2013 observations deleted due to missingness)

Multiple R-squared: 0.7376, Adjusted R-squared: 0.7356

F-statistic: 384.4 on 5 and 684 DF, p-value: < 2.2e-16

install.packages(‘ipeds’, repos=c(‘http://R-Forge.R-project.org’, ‘http://lib.stat.cmu.edu/R/CRAN’), dep=TRUE)

library(ipeds)

library(ggplot2)

#The ipedsHelp function will return the data dictionary for the given surveys.

ipedsHelp(‘HD’, 2008)

ipedsHelp(‘IC’, 2008)

ipedsHelp(‘EFD’, 2008)

directory = getIPEDSSurvey(‘HD’, 2008)

admissions = getIPEDSSurvey(‘IC’, 2008)

retention = getIPEDSSurvey(‘EFD’, 2008)

directory = directory[,c('unitid', 'instnm', 'sector', 'control')]

admissions = admissions[,c('unitid', 'admcon1', 'admcon2', 'admcon7', 'applcnm', 'applcnw', 'applcn', 'admssnm', 'admssnw', 'admssn', 'enrlftm', 'enrlftw', 'enrlptm', 'enrlptw', 'enrlt', 'satnum', 'satpct', 'actnum', 'actpct', 'satvr25', 'satvr75', 'satmt25', 'satmt75', 'satwr25', 'satwr75', 'actcm25', 'actcm75', 'acten25', 'acten75', 'actmt25', 'actmt75', 'actwr25', 'actwr75')]

admissions$admcon1 = factor(admissions$admcon1, levels=c(1,2,3,4,-1,-2), labels=c(‘Required’, ‘Recommended’, ‘Neither requiered nor recommended’, ‘Do not know’, ‘Not reported’, ‘Not applicable’))

admissions$admcon2 = factor(admissions$admcon2, levels=c(1,2,3,4,-1,-2), labels=c(‘Required’, ‘Recommended’, ‘Neither requiered nor recommended’, ‘Do not know’, ‘Not reported’, ‘Not applicable’))

admissions$admcon7 = factor(admissions$admcon7, levels=c(1,2,3,4,-1,-2), labels=c(‘Required’, ‘Recommended’, ‘Neither requiered nor recommended’, ‘Do not know’, ‘Not reported’, ‘Not applicable’))

names(admissions) = c(‘unitid’, ‘UseHSGPA’, ‘UseHSRank’, ‘UseAdmissionTestScores’, ‘ApplicantsMen’, ‘ApplicantsWomen’, ‘ApplicantsTotal’, ‘AdmissionsMen’, ‘AdmissionsWomen’, ‘AdmissionsTotal’, ‘EnrolledFullTimeMen’, ‘EnrolledFullTimeWomen’, ‘EnrolledPartTimeMen’, ‘EnrolledPartTimeWomen’, ‘EnrolledTotal’, ‘NumSATScores’, ‘PercentSATScores’, ‘NumACTScores’, ‘PercentACTScores’, ‘SATReading25′, ‘SATReading75′, ‘SATMath25′, ‘SATMath75′, ‘SATWriting25′, ‘SATWriting75′, ‘ACTComposite25′, ‘ACTComposite75′, ‘ACTEnglish25′, ‘ACTEnglish75′, ‘ACTMath25′, ‘ACTMath75′, ‘ACTWriting25′, ‘ACTWriting75′)

retention = retention[,c('unitid', 'ret_pcf', 'ret_pcp')]

names(retention) = c(‘unitid’, ‘FullTimeRetentionRate’, ‘PartTimeRetentionRate’)

#Merge the data frames. Note that schools that do not appear in all three data frames will not be included in the final analysis.

ret = merge(directory, admissions, by=’unitid’)

ret = merge(ret, retention, by=’unitid’)

ret2 = ret[ret$UseAdmissionTestScores %in% c('Required', 'Recommended', 'Neither requiered nor recommended'),] #Use schools that require or recommend admission tests

ret2 = ret2[-which(ret2$FullTimeRetentionRate < 20),] #Remove schools with low retention rates. Are these errors in the data?

ret2$SATMath = (ret2$SATMath75 + ret2$SATMath25) / 2

ret2$SATWriting = (ret2$SATWriting75 + ret2$SATWriting25) / 2

ret2$SATTotal = ret2$SATMath + ret2$SATWriting

ret2$AcceptanceTotal = ret2$AdmissionsTotal / ret2$ApplicantsTotal

ret2$UseAdmissionTestScores = as.factor(as.character(ret2$UseAdmissionTestScores))

ggplot(ret2, aes(x=FullTimeRetentionRate)) + geom_histogram(binwidth=1, alpha=.6)

ggplot(ret2, aes(x=SATMath)) + geom_histogram(binwidth=10, alpha=.6)

ggplot(ret2, aes(x=SATWriting)) + geom_histogram(binwidth=10, alpha=.6)

retMath = ret2[,c('unitid', 'SATMath25', 'SATMath75', 'SATMath')]

retMath = melt(retMath, id=’unitid’)

ggplot(retMath, aes(x=value)) + geom_histogram(binwidth=10, alpha=.6) + facet_wrap(~ variable, ncol=1)

retWriting = ret2[,c('unitid', 'SATWriting25', 'SATWriting75', 'SATWriting')]

retWriting = melt(retWriting, id=’unitid’)

ggplot(retWriting, aes(x=value)) + geom_histogram(binwidth=10, alpha=.6) + facet_wrap(~ variable, ncol=1)

ggplot(ret2, aes(x=SATTotal, y=FullTimeRetentionRate, size=NumSATScores, color=UseAdmissionTestScores)) + geom_point()

#Regression

fit = lm(FullTimeRetentionRate ~ SATWriting + SATMath + AcceptanceTotal + UseAdmissionTestScores, data=ret2, weights=NumSATScores)

summary(fit)

**leave a comment**for the author, please follow the link and comment on his blog:

**Jason Bryer » R**.

R-bloggers.com offers

**daily e-mail updates**about R news and tutorials on topics such as: visualization (ggplot2, Boxplots, maps, animation), programming (RStudio, Sweave, LaTeX, SQL, Eclipse, git, hadoop, Web Scraping) statistics (regression, PCA, time series, trading) and more...