#code for Chapter 2 "Propensity Score Estimation" of book:
#Leite, W. L. (2016). Practical propensity score analysis using R.
#Thousand Oaks, CA: Sage.
#
#this is the code that was used to generate the example results in the book
#As the R software and the R packages used in this example are updated frequently
#some incompatibilities between the current code and new R versions or package versions
#may appear
#Any updates to the code will be posted at:
# http://www.practicalpropensityscore.com
# This example estimates the effect of participation in career academy programs on income
# This example used data from the Education Longitudinal Study of 2002 (ELS)
# More details about dataset can be found at http://nces.ed.gov/surveys/els2002/
#load dataset
load(file="Chapter2_ELS_data_example_career_academy.Rdata")
#===============================================
#CODE FOR TREATMENT OF MISSING DATA BY MULTIPLE IMPUTATION
library(mice) #load package that performs multiple imputation by chained equations
#examine the number of missing cases
missing.indicator <- data.frame(is.na(ELS.data))
propMissing <- apply(missing.indicator,2,mean)
#create dummy missing value indicators
names(missing.indicator)[propMissing>0] <- paste(names(ELS.data)[propMissing>0],"NA",sep="")
#convert dummy missing indicators from logical to numeric variables
for (var in 1:ncol(missing.indicator)) {
missing.indicator[,var] <- as.numeric(missing.indicator[,var]) }
#merge covariate names with missing indicator names
ELS.data <- cbind(ELS.data,missing.indicator[,propMissing>0])
#show percentage missing
print(round(propMissing,3))
#create a treatment indicator
ELS.data$treat <- factor(ELS.data$BYS33K)
#======================
#impute separately treated and untreated groups
#create storage
long.imputation <- c()
#loop through the treatment groups
for (group in 0:1) {
#creates a list of predictors of missing data with a mininum correlation of 0.1
#and at least 50% of useful data
predictor.selection <- quickpred(subset(ELS.data,treat==group), mincor=0.1, minpuc=0.5,method='pearson',
exclude=c("STU_ID"))
#impute variables by from least missing to most missing
#Using multiple imputation by chained equations
#with predictive mean matching as the univariate imputation method
imputation <- mice(subset(ELS.data,treat==group), m=5, method="pmm", visitSequence="monotone",
predictorMatrix = predictor.selection)
#extract stacked data files
long.imputation = rbind(long.imputation,complete(imputation, action="long"))
} #finish loop
#extract a single imputation dataset
imputation1 <- subset(long.imputation, subset=.imp==1)
#create a list of all imputed datasets
library(mitools) #load package to combine imputed datasets
#this object was specifically designed to be analyzed with the survey package
allImputations <- imputationList(list(
subset(long.imputation, subset=.imp==1),
subset(long.imputation, subset=.imp==2),
subset(long.imputation, subset=.imp==3),
subset(long.imputation, subset=.imp==4),
subset(long.imputation, subset=.imp==5)))
#------------------------------------
#ESTIMATE PROPENSITY SCORES WITH LOGISTIC REGRESSION
#Create a vector of the covariates that will be balanced by using
#propensity score methods
#define formula for propensity score model
covariateNames <-
c("BYSTLNG2",#Sample member^s English fluency
"byplang",#Parent^s English fluency
"byfcomp", #Family composition
"bysibstr",# BY in-home sibling structure
"bysibhom", # BY number of in-home siblings
"bygnstat", # Generational status
"bypared", #Parents' highest level of education
"bygpared", #Highest reported level of education among parents^ parents
"BYSES1QU", #Quartile coding of SES1 variable
"bygrdrpt", #Number of grades repeated (K-10)
"byhomlit", #BY home literacy resources
"byriskfc", #Number of academic risk factors in 10th grade
"bystexp", # How far in school student thinks will get-composite
"F1SEX", #F1 sex-composite Composite Weights and Composites for F1
"F1RACE", #F1 student's race / ethnicity-composite Composite Weights and Composites for F1
"F1HOMLNG", # F1 student's native language-composite Composite Weights and Composites for F1
"BYS26", # High school program-student self-report Questionnaire BY Student Questionnaire
"BYS27D", # Education is important to get a job later Questionnaire BY Student Questionnaire
"BYS27I", # Parents expect success in school Questionnaire BY Student Questionnaire
"BYS28", # How much likes school Questionnaire BY Student Questionnaire
"BYS37", # Importance of good grades to student Questionnaire BY Student Questionnaire
"BYS38C", # How often goes to class without homework done Questionnaire BY Student Questionnaire
"BYS54A", # Importance of being successful in line work Questionnaire BY Student Questionnaire
"BYS54O", # Importance of getting good education Questionnaire BY Student Questionnaire
"BYS57", # Plans to continue education after high school Questionnaire BY Student Questionnaire
"BYS58", # Type of school plans to attend Questionnaire BY Student Questionnaire
"bysctrl", #public, private, other
"byurban", #urbanicity
"byregion", #geographic region of school
"BY10FLP") #grade 10 percent free or reduced lunch
#check whether any dummy missing indicators are redundant because
#variables have missing values for the same cases
missingCorrelations <- cor(missing.indicator[,propMissing>0.05])
diag(missingCorrelations) <- 0
maxCorrelations <- apply(missingCorrelations,2,max)
dummyNAnames <- names(maxCorrelations)[maxCorrelations < 0.8]
maxCorrelationsHigh <- maxCorrelations[!duplicated(maxCorrelations)]
dummyNAnames <- c(dummyNAnames,names(maxCorrelationsHigh)[maxCorrelationsHigh>=0.8])
#add missing value indicators for variables with more than 5%
#of missing data to covariateNames
#merge covariate names with missing indicator names
covariateNames <- c(covariateNames, dummyNAnames)
#create formula based on covariate list
#this includes both individual level and school level covariates
psFormula <- paste(covariateNames, collapse="+")
psFormula <- formula(paste("treat~",psFormula, sep=""))
print(psFormula)
#define design object that describes the sample characteristics
#the variable psu identifies the primary sampling units (cluster ids)
#the variable STRATA_ID identifies the strata ids
#the variable bystuwt identifies the base-year sampling weights for
#respondents of the 2002 and 2004 waves (Base year and 1st follow-up)
library(survey)
options(survey.lonely.psu = "adjust")
surveyDesign1 <- svydesign(ids=~psu, strata=~STRAT_ID, weights=~bystuwt,
data = imputation1, nest=T)
surveyDesignAll <- svydesign(ids=~psu, strata=~STRAT_ID, weights=~bystuwt,
data = allImputations, nest=T)
#fit a logistic regression model with adjustments for clustering and stratification
#to the first imputed dataset
psModel1 <- svyglm(psFormula, design=surveyDesign1, family=quasibinomial)
#fit a logistic regression model to all imputed datasets
psModelAll <- with(surveyDesignAll, svyglm(psFormula, family=quasibinomial))
#extract propensity scores from first dataset
pScores <- fitted(psModel1)
imputation1$pScores <- pScores
#extract propensity scores from all imputed datasets
pScoresAll <- sapply(psModelAll, fitted)
#combine propensity scores across imputed datasets by taking the mean
pScoresMean <- apply(pScoresAll,1,mean)
#add propensity score mean to imputed datasets
allImputations <- update(allImputations, pScores = pScoresMean)
#====================================================
#ESTIMATE PROPENSITY SCORES USING CLASSIFICATION TREES
library(party)
myctree <- ctree(psFormula, data = imputation1) #fit the tree to imputed data
#obtain tiff image
tiff("Chapter2_figure2-2.tif", res=600, compression = "lzw", height=6, width=15, units="in")
plot(myctree) #plot the tree
dev.off()
#obtain a pdf
pdf("Chapter2_Figure2-2.pdf",, height=6, width=15)
plot(myctree)
dev.off()
#fit random forest with settings to produce results that are not biased towards
#continuous variables and categorical variables with many categories.
set.seed(2014)
mycontrols <- cforest_unbiased(ntree=1000, mtry=5)
mycforest <- cforest(psFormula, data=imputation1, weights = imputation1$bystuwt,
controls=mycontrols)
#obtain a list of predicted probabilities
pScoresRf <- predict(mycforest, type="prob")
#organize the list into a matrix with two columns for the
#probability of being in treated and control groups.
#keep only the second column, which are the propensity scores.
imputation1$pScoresRf <- matrix(unlist(pScoresRf),,2,byrow=T)[,2]
#save results of random forests
#save(list=c("myctree","mycforest","pScoresRf"),
# file="random_forest_results_career_academy_example.Rdata", compress=T)
#=======================================================================
#Estimate propensity scores with generalized boosted modeling (GBM)
library(twang)
set.seed(2015) #set random seed to allow replication of results
#change the treatment indicator from factor to numeric, which is required by GBM
imputation1$treat <- as.numeric(imputation1$treat==1)
myGBM <- ps(psFormula, data = imputation1, n.trees=10000, interaction.depth=4,
shrinkage=0.01, stop.method=c("es.max"), estimand = "ATT",
verbose=TRUE, sampw = imputation1$bystuwt)
#obtain information about balance achieved and iteration numbers
summary(myGBM)
#obtain tiff image
tiff("Chapter2_figure2-3.tif", res=600, compression = "lzw", height=6, width=15, units="in")
plot(myGBM,type="b", color=F, lwd=2)
dev.off()
#obtain a pdf
pdf("Chapter2_Figure2-3.pdf",, height=6, width=15)
plot(myGBM,type="b", color=F)
dev.off()
#extract estimated propensity scores from object
pScoresGBM <- myGBM$ps
names(pScoresGBM) = "pScoresGBM"
imputation1$pScoresGBM <- unlist(pScoresGBM)
#save results of GBM
#save(list=c("myGBM","pScoresGBM"),
# file="GBM_results_career_academy_example.Rdata", compress=T)
# #save data file
ELS.data.imputed <- imputation1
# #========================================================
#obtain summary statistics of common support for
#propensity scores estimated with the first imputed dataset
#for logistic regression
with(ELS.data.imputed, by(pScores,treat,summary))
#for all propensity scores
by(ELS.data.imputed[,63:65],ELS.data.imputed$treat,summary)
#create a table
tableCommonSupport = rbind(
summary(ELS.data.imputed[ELS.data.imputed$treat==1,63:65]),
summary(ELS.data.imputed[ELS.data.imputed$treat==0,63:65]))
rownames(tableCommonSupport) = c(rep("Treated",6),rep("Control",6))
write.csv(tableCommonSupport, file="Table_common_support.csv")
#obtain proportion of treated cases above maximum control cases
#and proportion of control cases below minum treated cases
#for logistic regression
with(ELS.data.imputed, 100*c(
mean(as.numeric(pScores[treat==1] > max(pScores[treat==0]))),
mean(as.numeric(pScores[treat==0] < min(pScores[treat==1])))))
#obtain proportions of treated cases above maximum control cases
percentageAbove = with(ELS.data.imputed, 100*c(
mean(as.numeric(pScores[treat==1] > max(pScores[treat==0]))),
mean(as.numeric(pScoresRf[treat==1] > max(pScoresRf[treat==0]))),
mean(as.numeric(pScoresGBM[treat==1,] > max(pScoresGBM[treat==0,])))))
#obtain proportions of control cases below minimum treated cases
percentageBelow = with(ELS.data.imputed, 100*c(
mean(as.numeric(pScores[treat==0] < min(pScores[treat==1]))),
mean(as.numeric(pScoresRf[treat==0] < min(pScoresRf[treat==1]))),
mean(as.numeric(pScoresGBM[treat==0,] < min(pScoresGBM[treat==1,])))))
#evaluate common support with box and whiskers plot
ELS.data.imputed$treat = factor(ELS.data.imputed$treat)
library(lattice)
lattice.options(default.theme = standard.theme(color = FALSE))
tiff("Chapter2_figure2-4.tif", res=600, compression = "lzw", height=6, width=15, units="in")
bwplot( pScores~treat, data = ELS.data.imputed, ylab = "Propensity Scores", xlab = "Treatment",auto.key = TRUE)
dev.off()
tiff("Chapter2_figure2-5.tif", res=600, compression = "lzw", height=6, width=15, units="in")
bwplot( pScoresRf~treat , data = ELS.data.imputed, ylab = "Propensity Scores", xlab = "Treatment",auto.key = TRUE)
dev.off()
tiff("Chapter2_figure2-6.tif", res=600, compression = "lzw", height=6, width=15, units="in")
bwplot( pScoresGBM~treat, data = ELS.data.imputed, ylab = "Propensity Scores", xlab = "Treatment",auto.key = TRUE)
dev.off()
#evaluate common support with kernel density plots
require(lattice)
lattice.options(default.theme = standard.theme(color = FALSE))
tiff("Chapter2_figure2-4b.tif", res=600, compression = "lzw", height=6, width=15, units="in")
densityplot( ~pScores, groups=treat, plot.points=F, xlim=c(0,1), lwd=2,
data = ELS.data.imputed, ylab = "Propensity Scores", xlab = "Treatment",auto.key = TRUE)
dev.off()
tiff("Chapter2_figure2-5b.tif", res=600, compression = "lzw", height=6, width=15, units="in")
densityplot( ~pScoresRf, groups=treat, plot.points=F, xlim=c(0,1), lwd=2,
data = ELS.data.imputed, ylab = "Propensity Scores", xlab = "Treatment",auto.key = TRUE)
dev.off()
tiff("Chapter2_figure2-6b.tif", res=600, compression = "lzw", height=6, width=15, units="in")
densityplot( ~pScoresGBM, groups=treat, plot.points=F, xlim=c(0,1), lwd=2,
data = ELS.data.imputed, ylab = "Propensity Scores", xlab = "Treatment",auto.key = TRUE)
dev.off()
#obtain histograms of propensity scores estimated with logistic regression
#for all inputed datasets
#first stack the imputed datasets
allImputationsStacked <- data.frame()
for (imp in 1:5) { temp <- cbind(allImputations$imputations[[imp]],imputation = imp)
allImputationsStacked = rbind(allImputationsStacked, temp) }
allImputationsStacked$treat <- factor(allImputationsStacked$treat,
levels=c(0,1),labels=c("Untreated","Treated"))
allImputationsStacked$imputation <- factor(allImputationsStacked$imputation,
labels=paste("Imputation",1:5))
library(lattice) #used for the density plots
lattice.options(default.theme = standard.theme(color = FALSE))
tiff("Chapter2_figure2-7.tif", res=600, compression = "lzw", height=6, width=15, units="in")
densityplot( ~pScores | imputation, data = allImputationsStacked, plot.points=F, lwd=2,
groups = treat, xlab = "Propensity Scores", auto.key = TRUE)
dev.off()
tiff("Chapter2_figure2-8.tif", res=600, compression = "lzw", height=6, width=15, units="in")
bwplot( pScores~treat | imputation, data = allImputationsStacked, lwd=2,
ylab = "Propensity Scores", auto.key = TRUE)
dev.off()
#save results
#save(list=c("ELS.data.imputed", "allImputations","covariateNames","psFormula"),
# file="Chapter3_ELS_data_imputed_example_career_academy.Rdata", compress=T)
#save final results
#save(list=ls(), file="Chapter2_All_results.Rdata", compress=T)