For a longer version of this tutorial, see: Sorensen, Hohenstein, Vasishth, 2016.

If you don't know what the above code means, first read chapter 4 of my lecture notes.

lmer(log(rt) ~ 1+RCType+dist+int+(1+RCType+dist+int|subj) + (1+RCType+dist+int|item), dat)

Assume that these data are stored in R as a data-frame with name rDat.

"subj" "item" "rt""RCType" "dist" "int"

1 14 438 -1 -1 1

1 16 531 1 -1 -1

1 15 422 1 1 1

1 18 1000 -1 -1 1

...

data {

int N; //no trials

int P; //no fixefs

int J; //no subjects

int n_u; //no subj ranefs

int K; //no items

int n_w; //no item ranefs

int subj[N]; //subject indicator

int item[N]; //item indicator

row_vector[P] X[N]; //fixef design matrix

row_vector[n_u] Z_u[N]; //subj ranef design matrix

row_vector[n_w] Z_w[N]; //item ranef design matrix

vector[N] rt; //reading time

}

parameters {

vector[P] beta; //fixef coefs

cholesky_factor_corr[n_u] L_u; //cholesky factor of subj ranef corr matrix

cholesky_factor_corr[n_w] L_w; //cholesky factor of item ranef corr matrix

vector[n_u] sigma_u; //subj ranef std

vector[n_w] sigma_w; //item ranef std

real sigma_e; //residual std

vector[n_u] z_u[J]; //spherical subj ranef

vector[n_w] z_w[K]; //spherical item ranef

}

transformed parameters {

vector[n_u] u[J]; //subj ranefs

vector[n_w] w[K]; //item ranefs

{

matrix[n_u,n_u] Sigma_u; //subj ranef cov matrix

matrix[n_w,n_w] Sigma_w; //item ranef cov matrix

Sigma_u = diag_pre_multiply(sigma_u,L_u);

Sigma_w = diag_pre_multiply(sigma_w,L_w);

for(j in 1:J)

u[j] = Sigma_u * z_u[j];

for(k in 1:K)

w[k] = Sigma_w * z_w[k];

}

}

model {

//priors

L_u ~ lkj_corr_cholesky(2.0);

L_w ~ lkj_corr_cholesky(2.0);

for (j in 1:J)

z_u[j] ~ normal(0,1);

for (k in 1:K)

z_w[k] ~ normal(0,1);

//likelihood

for (i in 1:N)

rt[i] ~ lognormal(X[i] * beta + Z_u[i] * u[subj[i]] + Z_w[i] * w[item[i]], sigma_e);

}

# Make design matrix

X attr(X, "assign")

# Make Stan data

stanDat P = ncol(X),

n_u = ncol(X),

n_w = ncol(X),

X = X,

Z_u = X,

Z_w = X,

J = nlevels(rDat$subj),

K = nlevels(rDat$item),

rt = rDat$rt,

subj = as.integer(rDat$subj),

item = as.integer(rDat$item))

library(rstan)

rstan_options(auto_write = TRUE)

options(mc.cores = parallel::detectCores())

# Fit the model

matrixFit iter = 2000, chains = 4)

This print output is overly verbose. I wrote a simple function to get the essential information quickly.

print(matrixFit)

For example, if I want to see only the posteriors of the four beta parameters, I can write:

stan_results m_extr par_names means quantiles function(x)quantile(x,probs=c(0.025,0.975)))

means quants summry colnames(summry) summry

}

For more details, such as interpreting the results and computing things like Bayes Factors, see Nicenboim and Vasishth 2016.

stan_results(matrixFit, params=c("beta[1]","beta[2]","beta[3]","beta[4]"))

If this upsets you deeply and you want to use a normal distribution (and in fact, for EEG data this makes sense), go right ahead and change the lognormal to normal:

rt[i] ~ lognormal(X[i] * beta + Z_u[i] * u[subj[i]] + Z_w[i] * w[item[i]], sigma_e);

rt[i] ~ normal(X[i] * beta + Z_u[i] * u[subj[i]] + Z_w[i] * w[item[i]], sigma_e);

data {

int N; //no trials

int P; //no fixefs

int J; //no subjects

int n_u; //no subj ranefs

int K; //no items

int n_w; //no item ranefs

int subj[N]; //subject indicator

int item[N]; //item indicator

row_vector[P] X[N]; //fixef design matrix

row_vector[n_u] Z_u[N]; //subj ranef design matrix

row_vector[n_w] Z_w[N]; //item ranef design matrix

int response[N]; //response

}

parameters {

vector[P] beta; //fixef coefs

cholesky_factor_corr[n_u] L_u; //cholesky factor of subj ranef corr matrix

cholesky_factor_corr[n_w] L_w; //cholesky factor of item ranef corr matrix

vector[n_u] sigma_u; //subj ranef std

vector[n_w] sigma_w; //item ranef std

vector[n_u] z_u[J]; //spherical subj ranef

vector[n_w] z_w[K]; //spherical item ranef

}

transformed parameters {

vector[n_u] u[J]; //subj ranefs

vector[n_w] w[K]; //item ranefs

{

matrix[n_u,n_u] Sigma_u; //subj ranef cov matrix

matrix[n_w,n_w] Sigma_w; //item ranef cov matrix

Sigma_u = diag_pre_multiply(sigma_u,L_u);

Sigma_w = diag_pre_multiply(sigma_w,L_w);

for(j in 1:J)

u[j] = Sigma_u * z_u[j];

for(k in 1:K)

w[k] = Sigma_w * z_w[k];

}

}

model {

//priors

beta ~ cauchy(0,2.5);

sigma_u ~ cauchy(0,2.5);

sigma_w ~ cauchy(0,2.5);

L_u ~ lkj_corr_cholesky(2.0);

L_w ~ lkj_corr_cholesky(2.0);

for (j in 1:J)

z_u[j] ~ normal(0,1);

for (k in 1:K)

z_w[k] ~ normal(0,1);

//likelihood

for (i in 1:N)

response[i] ~ bernoulli_logit(X[i] * beta + Z_u[i] * u[subj[i]] + Z_w[i] * w[item[i]]);

}

(This article was first published on ** Shravan Vasishth's Slog (Statistics blog)**, and kindly contributed to R-bloggers)

I want to give a quick tutorial on fitting Linear Mixed Models (hierarchical models) with a full variance-covariance matrix for random effects (what Barr et al 2013 call a maximal model) using Stan.

For a longer version of this tutorial, see: Sorensen, Hohenstein, Vasishth, 2016.

**Prerequisites**: You need to have R and preferably RStudio installed; RStudio is optional. You need to have rstan installed. See here. I am also assuming you have fit lmer models like these before:

lmer(log(rt) ~ 1+RCType+dist+int+(1+RCType+dist+int|subj) + (1+RCType+dist+int|item), dat)

If you don’t know what the above code means, first read chapter 4 of my lecture notes.

I assume you have a 2×2 repeated measures design with some continuous measure like reading time (rt) data and want to do a main effects and interaction contrast coding. Let’s say your main effects are RCType and dist, and the interaction is coded as int. All these contrast codings are $\pm 1$. If you don’t know what contrast coding is, see these notes and read section 4.3 (although it’s best to read the whole chapter). I am using an excerpt of an example data-set from Husain et al. 2014.

"subj" "item" "rt""RCType" "dist" "int"

1 14 438 -1 -1 1

1 16 531 1 -1 -1

1 15 422 1 1 1

1 18 1000 -1 -1 1

...

Assume that these data are stored in R as a data-frame with name rDat.

Copy the following Stan code into a text file and save it as the file matrixModel.stan. For continuous data like reading times or EEG, you never need to touch this file again. You will only ever specify the design matrix X and the structure of the data. The rest is all taken care of.

data {

intN; //no trials

intP; //no fixefs

intJ; //no subjects

intn_u; //no subj ranefs

intK; //no items

intn_w; //no item ranefs

intsubj[N]; //subject indicator

intitem[N]; //item indicator

row_vector[P] X[N]; //fixef design matrix

row_vector[n_u] Z_u[N]; //subj ranef design matrix

row_vector[n_w] Z_w[N]; //item ranef design matrix

vector[N] rt; //reading time

}

parameters {

vector[P] beta; //fixef coefs

cholesky_factor_corr[n_u] L_u; //cholesky factor of subj ranef corr matrix

cholesky_factor_corr[n_w] L_w; //cholesky factor of item ranef corr matrix

vector[n_u] sigma_u; //subj ranef std

vector[n_w] sigma_w; //item ranef std

realsigma_e; //residual std

vector[n_u] z_u[J]; //spherical subj ranef

vector[n_w] z_w[K]; //spherical item ranef

}

transformed parameters {

vector[n_u] u[J]; //subj ranefs

vector[n_w] w[K]; //item ranefs

{

matrix[n_u,n_u] Sigma_u; //subj ranef cov matrix

matrix[n_w,n_w] Sigma_w; //item ranef cov matrix

Sigma_u = diag_pre_multiply(sigma_u,L_u);

Sigma_w = diag_pre_multiply(sigma_w,L_w);

for(j in 1:J)

u[j] = Sigma_u * z_u[j];

for(k in 1:K)

w[k] = Sigma_w * z_w[k];

}

}

model {

//priors

L_u ~ lkj_corr_cholesky(2.0);

L_w ~ lkj_corr_cholesky(2.0);

for (j in 1:J)

z_u[j] ~ normal(0,1);

for (k in 1:K)

z_w[k] ~ normal(0,1);

//likelihood

for (i in 1:N)

rt[i] ~ lognormal(X[i] * beta + Z_u[i] * u[subj[i]] + Z_w[i] * w[item[i]], sigma_e);

}

Since we want to test the main effects coded as the columns RCType, dist, and int, our design matrix will look like this:

# Make design matrix

X <- unname(model.matrix(~ 1 + RCType + dist + int, rDat))

attr(X, "assign") <- NULL

Stan expects the data in a list form, not as a data frame (unlike lmer). So we set it up as follows:

# Make Stan data

stanDat <- list(N = nrow(X),

P = ncol(X),

n_u = ncol(X),

n_w = ncol(X),

X = X,

Z_u = X,

Z_w = X,

J = nlevels(rDat$subj),

K = nlevels(rDat$item),

rt = rDat$rt,

subj = as.integer(rDat$subj),

item = as.integer(rDat$item))

library(rstan)

rstan_options(auto_write = TRUE)

options(mc.cores = parallel::detectCores())

# Fit the model

matrixFit <- stan(file = "matrixModel.stan", data = stanDat,

iter = 2000, chains = 4)

print(matrixFit)

This print output is overly verbose. I wrote a simple function to get the essential information quickly.

stan_results<-function(m,params=paramnames){

m_extr<-extract(m,pars=paramnames)

par_names<-names(m_extr)

means<-lapply(m_extr,mean)

quantiles<-lapply(m_extr,

function(x)quantile(x,probs=c(0.025,0.975)))

means<-data.frame(means)

quants<-data.frame(quantiles)

summry<-t(rbind(means,quants))

colnames(summry)<-c("mean","lower","upper")

summry

}

For example, if I want to see only the posteriors of the four beta parameters, I can write:

stan_results(matrixFit, params=c("beta[1]","beta[2]","beta[3]","beta[4]"))

For more details, such as interpreting the results and computing things like Bayes Factors, see Nicenboim and Vasishth 2016.

In the Stan code above, I assume a lognormal function for the reading times:

rt[i] ~ lognormal(X[i] * beta + Z_u[i] * u[subj[i]] + Z_w[i] * w[item[i]], sigma_e);

If this upsets you deeply and you want to use a normal distribution (and in fact, for EEG data this makes sense), go right ahead and change the lognormal to normal:

rt[i] ~ normal(X[i] * beta + Z_u[i] * u[subj[i]] + Z_w[i] * w[item[i]], sigma_e);

Use this Stan code instead of the one shown above. Here, I assume that you have a column called response in the data, which has 0,1 values. These are the trial level binary responses.

data {

intN; //no trials

intP; //no fixefs

intJ; //no subjects

intn_u; //no subj ranefs

intK; //no items

intn_w; //no item ranefs

intsubj[N]; //subject indicator

intitem[N]; //item indicator

row_vector[P] X[N]; //fixef design matrix

row_vector[n_u] Z_u[N]; //subj ranef design matrix

row_vector[n_w] Z_w[N]; //item ranef design matrix

int response[N]; //response

}

parameters {

vector[P] beta; //fixef coefs

cholesky_factor_corr[n_u] L_u; //cholesky factor of subj ranef corr matrix

cholesky_factor_corr[n_w] L_w; //cholesky factor of item ranef corr matrix

vector[n_u] sigma_u; //subj ranef std

vector[n_w] sigma_w; //item ranef std

vector[n_u] z_u[J]; //spherical subj ranef

vector[n_w] z_w[K]; //spherical item ranef

}

transformed parameters {

vector[n_u] u[J]; //subj ranefs

vector[n_w] w[K]; //item ranefs

{

matrix[n_u,n_u] Sigma_u; //subj ranef cov matrix

matrix[n_w,n_w] Sigma_w; //item ranef cov matrix

Sigma_u = diag_pre_multiply(sigma_u,L_u);

Sigma_w = diag_pre_multiply(sigma_w,L_w);

for(j in 1:J)

u[j] = Sigma_u * z_u[j];

for(k in 1:K)

w[k] = Sigma_w * z_w[k];

}

}

model {

//priors

beta ~ cauchy(0,2.5);

sigma_u ~ cauchy(0,2.5);

sigma_w ~ cauchy(0,2.5);

L_u ~ lkj_corr_cholesky(2.0);

L_w ~ lkj_corr_cholesky(2.0);

for (j in 1:J)

z_u[j] ~ normal(0,1);

for (k in 1:K)

z_w[k] ~ normal(0,1);

//likelihood

for (i in 1:N)

response[i] ~ bernoulli_logit(X[i] * beta + Z_u[i] * u[subj[i]] + Z_w[i] * w[item[i]]);

}

See here.

To **leave a comment** for the author, please follow the link and comment on their blog: ** Shravan Vasishth's Slog (Statistics blog)**.

R-bloggers.com offers

In addition to the underlying data, the update removed some of the processing that was happening inside the application, and put it into the pre-processing stage. This processing needs to happen only the once, and is not related to the reactivity of the application. This will improve the speed of the application; in addition to reducing the processing, it also shrinks the size of the data table loaded into the application.

The third set of changes were a consequence of the updates to the Shiny and ggplot2 packages in the two years that have passed since I built the app. In Shiny, there was a deprecation for "format" in the sliderInput widget. And in ggplot2, it was a change in the quotes around the "method" specification in stat_smooth(). A little thing that took a few minutes to debug! Next up will be some formatting changes, and a different approach to one of the visualizations.

-30-]]>

(This article was first published on ** Bayes Ball**, and kindly contributed to R-bloggers)

A short post to alert the world that my modest Shiny application, showing Major League Baseball run scoring trends since 1901, has been updated to include the 2016 season. The application can be found here:

https://monkmanmh.shinyapps.io/MLBrunscoring_shiny/.

In addition to the underlying data, the update removed some of the processing that was happening inside the application, and put it into the pre-processing stage. This processing needs to happen only the once, and is not related to the reactivity of the application. This will improve the speed of the application; in addition to reducing the processing, it also shrinks the size of the data table loaded into the application.

The third set of changes were a consequence of the updates to the Shiny and ggplot2 packages in the two years that have passed since I built the app. In Shiny, there was a deprecation for “format” in the sliderInput widget. And in ggplot2, it was a change in the quotes around the “method” specification in stat_smooth(). A little thing that took a few minutes to debug! Next up will be some formatting changes, and a different approach to one of the visualizations.

-30-

To **leave a comment** for the author, please follow the link and comment on their blog: ** Bayes Ball**.

R-bloggers.com offers

(This article was first published on ** R – Xi'an's Og**, and kindly contributed to R-bloggers)

**A**fter a long lag *(due to my missing the free copies distributed at Paris-Dauphine!)*, here is a Sudoku-like Le Monde mathematical puzzle:

A grid of size (n,n)holds integer values such that any entry larger than 1 is the sum of one term in the same column and one term in the same row. What is the maximal possible value observed in such a grid when n=3,4?

**T**his can be solved in R by a random exploration of such possible grids in a simulated annealing spirit:

mat=matrix(1,N,N) goal=1 targ=function(mat){ #check constraints d=0 for (i in (1:(N*N))[mat>1]){ r=(i-1)%%N+1;c=(i-1)%/%N+1 d=d+(min(abs(mat[i]-outer(mat[-r,c],mat[r,-c],"+")))>0)} return(d)} cur=0 for (t in 1:1e6){ i=sample(1:(N*N),1);prop=mat prop[i]=sample(1:(2*goal),1) d=targ(prop) if (10*log(runif(1))/t<cur-d){ mat=prop;cur=d} if ((d==0)&(max(prop)>goal)){ goal=max(prop);maxx=prop}}

returning a value of 8 for n=3 and 37 for n=4. However, the method is quite myopic and I tried instead a random filling of the grid, using each time the maximum possible sum for empty cells:

goal=1 for (v in 1:1e6){ mat=matrix(0,N,N) #one 1 per row/col for (i in 1:N) mat[i,sample(1:N,1)]=1 for (i in 1:N) if (max(mat[,i])==0) mat[sample(1:N,1),i]=1 while (min(mat)==0){ parm=sample(1:(N*N)) #random order for (i in parm[mat[parm]==0]){ r=(i-1)%%N+1;c=(i-1)%/%N+1 if ((max(mat[-r,c])>0)&(max(mat[r,-c])>0)){ mat[i]=max(mat[-r,c])+max(mat[r,-c]) break()}}} if (goal<max(mat)){ goal=max(mat);maxx=mat}}

which recovered a maximum of 8 for n=3, but reached 48 for n=4. And 211 for n=5, 647 for n=6… For instance, here is the solution for n=4:

[1,] 1 5 11 10 [2,] 2 4 1 5 [3,] 48 2 24 1 [4,] 24 1 22 11

While the update in the above is random and associated with the first term in the permutation, it may be preferable to favour the largest possible term at each iteration, which I tried as

while (min(mat)==0){ parm=sample(1:(N*N)) val=0*parm for (i in parm[mat[parm]==0]){ r=(i-1)%%N+1;c=(i-1)%/%N+1 if ((max(mat[-r,c])>0)&(max(mat[r,-c])>0)){ val[i]=max(mat[-r,c])+max(mat[r,-c])} } #largest term i=order(-val)[1];mat[i]=val[i]}

For n=4, I did not recover the maximal value 48, but achieved larger values for n=5 (264) and n=6 (2256).

As an aside, the R code sometimes led to a strange error message linked with the function sample(), which is that too large a bound in the range produces the following

> sample(1:1e10,1) Error: cannot allocate vector of size 74.5 Gb

meaning that 1:1e10 first creates a vector for all the possible values. The alternative

> sample.int(1e10,1) [1] 7572058778

works, however. And only breaks down for 10¹².

Filed under: Kids, R Tagged: Le Monde, mathematical puzzle, R, sample, sudoku

To **leave a comment** for the author, please follow the link and comment on their blog: ** R – Xi'an's Og**.

R-bloggers.com offers

RcppTOML brings TOML to R. TOML is a file format that is most suitable for configurations, as it is meant to be edited by humans but read by computers. It emphasizes strong readability for humans while at the same time supporting strong typing as well as immediate and clear error reports. On small typos you get parse errors, rather than silently corrupted garbage. Much preferable to any and all of XML, JSON or YAML -- though sadly these may be too ubiquitous now. TOML is making good inroads with newer and more flexible projects such as the Hugo static blog compiler, or the Cargo system of Crates (aka "packages") for the Rust language.

## Changes in version 0.1.2 (2017-03-26)

Dates and Datetimes in arrays in the input now preserve their types instead of converting to numeric vectors (#13)

Boolean vectors are also correctly handled (#14)

TableArray types are now stored as lists in a single named list (#15)

The README.md file was expanded with an example and screenshot.

Added file

`init.c`

with calls to`R_registerRoutines()`

`and R_useDynamicSymbols()`

; also use`.registration=TRUE`

in`useDynLib`

in`NAMESPACE`

Two example files were updated.

Courtesy of CRANberries, there is a diffstat report for this release.

More information is on the RcppTOML page page. Issues and bugreports should go to the GitHub issue tracker.

This post by Dirk Eddelbuettel originated on his Thinking inside the box blog. Please report excessive re-aggregation in third-party for-profit settings.

]]>

(This article was first published on ** Thinking inside the box **, and kindly contributed to R-bloggers)

A new release of RcppTOML is now on CRAN. This release fixes a few parsing issues for less frequently-used inputs: vectors of boolean or date(time) types, as well as table array input.

RcppTOML brings TOML to R. TOML is a file format that is most suitable for configurations, as it is meant to be *edited by humans* but read by computers. It emphasizes *strong readability for humans* while at the same time supporting *strong typing* as well as *immediate and clear error reports*. On small typos you get parse errors, rather than silently corrupted garbage. Much preferable to any and all of XML, JSON or YAML — though sadly these may be too ubiquitous now. TOML is making good inroads with newer and more flexible projects such as the Hugo static blog compiler, or the Cargo system of Crates (aka "packages") for the Rust language.

## Changes in version 0.1.2 (2017-03-26)

Dates and Datetimes in arrays in the input now preserve their types instead of converting to numeric vectors (#13)

Boolean vectors are also correctly handled (#14)

TableArray types are now stored as lists in a single named list (#15)

The README.md file was expanded with an example and screenshot.

Added file

`init.c`

with calls to`R_registerRoutines()`

`and R_useDynamicSymbols()`

; also use`.registration=TRUE`

in`useDynLib`

in`NAMESPACE`

Two example files were updated.

Courtesy of CRANberries, there is a diffstat report for this release.

More information is on the RcppTOML page page. Issues and bugreports should go to the GitHub issue tracker.

This post by Dirk Eddelbuettel originated on his Thinking inside the box blog. Please report excessive re-aggregation in third-party for-profit settings.

To **leave a comment** for the author, please follow the link and comment on their blog: ** Thinking inside the box **.

R-bloggers.com offers

(This article was first published on ** The Shape of Code » R**, and kindly contributed to R-bloggers)

The Economics chapter of my Empirical software engineering book has been added to the draft pdf (download here).

This is a slim chapter, it might grow a bit, but I suspect not by a huge amount. Reasons include lots of interesting data being confidential and me not having spent a lot of time on this topic over the years (so my stash of accumulated data is tiny). Also, a significant chunk of the economics data I have is used to discuss issues in the Ecosystems and Projects chapters, perhaps some of this material will migrate back once these chapters are finalized.

You might argue that Economics is more important than Human cognitive characteristics and should have appeared before it (in chapter order). I would argue that hedonism by those involved in producing software is the important factor that pushes (financial) economics into second place (still waiting for data to argue my case in print).

Some of the cognitive characteristics data I have been waiting for arrived, and has been added to this chapter (some still to be added).

As always, if you know of any interesting software engineering data, please tell me.

I am after a front cover. A woodcut of alchemists concocting a potion appeals, perhaps with various software references discretely included, or astronomy related (the obvious candidate has already been used). The related modern stuff I have seen does not appeal. Suggestions welcome.

Ecosystems next.

To **leave a comment** for the author, please follow the link and comment on their blog: ** The Shape of Code » R**.

R-bloggers.com offers

This note asks whether high frequency satellite images do capture the extent to which conflict is ongoing in Yemen and asks in particular, whether there is distinct geographic variation suggesting which areas are most affected by the ongoing conflict.

Satellite images have been used to study urban sprawl and general economic growth and development. The extent to which satellite images can be used to study man-made disasters such as conflicts is not widely explored.

There are lots of other papers that have used various night light data sets to study urbanization, ethnic favoritism, and economic growth (see Henderson et al, 2012 ; Michalopoulos and Papaioannou 2013, Hodler and Raschky, 2014).

In related work Fetzer et al., 2016, I studied the extent to which light emissions in the early 1990s can be used to obtain a measure of the extent of power rationing in Colombia following El-Nino induced droughts. In another project, we use the DMSP night light images to study the evolution of cities over time and how democratization can change the relative distribution of cities Fetzer and Shanghavi, 2015.

Since 2012, the VIIRS

high frequency and high resolution satellite images capturing night lights emissions are available from NASA’s Earth Observation Group. They have now been made available for analysis on Google’s Earth Engine, making them much more accessible to the wider research audience.

Lets have a look at night light Yemen before and after the Saudi Arabian military intervention.

The light scales are identical, indicating that relative to the border with Saudi Arabia, the night light emissions from Yemen have dropped dramatically, especially around the capital city Sana’a. The circular blobs indicated are around the main oil/ gas producing parts of Yemen, where there may be light emissions due to flaring of natural gas.

A minimal average light emissions of 0.5 was imposed

Zooming in to Sana’a, the figures look as follows.

```
library(data.table)
library(foreign)
library(plyr)
library(parallel)
options(stringsAsFactors = FALSE)
setwd("/Users/thiemo/Dropbox/Research/Yemen")
# A DATA SET OF 34k populated places (or historically populated places)
YE <- data.table(read.csv(file = "~/Dropbox/Research/Yemen/Yemen-Cities.csv"))
# LIGHTS DATA IS FROM VIIRS Images made availabe on the Google Earth Engine
LIGHTS <- data.table(read.csv(file = "~/Dropbox/Research/Yemen/lightsall.csv"))
LIGHTS[, `:=`(year, as.numeric(substr(system.index, 1, 4)))]
LIGHTS[, `:=`(month, as.numeric(substr(system.index, 5, 6)))]
LIGHTS[, `:=`(.geo, NULL)]
LIGHTS[, `:=`(UFI, NULL)]
LIGHTS[, `:=`(LONGITUDE, NULL)]
LIGHTS[, `:=`(date, strptime(paste(year, month, "01", sep = "-"), "%Y-%m-%d"))]
LIGHTS <- join(LIGHTS, YE)
```

## Joining by: rownum

Some simple plots are quite suggestive. The following plots the average light emissions around populated places over time by month. The date of the intervention onset, which coincides with the date of the fall of Sana’a coincides with dramatic drop in light emissions.

Average lights dropped by a almost 2/3, suggesting a stand still in economic activity. Overall light emissions are still visible as indicated in the graphs suggesting that the places do not turn pitch black. The

```
plot(LIGHTS[, mean(list), by = date], type = "l")
```

The Houthi movement has been gaining influence over a longer time period. In particular, since the 2012 the Houthi’s have gained influence spreading from North to the South. The European Council of Foreign Relations has produced maps illustrating the spatial expansion of Houthi control in Yemen.

A central question relates to the strategy of the Saudi military intervention. In particular, whether the intervention is aimed at territories that came under Houthi control since 2012 or whether the intervention is targeted at the Houthi-heartland.

A simple exercise that allows this study is to look at the evolution of lights in the northern Houthi-heartland relative to the populated places in the rest of the country that came under Houthi control since 2012.

A definition of what consists of the Houthi-heartland is subject to contention. But a conservative definition may consist of the four governerates Ammran, Sada’ah, Al Jawf and Hajjah.

```
LIGHTS[, `:=`(HOUTHI, as.numeric(ADM1 %in% c("15", "22", "21", "19")))]
require(ggplot2)
ggplot(LIGHTS[, mean(list), by = c("HOUTHI", "date")], aes(date, V1, colour = as.factor(HOUTHI))) +
geom_line() + geom_point() + theme_bw() + theme(legend.position = "bottom")
```

The summary statistics suggest that in absolute terms much larger in the non-Houthi heartland. Though given that the initial level in the Houthi heartland is much lower, suggesting that that part of the country is much less developed. Given that there is a notional minimum light emissions of zero, this truncation of the data is a concern.

One way around this is to dummify the lights measure and look at whether a populated place is lit above a certain threshold.

```
LIGHTS[, `:=`(anylit, list > 0.25)]
ggplot(LIGHTS[, mean(anylit), by = c("HOUTHI", "date")], aes(date, V1, colour = as.factor(HOUTHI))) +
geom_line() + geom_point() + theme_bw() + theme(legend.position = "bottom")
```

Again it is hard to see whether there is any divergence in trends in this dummified measure, but this naturally is less prone to be affected by the truncation inherent to this type of data.

A regression with location and time fixed effects that measures whether there was a distinct change in nightlights in places in the Houthi-heartland relative to the non-Houthi heartland suggests that there is indeed a marked difference, indicating that the conflict is concentrated in the non-Houthi heartland.

Definint the discrete variable for a difference in difference estimation and loading the lfe package that allows for high dimensional fixed effects:

```
LIGHTS[, `:=`(anylit, list > 0.25)]
LIGHTS[, `:=`(postKSAintervention, as.numeric(date > "2015-03-01"))]
library(lfe)
LIGHTS[, `:=`(date, as.factor(date))]
```

Running the actual difference in difference regressions:

```
# levels
summary(felm(list ~ postKSAintervention:HOUTHI | rownum + date | 0 | ADM1, data = LIGHTS))
```

## ## Call: ## felm(formula = list ~ postKSAintervention:HOUTHI | rownum + date | 0 | ADM1, data = LIGHTS) ## ## Residuals: ## Min 1Q Median 3Q Max ## -74.347 -0.205 0.043 0.194 82.063 ## ## Coefficients: ## Estimate Cluster s.e. t value Pr(>|t|) ## postKSAintervention:HOUTHI 0.4184 0.1900 2.202 0.0277 * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 1.758 on 1172455 degrees of freedom ## Multiple R-squared(full model): 0.752 Adjusted R-squared: 0.7447 ## Multiple R-squared(proj model): 0.003315 Adjusted R-squared: -0.02603 ## F-statistic(full model, *iid*): 103 on 34519 and 1172455 DF, p-value: < 2.2e-16 ## F-statistic(proj model): 4.848 on 1 and 22 DF, p-value: 0.03846

```
# dummified measure
summary(felm(anylit ~ postKSAintervention:HOUTHI | rownum + date | 0 | ADM1,
data = LIGHTS))
```

## ## Call: ## felm(formula = anylit ~ postKSAintervention:HOUTHI | rownum + date | 0 | ADM1, data = LIGHTS) ## ## Residuals: ## Min 1Q Median 3Q Max ## -1.12247 -0.10416 0.00593 0.06185 1.06958 ## ## Coefficients: ## Estimate Cluster s.e. t value Pr(>|t|) ## postKSAintervention:HOUTHI 0.08470 0.02359 3.59 0.00033 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.2223 on 1172455 degrees of freedom ## Multiple R-squared(full model): 0.5762 Adjusted R-squared: 0.5637 ## Multiple R-squared(proj model): 0.008458 Adjusted R-squared: -0.02073 ## F-statistic(full model, *iid*):46.18 on 34519 and 1172455 DF, p-value: < 2.2e-16 ## F-statistic(proj model): 12.89 on 1 and 22 DF, p-value: 0.00163

```
# taking logs
summary(felm(log(list) ~ postKSAintervention:HOUTHI | rownum + date | 0 | ADM1,
data = LIGHTS[!is.infinite(log(list))]))
```

## ## Call: ## felm(formula = log(list) ~ postKSAintervention:HOUTHI | rownum + date | 0 | ADM1, data = LIGHTS[!is.infinite(log(list))]) ## ## Residuals: ## Min 1Q Median 3Q Max ## -12.8918 -0.3725 0.1060 0.5223 6.5958 ## ## Coefficients: ## Estimate Cluster s.e. t value Pr(>|t|) ## postKSAintervention:HOUTHI 0.4133 0.1234 3.35 0.000809 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.8958 on 844476 degrees of freedom ## (327294 observations deleted due to missingness) ## Multiple R-squared(full model): 0.6534 Adjusted R-squared: 0.6393 ## Multiple R-squared(proj model): 0.01248 Adjusted R-squared: -0.02789 ## F-statistic(full model, *iid*):46.12 on 34519 and 844476 DF, p-value: < 2.2e-16 ## F-statistic(proj model): 11.22 on 1 and 22 DF, p-value: 0.002899

An alternative way to study this is by doing a flexible non-parametric estimation to rule out diverging trends prior to the military intervention.

```
summary(felm(anylit ~ date:HOUTHI | rownum + date | 0 | ADM1, data = LIGHTS))
```

## ## Call: ## felm(formula = anylit ~ date:HOUTHI | rownum + date | 0 | ADM1, data = LIGHTS) ## ## Residuals: ## Min 1Q Median 3Q Max ## -1.12574 -0.10765 0.00313 0.06437 1.06515 ## ## Coefficients: ## Estimate Cluster s.e. t value Pr(>|t|) ## date2014-01-01:HOUTHI NA 0.00000 NA NA ## date2014-02-01:HOUTHI 0.01095 0.01320 0.830 0.406641 ## date2014-03-01:HOUTHI 0.03173 0.02764 1.148 0.250884 ## date2014-04-01:HOUTHI 0.11048 0.06028 1.833 0.066814 . ## date2014-05-01:HOUTHI 0.09762 0.05271 1.852 0.063989 . ## date2014-06-01:HOUTHI 0.10249 0.05861 1.749 0.080336 . ## date2014-07-01:HOUTHI 0.07204 0.06053 1.190 0.233987 ## date2014-08-01:HOUTHI 0.06338 0.04866 1.302 0.192778 ## date2014-09-01:HOUTHI 0.03816 0.04690 0.814 0.415860 ## date2014-10-01:HOUTHI 0.04247 0.04359 0.974 0.329930 ## date2014-11-01:HOUTHI 0.05621 0.03646 1.542 0.123115 ## date2014-12-01:HOUTHI 0.02213 0.03037 0.729 0.466205 ## date2015-01-01:HOUTHI -0.02596 0.02585 -1.004 0.315415 ## date2015-02-01:HOUTHI 0.02250 0.05141 0.438 0.661649 ## date2015-03-01:HOUTHI 0.06080 0.05740 1.059 0.289437 ## date2015-04-01:HOUTHI 0.13514 0.04806 2.812 0.004925 ** ## date2015-05-01:HOUTHI 0.15874 0.04647 3.416 0.000635 *** ## date2015-06-01:HOUTHI 0.15493 0.05151 3.008 0.002632 ** ## date2015-07-01:HOUTHI 0.12681 0.04697 2.700 0.006944 ** ## date2015-08-01:HOUTHI 0.12363 0.04319 2.863 0.004202 ** ## date2015-09-01:HOUTHI 0.13972 0.05276 2.648 0.008088 ** ## date2015-10-01:HOUTHI 0.13422 0.04697 2.857 0.004273 ** ## date2015-11-01:HOUTHI 0.12408 0.04566 2.717 0.006578 ** ## date2015-12-01:HOUTHI 0.12125 0.04505 2.691 0.007119 ** ## date2016-01-01:HOUTHI 0.11971 0.03905 3.065 0.002176 ** ## date2016-02-01:HOUTHI 0.11952 0.04151 2.879 0.003984 ** ## date2016-03-01:HOUTHI 0.12721 0.04239 3.001 0.002693 ** ## date2016-04-01:HOUTHI 0.12537 0.04532 2.766 0.005669 ** ## date2016-05-01:HOUTHI 0.12989 0.05297 2.452 0.014209 * ## date2016-06-01:HOUTHI 0.13070 0.05936 2.202 0.027675 * ## date2016-07-01:HOUTHI 0.14831 0.06597 2.248 0.024573 * ## date2016-08-01:HOUTHI 0.13047 0.04614 2.827 0.004693 ** ## date2016-09-01:HOUTHI 0.14481 0.06024 2.404 0.016227 * ## date2016-10-01:HOUTHI 0.11782 0.05255 2.242 0.024959 * ## date2016-11-01:HOUTHI 0.12175 0.04473 2.722 0.006486 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.2219 on 1172422 degrees of freedom ## Multiple R-squared(full model): 0.5776 Adjusted R-squared: 0.5652 ## Multiple R-squared(proj model): 0.01175 Adjusted R-squared: -0.01738 ## F-statistic(full model, *iid*): 46.4 on 34552 and 1172422 DF, p-value: < 2.2e-16 ## F-statistic(proj model): 147.2 on 35 and 22 DF, p-value: < 2.2e-16

This suggests that the differential drop in lights occured only after March 2015, the month in which Saudi Arabia’s military intervention commenced.

On average, the regressions suggest that the drop in lights was significantly more pronounced outside the Houthi heartland. This suggests that the conflict and the bombing carried out by Saudi Arabia is mostly concentrated outside the Houthi rebel heartland.

That the dramatic drops in light emissions is associated with the Saudi military intervention is quite clear. The conflict between the Houthi rebels and the government had been ongoing for several years but only starting with the intervention of Saudi Arabia do marked differences between Houthi and non-Houthi heartland provinces appear.

This analysis can further be refined by studying the role of the religious make up of different provinces, as the role of the religious make up between Shia and Sunni muslim groups is said to be an important factor driving this conflict.

Nevertheless, this analysis suggests that high frequency satellite images such as these can be useful in assessing the extent to which areas area directly affected by conflict, which may be useful for targeting humanitarian relief.

(This article was first published on ** R and Finance**, and kindly contributed to R-bloggers)

Processing and modelling financial data with R –

My Portuguese book about finance and R was

published

a couple of months ago and, given its positive feedback, I decided to

work on the english version immediately. You can find details about my

experience in self publishing the book in this

post.

The English book is not a simple translation of text and examples. This

is a long term project that I always dreamed of doing. With time, I plan

to keep the Portuguese and English version synchronized. The feedback I

got from the Portuguese version was taken into account and I wrote

additional sections covering advanced use of `dplyr`

with list-columns,

storing data in SQLITE, reporting tables with `xtable`

and `stargazer`

,

and many more.

The book is not yet finished. I’m taking my time in reviewing everything

and making sure that it comes out perfectly. I believe it will be ready

in a few months or less. If you are interested in the book, please go to

its website where you can

find its current TOC (table of contents), code and data.

If you want to be notified about the publication of the book, please

sign this form and I’ll let

you know as soon as it is available.

`pmfdR`

Yesterday I released package `pmfdR`

, which provides access to all

material from my book **Processing and Modelling Financial Data with
R**, including code, data and exercises.

The exercises are still not complete. I expect to have at least 100

exercises covering all chapters of the book. As soon as the book is

finished, I’ll starting working on it.

With package `pmfdR`

you can:

- Download data and code with function
`pmfdR_download.code.and.data`

- Build exercises with function
`pmfdR_build.exercise`

All the R code from the book is publicly available in

github. Function

`pmfdR_download.code.and.data`

will download a zip file from the

repository and unzip it at specified folder. Have a look in its usage:

```
if (!require(pmfdR)){
install.packages('pmfdR')
library(pmfdR)
}
my.lan <- 'en' # language of code and data ('en' or 'pt-br')
# dl may take some time (around 60 mb)
pmfdR_download.code.and.data(lan = my.lan)
dir.out <- 'pmfdR-en-code_data-master'
# list R code
list.files(dir.out, pattern = '*.R')
list.files(paste0(dir.out,'/data'))
```

All exercises from the book are based on package `exams`

. This means

that every reader will have a different version of the exercise, with

different values and correct answer. I’ve written extensively about the

positive aspects of using `exams`

. You can find the full post

here

You can create your custom exercise file using function

`pmfdR_build.exercise`

. Give it a try, just copy and paste the following

chunk of code in your R prompt.

```
if (!require(pmfdR)){
install.packages('pmfdR')
library(pmfdR)
}
my.lan <- 'en' # language of exercises
my.exercise.folder <- 'pmfdR-exercises' # name of folder with exercises files (will download from github)
my.pdf.folder <- 'PdfOut' # name of folder to place pdf file and answer sheet
pmfdR_build.exercise(lan = my.lan,
exercise.folder = my.exercise.folder,
pdf.folder = my.pdf.folder)
list.files(my.pdf.folder)
```

To **leave a comment** for the author, please follow the link and comment on their blog: ** R and Finance**.

R-bloggers.com offers

(This article was first published on ** R – Giga thoughts …**, and kindly contributed to R-bloggers)

The 2nd edition of both my books

a) Cricket analytics with cricketr

b) Beaten by sheer pace – Cricket analytics with yorkr

is now available on Amazon, both as Paperback and Kindle versions.

Pick up your copies today!!!

A) **Cricket analytics with cricketr: Second Edition**

B) **Beaten by sheer pace: Cricket analytics with yorkr(2nd edition)**

Pick up your copies today!!!

To **leave a comment** for the author, please follow the link and comment on their blog: ** R – Giga thoughts …**.

R-bloggers.com offers