Double y-axis plots with ggplot2 and purrr

[This article was first published on pacha.dev/blog, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

If this post is useful to you I kindly ask a minimal donation on Buy Me a Coffee. It shall be used to continue my Open Source efforts. The full explanation is here: A Personal Message from an Open Source Contributor.

You can send me questions for the blog using this form and subscribe to receive an email when there is a new post.

I did not expect the Spurious Correlations post to be so popular, but it seems that many people found it useful and it became the top weekly post on R-Bloggers. Because of that, I will detail a bit more how to create double y-axis plots with ggplot2, using the spuriouscorrelations package.

All about the Spurious Correlations package is:

Correlation is not Causation

Number of people who drowned by falling into a pool and the number of films Nicolas Cage appeared in

Let’s start by loading the package:

# install.packages("spuriouscorrelations", repos = "https://cran.rstudio.com")
library(spuriouscorrelations)

Let’s start with the data for number of people who drowned by falling into a pool and the number of films Nicolas Cage appeared in:

library(dplyr)

nic_cage <- filter(spurious_correlations, var2_short == "Nicholas Cage")

glimpse(nic_cage)
Rows: 11
Columns: 10
$ year       <int> 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,…
$ var1       <fct> "Number of people who drowned by falling into a pool", "Num…
$ var2       <fct> "Films Nicolas Cage appeared in", "Films Nicolas Cage appea…
$ var1_short <fct> Falling into a pool drownings, Falling into a pool drowning…
$ var2_short <fct> Nicholas Cage, Nicholas Cage, Nicholas Cage, Nicholas Cage,…
$ var1_unit  <fct> "drownings", "drownings", "drownings", "drownings", "drowni…
$ var2_unit  <fct> "films", "films", "films", "films", "films", "films", "film…
$ var1_value <dbl> 109, 102, 102, 98, 85, 95, 96, 98, 123, 94, 102
$ var2_value <dbl> 2, 2, 2, 3, 1, 1, 2, 3, 4, 1, 4
$ source     <fct> Centers for Disease Control & Prevention and Internet Movie…

Now let’s define a function that will:

  1. Compute the standard deviations of two series and .
  2. Compute the scaling factor .
  3. Compute the offset .
  4. Return the vector .
# Align the two series visually (works for any two series v1 and v2)
fun_adjust <- function(v1, v2) {
  s1 <- sd(v1, na.rm = TRUE)
  s2 <- sd(v2, na.rm = TRUE)
  a <- ifelse(s2 == 0, 1, s1 / s2)
  b <- mean(v1, na.rm = TRUE) - a * mean(v2, na.rm = TRUE)
  c(a = as.numeric(a), b = as.numeric(b))
}

v1 <- nic_cage$var1_value
v2 <- nic_cage$var2_value

adjust <- fun_adjust(v1, v2)
scale_a <- adjust["a"]
scale_b <- adjust["b"]

With this function in mind, we need to reshape the data to a long format, so we can plot both series with ggplot2 and we can apply the transformation to the second series for plotting.

# install.packages("tintin", repos = "https://cran.rstudio.com")
library(tidyr)
library(tintin)

y1_title <- as.character(unique(nic_cage$var1))
y2_title <- as.character(unique(nic_cage$var2))

nic_cage_long <- nic_cage %>%
  select(year, var1_value, var2_value) %>%
  pivot_longer(
    cols = c(var1_value, var2_value),
    names_to = "variable",
    values_to = "value"
  ) %>%
  mutate(
    # apply transform to var2 for plotting: plot_value = a * var2 + b
    plot_value = ifelse(variable == "var2_value", value * scale_a + scale_b, value),

    # add proper labels for legend
    variable_label = case_when(
      variable == "var1_value" ~ y1_title,
      variable == "var2_value" ~ y2_title
    )
  )

Now we can compute the correlation value and make the double y-axis plot:

library(ggplot2)

cor_val <- cor(nic_cage$var1_value, nic_cage$var2_value)

# make a double y axis plot with year on the x axis
ggplot(nic_cage_long, aes(x = year)) +
  geom_line(aes(y = plot_value, color = variable_label, group = variable_label), linewidth = 1.5) +
  geom_point(aes(y = plot_value, color = variable_label), size = 3) +
  labs(
    x = "Year",
    y = y1_title,
    title = sprintf("%s\nvs\n%s\n", y1_title, y2_title),
    subtitle = sprintf("Correlation: %.2f", cor_val),
    color = ""
  ) +
  # display all years on the x axis
  scale_x_continuous(breaks = nic_cage$year) +
  # primary y axis is the var1 scale
  # secondary shows var2 original scale by inverse-transforming
  scale_y_continuous(
    sec.axis = sec_axis(~ (. - scale_b) / scale_a, name = y2_title)
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "top") +
  # use tintin color palette
  scale_colour_manual(
    values = tintin_pal(option = "the black island")(2), 
    name = ""
  ) +
  # center title and subtitle
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5)
  )

Per capita consumption of mozzarella cheese and civil engineering doctorates awarded in the US

Reusing the same code as above, we can create a similar plot for the per capita consumption of mozzarella cheese and civil engineering doctorates awarded in the US:

engineering_doctorates <- filter(spurious_correlations, var2_short == "Engineering doctorates")

cor_val <- cor(engineering_doctorates$var1_value, engineering_doctorates$var2_value)

v1 <- engineering_doctorates$var1_value
v2 <- engineering_doctorates$var2_value

adjust <- fun_adjust(v1, v2)
scale_a <- adjust["a"]
scale_b <- adjust["b"]

y1_title <- as.character(unique(engineering_doctorates$var1))
y2_title <- as.character(unique(engineering_doctorates$var2))

engineering_doctorates_long <- engineering_doctorates %>%
  select(year, var1_value, var2_value) %>%
  pivot_longer(
    cols = c(var1_value, var2_value),
    names_to = "variable",
    values_to = "value"
  ) %>%
  mutate(
    variable_label = case_when(
      variable == "var1_value" ~ y1_title,
      variable == "var2_value" ~ y2_title
    ),
  # apply transform to var2 for plotting: plot_value = a * var2 + b
  plot_value = ifelse(variable == "var2_value", value * scale_a + scale_b, value)
  )

# make a double y axis plot with year on the x axis
ggplot(engineering_doctorates_long, aes(x = year)) +
  geom_line(aes(y = plot_value, color = variable_label, group = variable_label), linewidth = 1.5) +
  geom_point(aes(y = plot_value, color = variable_label), size = 3) +
  labs(
    x = "Year",
    y = y1_title,
    title = sprintf("%s\nvs\n%s\n", y1_title, y2_title),
    subtitle = sprintf("Correlation: %.2f", cor_val),
    color = ""
  ) +
  # display all years on the x axis
  scale_x_continuous(breaks = nic_cage$year) +
  # primary y axis is the var1 scale
  # secondary shows var2 original scale by inverse-transforming
  scale_y_continuous(
    sec.axis = sec_axis(~ (. - scale_b) / scale_a, name = y2_title)
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "top") +
  # use tintin color palette
  scale_colour_manual(
    values = tintin_pal(option = "the black island")(2), 
    name = ""
  ) +
  # center title and subtitle
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5)
  )

All the correlations in the package

Instead of repeating the same code for each correlation, we can create a function that does the plotting for two series and then use purrr::walk() to iterate over all the correlations in the package.

It is important to include a print(p) statement inside the function to ensure that each plot is displayed when using walk().

Happy plotting!

library(purrr)

# All variable 2 in the dataset
var2_all <- unique(spurious_correlations$var2_short)

# Function to plot a single correlation
plot_correlation <- function(var2_name) {
  data <- filter(spurious_correlations, var2_short == var2_name)
  
  v1 <- data$var1_value
  v2 <- data$var2_value
  
  adjust <- fun_adjust(v1, v2)
  scale_a <- adjust["a"]
  scale_b <- adjust["b"]
  
  cor_val <- cor(v1, v2)
  
  y1_title <- as.character(unique(data$var1))
  y2_title <- as.character(unique(data$var2))
  
  data_long <- data %>%
    select(year, var1_value, var2_value) %>%
    pivot_longer(
      cols = c(var1_value, var2_value),
      names_to = "variable",
      values_to = "value"
    ) %>%
    mutate(
      variable_label = case_when(
        variable == "var1_value" ~ y1_title,
        variable == "var2_value" ~ y2_title
      ),
      plot_value = ifelse(variable == "var2_value", value * scale_a + scale_b, value)
    )
  
  p <- ggplot(data_long, aes(x = year)) +
    geom_line(aes(y = plot_value, color = variable_label, group = variable_label), linewidth = 1.5) +
    geom_point(aes(y = plot_value, color = variable_label), size = 3) +
    labs(
      x = "Year",
      y = y1_title,
      title = sprintf("%s\nvs\n%s\n", y1_title, y2_title),
      subtitle = sprintf("Correlation: %.2f", cor_val),
      color = ""
    ) +
    scale_x_continuous(breaks = data$year) +
    scale_y_continuous(
      sec.axis = sec_axis(~ (. - scale_b) / scale_a, name = y2_title)
    ) +
    theme_minimal(base_size = 13) +
    theme(legend.position = "top") +
    scale_colour_manual(
      values = tintin_pal(option = "the black island")(2), 
      name = ""
    ) +
    theme(
      plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
      plot.subtitle = element_text(hjust = 0.5)
    )
  
  print(p)
}

walk(var2_all, plot_correlation)

To leave a comment for the author, please follow the link and comment on their blog: pacha.dev/blog.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Never miss an update!
Subscribe to R-bloggers to receive
e-mails with the latest R posts.
(You will not see this message again.)

Click here to close (This popup will not appear again)