Double y-axis plots with ggplot2 and purrr
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
If this post is useful to you I kindly ask a minimal donation on Buy Me a Coffee. It shall be used to continue my Open Source efforts. The full explanation is here: A Personal Message from an Open Source Contributor.
You can send me questions for the blog using this form and subscribe to receive an email when there is a new post.
I did not expect the Spurious Correlations post to be so popular, but it seems that many people found it useful and it became the top weekly post on R-Bloggers. Because of that, I will detail a bit more how to create double y-axis plots with ggplot2, using the spuriouscorrelations package.
All about the Spurious Correlations package is:
Number of people who drowned by falling into a pool and the number of films Nicolas Cage appeared in
Let’s start by loading the package:
# install.packages("spuriouscorrelations", repos = "https://cran.rstudio.com") library(spuriouscorrelations)
Let’s start with the data for number of people who drowned by falling into a pool and the number of films Nicolas Cage appeared in:
library(dplyr) nic_cage <- filter(spurious_correlations, var2_short == "Nicholas Cage") glimpse(nic_cage)
Rows: 11 Columns: 10 $ year <int> 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,… $ var1 <fct> "Number of people who drowned by falling into a pool", "Num… $ var2 <fct> "Films Nicolas Cage appeared in", "Films Nicolas Cage appea… $ var1_short <fct> Falling into a pool drownings, Falling into a pool drowning… $ var2_short <fct> Nicholas Cage, Nicholas Cage, Nicholas Cage, Nicholas Cage,… $ var1_unit <fct> "drownings", "drownings", "drownings", "drownings", "drowni… $ var2_unit <fct> "films", "films", "films", "films", "films", "films", "film… $ var1_value <dbl> 109, 102, 102, 98, 85, 95, 96, 98, 123, 94, 102 $ var2_value <dbl> 2, 2, 2, 3, 1, 1, 2, 3, 4, 1, 4 $ source <fct> Centers for Disease Control & Prevention and Internet Movie…
Now let’s define a function that will:
- Compute the standard deviations of two series
and
.
- Compute the scaling factor
.
- Compute the offset
.
- Return the vector
.
# Align the two series visually (works for any two series v1 and v2) fun_adjust <- function(v1, v2) { s1 <- sd(v1, na.rm = TRUE) s2 <- sd(v2, na.rm = TRUE) a <- ifelse(s2 == 0, 1, s1 / s2) b <- mean(v1, na.rm = TRUE) - a * mean(v2, na.rm = TRUE) c(a = as.numeric(a), b = as.numeric(b)) } v1 <- nic_cage$var1_value v2 <- nic_cage$var2_value adjust <- fun_adjust(v1, v2) scale_a <- adjust["a"] scale_b <- adjust["b"]
With this function in mind, we need to reshape the data to a long format, so we can plot both series with ggplot2 and we can apply the transformation to the second series for plotting.
# install.packages("tintin", repos = "https://cran.rstudio.com") library(tidyr) library(tintin) y1_title <- as.character(unique(nic_cage$var1)) y2_title <- as.character(unique(nic_cage$var2)) nic_cage_long <- nic_cage %>% select(year, var1_value, var2_value) %>% pivot_longer( cols = c(var1_value, var2_value), names_to = "variable", values_to = "value" ) %>% mutate( # apply transform to var2 for plotting: plot_value = a * var2 + b plot_value = ifelse(variable == "var2_value", value * scale_a + scale_b, value), # add proper labels for legend variable_label = case_when( variable == "var1_value" ~ y1_title, variable == "var2_value" ~ y2_title ) )
Now we can compute the correlation value and make the double y-axis plot:
library(ggplot2) cor_val <- cor(nic_cage$var1_value, nic_cage$var2_value) # make a double y axis plot with year on the x axis ggplot(nic_cage_long, aes(x = year)) + geom_line(aes(y = plot_value, color = variable_label, group = variable_label), linewidth = 1.5) + geom_point(aes(y = plot_value, color = variable_label), size = 3) + labs( x = "Year", y = y1_title, title = sprintf("%s\nvs\n%s\n", y1_title, y2_title), subtitle = sprintf("Correlation: %.2f", cor_val), color = "" ) + # display all years on the x axis scale_x_continuous(breaks = nic_cage$year) + # primary y axis is the var1 scale # secondary shows var2 original scale by inverse-transforming scale_y_continuous( sec.axis = sec_axis(~ (. - scale_b) / scale_a, name = y2_title) ) + theme_minimal(base_size = 13) + theme(legend.position = "top") + # use tintin color palette scale_colour_manual( values = tintin_pal(option = "the black island")(2), name = "" ) + # center title and subtitle theme( plot.title = element_text(hjust = 0.5, size = 16, face = "bold"), plot.subtitle = element_text(hjust = 0.5) )
Per capita consumption of mozzarella cheese and civil engineering doctorates awarded in the US
Reusing the same code as above, we can create a similar plot for the per capita consumption of mozzarella cheese and civil engineering doctorates awarded in the US:
engineering_doctorates <- filter(spurious_correlations, var2_short == "Engineering doctorates") cor_val <- cor(engineering_doctorates$var1_value, engineering_doctorates$var2_value) v1 <- engineering_doctorates$var1_value v2 <- engineering_doctorates$var2_value adjust <- fun_adjust(v1, v2) scale_a <- adjust["a"] scale_b <- adjust["b"] y1_title <- as.character(unique(engineering_doctorates$var1)) y2_title <- as.character(unique(engineering_doctorates$var2)) engineering_doctorates_long <- engineering_doctorates %>% select(year, var1_value, var2_value) %>% pivot_longer( cols = c(var1_value, var2_value), names_to = "variable", values_to = "value" ) %>% mutate( variable_label = case_when( variable == "var1_value" ~ y1_title, variable == "var2_value" ~ y2_title ), # apply transform to var2 for plotting: plot_value = a * var2 + b plot_value = ifelse(variable == "var2_value", value * scale_a + scale_b, value) ) # make a double y axis plot with year on the x axis ggplot(engineering_doctorates_long, aes(x = year)) + geom_line(aes(y = plot_value, color = variable_label, group = variable_label), linewidth = 1.5) + geom_point(aes(y = plot_value, color = variable_label), size = 3) + labs( x = "Year", y = y1_title, title = sprintf("%s\nvs\n%s\n", y1_title, y2_title), subtitle = sprintf("Correlation: %.2f", cor_val), color = "" ) + # display all years on the x axis scale_x_continuous(breaks = nic_cage$year) + # primary y axis is the var1 scale # secondary shows var2 original scale by inverse-transforming scale_y_continuous( sec.axis = sec_axis(~ (. - scale_b) / scale_a, name = y2_title) ) + theme_minimal(base_size = 13) + theme(legend.position = "top") + # use tintin color palette scale_colour_manual( values = tintin_pal(option = "the black island")(2), name = "" ) + # center title and subtitle theme( plot.title = element_text(hjust = 0.5, size = 16, face = "bold"), plot.subtitle = element_text(hjust = 0.5) )
All the correlations in the package
Instead of repeating the same code for each correlation, we can create a function that does the plotting for two series and then use
purrr::walk()
to iterate over all the correlations in the package.
It is important to include a print(p)
statement inside the function to ensure that each plot is displayed when using walk()
.
Happy plotting!
library(purrr) # All variable 2 in the dataset var2_all <- unique(spurious_correlations$var2_short) # Function to plot a single correlation plot_correlation <- function(var2_name) { data <- filter(spurious_correlations, var2_short == var2_name) v1 <- data$var1_value v2 <- data$var2_value adjust <- fun_adjust(v1, v2) scale_a <- adjust["a"] scale_b <- adjust["b"] cor_val <- cor(v1, v2) y1_title <- as.character(unique(data$var1)) y2_title <- as.character(unique(data$var2)) data_long <- data %>% select(year, var1_value, var2_value) %>% pivot_longer( cols = c(var1_value, var2_value), names_to = "variable", values_to = "value" ) %>% mutate( variable_label = case_when( variable == "var1_value" ~ y1_title, variable == "var2_value" ~ y2_title ), plot_value = ifelse(variable == "var2_value", value * scale_a + scale_b, value) ) p <- ggplot(data_long, aes(x = year)) + geom_line(aes(y = plot_value, color = variable_label, group = variable_label), linewidth = 1.5) + geom_point(aes(y = plot_value, color = variable_label), size = 3) + labs( x = "Year", y = y1_title, title = sprintf("%s\nvs\n%s\n", y1_title, y2_title), subtitle = sprintf("Correlation: %.2f", cor_val), color = "" ) + scale_x_continuous(breaks = data$year) + scale_y_continuous( sec.axis = sec_axis(~ (. - scale_b) / scale_a, name = y2_title) ) + theme_minimal(base_size = 13) + theme(legend.position = "top") + scale_colour_manual( values = tintin_pal(option = "the black island")(2), name = "" ) + theme( plot.title = element_text(hjust = 0.5, size = 16, face = "bold"), plot.subtitle = element_text(hjust = 0.5) ) print(p) } walk(var2_all, plot_correlation)
R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.