Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
If this post is useful to you I kindly ask a minimal donation on Buy Me a Coffee. It shall be used to continue my Open Source efforts. The full explanation is here: A Personal Message from an Open Source Contributor.
You can send me questions for the blog using this form and subscribe to receive an email when there is a new post.
I did not expect the Spurious Correlations post to be so popular, but it seems that many people found it useful and it became the top weekly post on R-Bloggers. Because of that, I will detail a bit more how to create double y-axis plots with ggplot2, using the spuriouscorrelations package.
All about the Spurious Correlations package is:
Number of people who drowned by falling into a pool and the number of films Nicolas Cage appeared in
Let’s start by loading the package:
# install.packages("spuriouscorrelations", repos = "https://cran.rstudio.com")
library(spuriouscorrelations)
Let’s start with the data for number of people who drowned by falling into a pool and the number of films Nicolas Cage appeared in:
library(dplyr) nic_cage <- filter(spurious_correlations, var2_short == "Nicholas Cage") glimpse(nic_cage)
Rows: 11 Columns: 10 $ year <int> 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,… $ var1 <fct> "Number of people who drowned by falling into a pool", "Num… $ var2 <fct> "Films Nicolas Cage appeared in", "Films Nicolas Cage appea… $ var1_short <fct> Falling into a pool drownings, Falling into a pool drowning… $ var2_short <fct> Nicholas Cage, Nicholas Cage, Nicholas Cage, Nicholas Cage,… $ var1_unit <fct> "drownings", "drownings", "drownings", "drownings", "drowni… $ var2_unit <fct> "films", "films", "films", "films", "films", "films", "film… $ var1_value <dbl> 109, 102, 102, 98, 85, 95, 96, 98, 123, 94, 102 $ var2_value <dbl> 2, 2, 2, 3, 1, 1, 2, 3, 4, 1, 4 $ source <fct> Centers for Disease Control & Prevention and Internet Movie…
Now let’s define a function that will:
- Compute the standard deviations of two series
and . - Compute the scaling factor
. - Compute the offset
. - Return the vector
.
# Align the two series visually (works for any two series v1 and v2)
fun_adjust <- function(v1, v2) {
s1 <- sd(v1, na.rm = TRUE)
s2 <- sd(v2, na.rm = TRUE)
a <- ifelse(s2 == 0, 1, s1 / s2)
b <- mean(v1, na.rm = TRUE) - a * mean(v2, na.rm = TRUE)
c(a = as.numeric(a), b = as.numeric(b))
}
v1 <- nic_cage$var1_value
v2 <- nic_cage$var2_value
adjust <- fun_adjust(v1, v2)
scale_a <- adjust["a"]
scale_b <- adjust["b"]
With this function in mind, we need to reshape the data to a long format, so we can plot both series with ggplot2 and we can apply the transformation to the second series for plotting.
# install.packages("tintin", repos = "https://cran.rstudio.com")
library(tidyr)
library(tintin)
y1_title <- as.character(unique(nic_cage$var1))
y2_title <- as.character(unique(nic_cage$var2))
nic_cage_long <- nic_cage %>%
select(year, var1_value, var2_value) %>%
pivot_longer(
cols = c(var1_value, var2_value),
names_to = "variable",
values_to = "value"
) %>%
mutate(
# apply transform to var2 for plotting: plot_value = a * var2 + b
plot_value = ifelse(variable == "var2_value", value * scale_a + scale_b, value),
# add proper labels for legend
variable_label = case_when(
variable == "var1_value" ~ y1_title,
variable == "var2_value" ~ y2_title
)
)
Now we can compute the correlation value and make the double y-axis plot:
library(ggplot2)
cor_val <- cor(nic_cage$var1_value, nic_cage$var2_value)
# make a double y axis plot with year on the x axis
ggplot(nic_cage_long, aes(x = year)) +
geom_line(aes(y = plot_value, color = variable_label, group = variable_label), linewidth = 1.5) +
geom_point(aes(y = plot_value, color = variable_label), size = 3) +
labs(
x = "Year",
y = y1_title,
title = sprintf("%s\nvs\n%s\n", y1_title, y2_title),
subtitle = sprintf("Correlation: %.2f", cor_val),
color = ""
) +
# display all years on the x axis
scale_x_continuous(breaks = nic_cage$year) +
# primary y axis is the var1 scale
# secondary shows var2 original scale by inverse-transforming
scale_y_continuous(
sec.axis = sec_axis(~ (. - scale_b) / scale_a, name = y2_title)
) +
theme_minimal(base_size = 13) +
theme(legend.position = "top") +
# use tintin color palette
scale_colour_manual(
values = tintin_pal(option = "the black island")(2),
name = ""
) +
# center title and subtitle
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
plot.subtitle = element_text(hjust = 0.5)
)
Per capita consumption of mozzarella cheese and civil engineering doctorates awarded in the US
Reusing the same code as above, we can create a similar plot for the per capita consumption of mozzarella cheese and civil engineering doctorates awarded in the US:
engineering_doctorates <- filter(spurious_correlations, var2_short == "Engineering doctorates")
cor_val <- cor(engineering_doctorates$var1_value, engineering_doctorates$var2_value)
v1 <- engineering_doctorates$var1_value
v2 <- engineering_doctorates$var2_value
adjust <- fun_adjust(v1, v2)
scale_a <- adjust["a"]
scale_b <- adjust["b"]
y1_title <- as.character(unique(engineering_doctorates$var1))
y2_title <- as.character(unique(engineering_doctorates$var2))
engineering_doctorates_long <- engineering_doctorates %>%
select(year, var1_value, var2_value) %>%
pivot_longer(
cols = c(var1_value, var2_value),
names_to = "variable",
values_to = "value"
) %>%
mutate(
variable_label = case_when(
variable == "var1_value" ~ y1_title,
variable == "var2_value" ~ y2_title
),
# apply transform to var2 for plotting: plot_value = a * var2 + b
plot_value = ifelse(variable == "var2_value", value * scale_a + scale_b, value)
)
# make a double y axis plot with year on the x axis
ggplot(engineering_doctorates_long, aes(x = year)) +
geom_line(aes(y = plot_value, color = variable_label, group = variable_label), linewidth = 1.5) +
geom_point(aes(y = plot_value, color = variable_label), size = 3) +
labs(
x = "Year",
y = y1_title,
title = sprintf("%s\nvs\n%s\n", y1_title, y2_title),
subtitle = sprintf("Correlation: %.2f", cor_val),
color = ""
) +
# display all years on the x axis
scale_x_continuous(breaks = nic_cage$year) +
# primary y axis is the var1 scale
# secondary shows var2 original scale by inverse-transforming
scale_y_continuous(
sec.axis = sec_axis(~ (. - scale_b) / scale_a, name = y2_title)
) +
theme_minimal(base_size = 13) +
theme(legend.position = "top") +
# use tintin color palette
scale_colour_manual(
values = tintin_pal(option = "the black island")(2),
name = ""
) +
# center title and subtitle
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
plot.subtitle = element_text(hjust = 0.5)
)
All the correlations in the package
Instead of repeating the same code for each correlation, we can create a function purrr::walk() to iterate over all the correlations in the package.
It is important to include a print(p) statement inside the function to ensure that each plot is displayed when using walk().
Happy plotting!
library(purrr)
# All variable 2 in the dataset
var2_all <- unique(spurious_correlations$var2_short)
# Function to plot a single correlation
plot_correlation <- function(var2_name) {
data <- filter(spurious_correlations, var2_short == var2_name)
v1 <- data$var1_value
v2 <- data$var2_value
adjust <- fun_adjust(v1, v2)
scale_a <- adjust["a"]
scale_b <- adjust["b"]
cor_val <- cor(v1, v2)
y1_title <- as.character(unique(data$var1))
y2_title <- as.character(unique(data$var2))
data_long <- data %>%
select(year, var1_value, var2_value) %>%
pivot_longer(
cols = c(var1_value, var2_value),
names_to = "variable",
values_to = "value"
) %>%
mutate(
variable_label = case_when(
variable == "var1_value" ~ y1_title,
variable == "var2_value" ~ y2_title
),
plot_value = ifelse(variable == "var2_value", value * scale_a + scale_b, value)
)
p <- ggplot(data_long, aes(x = year)) +
geom_line(aes(y = plot_value, color = variable_label, group = variable_label), linewidth = 1.5) +
geom_point(aes(y = plot_value, color = variable_label), size = 3) +
labs(
x = "Year",
y = y1_title,
title = sprintf("%s\nvs\n%s\n", y1_title, y2_title),
subtitle = sprintf("Correlation: %.2f", cor_val),
color = ""
) +
scale_x_continuous(breaks = data$year) +
scale_y_continuous(
sec.axis = sec_axis(~ (. - scale_b) / scale_a, name = y2_title)
) +
theme_minimal(base_size = 13) +
theme(legend.position = "top") +
scale_colour_manual(
values = tintin_pal(option = "the black island")(2),
name = ""
) +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
plot.subtitle = element_text(hjust = 0.5)
)
print(p)
}
walk(var2_all, plot_correlation)
R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
