Trends in Baby Names: Professional Data Visualization in R
In this post, we combine data-wrangling and visualization skills to create a professional-looking data visualization using the ggplot2
package in R. We will be working with the babynames
data set, which is available through the babynames
package in R.
Solution file is accessible here.
Visualization 1
First, we visualize only the trends for the name Jessie as it has been the most gender-neutral name throughout the years.
library(babynames)
library(dplyr)
library(tidyr)
library(ggplot2)
data(babynames)
# data wrangling
prop_Jessie <- babynames %>%
select(-prop) %>%
# filter only the name Jessie from 1930 to 2012
filter(name == "Jessie", year >= 1930, year <= 2012) %>%
spread(key = sex, value = n) %>%
# proportion of female and male babies named Jessie each year
transmute(year, prop_F = 100 * F / (F + M), prop_M = 100 - prop_F) %>%
gather(key = "sex", value = "prop", -year)
# proportion of babies named Jessie that are female
prop_female <- prop_Jessie %>%
filter(sex == "prop_F")
# find the year when the name Jessie was the most gender-neutral
neutral_Jessie <- prop_Jessie %>%
mutate(diff = abs(50 - prop)) %>%
arrange(diff) %>%
head(1)
# change the font for the visualization
windowsFonts(Gothic = windowsFont("Century Gothic"))
# create the plot
ggplot() +
geom_area(data = prop_Jessie, aes(x = year, y = prop,
fill = reorder(sex, desc(prop))),
show.legend = FALSE) +
geom_line(data = prop_female, aes(x = year, y = prop)) +
# add the point for the most gender-neutral year
geom_point(data = neutral_Jessie, aes(x = year, y = prop),
shape = 21, fill = "white", size = 10, stroke = 1.1) +
# add text annotations
geom_segment(aes(x = 1940, y = 40, xend = 1940, yend = 50)) +
geom_segment(aes(x = 1940, y = 50, xend = 1947, yend = 50)) +
annotate("blank", y = 102) +
annotate("text", x = 1934, y = 24, hjust = 0, label = "Most \nunisex year",
family = "Gothic", fontface = "italic", size = 10) +
annotate("text", x = 1997, y = c(20, 80), label = c("GIRLS", "BOYS"),
family = "Gothic", size = 12, color = "white") +
# adjust axes
scale_x_continuous(breaks = c(1940, 1960, 1980, 2000),
labels = c("1940", "’60", "’80", "2000"),
expand = c(0, 0)) +
scale_y_continuous(breaks = c(0, 50, 100),
labels = c("0%", "50%", "100%"),
expand = c(0, 0)) +
theme(axis.title = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_text(margin = margin(r = -10)),
axis.text = element_text(family = "Gothic", size = 30, color = "black"),
panel.background = element_rect(fill = "white"),
axis.ticks.length = unit(0.5, "cm")) +
# change colors
scale_fill_manual(values = c("#92bbcf", "#e8aca1"))
Visualization 2
Here we create a similar graph as above but for the 35 most gender-neutral names.
neutral_names <- babynames %>%
select(-prop) %>%
# select only years 1930 to 2012
filter(year >= 1930, year <= 2012) %>%
# find number of female and male babies for each name each year
spread(key = sex, value = n, fill = 0) %>%
# calculate measure of gender-neutrality
mutate(prop_F = 100 * F / (F + M), se = (50 - prop_F)^2) %>%
group_by(name) %>%
# for each name find total number of babies and measure of gender-neutrality
summarise(n = n(), female = sum(F), male = sum(M), total = sum(F + M),
mse = mean(se)) %>%
# filter only those names that were present each year and occurred more than
# 9000 times
filter(n == 83, total > 9000) %>%
# sort by gender-neutrality
arrange(mse) %>%
# select 35 most gender-neutral names
head(35)
names <- neutral_names$name
prop_neutral <- babynames %>%
select(-prop) %>%
filter(name %in% names, year >= 1930, year <= 2012) %>%
spread(key = sex, value = n, fill = 0) %>%
# proportion of female and male babies for each name each year
transmute(year, name, prop_M = 100 * F / (F + M), prop_F = 100 - prop_M) %>%
gather(key = "sex", value = "prop", -year, -name)
# proportion of babies of each name that are male
prop_male_neutral <- prop_neutral %>%
filter(sex == "prop_M")
ggplot(data = prop_neutral, aes(x = year, y = prop, fill = sex)) +
geom_area(show.legend = FALSE) +
geom_line(data = prop_male_neutral, aes(x = year, y = prop)) +
facet_wrap(~ name, scales = "free_x", nrow = 5) +
# adjust axes
scale_x_continuous(breaks = c(1940, 1960, 1980, 2000),
labels = c("1940", "’60", "’80", "2000"),
expand = c(0, 0)) +
scale_y_continuous(breaks = c(0, 50, 100),
labels = c("0%", "50%", "100%"),
expand = c(0, 0)) +
theme(axis.title = element_blank(),
axis.ticks.y = element_blank(),
axis.text = element_text(family = "Gothic", color = "black"),
panel.background = element_rect(fill = "white"),
axis.ticks.length = unit(0.1, "cm"),
strip.background = element_rect(fill = "white")) +
# change colors
scale_fill_manual(values = c("#92bbcf", "#e8aca1"))