Intro
This document presents possible solutions to the assignment given during the workshop on dataviz, Ph.D. retreat 2022.
Preparation
Libraries
library(tidyverse)
library(cowplot)
library(gghighlight)
library(knitr)
library(MetBrewer)
Loading data
music <- read_csv("https://raw.githubusercontent.com/lescai-teaching/dataviz-rstudio/datasets/music_sales_history.csv",
col_names = c("format", "metric", "year", "records", "value"),
col_types = c(col_character(),
col_character(),
col_double(),
col_double(),
col_double()),
skip = 1)
Tidying the data
simplifying the column name
music$metric <- ifelse(
music$metric == "Value (Adjusted)",
"adjusted_value",
music$metric
)
some data are duplicated or missing
music[duplicated(music),]
## # A tibble: 39 x 5
## format metric year records value
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Paid Subscriptions Units 1973 1 NA
## 2 Paid Subscriptions Units 1974 1 NA
## 3 Paid Subscriptions Units 1975 1 NA
## 4 Paid Subscriptions Units 1976 1 NA
## 5 Paid Subscriptions Units 1977 1 NA
## 6 Paid Subscriptions Units 1978 1 NA
## 7 Paid Subscriptions Units 1979 1 NA
## 8 Paid Subscriptions Units 1980 1 NA
## 9 Paid Subscriptions Units 1981 1 NA
## 10 Paid Subscriptions Units 1982 1 NA
## # … with 29 more rows
Despite check, some data still seem to be duplicated, so when we group we’re just gonna choose the first value
music_tidy = music %>%
filter(!is.na(value)) %>%
pivot_wider(
names_from = metric,
values_from = value,
values_fn = first
)
the categories are pretty broad
unique(music_tidy$format)
## [1] "CD" "CD Single"
## [3] "Cassette" "Cassette Single"
## [5] "LP/EP" "Vinyl Single"
## [7] "8 - Track" "Other Tapes"
## [9] "Music Video (Physical)" "DVD Audio"
## [11] "SACD" "Download Single"
## [13] "Download Album" "Kiosk"
## [15] "Download Music Video" "Ringtones & Ringbacks"
## [17] "Paid Subscriptions" "Limited Tier Paid Subscription"
## [19] "On-Demand Streaming (Ad-Supported)" "Other Ad-Supported Streaming"
## [21] "Other Digital" "Paid Subscription"
## [23] "SoundExchange Distributions" "Synchronization"
In order to reduce them, we need to create a new category where we can simplify the existing media type descriptions:
music_tidy = music_tidy %>%
mutate(
media = case_when(
grepl("CD", format) ~ "CD_DVD",
grepl("DVD", format) ~ "CD_DVD",
grepl("LP", format) ~ "Vinyl",
grepl("Vinyl", format) ~ "Vinyl",
grepl("Cassette", format) ~ "Cassette",
grepl("Track", format) ~ "Vinyl",
grepl("Download", format) ~ "Digital",
grepl("Subscription", format) ~ "Digital",
grepl("Ring", format) ~ "Digital",
grepl("Streaming", format) ~ "Digital",
grepl("Digital", format) ~ "Digital",
grepl("Sync", format) ~ "Digital",
grepl("SoundExchange", format) ~ "Digital",
grepl("Tapes", format) ~ "Cassette",
grepl("Physical", format) ~ "CD_DVD",
grepl("Kiosk", format) ~ "Vinyl"
)
)
Plotting
Stacked Area chart
The most common way to represent these data is a stacked area chart, to show both evolution and relative proportions of sales of different media types over the years.
music_tidy %>%
group_by(media, year) %>%
summarise(total_value = sum(adjusted_value, na.rm = TRUE)) %>%
ggplot(aes(x=year, y=total_value, fill=media))+
geom_area(position = "stack")
Beeswarm plot
We can also reproduce a beeswarm-like plot this way:
music_tidy %>%
group_by(media, year) %>%
summarise(total_value = sum(adjusted_value, na.rm = TRUE)) %>%
ggplot(aes(x=year, y=0, colour=media, size = total_value))+
geom_jitter(width = 0.005, alpha = 0.5)+
ylim(-3,3)+
scale_size(range = c(0.5, 10), name="Total Value by Year")+
labs(
x = "Year",
y = "",
colour = "Media type"
)+
theme(axis.text.y = element_blank())