library(spotifyr) # to access spotify API
library(tidyverse) # for all things workflow + stringr
library(knitr) # for pretty-ish tables
library(lubridate) # for nice date handling
library(ggdist) # for distribution plotting
library(scales) # for plot label troubles
Can I ask you a question? Have you ever wondered which of Taylor Swift’s songs are the most danceable? Which songs are saddest? Slowest? Lowest? Well, if the answer is no, you’re playing a stupid game and you’ve won a stupid prize – leave now. Assuming the answer to any of those questions was an incredibly enthusiastic YES, then you’re the lucky one.
So, R you ready for it?
DISCLAIMER: Obviously I’m a very busy important person so I’m not going to answer ALL of your questions today, but I’ll try my very best to keep coming back and working on this, so please come back and check it out.
Here are the packages we’ll be using.
Spotify API
Using (Spotify’s Web API)[https://developer.spotify.com/documentation/web-api], we can access all sorts of fun information about an artist. If you’re following along at home and wondering if this is the explicit version with all those Xs below, no. I’m just avoiding giving you my super secret client_id and client_secret. You can get your own by visiting your Spotify developer dashboard.
# albums we/I don't want included in the data
<- c("Live From Clear Channel Stripped 2008",
ew_david "Speak Now World Tour Live",
"reputation Stadium Tour Surprise Song Playlist")
#Sys.setenv(SPOTIFY_CLIENT_ID = 'XXXXXXXXXXXX')
#Sys.setenv(SPOTIFY_CLIENT_SECRET = 'XXXXXXXXXXXX')
<- get_spotify_access_token()
access_token <- get_artist_audio_features('taylor swift') |>
swifty # filter out albums we don't want
filter(!album_name %in% ew_david) |>
# clean up album names
mutate(album_name_og = album_name,
album_edition = str_extract_all(album_name,
"\\([^()]+\\)|Platinum Edition|\\:+(\\D+)",
simplify = TRUE),
album_edition = if_else(nchar(album_edition) != 0,
str_remove_all(album_edition, "\\(|\\)|\\:"),
"original"),
album_name = str_squish(str_remove_all(album_name,
"\\([^()]+\\)|Platinum Edition|\\:+(\\D+)")))
To give you an idea of the sort of information we now have at our fingertips, here’s an example row of data, for everyone’s favorite song:
|>
swifty filter(track_name ==
"All Too Well (10 Minute Version) (Taylor's Version) (From The Vault)") |>
kable()
artist_name | artist_id | album_id | album_type | album_images | album_release_date | album_release_year | album_release_date_precision | danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | track_id | analysis_url | time_signature | artists | available_markets | disc_number | duration_ms | explicit | track_href | is_local | track_name | track_preview_url | track_number | type | track_uri | external_urls.spotify | album_name | key_name | mode_name | key_mode | album_name_og | album_edition |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Taylor Swift | 06HL4z0CvFAxyc27GXpf02 | 6kZ42qRrzov54LcAk4onW9 | album | 640 , 300 , 64 , https://i.scdn.co/image/ab67616d0000b273318443aab3531a0558e79a4d, https://i.scdn.co/image/ab67616d00001e02318443aab3531a0558e79a4d, https://i.scdn.co/image/ab67616d00004851318443aab3531a0558e79a4d, 640 , 300 , 64 | 2021-11-12 | 2021 | day | 0.631 | 0.518 | 0 | -8.771 | 1 | 0.0303 | 0.274 | 0 | 0.088 | 0.205 | 93.023 | 5enxwA8aAbwZbf5qCHORXi | https://api.spotify.com/v1/audio-analysis/5enxwA8aAbwZbf5qCHORXi | 4 | https://api.spotify.com/v1/artists/06HL4z0CvFAxyc27GXpf02, 06HL4z0CvFAxyc27GXpf02 , Taylor Swift , artist , spotify:artist:06HL4z0CvFAxyc27GXpf02 , https://open.spotify.com/artist/06HL4z0CvFAxyc27GXpf02 | AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, CY, CZ, DK, DO, DE, EC, EE, SV, FI, FR, GR, GT, HN, HK, HU, IS, IE, IT, LV, LT, LU, MY, MT, MX, NL, NZ, NI, NO, PA, PY, PE, PH, PL, PT, SG, SK, ES, SE, CH, TW, TR, UY, US, GB, AD, LI, MC, ID, JP, TH, VN, RO, IL, ZA, SA, AE, BH, QA, OM, KW, EG, MA, DZ, TN, LB, JO, PS, IN, BY, KZ, MD, UA, AL, BA, HR, ME, MK, RS, SI, KR, BD, PK, LK, GH, KE, NG, TZ, UG, AG, AM, BS, BB, BZ, BT, BW, BF, CV, CW, DM, FJ, GM, GE, GD, GW, GY, HT, JM, KI, LS, LR, MW, MV, ML, MH, FM, NA, NR, NE, PW, PG, WS, SM, ST, SN, SC, SL, SB, KN, LC, VC, SR, TL, TO, TT, TV, VU, AZ, BN, BI, KH, CM, TD, KM, GQ, SZ, GA, GN, KG, LA, MO, MR, MN, NP, RW, TG, UZ, ZW, BJ, MG, MU, MZ, AO, CI, DJ, ZM, CD, CG, IQ, LY, TJ, VE, ET, XK | 1 | 613026 | TRUE | https://api.spotify.com/v1/tracks/5enxwA8aAbwZbf5qCHORXi | FALSE | All Too Well (10 Minute Version) (Taylor’s Version) (From The Vault) | NA | 30 | track | spotify:track:5enxwA8aAbwZbf5qCHORXi | https://open.spotify.com/track/5enxwA8aAbwZbf5qCHORXi | Red | C | major | C major | Red (Taylor’s Version) | Taylor’s Version |
Summary
You may already know this, but here’s a summary of Taylor’s discography.
name | value | Fast Fact |
---|---|---|
Number of Songs | 258 | Number of Songs |
Albums Released | 10 | Albums Released |
Earliest Release | 2006 | Earliest Release |
Most Recent Release | 2023 | Most Recent Release |
Songs Featuring Other Artists | 25 | Songs Featuring Other Artists |
Songs with Explicit Labels | 54 | Songs with Explicit Labels |
Shortest Song Minutes | 1.79 | Shortest Song Minutes |
Longest Song Minutes | 10.22 | Longest Song Minutes |
Average Song Minutes | 3.96 | Average Song Minutes |
Album | Number of Songs | Release Year | Songs Featuring Other Artists | Songs with Explicit Labels | Shortest Song (Minutes) | Longest Song (Minutes) | Average Song (Minutes) |
---|---|---|---|---|---|---|---|
Taylor Swift | 15 | 2006 | 0 | 0 | 2.88 | 4.14 | 3.57 |
Fearless | 13 | 2008 | 0 | 0 | 3.34 | 4.91 | 4.12 |
Fearless Platinum Edition | 19 | 2008 | 0 | 0 | 3.34 | 5.18 | 4.18 |
Speak Now | 14 | 2010 | 0 | 0 | 3.62 | 6.73 | 4.79 |
Speak Now (Deluxe Edition) | 20 | 2010 | 0 | 0 | 3.62 | 6.73 | 4.59 |
Red | 16 | 2012 | 0 | 0 | 3.20 | 5.46 | 4.06 |
Red (Deluxe Edition) | 22 | 2012 | 0 | 0 | 3.20 | 5.46 | 4.10 |
1989 | 13 | 2014 | 0 | 0 | 3.22 | 4.52 | 3.75 |
1989 (Deluxe Edition) | 19 | 2014 | 0 | 0 | 1.79 | 4.52 | 3.62 |
reputation | 15 | 2017 | 0 | 0 | 3.39 | 4.08 | 3.72 |
Lover | 18 | 2019 | 2 | 0 | 2.51 | 4.89 | 3.44 |
evermore | 15 | 2020 | 3 | 6 | 3.01 | 5.25 | 4.05 |
folklore | 16 | 2020 | 1 | 5 | 3.18 | 4.91 | 3.98 |
folklore (deluxe version) | 17 | 2020 | 1 | 5 | 3.18 | 4.91 | 3.95 |
folklore: the long pond studio sessions (from the Disney+ special) [deluxe edition] | 34 | 2020 | 2 | 9 | 3.06 | 4.92 | 3.96 |
evermore (deluxe version) | 17 | 2021 | 3 | 6 | 3.01 | 5.25 | 4.06 |
Fearless (Taylor’s Version) | 26 | 2021 | 3 | 0 | 3.16 | 5.20 | 4.10 |
Red (Taylor’s Version) | 30 | 2021 | 5 | 2 | 3.22 | 10.22 | 4.36 |
Midnights | 13 | 2022 | 1 | 6 | 2.75 | 4.27 | 3.40 |
Midnights (3am Edition) | 20 | 2022 | 1 | 6 | 2.48 | 4.34 | 3.47 |
Midnights (The Til Dawn Edition) | 23 | 2023 | 3 | 9 | 2.48 | 4.34 | 3.50 |
Musical Metrics
Metric Name | Definition | Possible Values |
---|---|---|
danceability | how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity | 0 – 1 |
energy | perceptual measure of intensity and activity by way of dynamic range, perceived loudness, timbre, onset rate, and general entropy | 0 – 1 |
key | key the track is in | standard Pitch Class notation. E.g. 0 = C, 1 = C♯/D♭, 2 = D, and so on |
loudness | overall loudness of a track in decibels (dB) averaged across the entire track | -60 – 0 db. |
mode | modality (major or minor) of a track; the type of scale from which its melodic content is derived | major = 1; minor = 0 |
speechiness | the presence of spoken words in a track | speech-like recordings are closer to 1.0 (e.g., > 0.66 = tracks made entirely of spoken words, between 0.33 and 0.66 = tracks containing music and speech such as rap music, < 0.33 = tracks mostly music and other non-speech-like sounds |
acousticness | confidence measure from 0.0 to 1.0 of whether the track is acoustic | 0 – 1 |
instrumentalness | predicts whether a track contains no vocals; “ooh” and “aah” sounds are treated as instrumental with rap and spoken word tracks clearly “vocal” | 0 – 1 (more vocals to less vocals) |
liveness | detects the presence of an audience in the recording | 0 – 1 (> 0.8 indicates strong likelihood of live track) |
valence | musical positiveness conveyed by a track | 0 – 1 (low valence (e.g., sad, depressed, angry) to high valence (e.g., happy, cheerful, euphoric)) |
tempo | overall estimated tempo of a track in beats per minute (BPM) | 40 – 200+ |
Albums and Metrics
Each album defines an era. What defines the album? Let’s find out.
First, to make things a little bit easier, you ought to know that I am collapsing albums. That is, even though there are 2, 3, sometimes 4 versions of each album (looking at you Midnights w/ less Lana, medium Lana, and more Lana), we will be looking at each album as a collection of all of the songs. This isn’t the best strategy, but it definitely makes my figures a lot nicer.
Each album/era is defined by a color. Let’s define those here (courtesy of my best RA Dr. Gabrielle Pfund):
■ Taylor Swift
■ Fearless
■ Speak Now
■ Red
■ 1989
■ reputation
■ Lover
■ folklore
■ evermore
■ Midnights
<- c(
album_colors `Taylor Swift` = "#99FFCC",
`Fearless` = "#FFCC66",
`Speak Now` = "#CC66FF",
`Red` = "#990000",
`1989` = "#99FFFF",
`reputation` = "#000000",
`Lover` = "#FF99FF",
`folklore` = "#CCCCCC",
`evermore` = "#FFCC99",
`Midnights` = "#000066")
Danceability
|>
swifty ggplot(aes(y = fct_reorder(album_name, album_release_year),
x = danceability, fill = album_name)) +
stat_slab() +
scale_y_discrete(labels = label_wrap(20)) +
labs(x = "Distribution of Song Danceability", y = NULL) +
scale_fill_manual(values=album_colors) +
guides(fill = "none", color = "none") +
theme_minimal()
Key
???
Loudness
|>
swifty ggplot(aes(y = fct_reorder(album_name, album_release_year),
x = loudness, fill = album_name)) +
stat_slab() +
scale_y_discrete(labels = label_wrap(20)) +
labs(x = "Distribution of Song Loudness", y = NULL) +
scale_fill_manual(values=album_colors) +
guides(fill = "none", color = "none") +
theme_minimal()
Mode
???
Speechiness
|>
swifty ggplot(aes(y = fct_reorder(album_name, album_release_year),
x = speechiness, fill = album_name)) +
stat_slab() +
scale_y_discrete(labels = label_wrap(20)) +
labs(x = "Distribution of Song Speechiness", y = NULL) +
scale_fill_manual(values=album_colors) +
guides(fill = "none", color = "none") +
theme_minimal()
Acousticness
|>
swifty ggplot(aes(y = fct_reorder(album_name, album_release_year),
x = acousticness, fill = album_name)) +
stat_slab() +
scale_y_discrete(labels = label_wrap(20)) +
labs(x = "Distribution of Song Acousticness", y = NULL) +
scale_fill_manual(values=album_colors) +
guides(fill = "none", color = "none") +
theme_minimal()
Instrumentalness
Liveness
|>
swifty ggplot(aes(y = fct_reorder(album_name, album_release_year),
x = liveness, fill = album_name)) +
stat_slab() +
scale_y_discrete(labels = label_wrap(20)) +
labs(x = "Distribution of Song Liveness", y = NULL) +
scale_fill_manual(values=album_colors) +
guides(fill = "none", color = "none") +
theme_minimal()
Valence
|>
swifty ggplot(aes(y = fct_reorder(album_name, album_release_year),
x = valence, fill = album_name)) +
stat_slab() +
scale_y_discrete(labels = label_wrap(20)) +
labs(x = "Distribution of Song Valence", y = NULL) +
scale_fill_manual(values=album_colors) +
guides(fill = "none", color = "none") +
theme_minimal()
tempo
|>
swifty ggplot(aes(y = fct_reorder(album_name, album_release_year),
x = tempo, fill = album_name)) +
stat_slab() +
scale_y_discrete(labels = label_wrap(20)) +
labs(x = "Distribution of Song Tempo", y = NULL) +
scale_fill_manual(values=album_colors) +
guides(fill = "none", color = "none") +
theme_minimal()
Taylor’s Version vs. Not(?) Taylor’s Version
For my next bit of exploration, I’m wondering how much a difference Taylor made in how Taylory Taylor is.
First, let’s add a boolean that tells us if a song is Taylor’s Version. Dirty data problem 1 of ???: tick marks vs. apostrophes. And this is why we use unicode, friends.
<- swifty |>
swifty mutate(isTV = str_detect(track_name, "Taylor's Version|Taylor’s Version"))
Now that we have TV as it’s own variable, we can make our data one step closer to some sort of normal form by making sure that track name contains one and only one type of information: the track name.
Stringr is one of my favorite tools in the tidyverse, especially for dealing with any sort of text data. Here I used:
str_remove()
to remove (Taylor’s Version) from song titles to better group them laterstr_squish()
to remove any whitespace from the start and end, and reduce any double spaces within a song to a single spacestr_to_lower()
is the cute little sister ofstr_to_upper()
and converts characters to lowercasestr_replace_all()
replaces all of the target string (in this case all punctuation) with the replacement string (in this case nothingness)
<- swifty |>
swifty mutate(track_name = str_remove(track_name, "\\(Taylor's Version\\)|\\(Taylor’s Version\\)"),
track_name = str_squish(track_name),
track_name = str_to_lower(track_name),
track_name = str_replace_all(track_name, "[[:punct:]]", ""))
Now we can grab just our songs that have a TV and a no-TV.
<- swifty |>
songs_with_tvs group_by(track_name) |>
summarize(n = n(),
hasTV = sum(isTV)) |>
filter(n > 1 & hasTV > 0)
Let’s take a look at danceability first. Looks like there is a general trend for Taylor’s Version of songs to be less danceable.
Note: Apparently it would require fundamental changes to ggplot2 (according to Sir Hadley), so we cannot use album colors for each facet label without some super hacky workarounds. Maybe ggplot3 will serve us better.
|>
swifty filter(track_name %in% songs_with_tvs$track_name) |>
group_by(track_name, isTV, Album = album_name) |>
summarize(danceability = mean(danceability)) |>
ggplot(aes(x = isTV, y = danceability, group = track_name, color = Album)) +
scale_color_manual(values=album_colors) +
geom_point() +
geom_line(method = "lm") +
facet_grid(~Album) +
labs(x = "Taylor's Version") +
theme_minimal() +
guides(color = "none") +
theme(strip.background.x = element_rect(fill="gray95")) # this is where I wish fill would take a list of colors
Energy
|>
swifty filter(track_name %in% songs_with_tvs$track_name) |>
group_by(track_name, isTV, Album = album_name) |>
summarize(energy = mean(energy)) |>
ggplot(aes(x = isTV, y = energy, group = track_name, color = Album)) +
scale_color_manual(values=album_colors) +
geom_point() +
geom_line(method = "lm") +
facet_grid(~Album) +
labs(x = "Taylor's Version") +
theme_minimal() +
guides(color = "none") +
theme(strip.background.x = element_rect(fill="gray95")) # this is where I wish fill would take a list of colors
Loudness
|>
swifty filter(track_name %in% songs_with_tvs$track_name) |>
group_by(track_name, isTV, Album = album_name) |>
summarize(loudness = mean(loudness)) |>
ggplot(aes(x = isTV, y = loudness, group = track_name, color = Album)) +
scale_color_manual(values=album_colors) +
geom_point() +
geom_line(method = "lm") +
facet_grid(~Album) +
labs(x = "Taylor's Version") +
theme_minimal() +
guides(color = "none") +
theme(strip.background.x = element_rect(fill="gray95")) # this is where I wish fill would take a list of colors
Valence
|>
swifty filter(track_name %in% songs_with_tvs$track_name) |>
group_by(track_name, isTV, Album = album_name) |>
summarize(valence = mean(valence)) |>
ggplot(aes(x = isTV, y = valence, group = track_name, color = Album)) +
scale_color_manual(values=album_colors) +
geom_point() +
geom_line(method = "lm") +
facet_grid(~Album) +
labs(x = "Taylor's Version") +
theme_minimal() +
guides(color = "none") +
theme(strip.background.x = element_rect(fill="gray95")) # this is where I wish fill would take a list of colors
References:
while I’m pretty confident in my brain’s ability to store endless lyrics, this website made possible all of the lyrics and references found throughout this project