vankesteren
diff --git a/‎.gitignore
Lines changed: 9 additions & 0 deletions b/‎.gitignore
Lines changed: 9 additions & 0 deletions
diff --git a/‎01_prep_data.R
Lines changed: 28 additions & 2 deletions b/‎01_prep_data.R
Lines changed: 28 additions & 2 deletions
diff --git a/‎02_eda.R
Lines changed: 64 additions & 0 deletions b/‎02_eda.R
Lines changed: 64 additions & 0 deletions
diff --git a/‎02_process_data.R
Lines changed: 0 additions & 91 deletions b/‎02_process_data.R
Lines changed: 0 additions & 91 deletions
diff --git a/‎03_model.R
Lines changed: 33 additions & 60 deletions b/‎03_model.R
Lines changed: 33 additions & 60 deletions
diff --git a/‎04_compare.R
Lines changed: 24 additions & 19 deletions b/‎04_compare.R
Lines changed: 24 additions & 19 deletions
@@ -34,3 +34,12 @@ vignettes/*.pdf
 
 # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
 rsconnect/
+
+# Stan executables
+stan_models/*.exe
+model_comparison/stan_models/*.exe
+model_comparison/stan_models/old/*.exe
+
+# stan fits
+model_comparison/fits/*.rds
+
@@ -1,5 +1,5 @@
 # Code accompanying the manuscript "Bayesian Analysis of Formula One Race Results"
-# Last edited 2022-02-17 by @vankesteren
+# Last edited 2022-12-11 by @vankesteren
 # Contents: data preparation, data joining from database f1db_csv.
 library(tidyverse)
 library(lubridate)
@@ -115,7 +115,7 @@ results_dat <-
   left_join(tab_status, by = "statusId") %>%
   select(raceId, positionText, positionOrder, fastestLapTime, driverRef, constructorRef, status)
 
-# Joining & cleaning ----
+# Joining, cleaning & saving ----
 f1_dat <-
   race_dat %>%
   left_join(results_dat, by = "raceId") %>%
@@ -125,4 +125,30 @@ f1_dat <-
   select(driver, constructor, year, round, circuit, position, weather_type, circuit_type, status) %>%
   mutate(year = as.integer(year), round = as.integer(round), position = as.integer(position))
 
+# convert to factors
+f1_dat <-
+  f1_dat %>%
+  mutate(
+    status       = as_factor(status),
+    constructor  = as_factor(constructor),
+    driver       = as_factor(driver),
+    weather_type = as_factor(weather_type),
+    circuit_type = as_factor(circuit_type)
+  )
+
+# Adding a finished indicator
+compute_classified <- function(status) {
+  out <- rep(FALSE, length(status))
+  # anyone above the last person still running (finished or +n laps is classified)
+  last_classified <- max(which(status == "Finished" | str_starts(status, "\\+")))
+  out[1:last_classified] <- TRUE
+  out
+}
+
+f1_dat <-
+  f1_dat %>%
+  group_by(year, round) %>%
+  mutate(finished = compute_classified(status)) %>%
+  ungroup()
+
 write_rds(f1_dat, "dat/f1_dat.rds")
@@ -0,0 +1,64 @@
+# Code accompanying the manuscript "Bayesian Analysis of Formula One Race Results"
+# Last edited 2022-12-11 by @vankesteren
+# Contents: status filtering, some EDA
+library(tidyverse)
+library(firatheme)
+
+# Data loading ----
+f1_dat <- read_rds("dat/f1_dat.rds")
+f1_dat_finished <- f1_dat %>% filter(finished)
+
+# Some EDA ----
+# finish position
+f1_dat_finished %>%
+  ggplot(aes(x = factor(position))) +
+  geom_bar(fill = firaCols[4]) +
+  theme_fira() +
+  labs(
+    title = "Distribution of finish positions",
+    subtitle = "F1 hybrid era (2014-2021)",
+    x = "Finish position",
+    y = "Count"
+  )
+
+ggsave("img/eda_finish_position.png", width = 9, height = 6, bg = "white")
+
+# basic plot
+f1_dat_finished %>%
+  filter(driver %in% c("hamilton", "raikkonen", "giovinazzi"), year > 2015) %>%
+  ggplot(aes(x = factor(position), fill = driver)) +
+  geom_bar(position = position_dodge(preserve = "single")) +
+  theme_fira() +
+  scale_fill_fira() +
+  labs(
+    x = "Finish position",
+    y = "Count",
+    title = "Different drivers' finish positions",
+    subtitle = "Conditional on finishing the race",
+    fill = ""
+  ) +
+  theme(legend.position = "top") +
+  facet_wrap(~year)
+
+ggsave("img/eda_finish_drivers.png", width = 9, height = 6, bg = "white")
+
+# average finish positions for 2021 season
+f1_dat_finished %>%
+  filter(year == 2021) %>%
+  group_by(driver) %>%
+  summarize(mean_position = mean(position, na.rm = TRUE), sem = sd(position, na.rm = TRUE) / sqrt(n())) %>%
+  mutate(driver = fct_reorder(driver, -mean_position)) %>%
+  ggplot(aes(y = driver,
+             x = mean_position,
+             xmin = mean_position - 2*sem,
+             xmax = mean_position + 2*sem)) +
+  geom_pointrange(size = .4) +
+  theme_fira() +
+  labs(
+    y = "",
+    x = "Position (mean ± 2⋅se)",
+    title = "2021 Season Finish Positions",
+    subtitle = "Conditional on finishing the race"
+  )
+
+ggsave("img/eda_finish_2021.png", width = 9, height = 6, bg = "white")
@@ -2,71 +2,44 @@
 # Last edited 2021-05-16 by @vankesteren
 # Contents: Creating and estimating models
 library(tidyverse)
-library(brms)
-
-# read data
-f1_dat_finished  <- read_rds("dat/f1_dat_finished.rds")
-
-# basic model
-fit_basic <- brm(
-  formula = prop_trans ~ 0 + (1 | driver) + (1 | driver:year) + (1 | constructor) + (1 | constructor:year),
-  family  = Beta(),
-  data    = f1_dat_finished,
-  backend = "cmdstanr",
-  chains  = 4,
-  cores   = 4,
-  threads = 3,
-  warmup  = 1000,
-  iter    = 3500
+library(cmdstanr)
+
+# read & prepare stan data
+f1_dat <-
+  read_rds("dat/f1_dat.rds") %>%
+  filter(finished)
+
+stan_data <- list(
+  num_obs           = f1_dat %>% nrow(),
+  num_drivers       = f1_dat %>% pull(driver) %>% nlevels(),
+  num_teams         = f1_dat %>% pull(constructor) %>% nlevels(),
+  num_races         = f1_dat %>% group_by(year, round) %>% n_groups(),
+  num_seasons       = f1_dat %>% group_by(year) %>% n_groups(),
+  ranked_driver_ids = f1_dat %>% arrange(year, round, position) %>% pull(driver) %>% as.integer(),
+  ranked_team_ids   = f1_dat %>% arrange(year, round, position) %>% pull(constructor) %>% as.integer(),
+  num_entrants      = f1_dat %>% group_by(year, round) %>% summarize(count = n()) %>% pull(count),
+  season_id         = f1_dat %>% group_by(year, round) %>% summarize(y = factor(first(year))) %>% pull(y) %>% as.integer(),
+  wet_weather       = f1_dat %>% group_by(year, round) %>% summarize(w = first(weather_type)) %>% pull(w) %>% as.integer() - 1L,
+  prm_circuit       = f1_dat %>% group_by(year, round) %>% summarize(c = first(circuit_type)) %>% pull(c) %>% as.integer() - 1L
 )
 
-summary(fit_basic)
-write_rds(fit_basic, "fit/fit_basic.rds")
+# basic model
+mod_basic <- cmdstan_model("stan_models/basic_model.stan")
+fit_basic <- mod_basic$sample(stan_data, chains = 8, parallel_chains = 8, iter_sampling = 1000)
+fit_basic$save_object("fit/basic.rds")
 
 # weather model
-fit_weather <- brm(
-  formula = prop_trans ~ 0 + (1 + weather_type | driver) + (1 | driver:year) + (1 | constructor) + (1 | constructor:year),
-  family  = Beta(),
-  data    = f1_dat_finished,
-  backend = "cmdstanr",
-  chains  = 4,
-  cores   = 4,
-  threads = 3,
-  warmup  = 1000,
-  iter    = 3500
-)
-
-summary(fit_weather)
-write_rds(fit_weather, "fit/fit_weather.rds")
+mod_weather <- cmdstan_model("stan_models/weather_model.stan")
+fit_weather <- mod_weather$sample(stan_data, chains = 8, parallel_chains = 8, iter_sampling = 1000)
+fit_weather$save_object("fit/weather.rds")
 
 # circuit type model
-fit_circuit <- brm(
-  formula = prop_trans ~ 0 + (1 | driver) + (1 | driver:year) + (1 + circuit_type | constructor) + (1 | constructor:year),
-  family  = Beta(),
-  data    = f1_dat_finished,
-  backend = "cmdstanr",
-  chains  = 4,
-  cores   = 4,
-  threads = 3,
-  warmup  = 1000,
-  iter    = 3500
-)
-
-summary(fit_circuit)
-write_rds(fit_circuit, "fit/fit_circuit.rds")
+mod_circuit <- cmdstan_model("stan_models/circuit_model.stan")
+fit_circuit <- mod_circuit$sample(stan_data, chains = 8, parallel_chains = 8, iter_sampling = 1000)
+fit_circuit$save_object("fit/circuit.rds")
 
 # weather + circuit type model
-fit_weather_circuit <- brm(
-  formula = prop_trans ~ 0 + (1 + weather_type | driver) + (1 | driver:year) + (1 + circuit_type | constructor) + (1 | constructor:year),
-  family  = Beta(),
-  data    = f1_dat_finished,
-  backend = "cmdstanr",
-  chains  = 4,
-  cores   = 4,
-  threads = 3,
-  warmup  = 1000,
-  iter    = 3500
-)
-
-summary(fit_weather_circuit)
-write_rds(fit_weather_circuit, "fit/fit_weather_circuit.rds")
+# weather model
+mod_all <- cmdstan_model("stan_models/weather_circuit_model.stan")
+fit_all <- mod_all$sample(stan_data, chains = 8, parallel_chains = 8, iter_sampling = 1000)
+fit_all$save_object("fit/weather_circuit.rds")
@@ -1,24 +1,31 @@
 # Code accompanying the manuscript "Bayesian Analysis of Formula One Race Results"
-# Last edited 2021-05-16 by @vankesteren
+# Last edited 2021-12-12 by @vankesteren
 # Contents: Performing model comparison
 library(tidyverse)
-library(brms)
+library(cmdstanr)
+library(loo)
 library(xtable)
 
-options(mc.cores = 12)
 
 # which model is best? Compare using LOO
-fit_basic   <- read_rds("fit/fit_basic.rds") %>% add_criterion("loo")
-fit_circuit <- read_rds("fit/fit_circuit.rds") %>% add_criterion("loo")
-fit_weather <- read_rds("fit/fit_weather.rds") %>% add_criterion("loo")
-fit_all     <- read_rds("fit/fit_weather_circuit.rds") %>% add_criterion("loo")
+fit_basic   <- read_rds("fit/basic.rds")
+fit_weather <- read_rds("fit/weather.rds")
+fit_circuit <- read_rds("fit/circuit.rds")
+fit_all     <- read_rds("fit/weather_circuit.rds")
+
+loo_basic   <- fit_basic$loo(cores = 11)
+loo_weather <- fit_weather$loo(cores = 11)
+loo_circuit <- fit_circuit$loo(cores = 11)
+loo_all     <- fit_all$loo(cores = 11)
+
 
 loo_results <- loo_compare(
-  fit_basic,
-  fit_circuit,
-  fit_weather,
-  fit_all,
-  model_names = c("Basic", "Circuit", "Weather", "Circuit + Weather")
+  list(
+    "Basic" = loo_basic,
+    "Weather" = loo_weather,
+    "Circuit" = loo_circuit,
+    "Circuit + Weather" = loo_all
+  )
 )
 
 loo_results
@@ -29,12 +36,10 @@ write_rds(loo_results, "fit/loo_results.rds")
 xtable::xtable(loo_results)
 
 #                   elpd_diff se_diff
-# Basic              0.0       0.0
-# Circuit           -1.0       0.5
-# Weather           -1.2       1.3
-# Circuit + Weather -1.9       1.4
+# Circuit            0.0       0.0
+# Basic             -2.3       6.1
+# Circuit + Weather -2.7       2.2
+# Weather           -4.7       6.3
 
-# basic works best
+# Circuit works best, but not that different from basic
 
-# comparing basic to next best model
-bayes_factor(fit_basic, fit_circuit)