NYT COVID Forecast Aggregation (Box-Cox)
New York Times Data on COVID
I will grab the dataset from the NYT COVID data github. I will create two variables, New Cases and New Deaths to model. The final line use aggregation to create the national data.
NYT.COVIDN <- read.csv(url("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))
# Define a tsibble; the date is imported as character so mutate that first.
NYT.COVID <- NYT.COVIDN %>%
mutate(date = as.Date(date)) %>%
as_tsibble(index = date, key = state) %>%
group_by(state) %>%
mutate(New.Cases = difference(cases), New.Deaths = difference(deaths)) %>%
filter(!(state %in% c("Guam", "Puerto Rico", "Virgin Islands", "Northern Mariana Islands")))
NYT.COVID <- NYT.COVID %>%
mutate(New.Cases = (New.Cases >= 0) * New.Cases, New.Deaths = (New.Deaths >=
0) * New.Deaths)
NYTAgg.COVID <- NYT.COVID %>%
aggregate_key(state, New.Cases = sum(New.Cases, na.rm = TRUE), New.Deaths = sum(New.Deaths,
na.rm = TRUE)) %>%
filter(date > as.Date("2020-03-31")) %>%
mutate(Day.of.Week = as.factor(wday(date, label = TRUE)))
lambdaC <- NYTAgg.COVID %>%
filter(is_aggregated(state)) %>%
features(New.Cases, features = guerrero) %>%
pull(lambda_guerrero)
P1 <- NYTAgg.COVID %>%
filter(is_aggregated(state)) %>%
autoplot(New.Cases)
P2 <- NYTAgg.COVID %>%
filter(is_aggregated(state)) %>%
autoplot(box_cox(New.Cases, lambdaC)) + labs(y = "Box-Cox(New.Cases)")
library(patchwork)
P1/P2
The \(\lambda\) for deaths…
lambdaD <- NYTAgg.COVID %>%
filter(is_aggregated(state)) %>%
features(New.Deaths, features = guerrero) %>%
pull(lambda_guerrero)
P1 <- NYTAgg.COVID %>%
filter(is_aggregated(state)) %>%
autoplot(New.Deaths)
P2 <- NYTAgg.COVID %>%
filter(is_aggregated(state)) %>%
autoplot(box_cox(New.Deaths, lambdaD)) + labs(y = "Box-Cox(New.Deaths)")
library(patchwork)
P1/P2
Plot the two aggregates.
plot1 <- NYTAgg.COVID %>%
filter(is_aggregated(state)) %>%
autoplot(box_cox(New.Cases, lambdaC)) + labs(y = "Box-Cox(New.Cases)")
plot2 <- NYTAgg.COVID %>%
filter(is_aggregated(state)) %>%
autoplot(box_cox(New.Deaths, lambdaD)) + labs(y = "Box-Cox(New.Deaths)")
plot1/plot2
Are there day of week effects?
NYTAgg.COVID %>%
model(TSLM(box_cox(New.Cases, lambdaC) ~ Day.of.Week)) %>%
glance() %>%
filter(p_value < 0.05)
# A tibble: 15 x 16
state .model r_squared adj_r_squared sigma2 statistic p_value df
<chr*> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 Connecticu… TSLM(box… 0.0785 0.0592 0.742 4.06 6.31e- 4 7
2 District o… TSLM(box… 0.0409 0.0255 0.248 2.65 1.57e- 2 7
3 Georgia … TSLM(box… 0.0409 0.0255 0.235 2.65 1.57e- 2 7
4 Idaho … TSLM(box… 0.0610 0.0453 0.928 3.88 9.21e- 4 7
5 Iowa … TSLM(box… 0.0478 0.0325 0.306 3.13 5.26e- 3 7
6 Kansas … TSLM(box… 0.538 0.530 0.654 67.9 1.01e-55 7
7 Kentucky … TSLM(box… 0.0417 0.0263 0.669 2.71 1.37e- 2 7
8 Louisiana … TSLM(box… 0.0821 0.0649 0.282 4.78 1.09e- 4 7
9 Michigan … TSLM(box… 0.320 0.309 0.490 29.3 8.89e-29 7
10 Mississipp… TSLM(box… 0.0868 0.0719 0.287 5.84 7.75e- 6 7
11 Nebraska … TSLM(box… 0.0392 0.0238 0.368 2.54 2.03e- 2 7
12 Rhode Isla… TSLM(box… 0.0718 0.0513 0.463 3.51 2.33e- 3 7
13 South Dako… TSLM(box… 0.0672 0.0520 0.540 4.42 2.51e- 4 7
14 Washington… TSLM(box… 0.124 0.110 0.311 8.83 5.10e- 9 7
15 Wisconsin … TSLM(box… 0.0364 0.0208 0.427 2.34 3.13e- 2 7
# … with 8 more variables: log_lik <dbl>, AIC <dbl>, AICc <dbl>, BIC <dbl>,
# CV <dbl>, deviance <dbl>, df.residual <int>, rank <int>
In 12 states, there appear to be and in a few of them, those effects are large. Modelling this covariate seems useful. At the same time, this does nothing about the broad trends.
NYTAgg.COVID %>%
model(TSLM(box_cox(New.Deaths, lambdaD) ~ Day.of.Week)) %>%
glance() %>%
filter(p_value < 0.05)
# A tibble: 47 x 16
state .model r_squared adj_r_squared sigma2 statistic p_value df
<chr*> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 Alabama TSLM(box… 0.278 0.267 12.4 24.0 4.80e-24 7
2 Alaska TSLM(box… 0.0711 0.0562 14.6 4.77 1.06e- 4 7
3 Arizona TSLM(box… 0.461 0.452 6.87 53.2 2.72e-47 7
4 Arkansas TSLM(box… 0.0454 0.0301 7.54 2.97 7.67e- 3 7
5 California TSLM(box… 0.187 0.174 2.89 14.3 1.09e-14 7
6 Colorado TSLM(box… 0.109 0.0948 9.33 7.63 9.59e- 8 7
7 Connecticut TSLM(box… 0.407 0.398 16.3 42.8 1.07e-39 7
8 Delaware TSLM(box… 0.0470 0.0317 16.5 3.07 6.00e- 3 7
9 Florida TSLM(box… 0.0749 0.0601 3.49 5.05 5.41e- 5 7
10 Georgia TSLM(box… 0.356 0.346 3.35 34.5 3.80e-33 7
# … with 37 more rows, and 8 more variables: log_lik <dbl>, AIC <dbl>,
# AICc <dbl>, BIC <dbl>, CV <dbl>, deviance <dbl>, df.residual <int>,
# rank <int>
AN STL
NYTAgg.COVID %>%
filter(is_aggregated(state)) %>%
model(STLC = STL(box_cox(New.Cases, lambdaC) ~ trend(window = 14) + season(window = 21),
robust = TRUE)) %>%
components() %>%
autoplot()
NYTAgg.COVID %>%
filter(is_aggregated(state)) %>%
model(STLC = STL(box_cox(New.Deaths, lambdaD) ~ trend(window = 14) + season(window = 21),
robust = TRUE)) %>%
components() %>%
autoplot()
Training and Testing
That gives me the data that I want; now let me slice it up into training and test sets.
COVID.Agg.Test <- NYTAgg.COVID %>%
as_tibble() %>%
group_by(state) %>%
filter(date > as.Date("2021-03-31")) %>%
ungroup() %>%
as_tsibble(index = date, key = state)
COVID.Agg.Train <- anti_join(NYTAgg.COVID, COVID.Agg.Test) %>%
as_tsibble(index = date, key = state)
Model Fitting
Fit some models to the data.
COVID.Models <- COVID.Agg.Train %>%
filter(is_aggregated(state)) %>%
model(`K = 2` = ARIMA(box_cox(New.Cases, lambdaC) ~ fourier(K = 2) + PDQ(0, 0,
0)), ARIMA = ARIMA(box_cox(New.Cases, lambdaC)), ETS = ETS(box_cox(New.Cases,
lambdaC)), NNET = NNETAR(box_cox(New.Cases, lambdaC)), NNETX = NNETAR(box_cox(New.Cases,
lambdaC) ~ Day.of.Week), ARIMAX = ARIMA(box_cox(New.Cases, lambdaC) ~ Day.of.Week),
ETSlog = ETS(log(New.Cases + 1)), ETSLevel = ETS(New.Cases)) %>%
mutate(Combo1 = (`K = 2` + ARIMA + ETS)/3, Combo2 = (`K = 2` + ARIMA + ETS +
NNET)/4, Combo3 = (NNETX + ARIMAX)/2, Combo4 = (ARIMAX + ETS)/2, Combo5 = (ETS +
ETSlog + ETSLevel)/3)
Forecast the models.
COVIDFC <- COVID.Models %>%
filter(is_aggregated(state)) %>%
forecast(h = 14, new_data = COVID.Agg.Test)
COVIDFC %>%
filter(is_aggregated(state)) %>%
accuracy(COVID.Agg.Test %>%
filter(is_aggregated(state)))
# A tibble: 13 x 11
.model state .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr*> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ARIMA <aggregat… Test 2468. 6554. 5297. 1.82 8.32 NaN NaN -0.0535
2 ARIMAX <aggregat… Test 271. 7164. 6030. -1.80 10.1 NaN NaN -0.0579
3 Combo1 <aggregat… Test 947. 6607. 5306. -0.657 8.83 NaN NaN -0.186
4 Combo2 <aggregat… Test 1439. 7411. 6150. -0.294 10.2 NaN NaN -0.0978
5 Combo3 <aggregat… Test NaN NaN NaN NaN NaN NaN NaN NA
6 Combo4 <aggregat… Test 373. 6094. 4951. -1.33 8.26 NaN NaN -0.169
7 Combo5 <aggregat… Test 291. 5800. 4389. -1.38 7.42 NaN NaN -0.216
8 ETS <aggregat… Test 475. 5408. 4153. -0.870 6.85 NaN NaN -0.264
9 ETSLe… <aggregat… Test 1051. 6337. 4996. -0.471 8.31 NaN NaN -0.124
10 ETSlog <aggregat… Test -653. 5923. 4708. -2.81 8.03 NaN NaN -0.234
11 K = 2 <aggregat… Test -102. 8815. 7176. -2.92 12.3 NaN NaN -0.211
12 NNET <aggregat… Test 2952. 10648. 9300. 0.850 15.2 NaN NaN 0.116
13 NNETX <aggregat… Test NaN NaN NaN NaN NaN NaN NaN NA
Look at the aggregates.
COVIDFC %>%
filter(is_aggregated(state)) %>%
autoplot(NYTAgg.COVID %>%
filter(date > as.Date("2021-03-01")), level = NULL)
Plot the Winner
COVIDFC %>%
filter(is_aggregated(state) & .model == "ETS") %>%
autoplot(NYTAgg.COVID %>%
filter(is_aggregated(state) & date > as.Date("2021-03-01")))
Assess the Winner
COVID.Models %>%
select(ETS) %>%
augment() %>%
ACF(.resid) %>%
autoplot()
The Median Forecast?
COVID.Model.ETS <- COVID.Agg.Train %>%
filter(is_aggregated(state)) %>%
model(ETS = ETS(New.Cases))
Med.FC <- COVID.Model.ETS %>%
forecast(h = 14, point_forecast = list(.median = median), simulate = TRUE, bootstrap = TRUE,
times = 10000)
Med.FC %>%
accuracy(COVID.Agg.Test)
# A tibble: 1 x 11
.model state .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr*> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ETS <aggregated> Test 1169. 6803. 5630. -0.568 9.39 NaN NaN -0.113
Med.FC %>%
autoplot(NYTAgg.COVID %>%
filter(date > as.Date("2021-03-01")))
What Does the Future Hold?
ETSFA <- NYTAgg.COVID %>%
filter(is_aggregated(state)) %>%
model(ETS = ETS(New.Cases)) %>%
forecast(h = 14, point_forecast = list(.median = median), simulate = TRUE, bootstrap = TRUE,
times = 10000)
ETSFA %>%
autoplot(NYTAgg.COVID %>%
filter(date > as.Date("2021-03-01")))
Deaths
COVID.ModelsD <- COVID.Agg.Train %>%
filter(is_aggregated(state)) %>%
model(`K = 2` = ARIMA(box_cox(New.Deaths, lambdaD) ~ fourier(K = 2) + PDQ(0,
0, 0)), ARIMA = ARIMA(box_cox(New.Deaths, lambdaD)), ETS = ETS(box_cox(New.Deaths,
lambdaD)), NNET = NNETAR(box_cox(New.Deaths, lambdaD)), NNETX = NNETAR(box_cox(New.Deaths,
lambdaD) ~ Day.of.Week), ARIMAX = ARIMA(box_cox(New.Deaths, lambdaD) ~ Day.of.Week),
ETSlog = ETS(log(New.Deaths + 1)), ETSLevel = ETS(New.Deaths)) %>%
mutate(Combo1 = (`K = 2` + ARIMA + ETS)/3, Combo2 = (`K = 2` + ARIMA + ETS +
NNET)/4, Combo3 = (NNETX + ARIMAX)/2, Combo4 = (ARIMAX + ETS)/2, Combo5 = (ETS +
ETSlog + ETSLevel)/3)
Forecast the models.
COVIDFCD <- COVID.ModelsD %>%
filter(is_aggregated(state)) %>%
forecast(h = 14, new_data = COVID.Agg.Test)
COVIDFCD %>%
filter(is_aggregated(state)) %>%
accuracy(COVID.Agg.Test %>%
filter(is_aggregated(state)))
# A tibble: 13 x 11
.model state .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr*> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ARIMA <aggregat… Test -95.8 390. 263. -22.7 29.3 NaN NaN -4.13e-2
2 ARIMAX <aggregat… Test -60.8 375. 231. -19.0 25.7 NaN NaN -5.67e-2
3 Combo1 <aggregat… Test -105. 389. 270. -24.0 30.4 NaN NaN -3.25e-2
4 Combo2 <aggregat… Test -63.2 396. 242. -18.3 25.3 NaN NaN -1.42e-1
5 Combo3 <aggregat… Test NaN NaN NaN NaN NaN NaN NaN NA
6 Combo4 <aggregat… Test -86.3 380. 252. -21.3 27.8 NaN NaN -5.53e-2
7 Combo5 <aggregat… Test -107. 390. 271. -23.9 30.3 NaN NaN -4.60e-2
8 ETS <aggregat… Test -112. 389. 273. -23.6 29.9 NaN NaN -4.63e-2
9 ETSLev… <aggregat… Test -76.2 378. 244. -20.6 27.1 NaN NaN -8.83e-2
10 ETSlog <aggregat… Test -132. 409. 297. -27.6 34.0 NaN NaN -3.23e-4
11 K = 2 <aggregat… Test -106. 392. 273. -25.6 32.1 NaN NaN -6.94e-3
12 NNET <aggregat… Test 61.0 480. 212. -1.19 16.2 NaN NaN -3.96e-1
13 NNETX <aggregat… Test NaN NaN NaN NaN NaN NaN NaN NA
Look at the aggregates.
COVIDFCD %>%
filter(is_aggregated(state)) %>%
autoplot(NYTAgg.COVID %>%
filter(date > as.Date("2021-03-01")), level = NULL)
## Diagnostics
COVID.ModelsD %>%
select(ARIMAX) %>%
report()
Series: New.Deaths
Model: LM w/ ARIMA(1,1,3)(1,0,1)[7] errors
Transformation: box_cox(New.Deaths, lambdaD)
Coefficients:
ar1 ma1 ma2 ma3 sar1 sma1 Day.of.Week.L
0.0683 -0.5906 -0.1000 -0.0419 0.5341 -0.3426 1.3388
s.e. 0.4418 0.4385 0.2394 0.0936 0.3471 0.3860 0.1001
Day.of.Week.Q Day.of.Week.C Day.of.Week^4 Day.of.Week^5 Day.of.Week^6
-1.5666 -0.0362 0.3687 -0.4623 0.1221
s.e. 0.1084 0.0892 0.0831 0.0772 0.0731
sigma^2 estimated as 0.2794: log likelihood=-278.8
AIC=583.6 AICc=584.64 BIC=634.26
COVID.ModelsD %>%
select(ARIMAX) %>%
augment() %>%
gg_tsdisplay(.resid)
Plot the Winner
COVIDFCD %>%
filter(is_aggregated(state) & .model == "ARIMAX") %>%
autoplot(NYTAgg.COVID %>%
filter(is_aggregated(state) & date > as.Date("2021-03-01")))
The Median Forecast?
COVID.Model.ARIMAX <- COVID.Agg.Train %>%
filter(is_aggregated(state)) %>%
model(ARIMAX = ARIMA(box_cox(New.Deaths, lambdaD) ~ Day.of.Week))
Med.D.FC <- COVID.Model.ARIMAX %>%
forecast(new_data = COVID.Agg.Test, point_forecast = list(.median = median),
simulate = TRUE, bootstrap = TRUE, times = 10000)
Med.D.FC %>%
accuracy(COVID.Agg.Test)
# A tibble: 1 x 11
.model state .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr*> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ARIMAX <aggregated> Test -58.6 374. 228. -18.8 25.4 NaN NaN -0.0603
The Future
Day.of.Week <- data.frame(date = Sys.Date() + days(0:13)) %>%
mutate(Day.of.Week = wday(as.Date(date), label = TRUE)) %>%
as_tsibble(., index = date)
AggD <- NYTAgg.COVID %>%
filter(is_aggregated(state)) %>%
as_tsibble(., index = date, key = NULL)
FutureFC <- AggD %>%
model(ARIMAX = ARIMA(box_cox(New.Deaths, lambdaD) ~ Day.of.Week)) %>%
forecast(new_data = Day.of.Week, point_forecast = list(.median = median), simulate = TRUE,
bootstrap = TRUE, times = 10000)
FutureFC %>%
autoplot(AggD)