-
Notifications
You must be signed in to change notification settings - Fork 2
/
kubinec_model_preprint.Rmd
executable file
·3248 lines (2398 loc) · 198 KB
/
kubinec_model_preprint.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "A Bayesian Latent Variable Model for the Optimal Identification of Disease Incidence Rates Given Information Constraints"
date: "April 5th, 2024"
author:
- Robert Kubinec:
email: [email protected]
institute: nyuad
correspondence: true
- Luiz Max Carvalho:
institute: gvf
- Joan Barceló:
institute: nyuad
- Cindy Cheng:
institute: tum
- Luca Messerschmidt:
institute: tum
- Matthew Sean Cottrell:
institute: ucr
institute:
- gvf: School of Applied Mathematics, Getulio Vargas Foundation, Brazil
- tum: Hochschule für Politik at the Technical University of Munich (TUM) and the TUM School of Governance, Munich, Germany
- nyuad: Social Science Division, New York University Abu Dhabi, Abu Dhabi, United Arab Emirates
- ucr: University of California Riverside, United States of America
toc: false
output:
bookdown::pdf_document2:
keep_tex: true
includes:
in_header:
preamble2.tex
pandoc_args:
- '--lua-filter=scholarly-metadata.lua'
- '--lua-filter=author-info-blocks.lua'
bibliography: BibTexDatabase.bib
abstract: "We present an original approach for measuring infections as a latent variable and making use of serological and expert surveys to provide ground truth identification during the early pandemic period. Compared to existing approaches, our model relies more on empirical information than strong structural forms, permitting inference with relatively few assumptions of cumulative infections. We also incorporate a range of political, economic, and social covariates to richly parameterize the relationship between epidemic spread and human behavior. To show the utility of the model, we provide robust estimates of total infections that account for biases in COVID-19 cases and tests counts in the United States from March to July of 2020, a period of time when accurate data about the nature of the SARS-CoV-2 virus was of limited availability. In addition, we can show how sociopolitical factors like the Black Lives Matter protests and support for President Donald Trump are associated with the spread of the virus via changes in fear of the virus and cell phone mobility.\\footnote{A reproducible version of this paper is available as an Rmarkdown file at \\url{https://github.com/CoronaNetDataScience/covid_model}}."
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE,warning=FALSE,message=FALSE,fig.width=6,fig.asp=0.618,dpi=300)
require(dplyr)
require(tidyr)
require(ggplot2)
require(cmdstanR)
require(stringr)
require(lubridate)
require(bayesplot)
require(historydata)
library(posterior)
require(readr)
require(datasets)
require(extraDistr)
require(patchwork)
require(RcppRoll)
require(readxl)
require(ggrepel)
require(missRanger)
require(cmdstanr)
require(viridis)
library(ggthemes)
# update this package /w data
set.seed(662817)
knitr::opts_chunk$set(warning=F,message=F,dev="png")
# NEED THESE GITHUB REPOS IN YOUR HOME FOLDER:
# https://github.com/COVID19Tracking/covid-tracking-data
# https://github.com/nytimes/covid-19-data
#system2("git",args=c("-C ~/covid-tracking-data","pull"))
#system2("git",args=c("-C ~/covid-19-data","pull"))
# whether to use a subset of states (for testing purposes)
state_filter <- c('AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'NA', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY')
# what type of model to fit
# one of prior_expert, prior_only or full (includes likelihood)
model_type <- "prior_expert"
# set treedepth (determines sampler complexity)
treedepth <- 10
nchains <- 2
nsamples <- 2000
adapt_delta <- 0.90
# whether to run model (it will take a few hours) or load saved model from disk
run_model <- F
# whether to use fresher coronanet policy data
new_policy <- F
# whether to pull new cases/tests from NYT/COVID project
new_cases <- F
# whether to re-calculate QOIs from posterior distribution
# can take quite a while
calc_qoi <- F
```
\newpage
The COVID-19 pandemic has led to a significant increase in disease modeling as the demands of a worldwide emergency spurred substantial innovation. Accurately modeling COVID-19 and similar diseases is no simple feat, however, due to multiple forms of selection and measurement bias that are difficult to overcome. The approach we present in this paper differs from existing work by conceptualizing the relative level of infections as a time-varying latent variable and applying Bayesian techniques to obtain a posterior distribution over likely estimates. Our model's framework is similar in spirit to election forecasting models that produce latent estimates of candidate support from biased time-varying information, i.e., polls [@linzer2013; @lock2010]. Instead of polling data, we directly incorporate serological and expert survey estimates into the model in a way that allows us to identify the scale of the latent variable without having to make extensive assumptions about transmission dynamics.
This approach has two advantages for applied research. First, the minimal set of assumptions makes the model more robust than traditional compartmental models by incorporating as much uncertainty as possible about the infection rate. Existing approaches based on the SIR/SEIR framework often require multiple hard-coded values due to the large number of dynamic compartments that must be estimated. When there is a lack of quality empirical information about disease transmission dynamics, such as in the early pandemic period, these assumptions can be difficult to justify. By employing a Bayesian model with informative priors, we can obtain estimates that have realistic uncertainty about the state of the world while also incorporating the best available prior information.
For example, in one of the earliest studies of COVID-19 transmission in the Wuhan area of China by @kucharski2020, the authors had to use strongly informative prior distributions for the "incubation period of COVID-19 cases" (5.2 days, SD 3.7), the "delay from onset to isolation" (6.1 days, SD 2.5), the "delay from onset to reporting" (6.1 days, SD 2.5), as well as additional assumptions about the contact mixing rate and travel between Chinese regions. We note that these are assumptions required by the model, and even with clever efforts to collect data on all of these variables, such as estimating infection duration from the *Diamond Princess* cruise ship COVID-19 outbreak in February of 2020 [@riou2020], there are clear limitations to relying on assumptions about disease transmission and reporting dynamics without a large base of empirical evidence.
Second, conceptualizing the infection rate as a latent variable within a generative modeling framework permits us to examine covariate relationships in a manner that is more nuanced than traditional regression modeling. As we show in this paper, it is possible for us to employ a wide range of covariates that predict the infection rate and consequently dramatically reduce the uncertainty of our estimates without having to assume any prior relationship between covariates and the infection rate (also known as the attack rate). Second, we are able to examine mediation relationships between covariates and the infection rate, which permits us to be more specific about the pathways through which covariates may be associated with infections. Especially given how difficult it is to achieve causal identification with observational epidemiological data, we believe that employing causal graphs to test for plausible mediation relationships can improve our understanding of potential disease transmission dynamics.
To apply the model, we estimate the infection rate in the United States from March to July of 2020, a crucial time in the pandemic as substantial uncertainty existed about how to respond to the disease [@perra2021]. This uncertainty manifested itself in rapidly changing patterns of human behavior and also political conflict as leaders diverged over their understanding of the disease's threat. While we have increasing evidence about the relationship between partisan identities and individual beliefs about COVID-19 [@fan2020; @Grossman202007835], data about partisan identity, along with other social and economic covariates, are rarely included in efforts to model and predict the spread of the disease in the population [@flaxman2020; @sharma2020; @haug2020]. As a result, we have difficulty understanding the causal pathways through which political and social variables affect the course of the pandemic [@gadarian2022], and in turn are affected by it. In this paper we disaggregate the effects of partisanship and demographic factors on disease outcomes by examining the way that these variables are mediated by individual mobility, reported masking and fear of the virus. Even when we cannot make exclusive claims of causal identification, we can still learn in much more detail about time-varying associations between covariates and the disease and the plausible pathways through which beliefs affected actions.
We show with this model that political partisanship in the United States had a very strong association with the spread of the pandemic by increasing or reducing people's fear of the virus and also by changing their mobility patterns. A 2% increase in a state's vote share for U.S. President Donald Trump in 2016 is associated with a 0.5% to 0.7% increase in a state's infections mediated by unsafe changes in mobility patterns.
We also find evidence that in-person political activity is positively associated with the spread of the disease, with states that saw a 1-SD increase in social justice protests following the death of George Floyd witnessing an increase in infections as high as 0.7% over time, although only among states that witnessed continuous protest activity. On the other hand, we do not find that the protests reduced people's fears of the disease or noticeably changed mobility patterns, suggesting that the spread of the disease happened solely through increased personal contact at the protests and subsequent chains of transmission rather than by changing behavioral patterns over long term.
# Background
As more and more data has become available on observed case counts of the SARS-CoV2 coronavirus, there have been increasing attempts to infer how contextual factors like government policies, partisanship, and temperature affect viral spread [@carleton2020; @sajadi2020; @dudel2020; @tansim2020; @brze2020]. The temptation to make inferences from the observed data, however, can result in misleading conclusions. Modeling approaches that fully account for disease dynamics like the SIR/SEIR specifications are very powerful but also require more information than is known about disease progression in the population--especially in its early stages--requiring researchers to rely on assumptions that are difficult to know with complete confidence [@flaxman2020; @ferguson2020]. For this reason, in this paper we present a retrospective Bayesian model that can adjust for testing bias by estimating the unobservable infection rate up to an unidentified constant. Furthermore, by incorporating informative priors based on serological and expert surveys of infection prevalence, it is possible to put an informative prior on the unobserved infection rate and estimate both recent disease trends and the association of covariates with the historical spread of the disease. By employing a fully Bayesian approach, we are able to allow our uncertainty about this prior information to propagate in the model, ensuring that we are not over-confident in our predictions of the latent infection rate.
<!--# We can summarize the problem of modeling COVID-19 (and diseases more generally) in terms of two main challenges. The first is the challenge of modeling the spread of the disease given the limitations of testing and reported deaths, which could obfuscate the effect of any covariates with data reporting issues. Second, employing observational data requires nuanced comparisons to be made. To learn the effect of a stay-at-home order, for instance, we would want to compare two regions with similar demographic, social and political characteristics as these factors could be also influencing human behavior, masking the effect of the stay-at-home order. For example, regions with less political partisanship may be more likely to take prudent behaviors to mitigate COVID-19 and also may be more likely to see stay-at-home orders implemented. -->
An additional advantage of our method is to permit us to make more precise statements about how important theoretical covariates for the spread of disease may or may not be associated with the latent infection rate. A vast and expanding literature documents connections between many political, economic and social factors with human behavior related to the COVID-19 pandemic [@abouk2021a; @adolph2021; @allcott2020a; @ashraf2020; @barceló2022; @bo2021; @brauner2020a; @courtemanche2020a; @dave2020b; @fellows2020; @flaxman2020b; @haug2020a; @islam2020; @barceló2020; @sebhatu2020; @sharma2021; @zheng2020]. While existing studies have shown these associations primarily through surveys and other individual-level analyses, it is difficult to test whether these factors jointly have any effect on COVID-19 infections. The reason for this difficulty is due to how these variables affect human behavior in general equilibrium. For example, non-pharmaceutical interventions (NPIs) like stay-at-home orders have been associated with reduced infections, but stay-at-home orders were also implemented in a rapidly changing environment as public health policies, new suppression practices like masking and the health of the economy varied. People faced myriad influences on their choices during the pandemic, and even if we have a strong reason to believe that a certain factor should influence their behavior, estimating that effect when many other contravening and contrasting factors were likely at play is challenging.
At the same time, estimating these general equilibrium effects even within the limitations of available data is very important to learn what factors are associated with the spread of COVID-19 in realistic conditions. For example, some argued that masking would lead to increased infections because it would reduce concern over the risk of infection [@abaluck2020]. Evaluating this hypothesis ultimately requires general equilibrium analysis as it involves competing influences on human behavior. In other words, is the moral hazard of being falsely protected a greater threat than the positive benefits of reducing infections via masking? Being able to sort, rank and understand socio-economic, political and healthcare-related factors behind the disease's spread is crucial to better understand why and how COVID-19 overwhelmed countries' disease control systems even when we lack a means of causal identification.
In this paper, we seek to address these questions by collecting a set of important covariates, implementing models to adjust for bias in COVID-19 data and employ mediation analysis to understand the pathways that covariates affect the spread of the pandemic. We believe that pathway analysis allows us to uncover meaningful associations that do not obfuscate different time-varying processes. While causal identification in the pandemic is a non-trivial endeavor, employing models that can more realistically evaluate available data is arguably the best path forward.
With this model, we are also able to address important empirical debates about the sociopolitical factors behind the spread of the virus such as political partisanship. Political scientists have investigated to what extent partisanship has inhibited preventive measures against the COVID-19 pandemic as President Trump argued against public health policies like face masks [@gadarian2022]. Research has already shown that Republicans are less likely than Democrats to practice public health behaviors like hand washing [@gadarian2020], to practice social distancing [@andersen2020; @alcott2020; @qiu2020], and to comply with policies targeted against COVID-19 [@fan2020; @Grossman202007835].
While partisanship in favor of President Trump and the Republican party has received the most attention, other types of political mobilization have also come under scrutiny. Of particular note were the protest movements against police brutality that spread across the United States in the summer and fall of 2020. Existing research suggests the protests have not had an adverse effect on COVID-19 infections [@protest2020], though the finding is again limited by the measurement bias we describe later. As such, it is clear that political motivations on both the left and the right have been associated with reduced compliance with COVID-19 precautions, though it is not clear through which potential pathways these variables could be affecting disease outcomes.
<!-- The most important of these, which we also study in this article, are the role of government policies to prevent close personal interaction, which are often classified under the umbrella of non-pharmaceutical interventions (NPIs). Some of the most sophisticated of these studies, which employ state-of-the-art epidemiological models of COVID-19, have examined how country-level differences in the implementation of stay-at-home orders and business closures affected the spread of the pandemic in the critical early period [@flaxman2020; @sharma2020; @haug2020]. While these studies have emphasized the difficult inference issues involved with modeling COVID-19 data, they have largely avoided adjusting NPI estimates with sociopolitical covariates like partisanship, making an implicit assumption that the effect of NPI estimates is independent of these types of human behaviors and identities. For this reason, while these studies help us know much more precisely how NPIs affected the spread of the pandemic through exact measures like the reproduction number, they are more limited in making stronger claims of identification of the NPIs and even through what channels NPIs affect human behavior [@sharma2020a].-->
<!-- Studies of NPIs and other factors arising in the social sciences, on the other hand, tend to address socio-political issues more directly, though they also generally opt for simpler models such as event studies that are easier to estimate with richer covariate sets [@allcott2020; @dave2020]. These studies sometimes also employ cell phone mobility data as a proxy for infections with the assumption that reduced mobility will reduce human contact and consequently the spread of COVID-19. These studies contain more realism in terms of the factors that could explain COVID-19 at the expense of the more sophisticated methods for modeling the spread of the disease. However, even these studies do not normally include relatively novel factors such as partisanship, and do not generally examine the mediation pathways through which policies and demographics may influence behavior. Furthermore, it is difficult to assess whether the more conventional models employed are able to appropriately address the limitations of COVID-19 data. -->
<!-- Our aim in this paper is to include variables of interest to both the epidemiological and social-scientific literatures, with a particular emphasis on decomposing the channels through which covariates influence human behavior. By doing so, we hope to point out what are in fact the most important factors in the myriad of possible influences on human behavior. One of our central arguments in doing so is that we cannot afford to ignore unconventional factors like partisanship even in models that focus solely on the spread of the disease. -->
<!-- Our argument about the importance of partisanship as a covariate can be expressed with the following two hypotheses: -->
<!-- > H1: Higher levels of partisanship as measured by President Trump's 2016 vote share, Trump daily approval ratings and participation in racial justice protests are associated with increased COVID-19 infections even when accounting for government policies and demographic factors. -->
<!-- > H2: Increased partisanship in favor of President Trump is associated with increased COVID-19 infections by reducing fears of the severity of the disease and by encouraging risky patterns of mobility. -->
<!-- In the next section, we discuss our statistical method for testing these hypotheses and estimating the effect of NPIs and other factors. -->
# Methods
<!-- As more and more data has become available on observed case counts of the SARS-CoV2 coronavirus, there have been increasing attempts to infer how contextual factors like government policies, partisanship, and temperature affect the disease's spread [@carleton2020;@sajadi2020;@dudel2020;@tansim2020;@flaxman2020;@brze2020]. The temptation to make inferences from the observed data, however, can result in misleading conclusions. For example, some policy makers have publicly questioned whether the predictions of epidemiological models are far worse than the observed case count.^[See article available at https://www.realclearpolitics.com/video/2020/03/26/dr_birx_coronavirus_data_d] By contrast, in this paper we show that the unobserved infection rate obscures any estimates of covariates because the infection rate influences counts of both COVID-19 cases and tests. For this reason, in this paper we present a retrospective Bayesian model that can adjust for this bias by estimating the unseen infection rate up to an unidentified constant. Furthermore, by incorporating informative priors based on serological surveys of infection prevalence, it is possible to put an informative prior on the unobserved infection rate and estimate both recent disease trends and the effect of covariates on the historical spread of the disease. -->
<!-- In this section we present a formal definition of the model. We refer the reader to the supplementary materials for details of Monte Carlo simulations showing recovery of the latent infection rate. -->
<!-- Existing models that address these questions generally fall into one of two categories. In the first, models by social scientists investigate how social, economic and political variables affect and are affected by the spread of COVID-19. These models generally attempt to avoid data measurement errors through implementing existing methods, such as time-to-event models and difference-in-difference estimation [@dave2020; @courtemanche2020; @protest2020; @abouk2021]. In a different vein, a growing literature examines the role of NPIs, or government policies attempting to control the pandemic. These studies tend to use more complicated epidemiological models, known as compartmental models, that are more suited to the complexity of the data, but also tend to have much less sophistication in terms of the range of covariates employed for adjustment [@flaxman2020; @ferguson2020; @haug2020; @brauner2020]. The more robust modeling of the disease comes at the cost of being able to employ more extensive and more nuanced covariate adjustment, such as the mediation analysis we employ in this paper. -->
Fitting models that can realistically model disease trends during the early pandemic period when data is limited can be quite difficult. To address this crucial problem, we present a new Bayesian latent variable model that has a similar aim as epidemiological disease-tracking models in that it is designed explicitly to model disease dynamics. However, our model is a significant simplification of the compartmental models employed by epidemiologists to study viruses, and in particular SARS-CoV2 [@peak2020; @riou2020; @verity2020; @perkins2020; @lourenco2020; @li2020; @ferguson2020; @carleton2020; @sajadi2020; @dudel2020; @tansim2020; @flaxman2020; @brze2020]. These models suppose different classes (compartments) of individuals in the population, denoted $S$ for susceptible, $I$ for infectious, and $R$ for removed (other compartments may be added, such as $E$ for exposed).
While these models are a powerful expression of the progress of a disease in the population, these models often struggle to provide robust estimates when relying on limited and possibly biased data about disease transmission dynamics. COVID-19 data, especially in the early pandemic period, had serious flaws, including limited testing and under-reporting of hospitalizations and deaths [@larremore2020; @sánchez-romero2021; @moein2021; @bertozzi2020]. When such data is unavailable, modelers can compensate by simulating plausible random values or using informative prior distributions, but this makes the model estimates tied to the particular set of values used [@grinsztajn2021]. As a result, the challenges in the estimation of compartmental models with empirical data restrict the ability to interpret covariate adjustment. Any estimated associations are tied to the particular values used to identify the models, and it is not always possible to marginalize over all possible (or even a reasonably broad range) of hard-coded values via simulations.
By contrast, this paper endeavors to estimate a more limited quantity than each dynamic measure of infected, recovered, and susceptible persons. We believe that many researchers and the general public often only want to learn about what has already happened, or the *empirical* infection rate (also called the attack rate in the epidemiological literature). For a number of time points $t \in T$ since the outbreak's start and states $c \in C$, we aim to identify the following quantity:
$$
f_t \left (\frac{I_{ct}}{S_{ct}+R_{ct}} \right )
$$
<!-- Assuming a fixed population size, this quantity is simply the marginal rate of infections in the population up to the present. The function $f_t$ determines the historical time trend of the rate of infection (which is assumed to be same across countries/regions) in the population up to time $T$, the present. Because the denominator is shifting over time due to disease progression dynamics, this model is only useful for retrospection, i.e., to examine factors that may be influencing the empirical time trend $f_t$. As $S_{ct}$ and $R_{ct}$ are exogenous to the model, it cannot predict future prevalence of the disease given that it does not determine these crucial factors. -->
<!-- In other words, this model can be seen as a local linear approximation to the $I_{ct}$ curve from an SIR model. -->
where $I_{ct}$ denotes the number infected with SARS-CoV-2 at time $t$ and $S_{ct}$ and $R_{ct}$ denote those who remain susceptible to the virus and those who have either died or recovered. In our model, we collapse $S_{ct}$ and $R_{ct}$ to a single quantity--those who are not infected--so we can focus exclusively on identifying $I_{ct}$.
However, even with this simplification, we do not have estimates of the actual infected rate $I_{ct}$, only positive COVID-19 cases $a_{ct}$ and numbers of COVID-19 tests $q_{ct}$ due to the aforementioned measurement issues. Given this limitation, the aim of the model is to backwards infer the infection rate $I_{ct}$ as a latent process given observed test and counts. Modeling the latent process is necessary to avoid bias in using only observed case counts as a proxy for $I_{ct}$. The reason for this is shown in Figure 1 in which a covariate $X_{ct}$, such as a stay-at-home order, is hypothesized to affect the infection rate $I_{ct}$. Unfortunately, increasing infection rates can cause both increasing numbers of observed counts $a_{ct}$ and tests $q_{ct}$. As more people are infected, more tests are likely to be done, which will increase the number of cases independently of the infection rate. As a result, due to the back-door path from the infection rate $I_{ct}$ to case counts $a_{ct}$ via the number of tests $q_{ct}$, it is impossible to infer the association of $X_{ct}$ on $I_{ct}$ from the observed data alone without modeling the latent infection rate.
```{=tex}
\begin{figure}
\label{tikzfig}
\caption{Directed Acyclic Graph Showing Confounding of Covariate $X_{ct}$ on Observed Tests $q_{ct}$ and Cases $a_{ct}$ Due to Unobserved Infection Rate $I_{ct}$}
\ctikzfig{policy_dag}
\footnotesize{Figure shows the relationship between a covariate $X_{ct}$ representing a policy or social factor influencing the infection rate $I_{ct}$. Because the infection rate $I_{ct}$ influences both the number of reported tests $q_{ct}$ and reported cases $a_{ct}$, any regression of a covariate $X_{ct}$ on the reported data will be biased. Latent variables are shown as circles and observed variables as rectangles.}
\end{figure}
```
To estimate the process in Figure 1, we assume that the unobserved state-specific cumulative infection rate $I_{ct}$ can be modeled as a time-varying Beta-distributed random variable with a mean parameter $\mu \in (0,1)$ and shape parameter $\phi>0$. We employ the Beta distribution for this parameter because the cumulative infection rate must lie between 0 and 1 (i.e., a proportion) once the pandemic has started.
It is important to understand how the model incorporates time. We estimate one cumulative infection rate for each state for each time point $t$. In order to take into account time dependence, we include a shared third-order polynomial time trend that is a function of the number of post-outbreak time periods $T_O < T$, where an outbreak begins at the first reported case in a given state. In other words, we expect the residual time process to continue at the same rate across states once the pandemic begins in a given state.
We employ a cubic time function based on theoretical considerations. In our model, the polynomial represents the rate of infection increase in the absence of any other covariates, or equivalently the *counterfactual* rate of infections. We know from the SIR/SEIR simulations that, in the absence of any countervailing measures, epidemics occur in ever-increasing waves until the herd immunity threshold is reached, although the curve is unlikely to be symmetric as a quadratic function would require. As such, we employ this function because it represents a credible baseline for what the epidemic would do if no other factors impeded its spread.
For the same reason, we use a shared polynomial function because, during this early pandemic period, we know that the states are experiencing infections from the same virus. As such, we expect that the counterfactual or residual trajectory of the pandemic to be the same across states. If we were to allow this time trend to vary across states in a multilevel structure, we would mistakenly absorb the effect of covariates as heterogeneity in the viral strain.
We define the conditional distribution of the unobserved infection rate $I_{ct}$ as:
```{=tex}
\begin{align}
\operatorname{Pr}(I_{ct} \mid t=T) &\sim \operatorname{Beta}(\mu \phi, (1 - \mu)\phi)\\
\mu = & g^{-1}(\alpha_1 + \beta_{I1}t_o + \beta_{I2}t_o^2 + \beta_{I3}t_o^3 +
\beta_C X_{ct})
(\#eq:binom)
\end{align}
```
This parameterization of the Beta distribution in terms of $\mu$ and $\phi$ follows from the Beta regression literature [@ferrari2010] so that we can model the expected value $E[I_{ct}]$ directly via $\mu$. As such, we use $g^{-1}(\cdot)$, the inverse logit function, to scale the linear model in $\mu$ to the $(0,1)$ interval. The three $\beta_{Ii}$ are polynomial coefficients of the number of post-outbreak time periods $t_o$.
The parameter vector $\beta_C$ represents the effect of independent covariate matrix $X_{ct}$ on the latent infection rate. These are our main variables of interest, and have effects in addition to the polynomial time trends. Finally, the parameter $\phi$ becomes a dispersion parameter which, intuitively, equals the effective sample size of the beta-binomial model. For each day $t$, $\phi$ is equal to the amount of information in the linear model about cases and tests expressed as the size of a random sample from the population.
<!-- The second way suppression measures enter the model is through $\beta_{S2}X_c$, which can increase over time as the disease increases. This parameter reflects possible measures which will grow more effective as domestic transmission of the disease increases (i.e. as the polynomial time trend takes off). -->
<!-- As such, it is assumed that any deviation from the common domestic transmission pattern is due to these time-varying suppression measures. -->
Because we do not have measures of $I_{ct}$, we need to used the observed data, tests $q_{ct}$ and cases $a_{ct}$, to infer $I_{ct}$. First, we propose that the number of infections will almost certainly increase the number of tests as states try to stop the disease's spread via surveillance. Second, we can assume that a rising infection rate is associated with a higher ratio of positive results (reported cases) conditional on the number of tests, that is, COVID-19 is causing positive test results. We model both of these observed indicators, tests and cases, jointly to simultaneously adjust for the infection rate's influence on both factors. It is this joint modeling that permits us to directly incorporate testing bias. In fact, our model learns about the infection rate from the absolute number of tests.
To model the number of tests, we assume that each state has an unobserved level of testing capacity, which increases at a non-linear rate during the course of the epidemic. We employ a quadratic function of testing capacity to express the concept of diminishing marginal returns. States were able to ramp up testing once PCR tests were approved by the FDA, but faced constraints due to shortages of supplies, personnel and labs. The cumulative number of observed tests $q_{ct}$ for a given time point $t$ and state $c$ and as a fraction of the states' population, $c_{p}$, then has a binomial distribution:
```{=tex}
\begin{equation}
q_{ct} \sim \operatorname{Binomial}(c_{p}, g^{-1}(\alpha_2 + \beta_b I_{ct} +
\beta_{cq1}L_t + \beta_{cq2} L_t^2)).
(\#eq:binom2)
\end{equation}
```
The parameters $\beta_{cq1}$ and $\beta_{cq2}$ represent the quadratic increase in testing capacity that varies by state $c$ for each post-outbreak time point $L_t$. We similarly allow for partial pooling of these coefficients as testing capacity will show a limited level of variability across states. The parameter $\beta_b$ then represents the independent contribution of the level of infections $I_{ct}$ on the total number of requests demanded marginal of testing capacity. The intercept $\alpha_2$ indicates how many tests would be performed in a state with an infection rate of zero and at time $t=0$, and as such is likely to be very low.
<!-- Given the parameters $\beta_{cq1}$ and , a state could test almost no one or test far more than are actually infected depending on their willingness to impose tests. -->
<!-- Because the capacity to test changed significantly over time, we include a linear time interaction (denoted $L_t$) to allow testing capacity to adjust accordingly.^[For a very compelling visualization of this process with empirical data from the COVID-19 pandemic, we refer the reader to this website: https://ourworldindata.org/grapher/covid-19-tests-cases-scatter-with-comparisons.] -->
The binomial model for the number of observed tests $q_{ct}$ provides some information about $I_{ct}$, but not enough for useful estimates. We can learn much more about $I_{ct}$ by also modeling the number of observed cases $a_{ct}$ as another binomial random variable expressed as a proportion of the state population, $c_p$:
```{=tex}
\begin{equation}
a_{ct} \sim \operatorname{Binomial}(c_p, g^{-1}(\alpha_3 + \beta_a I_{ct})),
(\#eq:binom3)
\end{equation}
```
where $g^{-1}(\cdot)$ is again the inverse logit function, $\alpha_3$ is an intercept that indicates how many cases would test positive with a cumulative infection rate of 50% (i.e., zero on the logit scale), and $\beta_a$ is a scaling parameter that reflects how much information about case counts comes from the latent infection process. The multiplication of this parameter and the infection rate determines the cumulative number of cases, $a_{ct}$, as a proportion of the state population, $c_p$.
To summarize the model, infection rates determine how many tests a state is likely to undertake and also the number of positive tests (confirmed cases). This simultaneous adjustment helps takes care of mis-interpreting the observed data without taking into account varying testing levels, which has made it hard to generalize findings concerning the disease across health jurisdictions. It also allows us to learn the likely location of the infection rate conditional on what we observe in terms of tests and cases.
Because sampling from a model with a hierarchical Beta parameter can be difficult, we simplify the likelihood by combining the beta distribution and the binomial counts into a beta-binomial model for tests:
```{=tex}
\begin{align}
q_{ct} & \sim \operatorname{Beta-Binomial}(c_p, \mu_q \phi_q, (1-\mu_q) \phi_q)\\
\mu_q &= g^{-1}(\alpha_2 + \beta_b I_{ct} +
\beta_{cq1}L_t + \beta_{cq2}L_t^2)
(\#eq:binom4)
\end{align}
```
and cases:
```{=tex}
\begin{align}
a_{ct} &\sim \operatorname{Beta-Binomial}(q_{ct}, \mu_a \phi_a, (1 - \mu_a) \phi_a) \\
\mu_a &= g^{-1}(\alpha_3 + \beta_a I_{ct}).
(\#eq:binom5)
\end{align}
```
where $I_{ct}$ is now equal to the linear model shown in \@ref(eq:binom) and implicitly mapped to $(0,1)$ as a component of $\mu_a$.
## Identifiability
This model contains an unobserved latent process $I_{ct}$, and as such the model as presented is not identified from the data alone without further information. For example, the parameters that control the influence of the infection rate on tests and cases could increase and the latent infection rate could decrease without the probability of the observed data changing.
We take two further steps to identify this model that we believe represent very limited additional assumptions, especially compared to existing modeling approaches. First, we impose the assumption that $I_{ct}$ is a non-decreasing quantity. The number of infected people cannot decrease in an epidemic without a significant virus mutation, but the model as expressed does not require that to be true. We can eliminate that possibility from the model by imposing an ordered constraint on $I_{ct}$:
```{=tex}
\begin{equation}
I_{ct} = \begin{cases}
I_{ct} & \text{if } t=1\\
I_{ct-1} + e^{I_{ct}} & \text{if } 1 < t < T
\end{cases}
(\#eq:trans)
\end{equation}
```
This transformation forces $I_{ct}$ to be no less than $I_{ct-1}$. At the same time, we do not need to impose any constraints on the covariates themselves, allowing us to sample those in an unconstrained space before we transform $I_{ct}$.
However, we also need some information about the empirical scale of testing bias to produce identified estimates of $I_{ct}$. We could do so by adding a prior to the model about the plausible range of total infections to reported cases, though we prefer to use information that is more precise. Our information about the possible level of infections comes from two sources. First, the Centers for Disease Control's serology surveys conducted during the pandemic represent an empirical way of relating $I_{ct}$ to plausible estimates of infections at varying time points. We include a list of these surveys for the time period under study in the supplementary information. Second, we incorporate expert survey data from @mcandrew2022 who surveyed epidemiologists in the early weeks of the pandemic to obtain their best estimates of the total level of infections. Helpfully, this survey provided a robust estimate of uncertainty by eliciting distributions over infections. Furthermore, this type of empirical data can be collected relatively rapidly, which increases this model's utility for future pandemics.
By including this information as informative priors, we also implicitly account for many of the variables explicitly parameterized in compartmental models such as reporting delays. Because we have an estimate of the number infected at time $t$ that is independent of reported cases and tests, the model will find the parameter estimates that are most likely given the observed differences between the surveys and the reported data.
Because we model the infection rate as a cumulative count, we can directly parameterize this information in the model. For the serological surveys, given a state $c$ and time point $t$ for which we have survey information, we model the count of infected $S_{ct}^P$ as a proportion of the total subjects in each serology survey $S^N_{ct}$ with the Binomial distribution:
```{=tex}
\begin{equation}
S_{ct} \sim \operatorname{Binomial}(S^N_{ct},g^{-1}(I_{ct}))
(\#eq:fix)
\end{equation}
```
For the expert survey data, which is expressed as a distribution of proportions $E_{ct}$, we use the Beta distribution to represent our uncertainty:
```{=tex}
\begin{equation}
E_{ct} \sim \operatorname{Beta}(S^N_{ct},g^{-1}(I_{ct})\phi_E,(1-g^{-1}(I_{ct}))\phi_E ))
(\#eq:expert)
\end{equation}
```
where $E_{ct}$ is the average expert estimate and $\phi_E$ is an estimated shape parameter from the empirical distribution reported in @mcandrew2022. This empirical distribution is further stratified by state as the original estimates are for national totals; to do so we divide $E_{t}$ by the proportion of cases and tests that a given state $c$ had reported by that time $t$.
It is important that the serology and expert surveys enter the model in this fashion so that we can model the survey count stochastically and propagate our uncertainty from sample size and expert judgment through to our estimates of $I_{ct}$. This uncertainty matters as well because the serology surveys exhibit random noise and do not always increase over time, as can be seen in the supplementary information. By modeling the relationship as a probabilistic one, we are making the weaker assumption that the infected rate is probably close to the serology estimate, but the two do not need to be the identical. The combined posterior estimates for $I_{ct}$ will then be weighted with the case and test likelihoods to produce the most credible estimate of $I_{ct}$.
As we show in the supplemental information with simulations, no other identification restrictions are necessary to estimate the model beyond weakly informative priors assigned to parameters.
These are:
```{=tex}
\begin{align}
\beta_a &\sim \operatorname{Normal}(0,5),\\
\beta_{qci} &\sim \operatorname{Normal}(\mu_{qi},\sigma_{qi}),\\
\sigma_{qi} &\sim \operatorname{Exponential}(100),\\
\mu_{qi} &\sim \operatorname{Normal}(0, 50),\\
\beta_{C} &\sim \operatorname{Normal}(0, 5),\\
\beta_{Ii} &\sim \operatorname{Normal}(0, 50),\\
\alpha_1 &\sim \operatorname{Normal} (0,10),\\
\alpha_2 &\sim \operatorname{Normal}(0,10),\\
\alpha_3 &\sim \operatorname{Normal}(0,10),
(\#eq:binom6)
\end{align}
```
where the Normal distribution is parameterized in terms of mean and standard deviation. The testing effect variance parameter $\sigma_{qi}$ receives a relatively tight prior to reflect that while we expect there to be some variability in inter-state testing production, this variability should not explain all of the variation in tests as infection rates also play an important role.
<!-- We note that a crucial advantage of this framework is providing a way to measured the count of infected adjusting for known biases in the number of tests. By comparing numbers of tests per capita and growth rates in cases across regions, the model is able to backwards infer a likely number of infected individuals in a given area. As such it exploits both within-area and between-area variance to adjust for the biases of imperfect testing. The wide variety of covariates we add to the model, which we describe in the next section, provide the mechanism through which the model can infer test/case relationships even in states which have not had a CDC serology survey. -->
```{=tex}
\begin{figure}
\label{tikzfig2}
\caption{Directed Acyclic Graph for Latent Infection Rate with Mediators}
\ctikzfig{policy_dag_mediate}
\footnotesize{This figure adds mediators $M_{ct}$ (mobility data) and $F_{ct}$ (fear of COVID-19) that mediate the relationship between state-level covariates $X'_{ct}$ and the latent infection rate $I_{ct}$. Because beliefs precede actions, $F_{ct}$ is causally prior to $M_{ct}$ and can affect infections both via reducing mobility (path $abd$) and directly apart from mobility (path $ae$), such as by encouraging individuals to remain socially distant. Latent variables are shown as circles and observed variables as rectangles.}
\end{figure}
```
We also extend this model in order to analyze the mediation of a subset of covariates $X'_{ct}$ by adding mediators $M_{ct}$ for mobility and $F_{ct}$ for fear of the disease to the causal diagram as shown in Figure 2. Figure 2 has several paths due to the fact that the influence of covariates $X_{ct}$ affects the two mediators differently. Given that beliefs and preferences precede actions, the covariates $X'_{ct}$ first influence $I_{ct}$ along the $ae$ and $abd$ path through perceptions of how dangerous the disease is. These beliefs both affect the chance of an individual getting infected and thus $I_{ct}$ directly on the path $ae$, such as by causing an individual to adopt social distancing behaviors, and also on an indirect path $abd$ by which an increase in a people's fear of the disease reduces mobility as people prefer to stay home.
In addition to pathways through the fear mediator $F_{ct}$, a causal factor like NPIs could influence infections along the pathway through mobility $ed$ without increasing or decreasing fear. This situation could arise if government policies forced people to stay at home against their will and despite their unconcern about the disease. Finally, a covariate could have an unmediated direct effect $g$ on the infection rate. The total effect of a covariate $X_{ct}$ on the spread of the disease is then the sum of all the paths, $abd + af + ed + g$. To calculate the indirect effects and direct effects given the use of the inverse logit function $g^{-1}(\cdot)$, we employ the chain rule as in @winship1983 to calculate the marginal effect of covariates with respect to different pathways to $I_{ct}$.
Adding the mediators to the model does not require significant additional parametric assumptions as they can be included as Normal distributions (i.e., OLS regression) as in @Yuan2009. It should be noted that there are in fact five mobility covariates as explained in the following section, and so we explicitly model the covariance in mobility via a multivariate Normal distribution with a covariance matrix parameter $\Sigma_m$.
To add our mediation covariates $M_{ct}$ and $F_{ct}$, which we describe in more detail in the next section, we multiply the following likelihoods with the joint posterior:
```{=tex}
\begin{align}
M_{ct} &\sim MVN(\alpha_m + \beta_m X'_{ct},\Sigma_m)\\
F_{ct} &\sim N(\alpha_f + \beta_f X'_{ct}, \sigma_f)
(\#eq:mediate)
\end{align}
```
We also include all of $M_{ct}$ and $F_{ct}$ as linear predictors in \@ref(eq:binom).
We fit this model using Markov Chain Monte Carlo in the Stan software package [@carpenter2017]. We run the sampler for 4000 iterations (2000 iterations used for warmup) and four independent chains to test for convergence.
```{r munge_data,include=F}
if(calc_qoi) {
# vote share
# MIT Election Lab
load("data/mit_1976-2016-president.rdata")
vote_share <- filter(x,candidate=="Trump, Donald J.",
party=="republican",
writein=="FALSE") %>%
mutate(trump=candidatevotes/totalvotes)
# state GDP
state_gdp <- readxl::read_xlsx("data/qgdpstate0120_0_bea.xlsx",sheet="Table 3") %>%
mutate(gdp=Q1 + Q2 + Q3 +Q4)
# state-level unemployment (week-varying)
# too old to be of much use
unemp <- read_csv("data/simulation/unemployment/unemployment.csv")
# US Census data - population & percent foreign-born
# note: you need a Census API key loaded to use this -- see package tidycensus docs and use
# function census_api_key with the key number and install=T
# census_api_key(Sys.getenv("CENSUS_API_KEY"))
#
# acs_data <- get_acs("state",variables=c("B01003_001","B05002_013"),year=2018,survey="acs1") %>%
# select(-moe) %>%
# mutate(variable=recode(variable,
# B01003_001="state_pop",
# B05002_013="foreign_born")) %>%
# spread("variable","estimate") %>%
# mutate(prop_foreign=foreign_born/state_pop)
# saveRDS(acs_data, "data/census_data.rds")
acs_data <- readRDS("data/census_data.rds")
# population state density
area <- read_csv("data/pop_density.csv") %>% select(state_id,area)
acs_data <- left_join(acs_data,area,by=c("NAME"="state_id")) %>%
mutate(density=state_pop/area)
# health data
health <- read_csv("data/2019-Annual.csv") %>%
filter(`Measure Name` %in% c("Air Pollution","Cardiovascular Deaths","Dedicated Health Care Provider",
"Population under 18 years", "Public Health Funding","Smoking")) %>%
select(`Measure Name`,state="State Name",Value) %>%
distinct %>%
spread(key="Measure Name",value="Value")
merge_names <- tibble(state.abb,
state=state.name)
if(new_cases) {
# google mobility data
goog_mobile <- read_csv("https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv?cachebust=b8b0c30cbee5f341",
col_types = cols(sub_region_2=col_character())) %>% filter(sub_region_1 %in% merge_names$state,
is.na(sub_region_2),
country_region=="United States") %>%
rename(state=sub_region_1,
retail="retail_and_recreation_percent_change_from_baseline",
grocery="grocery_and_pharmacy_percent_change_from_baseline",
parks="parks_percent_change_from_baseline",
transit="transit_stations_percent_change_from_baseline",
workplaces="workplaces_percent_change_from_baseline",
residential="residential_percent_change_from_baseline")
# impute some of this data with random forests / some missingness in parks and retail
goog_mobile <- missRanger(goog_mobile,pmm.k=5L)
saveRDS(goog_mobile,"goog_mobile.rds")
nyt_data <- read_csv("~/covid-19-data/us-states.csv") %>%
complete(date,state,fill=list(cases=0,deaths=0,fips=0)) %>%
mutate(month_day=ymd(date)) %>%
group_by(state) %>%
arrange(state,date) %>%
mutate(cases=floor(c(cases[1:6],roll_mean(cases,n=7))),
Difference=coalesce(cases,0)) %>%
left_join(merge_names,by="state")
saveRDS(nyt_data,"nyt_data.rds")
tests <- read_csv("~/covid-tracking-data/data/states_daily_4pm_et.csv") %>%
mutate(month_day=ymd(date)) %>%
arrange(state,month_day) %>%
group_by(state) %>%
# first do 3-day moving average
mutate(total=floor(c(total[1:6],roll_mean(total,n=7)))) %>%
mutate(tests_diff=total) %>%
select(month_day,tests="tests_diff",total,state.abb="state",recovered)
saveRDS(tests,"tests.rds")
} else {
goog_mobile <- readRDS("goog_mobile.rds")
nyt_data <- readRDS("nyt_data.rds")
tests <- readRDS("tests.rds")
}
# recode bad testing information
# merge cases and tests
combined <- left_join(nyt_data,tests,by=c("state.abb","month_day")) %>%
left_join(acs_data,by=c("state"="NAME")) %>%
filter(!is.na(state_pop),
state.abb %in% state_filter)
# add protest data
# need to impute
prot_data <- read_csv("data/simulation/protest/Protest_Data_Final_Merged.csv") %>%
mutate(state=recode(state,
NewYork="New York",
SouthDakota="South Dakota",
NewJersey="New Jersey",
Connecticuticut="Connecticut",
DistrictofColumbia="District of Columbia",
NewHampshire="New Hampshire",
NewMexico="New Mexico",
NorthCarolina="North Carolina",
NorthDakota="North Dakota",
PuertoRico="Puerto Rico",
RhodeIsland="Rhode Island",
SouthCarolina="South Carolina",
SouthDakota="South Dakota",
WestVirginia="West Virginia")) %>%
missRanger(pmm.k=10L) %>%
group_by(state,date) %>%
summarize(sum_prot=sum(avgsize))
# protests by day
# group_by(prot_data,state,Date) %>%
# summarize(n_prot=sum(Attendees)) %>%
# ggplot(aes(y=n_prot,x=Date)) +
# geom_line(aes(group=state))
# add polling data
if(new_cases) {
source("luca_scraping_code.R")
} else {
approval <- read_csv("data/simulation/Civiqs/approve_president_trump.csv") %>% select(date,state="state_col",trendline_approve)
concern <- read_csv("data/simulation/Civiqs/coronavirus_concern.csv") %>%
select(state="state_col",date,trendline_extremely_concerned)
economy <- read_csv("data/simulation/Civiqs/economy_family_retro.csv") %>%
select(date,state="state_col",trendline_gotten_worse)
local_gov_response <- read_csv("data/simulation/Civiqs/coronavirus_response_local.csv") %>%
select(date,state="state_col",trendline_not_very_satisfied)
}
masks <- read_csv("data/simulation/masks/masks_yougov.csv") %>%
select(state="X.1",
mask_wear="% that selected")
#min_mask <- min(masks$mask_wear)
# add suppression data
if(new_policy) {
coronanet <- read_csv("data/CoronaNet/coronanet_release.csv") %>%
filter(country=="United States of America",
type %in% c("Health Testing","Lockdown","Quarantine","Restriction and Regulation of Government Services",
"Restriction and Regulation of Businesses",
"Restrictions of Mass Gatherings",
"Social Distancing","Health Resources")) %>%
filter(type_sub_cat=="Masks" || type!="Health Resources")
# need to replace end date
coronanet <- group_by(coronanet,country,province,policy_id) %>%
mutate(date_end=case_when(update_type=="End of Policy" & !is.na(date_end)~date_end,
update_type=="End of Policy" & is.na(date_end)~date_start,
any(!is.na(date_end[date_start==max(date_start,na.rm=T)]))~unique(max(date_end,na.rm=T)),
TRUE~lubridate::today())) %>%
ungroup %>%
mutate(type_sub_cat=coalesce(type_sub_cat,"General"),
type=recode(type,Lockdown="Quarantine"))
write_csv(coronanet,"coronanet_data.csv")
} else {
coronanet <- read_csv("recode_us_data.csv") %>%
distinct(event_description,type,type_sub_cat,.keep_all = T) %>%
filter(!grepl(x=`Error/Needs Review`,pattern="duplicate")) %>%
mutate(policy_count=ifelse(grepl(x=policy_type,pattern="More"),
1,-1)) %>%
group_by(event_description) %>%
arrange(event_description,date_start) %>%
fill(policy_type,.direction="down") %>%
fill(policy_type,.direction="up") %>%
mutate(type=case_when((grepl(x=event_description,pattern="[Mm]ask|[Ff]ace covering") |
type_sub_cat=="Masks") & type=="Social Distancing"~"Mask Restrictions",
TRUE~type),
policy_type=coalesce(policy_type,"More Restrictions and/or More Supply"))
# load COVID-AMP data: this is preferred as it is more complete at the US province level
covid_amp <- read_excel("data/covid_amp_state_policy_data.xlsx",skip = 6) %>% rename(level="Authorizing level of government",
id="Unique ID",
country="Authorizing country name",
state="Authorizing state/province, if applicable",
policy_change="Policy relaxing or restricting",
type="Policy category",
start_date="Issued date",
end_date="Actual end date",
description="Policy/law name") %>%
select(id, country, state, level, type, start_date, end_date,
description,
policy_change,
target="Policy target") %>%
filter(start_date < ymd("2020-10-01"),
level=="State / Province",
country=="United States of America (USA)",
type %in% c("Emergency declarations",
"Contact tracing/Testing",
"Social distancing",
"Support for public health and clinical capacity",
"Face mask",
"Enabling and relief measures",
"Travel restrictions")) %>%
mutate(start_date=ymd(start_date),
end_date=ymd(end_date),
policy_count=case_when(grepl(x=policy_change,pattern="Restricting|Other")& target=="General population (inclusive)"~1,
grepl(x=policy_change,pattern="Relaxing") & target=="General population (inclusive)" ~ -1,
grepl(x=policy_change,pattern="Restricting|Other") & target!="General population (inclusive)"~ 0.5,
grepl(x=policy_change,pattern="Relaxing") & target!="General population (inclusive)" ~ -0.5,
TRUE~NA_real_)) %>%
filter(!is.na(policy_count))
}
# do a summing exercise over policy types in terms of what is still available at a given day
if(new_policy) {
count_pol <- parallel::mclapply(unique(coronanet$province), function(p) {
# loop over policies
these_pol <- unique(coronanet$policy_id[coronanet$province==p])
lapply(these_pol, function(t) {
if(!is.na(p)) {
this_chunk <- filter(coronanet,policy_id==t,province==p)
} else {
this_chunk <- filter(coronanet,policy_id==t)
}
# loop over days
lapply(seq(ymd("2019-12-30"),ymd("2020-07-14"),by=1), function(d) {
this_day_pol <- group_by(this_chunk,type_sub_cat) %>%
filter(d>date_start,d<date_end) %>%
summarize(tot_pol=sum(policy_count))
if(nrow(this_day_pol)==0) {
tibble(month_day=d,
type=paste0(unique(this_chunk$type),collapse=";"),
type_sub_cat=unique(this_chunk$type_sub_cat),
count_pol_eff=rep(0,length(unique(this_chunk$type_sub_cat))),
province=p)
} else {
tibble(month_day=d,
type=paste0(unique(this_chunk$type),collapse=";"),
type_sub_cat=this_day_pol$type_sub_cat,
count_pol_eff=this_day_pol$tot_pol,
province=p)
}
}) %>% bind_rows
}) %>% bind_rows
},mc.cores=15) %>% bind_rows
saveRDS(count_pol,"count_pol.rds")
# repeat same procedure for COVID amp data
count_pol_covidamp <- parallel::mclapply(unique(covid_amp$state), function(p) {
# loop over policies
these_pol <- unique(covid_amp$id[covid_amp$state==p])
lapply(these_pol, function(t) {
if(!is.na(p)) {
this_chunk <- filter(covid_amp,id==t,state==p)
} else {
this_chunk <- filter(covid_amp,id==t)
}
# loop over days
lapply(seq(ymd("2019-12-30"),ymd("2020-07-14"),by=1), function(d) {
this_day_pol <- group_by(this_chunk,type) %>%
filter(d>start_date,d<end_date) %>%
summarize(tot_pol=sum(policy_count))
if(nrow(this_day_pol)==0) {
tibble(month_day=d,
type=paste0(unique(this_chunk$type),collapse=";"),
count_pol_eff=rep(0,length(unique(this_chunk$type))),
province=p)
} else {
tibble(month_day=d,
type=paste0(unique(this_chunk$type),collapse=";"),
count_pol_eff=this_day_pol$tot_pol,
province=p)
}
}) %>% bind_rows
}) %>% bind_rows
},mc.cores=15) %>% bind_rows
saveRDS(count_pol_covidamp,"count_pol_covidamp.rds")
} else {
count_pol <- readRDS("count_pol.rds")
count_pol_covidamp <- readRDS("count_pol_covidamp.rds")
}
# need to remove negative policy counts
count_pol <- mutate(count_pol,
count_pol_eff=replace(count_pol_eff,count_pol_eff<0,0),
type=recode(type,`Restriction and Regulation of Businesses;Restrictions of Mass Gatherings`="Restriction and Regulation of Businesses",
`Restriction of Mass Gatherings`="Restrictions of Mass Gatherings",
`Restriction and Regulation of Mass Gatherings`="Restrictions of Mass Gatherings",
`Restrictions of Mass Gatherings;Restriction and Regulation of Businesses`="Restriction and Regulation of Businesses"))
# sum over multiple overlapping policies
count_pol_sum <- group_by(count_pol,month_day,type,province) %>%
summarize(sum_pol=sum(count_pol_eff))
count_pol_covidamp_sum <- group_by(count_pol_covidamp,month_day,type,province) %>%
summarize(sum_pol=sum(count_pol_eff)) %>%
mutate(sum_pol=ifelse(sum_pol<0, 0, sum_pol))
## Recoding count_pol_covidamp_sum$type into count_pol_covidamp_sum$type_rec
count_pol_covidamp_sum$type_rec <- count_pol_covidamp_sum$type
count_pol_covidamp_sum$type_rec[count_pol_covidamp_sum$type == "Contact tracing/Testing"] <- "Contact Tracing"
count_pol_covidamp_sum$type_rec[count_pol_covidamp_sum$type == "Emergency declarations"] <- "Emergency"
count_pol_covidamp_sum$type_rec[count_pol_covidamp_sum$type == "Enabling and relief measures"] <- "Welfare"
count_pol_covidamp_sum$type_rec[count_pol_covidamp_sum$type == "Face mask"] <- "Masking"
count_pol_covidamp_sum$type_rec[count_pol_covidamp_sum$type == "Social distancing"] <- "Social Distancing"
count_pol_covidamp_sum$type_rec[count_pol_covidamp_sum$type == "Support for public health and clinical capacity"] <- "Health Resources"
count_pol_covidamp_sum$type_rec[count_pol_covidamp_sum$type == "Travel restrictions"] <- "Travel"
count_pol_sum <- spread(count_pol_sum,key="type",value="sum_pol") %>%
mutate_at(vars(`Health Resources`:`Social Distancing`),~ifelse(month_day==min(month_day),
coalesce(.,0),
.)) %>%
fill(`Health Resources`:`Social Distancing`,.direction=c("down"))
count_pol_covidamp_wide <- spread(select(ungroup(count_pol_covidamp_sum),-type),
key="type_rec",value="sum_pol") %>%
mutate_at(vars(`Contact Tracing`:Welfare),~ifelse(month_day==min(month_day),
coalesce(.,0),
.)) %>%
fill(`Contact Tracing`:Welfare,.direction=c("down"))
saveRDS(count_pol_covidamp_sum, "data/count_pol_covidamp_sum.rds")
# combined <- left_join(combined, count_pol_sum,by=c("state"="province","month_day"))
combined <- left_join(combined, count_pol_covidamp_wide,
by=c("state"="province","month_day"))
# add in civiqs
combined <- left_join(combined,approval,by=c("state","month_day"="date")) %>%
left_join(concern,by=c("state","month_day"="date")) %>%
left_join(economy,by=c("state","month_day"="date")) %>%
left_join(local_gov_response,by=c("state","month_day"="date"))
# add in other datasets
combined <- left_join(combined,health,by="state")
combined <- left_join(combined,select(state_gdp,state,gdp),by="state")
combined <- left_join(combined,select(vote_share,state,trump))
combined <- left_join(combined,select(goog_mobile,state,month_day="date",retail:residential))
combined <- left_join(combined,select(prot_data,state,date,sum_prot),by=c(month_day="date",
"state")) %>%
mutate(sum_prot=sum_prot/state_pop,
sum_prot=coalesce(sum_prot,0))
combined <- left_join(combined,select(masks,state,mask_wear),by="state")
# impute data
combined <- group_by(combined,state) %>%
mutate(test_case_ratio=sum(tests,na.rm=T)/sum(Difference,na.rm=T)) %>%
ungroup %>%
mutate(test_case_ratio=ifelse(test_case_ratio<1 | is.na(test_case_ratio),
mean(test_case_ratio[test_case_ratio>1],na.rm=T),test_case_ratio)) %>%
group_by(state) %>%
mutate(tests=case_when(Difference>0 & is.na(tests)~Difference*test_case_ratio,
Difference==0~0,
Difference>tests~Difference*test_case_ratio,
Difference==tests~Difference*test_case_ratio,
TRUE~tests),
gdp=gdp/state_pop,
Difference=ifelse(Difference<0,0,Difference)) %>%
filter(state!="Puerto Rico")
combined <- group_by(combined,state) %>%
arrange(state,month_day) %>%
mutate(outbreak=as.numeric(cases>1),
lin_counter=(1:n())/n()) %>%
fill(outbreak,.direction="down") %>%
mutate(outbreak_time=cumsum(outbreak)) %>%
ungroup %>%
mutate(max_time=max(outbreak_time),
outbreak_time=outbreak_time/max_time) %>%
group_by(state) %>%
arrange(state,month_day) %>%
mutate(world_infect=Difference - coalesce(dplyr::lag(Difference),0),
trendline_approve = trendline_approve - mean(trendline_approve,na.rm=T)) %>%
group_by(month_day) %>%
mutate(world_infect=sum(world_infect)) %>%
group_by(state) %>%
arrange(state,month_day) %>%
mutate(cases_per_cap=Difference/(state_pop),
cases_per_cap=ifelse(cases_per_cap==0,.00000001,cases_per_cap)) %>%
mutate_at(c("grocery",
"parks",
"residential",
"retail",
"transit",
"workplaces",
"Health Resources",
"Contact Tracing",
"Masking",
"Travel",
"Social Distancing",
"sum_prot",
"trendline_approve",
"world_infect",
"trendline_gotten_worse",
"trendline_extremely_concerned",
"trendline_not_very_satisfied"), ~dplyr::lag(.,n=14)) %>%
ungroup %>%
mutate(trump_int=trendline_approve*trump) %>%
filter(!is.na(grocery),!is.na(trendline_extremely_concerned),!is.na(trendline_gotten_worse),!(Difference==0 & tests==0)) %>%
group_by(state) %>%
arrange(state,month_day) %>%
mutate(test_max=coalesce(tests - dplyr::lag(tests),tests),
test_max=c(test_max[1:6],roll_mean(test_max,n=7)),
test_max=cummax(test_max)) %>%
ungroup %>%
mutate(test_max=test_max / max(test_max,na.rm=T))
if(!new_cases && !new_policy) {
combined <- filter(combined, month_day<ymd("2020-07-15"))
}
min_mask <- min(combined$mask_wear)
combined <- combined %>%
group_by(state) %>%
mutate(mask_wear=ifelse(ymd("2020-04-03")<month_day,mask_wear,min_mask/2)) %>%
ungroup
# filter(state %in% c("New York",
# "Florida",
# "California",
# "Utah",
# "Connecticut",
# "Michigan",
# "Minnesota",
# "Hawaii",
# "Vermont",
# "Texas",
# "Lousiana",
# "Pennsylvania",
# "Washington",
# "Missouri",
# "Montana",
# "Georgia",
# "North Carolina"))
# need to impute negative test numbers
combined <- group_by(combined,state) %>%
arrange(state,month_day) %>%
mutate(tests2=ifelse(tests < cummax(coalesce(tests,0)),NA,tests),
tests3=imputeTS::na_interpolation(tests2,option="linear"))
# make a key
combined <- mutate(ungroup(combined),key=1:n())
state_id <- distinct(combined,state,state_pop) %>%
ungroup %>%
mutate(state_num=as.numeric(factor(state)))
saveRDS(state_id, "data/state_id.rds")
# need to calculate world infection parameter (use lag to adjust)
world_infect <- select(ungroup(combined),world_infect,month_day) %>% distinct
world_infect <- arrange(world_infect,month_day) %>%
mutate(world_infect=world_infect/max(world_infect))
combined <- left_join(select(combined,-world_infect),world_infect,by="month_day")
# include serology data
cdc_sero <- read_csv("all_cdc_sero.csv") %>%
select(date_range="Date Range of Specimen Collection",
site_abbr="Site Name Abbr",
site="Site",
round="Round",everything()) %>%
fill(date_range:round,.direction="down") %>%
mutate(date_begin=mdy(paste0(str_extract(date_range,"[A-Z][a-z]+ [0-9]+"), " 2020")),
date_end=mdy(str_extract(date_range,"[A-Z][a-z]+ [0-9]+ [0-9]+"))) %>%
select(-date_range) %>%
gather(key="date",value="value",`3/27/2020`:`8/8/2020`) %>%
filter(!is.na(value)) %>%
mutate(value=ifelse(grepl(x=value,pattern="\\%"),as.numeric(str_remove(value,"\\%"))/100,value),
value=str_remove(value,",")) %>%
spread(key="variable",value="value") %>%
select(-date)
# n_infect="n [Cumulative Prevalence]",
# rate="Avg. Cumulative Prevalence Rate (%)",) %>%
# mutate(survey_size=floor(n_infect/(rate/100)))
# add in sample sizes
cdc_samples <- read_csv("cdc_sample_sizes.csv")
cdc_sero <- left_join(cdc_sero,cdc_samples)
cdc_sero <- select(cdc_sero,
inf_pr="Avg. Cumulative Prevalence Rate (%)",
inf_pr_high="Avg. Cumulative Prevalence Upper CI",
inf_pr_low="Avg. Cumulative Prevalence Lower CI",
cases_local="Cases reported by date of last specimen collection",
cum_inf="Estimated cumulative infections Count",
everything()) %>%
mutate(cum_inf=str_remove(cum_inf,","),
inf_pr=as.numeric(inf_pr),
cases_local=as.numeric(cases_local),
catch_pop=as.numeric(cum_inf)/as.numeric(inf_pr),
ratio=inf_pr/(cases_local/catch_pop),
state=site,
state=recode(site,
`South Florida`="Florida",
`New York City Metro Area`="New York",
`Philadelphia Metro Area`="Pennsylvania",
`Western Washington Region`="Washington",
`San Francisco Bay Area`="California"))
# add in full pop/cases
cdc_sero <- left_join(cdc_sero, distinct(select(combined,month_day,cases,state,
state_pop)),by=c("state",
"date_end"="month_day"))
# project catchment area to whole state by relative case distribution
cdc_sero <- mutate(cdc_sero,
cum_inf=as.numeric(str_remove(cum_inf,",")),
cases_pr=cases_local/catch_pop,
inf_pr=ifelse(grepl(x=site,pattern="Area|Region|South"),
(inf_pr*(cases/state_pop))/(cases_local/catch_pop),inf_pr),
cum_inf=ifelse(grepl(x=site,pattern="Area|Region|South"),
(cum_inf*(cases/state_pop))/(cases_local/catch_pop),
inf_pr))
cdc_sero_indata <- filter(cdc_sero,!is.na(state_pop))
# add in expert estimate for March 23rd
# use beta distribution to calculate correct number
expert_survey_2023 <- read_csv("data/consensusForecastsDB.csv") %>%
filter(questionLabel=="QF4",
surveyIssued==ymd("2020-03-23"))
# need to get estimates for effective sample size
expert_code <- cmdstan_model("estimate_beta_priors_v2.stan")
# data = random sample from empirical distribution
prop_expert <- sample(expert_survey_2023$bin,
size=5000,
replace=T,
prob=expert_survey_2023$prob)
# stratify by state
state_strat <- select(state_id,
state_pop,
state) %>%
left_join(select(filter(combined,
month_day==ymd("2020-03-21")),
cases,state,tests)) %>%
mutate(counter_test=floor((sum(tests)/sum(state_pop))*state_pop),
# adjust for below/above average test rates
case_adj = (cases + cases * (counter_test/tests)),
case_prop=cases/sum(cases),
case_prop_adj=case_adj/sum(case_adj))
# (prop_expert*((state_strat$case_prop_adj[state_strat$state==s]*state_strat$state_pop[state_strat$state==s])/(sum(state_strat$state_pop*state_strat$case_prop_adj))))/state_strat$state_pop[state_strat$state==s]
# loop over states