-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsum_stats_death_by_covid.Rmd
127 lines (87 loc) · 4.84 KB
/
sum_stats_death_by_covid.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
---
title: "death_from_COVID"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(leaps)
library(MASS)
library(xtable)
```
```{r}
data <- read.table("full_data_no_na.csv", sep = ",", header = TRUE)
deaths <- data$deaths
sapply(data, class)
deaths_omit_0 <- data[which(data$deaths != 0),]
log_death_omit <- log(deaths_omit_0$deaths)
hist(log_death_omit)
full_model_death_omit_log <- lm(log_death_omit ~ mask_score + ratio_pop_to_physician + unemploy_rate
+ HS_grad_rate + flu_vacc_. + uninsured + median_income + pop_size
+ X._rural + life_expectancy + sim_diversity_index + area + X._democrat
+ sim_diversity_index + life_expectancy + pop_density, data = deaths_omit_0)
summary(full_model_death_omit_log)
#per population size
deaths_per_pop <- deaths_omit_0$deaths/deaths_omit_0$pop_size
full_model_pop_death_omit <- lm(deaths_per_pop ~ mask_score + ratio_pop_to_physician + unemploy_rate
+ HS_grad_rate + flu_vacc_. + uninsured + median_income
+ X._rural + life_expectancy + sim_diversity_index + area + X._democrat
+ sim_diversity_index + life_expectancy, data = deaths_omit_0)
boxcox(full_model_pop_death_omit)
#need to take the log
```
```{r}
log_deaths_pop_omit <- log((deaths_per_pop))
hist(log_deaths_pop_omit, breaks = 200)
#full model where the number of deaths is divided by the population and zeros are omitted.
#issue is that taking the log of something (to correct that boxcox) that is between 0 and 1 will lead to negative response variables.
#state included as greatly improves model
full_model_log_pop_death_omit <- lm(log_deaths_pop_omit ~ mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + sim_diversity_index + life_expectancy+factor(state), data = deaths_omit_0)
#boxcox(full_model_log_pop_death_omit)
summary(full_model_log_pop_death_omit)
anova(full_model_log_pop_death_omit)
#decided on more relevant to do deaths per pop, even though the summary is less good. This model omits those places where there were 0 deaths (consider, not all counties have hospitals where someone in critical care would go to die... )
par(mfrow = c(2,2))
plot(full_model_log_pop_death_omit)
```
```{r}
#reg_subsets
sum_sub = regsubsets(log_deaths_pop_omit ~ mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + sim_diversity_index + life_expectancy+factor(state), data = deaths_omit_0,nvmax=20, nbest=1, method='forward')
#not looking like any combinations of these variables are a particularly good predictor of covid death.
#potentially a missing componant: death over time
res.sum <- summary(sum_sub)
summary(res.sum,all.best=TRUE,matrix=T,matrix.logical=F,df=NULL)
max(res.sum$adjr2)
res.sum$outmat[20,]
labels(which(res.sum$outmat[20,]=='*'))
# n = length(log_deaths_pop_omit)
# p.m = as.integer(rownames(res.sum$which))+1#number of coefficients in each model: p
# ssto = sum((log_deaths_pop_omit-mean(log_deaths_pop_omit))^2)
# sse = (1-res.sum$rsq)*ssto
# aic = n*log(sse/n)+2*p.m
# bic = n*log(sse/n)+log(n)*p.m
# res_sub = cbind(sse, res.sum$rsq, res.sum$adjr2,res.sum$cp, bic, aic)
# head(res_sub, n=1)
```
```{r}
#looking at the diversity index as the separated ethnicity proportions in each county
diversity_as_indv <- lm(log_deaths_pop_omit ~ mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + sim_diversity_index + life_expectancy+factor(state) + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, data = deaths_omit_0)
summary(diversity_as_indv)
#remove states (it is not really relevant-- see discussion)
diversity_as_indv_nostate <- lm(log_deaths_pop_omit ~ mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + sim_diversity_index + life_expectancy+ non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, data = deaths_omit_0)
# coefficient of multiple determination for normal model...
r2_normal <- function(model){
test <- anova(model)
sse <- test[,2]
names <- rownames(test)
total_sse <- sum(sse)
output <- cbind(names,sse/total_sse)
colnames(output) <- c("variables", "r2")
return(output)
}
summary(diversity_as_indv_nostate)
r2_normal(diversity_as_indv_nostate)
#printing a table with this model
#print(xtable(as.data.frame(diversity_as_indv_nostate$coefficients), type = "latex"))
```