-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprep_datasets.R
87 lines (58 loc) · 2.42 KB
/
prep_datasets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
library(dplyr)
library(tibble)
library("tidyr")
# * open the comorbidity file
# * remove duplicates of comorbidity measurements
# by ordering on EventDate and taking the most recent
# * keep only the EAVE_LINKNO and comorbidity type (cluster) and value
df_comorbid <- readRDS("/conf/EAVE/GPanalysis/data/cleaned_data/Qcovid_update_Jan21.rds") %>%
arrange(desc(EventDate)) %>%
distinct(EAVE_LINKNO,cluster,.keep_all=TRUE) %>%
select(EAVE_LINKNO,cluster,Value) %>%
arrange(EAVE_LINKNO)
nrow(df_comorbid)
# Note:
# - can we handle comorbidity measurement duplications better?
# - some comorbidities are longer term than others,
# can we assume someone with a condition in 2015 still has it when Serology test
# has been taken?
# Make a better dataframe so there is one row per EAVE_LINKNO
# seperate columns now for different comorbidities
# NA means the person is not classified with said comorbidity
# None NA means the person is classified (value is comorbidity value, e.g. BMI)
df_comorbid_flat <- df_comorbid %>%
group_by(EAVE_LINKNO) %>%
spread(cluster, Value, fill=0, convert=TRUE)
df_comorbid_flat
nrow(df_comorbid_flat)
# load the EAVE-II demographics and pull out just the Sex/Age of all studies
df_demographics <- readRDS("/conf/EAVE/GPanalysis/data/EAVE_demographics_SK.rds") %>%
as_tibble() %>%
select(EAVE_LINKNO,Sex,ageYear)
# join the flat comorbidities with these demographics
# if a study is not in the comorbidities, then all clusters with be NA
df_comorbid_flat <- df_demographics %>%
left_join(df_comorbid_flat) %>%
arrange(EAVE_LINKNO)
nrow(df_comorbid_flat)
# save this new dataframe
saveRDS(df_comorbid_flat,"/home/calumm09/data/comorbidities.rds")
df_deaths <- readRDS("/conf/EAVE/GPanalysis/data/all_deaths.rds") %>%
as_tibble()
df_deaths
df_cdw <- readRDS("/conf/EAVE/GPanalysis/data/CDW_deduped.rds") %>%
as_tibble()
df_cdw
df_wgs <- readRDS("/conf/EAVE/GPanalysis/data/WGS_latest.rds") %>%
as_tibble()
df_wgs
df_vac <- readRDS("/conf/EAVE/GPanalysis/data/cleaned_data/C19vaccine.rds") %>%
as_tibble()
df_vac
df_serology <- readRDS("/conf/EAVE/GPanalysis/data/serology_full_2021-09-16.rds") %>%
as_tibble()
df_serology
df_comorbid <- readRDS("/conf/EAVE/GPanalysis/data/cleaned_data/Qcovid_update_Jan21.rds") %>%
as_tibble()
df_demographics <- readRDS("/conf/EAVE/GPanalysis/data/EAVE_demographics_SK.rds") %>%
as_tibble()