-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01-load-data.R
163 lines (132 loc) · 6.73 KB
/
01-load-data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
## Load data from several sources, including:
## o IRS form 990s for basic financial info
## o Bureau of Labor Statistics (for CPI-based inflation adjustment)
## o NIH Exporter for NIH grant funding
#################### Start with ParseIRS990::
########## Pull data from 990s
## Get xml files and pull financial data from recent 990s.
## Parsing xml is slow, so do it once and stash it for later.
if(FALSE) {
ein <- "131628174" ## Haskins Labs
years <- 2014:2019; names(years) <- paste(years) ## years before 2014 are not available in xml format
get.financials <- function(year, ein){
xml <- get_990(ein, year) ## pull info from xml files on the IRS AWS bucket without downloading the files
## xml2::write_xml(xml, here("irs990", paste(ein, year, "irs990.xml", sep="-")))
return(get_all_financial_data(xml))
}
financials <-
map_dfr(years, get.financials, ein, .id = "year") %>%
mutate(across(.fns = as.integer)) %>%
mutate(net_assets = assets_total_assets-assets_total_liabilities)
save(financials, file="financials.rdata")
}
load("financials.rdata")
######### adjust purchasing power for inflation
## Get agreggate cpi data for all urban consumers (cu series) from the Bureau of Labor Statistics.
## An index of available data sets is here: https://download.bls.gov/pub/time.series/overview.txt.
## A list of available "series" (see series_id variable) is here: https://download.bls.gov/pub/time.series/cu/cu.series
## Data dictionary for cu series is here: https://download.bls.gov/pub/time.series/cu/cu.txt.
tmp <- tempfile()
download.file("http://download.bls.gov/pub/time.series/cu/cu.data.1.AllItems",tmp)
## "CUUR0000SA0" == "All items in U.S. city average, all urban consumers, not seasonally adjusted"
cpi <-
read_delim(tmp,
delim = "\t", trim_ws = TRUE) %>%
filter(year %in% 2014:2019,
period == "M13", ## M13 is the annual average
series_id == "CUUR0000SA0") %>% ## "All items in U.S. city average, all urban consumers, not seasonally adjusted"
rename(cpi = value) %>%
mutate(year = as.integer(year)) %>%
select(year, cpi)
deflate.currency <- function(old.values, price.index) {
## old.values: vector of currency values to 'deflate'
## price.index: price index that reflects rate of inflation
## arguments must be of equal length
## base period is always the first in the series (so, 1st period remains constant after adj)
base.cpi <- price.index[1]
(old.values * base.cpi) / price.index
}
## Merge cpi and financial data
financials <-
left_join(financials, cpi) %>%
mutate(net_assets_adj = deflate.currency (net_assets, cpi),
revenue_total_adj = deflate.currency (revenue_total, cpi),
expenses_total_adj = deflate.currency (expenses_total, cpi))
######### Pull NIH grant data for Haskins Labs
## Download some NIH Exporter "project" info as csv files and stash them locally
if(FALSE){
fy <- 1990:2012
fnames <- paste0("RePORTER_PRJ_C_FY", fy, ".zip")
localdatadir <- "nih-exporter"
fy <- 1990:2012
fnames <- paste0("RePORTER_PRJ_C_FY", fy, ".zip")
localdatadir <- "nih-exporter"
url0 <- "https://exporter.nih.gov/CSVs/final"
dest <- here::here(localdatadir, fnames)
urls <- paste(url0, fnames, sep="/")
purrr::map2(urls,
dest,
~ downloader::download(url=.x, destfile=.y))
}
## read Project files, scrape Haskins data, and tidy it up
duns <- c("60010147", "060010147") ## Haskins Labs' proper DUNS is 060010147. NIH sometimes drops the leading "0"
fy <- 1990:2020
fnames <- paste0("RePORTER_PRJ_C_FY", fy, ".zip")
localdatadir <- "nih-exporter"
infiles <- here::here(localdatadir, fnames)
read.nih.projects <- function(infiles, duns){
retval <-
read_csv(infiles,
col_types=cols(.default="c",
FY="i",
SUPPORT_YEAR = "i",
DIRECT_COST_AMT = "i", ## this and the next col don't exist before 2012
INDIRECT_COST_AMT = "i",
TOTAL_COST = "i")) %>%
rename_with(tolower) %>%
filter(str_detect(org_name, "^HASKINS LABORATORIES")) %>%
## filter(org_duns %in% duns) %>% ## organizations to retain
filter(is.na(subproject_id)) %>% ## drop subprojects; may need to revist this
select(-c(subproject_id, total_cost_sub_project)) %>%
select(-c(phr)) %>% ## problematic column (I don't remember why)
mutate(award_notice_date = mdy(award_notice_date), ## fixup some date variables
budget_start = mdy(budget_start),
budget_end = mdy(budget_end),
project_start = mdy(project_start),
project_end = mdy(project_end))
if (unique(retval$fy) < 2012) {
return(retval)
}
retval %>%
mutate(indirect_rate = indirect_cost_amt/direct_cost_amt) ## calc effective indirect rates for each grant/year where possible
}
D0_nih <-
map_dfr(infiles, read.nih.projects, duns) %>%
mutate(grant = str_remove(core_project_num, "^[A-Z][0-9]{2}")) %>% ## make unique grant ID
## fixup PI Names
mutate(pi_names = str_replace(pi_names, ";$", ""),
pi_names = str_replace(pi_names, "\\.", ""),
pi_names = str_replace(pi_names, "\\(contact\\)", ""),
pi_names = str_squish(pi_names)) %>%
mutate(pi_names = recode_factor(pi_names,
`LILLO-MARTIN, DIANE CAROLYN` = "LILLO-MARTIN, DIANE C",
`LILLO-MARTIN, DIANE` = "LILLO-MARTIN, DIANE C",
`SHANKWEILER, DONALD PAUL` = "SHANKWEILER, DONALD P",
`SHANKWEILER, DONALD` = "SHANKWEILER, DONALD P",
`SHADLE, CHRISTINE HELEN` = "SHADLE, CHRISTINE H",
`WHALEN, DOUG H` = "WHALEN, DOUGLAS H"))
D_nih_recent <-
D0_nih %>%
filter(fy %in% 2013:2020) %>%
group_by(fy) %>%
summarize(nih_direct_cost = sum(direct_cost_amt),
nih_indirect_cost = sum(indirect_cost_amt, na.rm=TRUE), ## sometimes indirect costs are missing ("NA"). Need to sort out why.
nih_total_cost = sum(total_cost),
nih_effective_indirect_rate = sum(indirect_cost_amt, na.rm=TRUE)/sum(direct_cost_amt, na.rm=TRUE))
## join some NIH data into the i990 dataset from
D_financials <-
right_join(financials, D_nih_recent, by = c("year" = "fy"))
######### Another take at 990 data, this time using hand-rolled code wrapping xml2::
## load 990s from local xml files
infiles <- dir_ls(here("irs990", "xml-files"), regexp="xml$")
xml.990 <- map(infiles, read_xml)