Skip to content

Commit a8e6c6a

Browse files
committed
put back standardized curations and convenience files to the pipeline
1 parent 7cbf6eb commit a8e6c6a

20 files changed

+2970
-3
lines changed

README.md

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ The repository contents are as follows:
3636
- **./data/source_curations**: source curation spreadsheets, as
3737
provided by curators.
3838

39+
- **./data/standardized_curations**: curation spreadsheets in denormalized form, after
40+
undergoing processing for quality control and standardization. This is the state from
41+
which all other files are created, i.e. the convenience files and release files.
42+
3943
- **./data/reference_files**: resources used to support the
4044
standardization process, including controlled vocabularies from
4145
external sources and mapping files maintained by the project.
@@ -45,6 +49,9 @@ The repository contents are as follows:
4549
curations. They are formatted as needed for uploading to the
4650
Dashboard.
4751

52+
- **./data/convenience_files**: the standardized curations conveniently reformatted to support
53+
human inspection and downstream computational analysis.
54+
4855
- **./docs**: curation templates and column specification.
4956

5057
The overall workflow of the project is depicted below.
@@ -151,17 +158,23 @@ and are updated with the latest versions prior to a Dashboard release.
151158
### Release Preparation
152159

153160
The quality control and standardization process produces new, denormalized versions of
154-
the curation data.
155-
These data then undergo further processing to generate the
161+
the source spreadsheets which are placed under **./data/standardized_curations**.
162+
These files then undergo further processing to generate the
156163
final data release files which will be uploaded to the Dashboard. This
157164
processing is essentially a straightforward repackaging of the
158165
spreadsheets into a format appropriate for the upload scripts. It
159166
involves splitting immune signatures into individual spreadsheet files, one
160167
signature per file. The final release files are stored under **./data/release_files** and
161-
have the same columns as the standardized curation data, with some additions
168+
have the same columns as the standardized curation files, with some additions
162169
to support the requirements of the Dashboard itself and to preserve
163170
original curated values for fields updated by the pipeline.
164171

172+
The release preparation process also generates a number of convenience files, i.e.,
173+
partially re-normalized versions of the standardized curations. These files are available
174+
in spreadsheet format, to facilitate human inspection; and in the Broad GMT tab delimited file format,
175+
to support downstream computational processing. The files are stored under
176+
**./data/convenience_files**
177+
165178
Additional details about the processing pipeline can be found under the
166179
**./data** directory.
167180

code/main.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/bash -x
2+
time Rscript main_inf_ctf.R
3+
time Rscript main_inf_gene.R
4+
time Rscript main_vac_ctf.R
5+
time Rscript main_vac_gene.R

code/main_inf_ctf.R

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,9 @@ df2 <- cSplit(df2, "comparison", sep = ";", direction = "long")
309309
df2 <- as.data.frame(df2)
310310
df2$comparison <- trimws(df2$comparison)
311311

312+
source("standardized_and_convenience.R")
313+
save_standardized_curations(df2, base_filename)
314+
312315
#############################################################
313316
#### Recreate original spreadsheet with all corrections #####
314317
#############################################################
@@ -355,3 +358,5 @@ if(!is.null(s)) {
355358
write_submission_template(df2, header_rows, "../data/release_files", NULL, "hipc_inf_ctf", titles_and_dates_df,
356359
resp_components_full_sig, unmatched_symbols_map = NULL,
357360
"CELLTYPE_FREQUENCY", "INFECTION", "Immune cell-type frequency response to infection")
361+
362+
save_convenience_files(df2, header_rows, base_filename, "INFECTION", "CELLTYPE_FREQUENCY")

code/main_inf_gene.R

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,9 @@ df2 <- cSplit(df2, "comparison", sep = ";", direction = "long")
322322
df2 <- as.data.frame(df2)
323323
df2$comparison <- trimws(df2$comparison)
324324

325+
source("standardized_and_convenience.R")
326+
save_standardized_curations(df2, base_filename)
327+
325328
#############################################################
326329
#### Recreate original spreadsheet with all corrections #####
327330
#############################################################
@@ -380,3 +383,5 @@ df2$exposure_material <- NULL
380383
write_submission_template(df2, header_rows, "../data/release_files", NULL, "hipc_inf_gene", titles_and_dates_df,
381384
resp_components_collected, unmatched_symbols_map,
382385
"GENE", "INFECTION", "Gene expression response to infection")
386+
387+
save_convenience_files(df2, header_rows, base_filename, "INFECTION", "GENE")

code/main_vac_ctf.R

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,9 @@ df2 <- cSplit(df2, "comparison", sep = ";", direction = "long")
338338
df2 <- as.data.frame(df2)
339339
df2$comparison <- trimws(df2$comparison)
340340

341+
source("standardized_and_convenience.R")
342+
save_standardized_curations(df2, base_filename)
343+
341344
#############################################################
342345
#### Recreate original spreadsheet with all corrections #####
343346
#############################################################
@@ -390,3 +393,5 @@ header_rows <- header_rows[!colnames(header_rows) %in% del_cols]
390393
write_submission_template(df2, header_rows, "../data/release_files", NULL, "hipc_vac_ctf", titles_and_dates_df,
391394
resp_components_full_sig, unmatched_symbols_map = NULL,
392395
"CELLTYPE_FREQUENCY", "VACCINE", "Immune cell-type frequency response to vaccine exposure")
396+
397+
save_convenience_files(df2, header_rows, base_filename, "VACCINE", "CELLTYPE_FREQUENCY")

code/main_vac_gene.R

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,9 @@ df2 <- cSplit(df2, "comparison", sep = ";", direction = "long")
341341
df2 <- as.data.frame(df2)
342342
df2$comparison <- trimws(df2$comparison)
343343

344+
source("standardized_and_convenience.R")
345+
save_standardized_curations(df2, base_filename)
346+
344347
#############################################################
345348
#### Recreate original spreadsheet with all corrections #####
346349
#############################################################
@@ -403,3 +406,5 @@ header_rows <- header_rows[!colnames(header_rows) %in% del_cols]
403406
write_submission_template(df2, header_rows, "../data/release_files", NULL, "hipc_vac_gene", titles_and_dates_df,
404407
resp_components_collected, unmatched_symbols_map,
405408
"GENE", "VACCINE", "Gene expression response to vaccine exposure")
409+
410+
save_convenience_files(df2, header_rows, base_filename, "VACCINE", "GENE")

code/standardized_and_convenience.R

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
# this file is adopted from the original generate_HIPC_submissions.R
2+
3+
library(R.utils) # for gzip
4+
5+
save_standardized_curations <- function(df2, base_filename) {
6+
del_cols <- c(
7+
"submission_name", "submission_date",
8+
"template_name", "short_comment", "process_note"
9+
)
10+
df2tmp <- df2[!colnames(df2) %in% del_cols]
11+
df2tmp <- df2tmp[-1]
12+
13+
filename <- paste0(
14+
"../data/standardized_curations/", base_filename,
15+
"-standardized_denormalized.tsv"
16+
)
17+
write.table(df2tmp,
18+
file = filename, sep = "\t",
19+
row.names = FALSE, col.names = TRUE
20+
)
21+
gzip(filename,
22+
destname = paste0(filename, ".gz"), overwrite = TRUE,
23+
remove = TRUE
24+
)
25+
}
26+
27+
save_convenience_files <- function(
28+
df2, header_rows, base_filename,
29+
exposure_type, response_type) {
30+
if (exposure_type != "VACCINE" && exposure_type != "INFECTION") {
31+
stop("Incorrect exposure type encountered")
32+
}
33+
if (response_type != "GENE" && response_type != "CELLTYPE_FREQUENCY") {
34+
stop("Incorrect response type encountered")
35+
}
36+
37+
if (response_type == "GENE") {
38+
response_behavior_type_var <- "gene expression"
39+
} else if (response_type == "CELLTYPE_FREQUENCY") {
40+
response_behavior_type_var <- "cell-type frequency"
41+
}
42+
43+
convenience_files <- "../data/convenience_files/"
44+
45+
uniq_sig_row_ids <- unique(df2$sig_row_id)
46+
resp_components_annotated <- vector("list", length(uniq_sig_row_ids))
47+
recreated_template <- vector("list", length(uniq_sig_row_ids))
48+
49+
for (i in seq_along(uniq_sig_row_ids)) {
50+
df2tmp <- df2[df2$sig_row_id == uniq_sig_row_ids[i], ]
51+
# Recreate a full signature in one row
52+
base_row <- df2tmp[1, ] # get first row for this uniqID
53+
54+
response_rowname <- paste(base_row$publication_reference_id,
55+
base_row$sig_subm_id, uniq_sig_row_ids[i],
56+
sep = "_"
57+
)
58+
response_description <- paste("PMID", base_row$publication_reference_id,
59+
response_behavior_type_var, base_row$sig_subm_id,
60+
sep = " "
61+
)
62+
63+
# Use the full original set of response components
64+
# rather than just those for which a valid symbol was found.
65+
base_row$response_component_original <- paste(
66+
unique(df2tmp$response_component_original),
67+
collapse = "; "
68+
)
69+
70+
base_row$exposure_material_id <- paste(
71+
unique(df2tmp$exposure_material_id),
72+
collapse = "; "
73+
)
74+
base_row$tissue_type_term_id <- paste(
75+
unique(df2tmp$tissue_type_term_id),
76+
collapse = "; "
77+
)
78+
79+
if (response_type == "GENE") {
80+
base_row$response_component <- paste(
81+
unique(df2tmp$response_component),
82+
collapse = "; "
83+
)
84+
resp_components_annotated[[i]] <- c(
85+
response_rowname,
86+
response_description, unique(df2tmp$response_component)
87+
)
88+
} else if (response_type == "CELLTYPE_FREQUENCY") {
89+
full_sig <- unique(df2tmp$fully_qualified_response_component)
90+
# FIXME - only response_component is getting put back together?
91+
base_row$response_component <- paste(full_sig, collapse = "; ")
92+
base_row$response_component_id <- paste(
93+
unique(df2tmp$response_component_id),
94+
collapse = "; "
95+
)
96+
base_row$proterm_and_extra <- paste(
97+
unique(df2tmp$proterm_and_extra),
98+
collapse = "; "
99+
)
100+
base_row$fully_qualified_response_component <- paste(
101+
unique(df2tmp$fully_qualified_response_component),
102+
collapse = "; "
103+
)
104+
# The pro_ontology_id values are already separated by semicolons,
105+
# so change to commas
106+
# before potentially joining two lists of pro-terms.
107+
df2tmp$pro_ontology_id <- sapply(
108+
df2tmp$pro_ontology_id,
109+
function(x) {
110+
gsub(";", ",", x)
111+
}
112+
)
113+
base_row$pro_ontology_id <- paste(
114+
unique(df2tmp$pro_ontology_id),
115+
collapse = "; "
116+
)
117+
118+
resp_components_annotated[[i]] <- c(
119+
response_rowname, response_description, full_sig
120+
)
121+
}
122+
123+
# Reconstitute target_pathogen and exposure_material_id
124+
if (exposure_type == "VACCINE") {
125+
base_row$target_pathogen_taxonid <- paste(
126+
unique(df2tmp$target_pathogen_taxonid),
127+
collapse = "; "
128+
)
129+
}
130+
131+
recreated_template[[i]] <- base_row
132+
}
133+
134+
names(resp_components_annotated) <- uniq_sig_row_ids
135+
136+
# consolidate to a single data.frame
137+
recreated_template_df <- as.data.frame(rbindlist(recreated_template))
138+
if (any(colnames(header_rows) != colnames(recreated_template_df))) {
139+
stop("mismatch between header rows and recreated_template_df rows")
140+
}
141+
142+
recreated_template_df <- rbind(header_rows, recreated_template_df)
143+
144+
# First save a complete version for use in debugging/logging
145+
del_cols <- c("submission_name", "submission_date", "template_name")
146+
recreated_template_df <- recreated_template_df[
147+
!colnames(recreated_template_df) %in% del_cols
148+
]
149+
150+
# Set that first column name back to blank
151+
colnames(recreated_template_df)[1] <- ""
152+
153+
del_cols <- c("sig_subm_id", "sig_row_id")
154+
155+
recreated_template_df <- recreated_template_df[
156+
!colnames(recreated_template_df) %in% del_cols
157+
]
158+
write.table(recreated_template_df,
159+
file = paste0(
160+
convenience_files,
161+
base_filename, "-standardized_curation_template.tsv"
162+
),
163+
sep = "\t", row.names = FALSE
164+
)
165+
166+
gmt_file <- paste0(
167+
convenience_files,
168+
base_filename, "-response_components.gmt.txt"
169+
)
170+
if (file.exists(gmt_file)) file.remove(gmt_file)
171+
lapply(
172+
resp_components_annotated,
173+
function(x) {
174+
write.table(paste(x, collapse = "\t"),
175+
file = gmt_file, row.names = FALSE, col.names = FALSE,
176+
quote = FALSE, append = TRUE
177+
)
178+
}
179+
)
180+
message("Finished creating convenience files")
181+
}

0 commit comments

Comments
 (0)