-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
223 changed files
with
3,136 additions
and
9,977 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
This is the very first draft of predictability as part of conseq in the new analysis pipeline. | ||
|
||
You just need conseq installed since everything else that is required is installed inside the us.gcr.io/broad-achilles/daintree-sparkles:v4 image where predicability is run. | ||
|
||
Note that there is a model-config.yaml file which has the config of all the models. | ||
Once conseq is installed, you can run `conseq run fit.conseq` to start. | ||
|
||
The `fit.conseq` works as follows: | ||
|
||
1. It first creates model input json files based on the `model-config.yaml` file. | ||
2. Once the input json file is created, daintree is run to produce the output for predictability. There are 3 different files that are uploaded to taiga for each model, predictions.csv, ensemble.csv, feature_metadata.csv.Running daintree also creates a `output_config.json` file which has the input config as well as the taiga ids of the 3 uploaded files. | ||
3. The `output_config.json` file is then combined into a single `combined_daintree_output_config.json` where the screen is the key and the value is the list of the output config for each model. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
include "predictability_inputs.conseq" | ||
|
||
# Three Steps: | ||
# 1. Generate a daintree input config file for each model and screen | ||
# 2. Run the model fitting | ||
# 3. Combine the output config files | ||
|
||
rule process_model_config: | ||
inputs: | ||
# Model config yaml file | ||
model_config=fileref("model-config.yaml"), | ||
|
||
# Two target matrices | ||
crispr_gene_effect={"type": "target_matrix", "label": "crispr_gene_effect"}, | ||
rnai={"type": "target_matrix", "label": "rnai"}, | ||
|
||
# Twelve features | ||
lineage={"type": "feature", "label": "lineage"}, | ||
crispr_confounder={"type": "feature", "label": "crispr_confounder"}, | ||
rnai_confounder={"type": "feature", "label": "rnai_confounder"}, | ||
driver_events={"type": "feature", "label": "driver_events"}, | ||
armlevel_cna={"type": "feature", "label": "armlevel_cna"}, | ||
cytoband_cn={"type": "feature", "label": "cytoband_cn"}, | ||
genetic_signature={"type": "feature", "label": "genetic_signature"}, | ||
mutations_hotspot={"type": "feature", "label": "mutations_hotspot"}, | ||
mutations_damaging={"type": "feature", "label": "mutations_damaging"}, | ||
gene_cn={"type": "feature", "label": "gene_cn"}, | ||
loh={"type": "feature", "label": "loh"}, | ||
rnaseq={"type": "feature", "label": "rnaseq"}, | ||
|
||
# Script to generate daintree input config file | ||
script=fileref("scripts/generate_daintree_input_configs.py"), | ||
|
||
run "python" with """ | ||
import json | ||
|
||
config_dict = {{inputs}} | ||
with open("daintree_input_config.json", 'w') as f: | ||
json.dump(config_dict, f, indent=2) | ||
|
||
""" | ||
run "python {{ inputs.script.filename }} --model_config {{ inputs.model_config.filename }} --input_config 'daintree_input_config.json'" | ||
|
||
|
||
rule run_fit_models: | ||
resources: {'slots': "0.05"} # let up to 20 of these run in parallel | ||
inputs: | ||
daintree_input_config={ | ||
"type": "daintree_input_config" | ||
}, | ||
sparkles_config=fileref("sparkles-config", copy_to="sparkles-config") | ||
outputs: | ||
{ | ||
"type": "daintree_output_config", | ||
"name": "{{ inputs.daintree_input_config.label }}", | ||
"filename": {"$filename": "daintree_output_config.json"} | ||
} | ||
run "python" with """ | ||
import subprocess | ||
import os | ||
import glob | ||
import json | ||
|
||
input_config_filepath = "{{ inputs.daintree_input_config.filename }}" | ||
input_config_filename = "{{ inputs.daintree_input_config.label }}.json" | ||
|
||
docker_command = [ | ||
"docker", "run", | ||
"--rm", | ||
"-v", f"{input_config_filepath}:/daintree/{input_config_filename}", | ||
"-v", f"{os.getcwd()}/output_data:/daintree/output_data", | ||
"-v", f"{os.getcwd()}/sparkles-config:/daintree/sparkles-config", | ||
"us.gcr.io/broad-achilles/daintree-sparkles:v4", | ||
"/install/depmap-py/bin/python3.9", "-u", "run_fit_models.py", | ||
"collect-and-fit-generate-config", | ||
"--input-files", input_config_filename, | ||
"--sparkles-config", "/daintree/sparkles-config", | ||
"--save-dir", "/daintree/output_data", | ||
"--test", "True", | ||
"--upload-to-taiga", "predictability-76d5", | ||
] | ||
|
||
subprocess.run( | ||
docker_command, | ||
check=True | ||
) | ||
|
||
# Find the output config file using glob | ||
output_config_files = glob.glob(os.path.join(os.getcwd(), "output_data", "output_config_files", "*.json")) | ||
if not output_config_files: | ||
raise FileNotFoundError("No output config files found") | ||
|
||
# Use the first, there should only be one matching file | ||
output_config_file = output_config_files[0] | ||
|
||
try: | ||
with open(output_config_file, 'r') as f: | ||
output_config = json.load(f) | ||
with open("daintree_output_config.json", 'w') as f: | ||
json.dump(output_config, f, indent=2) | ||
except json.JSONDecodeError as e: | ||
logger.error(f"Invalid JSON in output config: {e}") | ||
raise | ||
""" | ||
|
||
|
||
rule combine_output_configs: | ||
inputs: | ||
daintree_output_config = all{ | ||
"type": "daintree_output_config" | ||
} | ||
outputs: | ||
{ | ||
"type": "combined_daintree_output_config", | ||
"filename": {"$filename": "combined_daintree_output_config.json"} | ||
} | ||
run "python" with """ | ||
import json | ||
import os | ||
|
||
def merge_json_files(json_files): | ||
combined = {} | ||
|
||
for file_path in json_files: | ||
with open(file_path, 'r') as f: | ||
data = json.load(f) | ||
|
||
model_name = list(data.keys())[0] | ||
screen_name = data[model_name]["input"]["screen_name"] | ||
|
||
# Initialize the screen in combined if it doesn't exist | ||
if screen_name not in combined: | ||
combined[screen_name] = {} | ||
|
||
# Add the model data to the appropriate screen | ||
combined[screen_name][model_name] = data[model_name] | ||
|
||
return combined | ||
|
||
artifacts = {{ inputs.daintree_output_config }} | ||
list_of_files = [artifact['filename'] for artifact in artifacts] | ||
|
||
combined_output_config = merge_json_files(list_of_files) | ||
|
||
try: | ||
with open("combined_daintree_output_config.json", 'w') as f: | ||
json.dump(combined_output_config, f, indent=2) | ||
except json.JSONDecodeError as e: | ||
logger.error(f"Invalid JSON in combined output config: {e}") | ||
raise | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
CellContext: | ||
Features: | ||
- lineage | ||
- confounder # Will map to crispr_confounder or rnai_confounder based on screen | ||
Required: | ||
- lineage | ||
- confounder | ||
Relation: All | ||
|
||
DriverEvents: | ||
Features: | ||
- lineage | ||
- confounder | ||
- driver_events | ||
Required: | ||
- lineage | ||
- confounder | ||
- driver_events | ||
Relation: All | ||
|
||
GeneticDerangement: | ||
Features: | ||
- lineage | ||
- confounder | ||
- driver_events | ||
- armlevel_cn | ||
- cytoband_cn | ||
- genetic_signature | ||
Required: | ||
- lineage | ||
- confounder | ||
- driver_events | ||
- armlevel_cn | ||
- cytoband_cn | ||
- genetic_signature | ||
Relation: All | ||
|
||
DNA: | ||
Features: | ||
- lineage | ||
- confounder | ||
- driver_events | ||
- armlevel_cn | ||
- cytoband_cn | ||
- genetic_signature | ||
- mutations_hotspot | ||
- mutations_damaging | ||
- gene_cn | ||
- loh | ||
Required: | ||
- lineage | ||
- confounder | ||
- driver_events | ||
- armlevel_cn | ||
- cytoband_cn | ||
- genetic_signature | ||
- mutations_hotspot | ||
- mutations_damaging | ||
- gene_cn | ||
Relation: All | ||
|
||
RNASeq: | ||
Features: | ||
- lineage | ||
- confounder | ||
- driver_events | ||
- armlevel_cn | ||
- cytoband_cn | ||
- genetic_signature | ||
- mutations_hotspot | ||
- mutations_damaging | ||
- gene_cn | ||
- loh | ||
- rnaseq | ||
Required: | ||
- lineage | ||
- confounder | ||
- driver_events | ||
- armlevel_cn | ||
- cytoband_cn | ||
- genetic_signature | ||
- mutations_hotspot | ||
- mutations_damaging | ||
- gene_cn | ||
- rnaseq | ||
Relation: All |
97 changes: 97 additions & 0 deletions
97
analysis-pipeline/predictability/predictability_inputs.conseq
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
# Target Matrices | ||
add-if-missing { | ||
"type": "target_matrix", | ||
"label": "crispr_gene_effect", | ||
"source_dataset_id": "internal-24q2-3719.82/CRISPRGeneEffect" | ||
} | ||
|
||
add-if-missing { | ||
"type": "target_matrix", | ||
"label": "rnai", | ||
"source_dataset_id": "predictability-legacy-datasets-8c54.14/RNAiDep" | ||
} | ||
|
||
# Feature Matrices | ||
add-if-missing { | ||
"type": "feature", | ||
"label": "lineage", | ||
"category": "lineage", | ||
"source_dataset_id": "predictability-76d5.94/PredictabilityLineageTransformed" | ||
} | ||
|
||
add-if-missing { | ||
"type": "feature", | ||
"label": "crispr_confounder", | ||
"category": "confounder", | ||
"source_dataset_id": "predictability-76d5.111/PredictabilityCRISPRConfoundersTransformed" | ||
} | ||
|
||
add-if-missing { | ||
"type": "feature", | ||
"label": "rnai_confounder", | ||
"category": "confounder", | ||
"source_dataset_id": "predictability-legacy-datasets-8c54.14/RNAiConfounders" | ||
} | ||
|
||
add-if-missing { | ||
"type": "feature", | ||
"label": "driver_events", | ||
"category": "driverevents", | ||
"source_dataset_id": "predictability-76d5.99/DriverEvents" | ||
} | ||
|
||
add-if-missing { | ||
"type": "feature", | ||
"label": "armlevel_cna", | ||
"category": "armlevel", | ||
"source_dataset_id": "internal-24q2-3719.82/OmicsArmLevelCNA" | ||
} | ||
|
||
add-if-missing { | ||
"type": "feature", | ||
"label": "cytoband_cn", | ||
"category": "cytoband", | ||
"source_dataset_id": "predictability-76d5.99/PredictabilityGenticDerangementTransformed" | ||
} | ||
|
||
add-if-missing { | ||
"type": "feature", | ||
"label": "genetic_signature", | ||
"category": "geneticsignature", | ||
"source_dataset_id": "internal-24q2-3719.82/OmicsSignatures" | ||
} | ||
|
||
add-if-missing { | ||
"type": "feature", | ||
"label": "mutations_hotspot", | ||
"category": "gene", | ||
"source_dataset_id": "internal-24q2-3719.82/OmicsSomaticMutationsMatrixHotspot" | ||
} | ||
|
||
add-if-missing { | ||
"type": "feature", | ||
"label": "mutations_damaging", | ||
"category": "gene", | ||
"source_dataset_id": "internal-24q2-3719.82/OmicsSomaticMutationsMatrixDamaging" | ||
} | ||
|
||
add-if-missing { | ||
"type": "feature", | ||
"label": "gene_cn", | ||
"category": "gene", | ||
"source_dataset_id": "internal-24q2-3719.82/OmicsCNGene" | ||
} | ||
|
||
add-if-missing { | ||
"type": "feature", | ||
"label": "loh", | ||
"category": "gene", | ||
"source_dataset_id": "internal-24q2-3719.82/OmicsLoH" | ||
} | ||
|
||
add-if-missing { | ||
"type": "feature", | ||
"label": "rnaseq", | ||
"category": "gene", | ||
"source_dataset_id": "internal-24q2-3719.82/OmicsExpressionProteinCodingGenesTPMLogp1" | ||
} |
Oops, something went wrong.