Skip to content

Commit

Permalink
Merge branch 'master' into qa
Browse files Browse the repository at this point in the history
  • Loading branch information
naquib314 committed Feb 6, 2025
2 parents a70ce1a + 70ad8cb commit 3bceaf7
Show file tree
Hide file tree
Showing 223 changed files with 3,136 additions and 9,977 deletions.
12 changes: 12 additions & 0 deletions analysis-pipeline/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
This is the very first draft of predictability as part of conseq in the new analysis pipeline.

You just need conseq installed since everything else that is required is installed inside the us.gcr.io/broad-achilles/daintree-sparkles:v4 image where predicability is run.

Note that there is a model-config.yaml file which has the config of all the models.
Once conseq is installed, you can run `conseq run fit.conseq` to start.

The `fit.conseq` works as follows:

1. It first creates model input json files based on the `model-config.yaml` file.
2. Once the input json file is created, daintree is run to produce the output for predictability. There are 3 different files that are uploaded to taiga for each model, predictions.csv, ensemble.csv, feature_metadata.csv.Running daintree also creates a `output_config.json` file which has the input config as well as the taiga ids of the 3 uploaded files.
3. The `output_config.json` file is then combined into a single `combined_daintree_output_config.json` where the screen is the key and the value is the list of the output config for each model.
151 changes: 151 additions & 0 deletions analysis-pipeline/predictability/fit.conseq
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
include "predictability_inputs.conseq"

# Three Steps:
# 1. Generate a daintree input config file for each model and screen
# 2. Run the model fitting
# 3. Combine the output config files

rule process_model_config:
inputs:
# Model config yaml file
model_config=fileref("model-config.yaml"),

# Two target matrices
crispr_gene_effect={"type": "target_matrix", "label": "crispr_gene_effect"},
rnai={"type": "target_matrix", "label": "rnai"},

# Twelve features
lineage={"type": "feature", "label": "lineage"},
crispr_confounder={"type": "feature", "label": "crispr_confounder"},
rnai_confounder={"type": "feature", "label": "rnai_confounder"},
driver_events={"type": "feature", "label": "driver_events"},
armlevel_cna={"type": "feature", "label": "armlevel_cna"},
cytoband_cn={"type": "feature", "label": "cytoband_cn"},
genetic_signature={"type": "feature", "label": "genetic_signature"},
mutations_hotspot={"type": "feature", "label": "mutations_hotspot"},
mutations_damaging={"type": "feature", "label": "mutations_damaging"},
gene_cn={"type": "feature", "label": "gene_cn"},
loh={"type": "feature", "label": "loh"},
rnaseq={"type": "feature", "label": "rnaseq"},

# Script to generate daintree input config file
script=fileref("scripts/generate_daintree_input_configs.py"),

run "python" with """
import json

config_dict = {{inputs}}
with open("daintree_input_config.json", 'w') as f:
json.dump(config_dict, f, indent=2)

"""
run "python {{ inputs.script.filename }} --model_config {{ inputs.model_config.filename }} --input_config 'daintree_input_config.json'"


rule run_fit_models:
resources: {'slots': "0.05"} # let up to 20 of these run in parallel
inputs:
daintree_input_config={
"type": "daintree_input_config"
},
sparkles_config=fileref("sparkles-config", copy_to="sparkles-config")
outputs:
{
"type": "daintree_output_config",
"name": "{{ inputs.daintree_input_config.label }}",
"filename": {"$filename": "daintree_output_config.json"}
}
run "python" with """
import subprocess
import os
import glob
import json

input_config_filepath = "{{ inputs.daintree_input_config.filename }}"
input_config_filename = "{{ inputs.daintree_input_config.label }}.json"

docker_command = [
"docker", "run",
"--rm",
"-v", f"{input_config_filepath}:/daintree/{input_config_filename}",
"-v", f"{os.getcwd()}/output_data:/daintree/output_data",
"-v", f"{os.getcwd()}/sparkles-config:/daintree/sparkles-config",
"us.gcr.io/broad-achilles/daintree-sparkles:v4",
"/install/depmap-py/bin/python3.9", "-u", "run_fit_models.py",
"collect-and-fit-generate-config",
"--input-files", input_config_filename,
"--sparkles-config", "/daintree/sparkles-config",
"--save-dir", "/daintree/output_data",
"--test", "True",
"--upload-to-taiga", "predictability-76d5",
]

subprocess.run(
docker_command,
check=True
)

# Find the output config file using glob
output_config_files = glob.glob(os.path.join(os.getcwd(), "output_data", "output_config_files", "*.json"))
if not output_config_files:
raise FileNotFoundError("No output config files found")

# Use the first, there should only be one matching file
output_config_file = output_config_files[0]

try:
with open(output_config_file, 'r') as f:
output_config = json.load(f)
with open("daintree_output_config.json", 'w') as f:
json.dump(output_config, f, indent=2)
except json.JSONDecodeError as e:
logger.error(f"Invalid JSON in output config: {e}")
raise
"""


rule combine_output_configs:
inputs:
daintree_output_config = all{
"type": "daintree_output_config"
}
outputs:
{
"type": "combined_daintree_output_config",
"filename": {"$filename": "combined_daintree_output_config.json"}
}
run "python" with """
import json
import os

def merge_json_files(json_files):
combined = {}

for file_path in json_files:
with open(file_path, 'r') as f:
data = json.load(f)

model_name = list(data.keys())[0]
screen_name = data[model_name]["input"]["screen_name"]

# Initialize the screen in combined if it doesn't exist
if screen_name not in combined:
combined[screen_name] = {}

# Add the model data to the appropriate screen
combined[screen_name][model_name] = data[model_name]

return combined

artifacts = {{ inputs.daintree_output_config }}
list_of_files = [artifact['filename'] for artifact in artifacts]

combined_output_config = merge_json_files(list_of_files)

try:
with open("combined_daintree_output_config.json", 'w') as f:
json.dump(combined_output_config, f, indent=2)
except json.JSONDecodeError as e:
logger.error(f"Invalid JSON in combined output config: {e}")
raise
"""
86 changes: 86 additions & 0 deletions analysis-pipeline/predictability/model-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
CellContext:
Features:
- lineage
- confounder # Will map to crispr_confounder or rnai_confounder based on screen
Required:
- lineage
- confounder
Relation: All

DriverEvents:
Features:
- lineage
- confounder
- driver_events
Required:
- lineage
- confounder
- driver_events
Relation: All

GeneticDerangement:
Features:
- lineage
- confounder
- driver_events
- armlevel_cn
- cytoband_cn
- genetic_signature
Required:
- lineage
- confounder
- driver_events
- armlevel_cn
- cytoband_cn
- genetic_signature
Relation: All

DNA:
Features:
- lineage
- confounder
- driver_events
- armlevel_cn
- cytoband_cn
- genetic_signature
- mutations_hotspot
- mutations_damaging
- gene_cn
- loh
Required:
- lineage
- confounder
- driver_events
- armlevel_cn
- cytoband_cn
- genetic_signature
- mutations_hotspot
- mutations_damaging
- gene_cn
Relation: All

RNASeq:
Features:
- lineage
- confounder
- driver_events
- armlevel_cn
- cytoband_cn
- genetic_signature
- mutations_hotspot
- mutations_damaging
- gene_cn
- loh
- rnaseq
Required:
- lineage
- confounder
- driver_events
- armlevel_cn
- cytoband_cn
- genetic_signature
- mutations_hotspot
- mutations_damaging
- gene_cn
- rnaseq
Relation: All
97 changes: 97 additions & 0 deletions analysis-pipeline/predictability/predictability_inputs.conseq
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Target Matrices
add-if-missing {
"type": "target_matrix",
"label": "crispr_gene_effect",
"source_dataset_id": "internal-24q2-3719.82/CRISPRGeneEffect"
}

add-if-missing {
"type": "target_matrix",
"label": "rnai",
"source_dataset_id": "predictability-legacy-datasets-8c54.14/RNAiDep"
}

# Feature Matrices
add-if-missing {
"type": "feature",
"label": "lineage",
"category": "lineage",
"source_dataset_id": "predictability-76d5.94/PredictabilityLineageTransformed"
}

add-if-missing {
"type": "feature",
"label": "crispr_confounder",
"category": "confounder",
"source_dataset_id": "predictability-76d5.111/PredictabilityCRISPRConfoundersTransformed"
}

add-if-missing {
"type": "feature",
"label": "rnai_confounder",
"category": "confounder",
"source_dataset_id": "predictability-legacy-datasets-8c54.14/RNAiConfounders"
}

add-if-missing {
"type": "feature",
"label": "driver_events",
"category": "driverevents",
"source_dataset_id": "predictability-76d5.99/DriverEvents"
}

add-if-missing {
"type": "feature",
"label": "armlevel_cna",
"category": "armlevel",
"source_dataset_id": "internal-24q2-3719.82/OmicsArmLevelCNA"
}

add-if-missing {
"type": "feature",
"label": "cytoband_cn",
"category": "cytoband",
"source_dataset_id": "predictability-76d5.99/PredictabilityGenticDerangementTransformed"
}

add-if-missing {
"type": "feature",
"label": "genetic_signature",
"category": "geneticsignature",
"source_dataset_id": "internal-24q2-3719.82/OmicsSignatures"
}

add-if-missing {
"type": "feature",
"label": "mutations_hotspot",
"category": "gene",
"source_dataset_id": "internal-24q2-3719.82/OmicsSomaticMutationsMatrixHotspot"
}

add-if-missing {
"type": "feature",
"label": "mutations_damaging",
"category": "gene",
"source_dataset_id": "internal-24q2-3719.82/OmicsSomaticMutationsMatrixDamaging"
}

add-if-missing {
"type": "feature",
"label": "gene_cn",
"category": "gene",
"source_dataset_id": "internal-24q2-3719.82/OmicsCNGene"
}

add-if-missing {
"type": "feature",
"label": "loh",
"category": "gene",
"source_dataset_id": "internal-24q2-3719.82/OmicsLoH"
}

add-if-missing {
"type": "feature",
"label": "rnaseq",
"category": "gene",
"source_dataset_id": "internal-24q2-3719.82/OmicsExpressionProteinCodingGenesTPMLogp1"
}
Loading

0 comments on commit 3bceaf7

Please sign in to comment.