Merge branch 'master' into qa

broadinstitute · Feb 6, 2025 · 3bceaf7 · 3bceaf7
2 parents a70ce1a + 70ad8cb
commit 3bceaf7
Show file tree

Hide file tree

Showing 223 changed files with 3,136 additions and 9,977 deletions.
diff --git a/analysis-pipeline/README.md b/analysis-pipeline/README.md
@@ -0,0 +1,12 @@
+This is the very first draft of predictability as part of conseq in the new analysis pipeline.
+
+You just need conseq installed since everything else that is required is installed inside the us.gcr.io/broad-achilles/daintree-sparkles:v4 image where predicability is run.
+
+Note that there is a model-config.yaml file which has the config of all the models.
+Once conseq is installed, you can run `conseq run fit.conseq` to start.
+
+The `fit.conseq` works as follows:
+
+1. It first creates model input json files based on the `model-config.yaml` file.
+2. Once the input json file is created, daintree is run to produce the output for predictability. There are 3 different files that are uploaded to taiga for each model, predictions.csv, ensemble.csv, feature_metadata.csv.Running daintree also creates a `output_config.json` file which has the input config as well as the taiga ids of the 3 uploaded files.
+3. The `output_config.json` file is then combined into a single `combined_daintree_output_config.json` where the screen is the key and the value is the list of the output config for each model.
diff --git a/analysis-pipeline/predictability/fit.conseq b/analysis-pipeline/predictability/fit.conseq
@@ -0,0 +1,151 @@
+include "predictability_inputs.conseq"
+
+# Three Steps:
+# 1. Generate a daintree input config file for each model and screen
+# 2. Run the model fitting
+# 3. Combine the output config files
+
+rule process_model_config:
+    inputs:
+        # Model config yaml file
+        model_config=fileref("model-config.yaml"),
+
+        # Two target matrices
+        crispr_gene_effect={"type": "target_matrix", "label": "crispr_gene_effect"},
+        rnai={"type": "target_matrix", "label": "rnai"},
+
+        # Twelve features
+        lineage={"type": "feature", "label": "lineage"},
+        crispr_confounder={"type": "feature", "label": "crispr_confounder"},
+        rnai_confounder={"type": "feature", "label": "rnai_confounder"},
+        driver_events={"type": "feature", "label": "driver_events"},
+        armlevel_cna={"type": "feature", "label": "armlevel_cna"},
+        cytoband_cn={"type": "feature", "label": "cytoband_cn"},
+        genetic_signature={"type": "feature", "label": "genetic_signature"},
+        mutations_hotspot={"type": "feature", "label": "mutations_hotspot"},
+        mutations_damaging={"type": "feature", "label": "mutations_damaging"},
+        gene_cn={"type": "feature", "label": "gene_cn"},
+        loh={"type": "feature", "label": "loh"},
+        rnaseq={"type": "feature", "label": "rnaseq"},
+
+        # Script to generate daintree input config file
+        script=fileref("scripts/generate_daintree_input_configs.py"),
+
+    run "python" with """
+    import json
+
+    config_dict = {{inputs}}
+    with open("daintree_input_config.json", 'w') as f:
+        json.dump(config_dict, f, indent=2)   
+
+    """
+    run "python {{ inputs.script.filename }} --model_config {{ inputs.model_config.filename }} --input_config 'daintree_input_config.json'"
+
+
+rule run_fit_models:
+    resources: {'slots': "0.05"} # let up to 20 of these run in parallel
+    inputs:
+        daintree_input_config={
+          "type": "daintree_input_config"
+          },
+        sparkles_config=fileref("sparkles-config", copy_to="sparkles-config")
+    outputs:
+        {
+          "type": "daintree_output_config",
+          "name": "{{ inputs.daintree_input_config.label }}",
+          "filename": {"$filename": "daintree_output_config.json"}
+        }
+    run "python" with """
+      import subprocess
+      import os
+      import glob
+      import json
+
+      input_config_filepath = "{{ inputs.daintree_input_config.filename }}"
+      input_config_filename = "{{ inputs.daintree_input_config.label }}.json"
+
+      docker_command = [
+        "docker", "run",
+        "--rm",
+        "-v", f"{input_config_filepath}:/daintree/{input_config_filename}",
+        "-v", f"{os.getcwd()}/output_data:/daintree/output_data",
+        "-v", f"{os.getcwd()}/sparkles-config:/daintree/sparkles-config",
+        "us.gcr.io/broad-achilles/daintree-sparkles:v4",
+        "/install/depmap-py/bin/python3.9", "-u", "run_fit_models.py",
+        "collect-and-fit-generate-config",
+        "--input-files", input_config_filename,
+        "--sparkles-config", "/daintree/sparkles-config",
+        "--save-dir", "/daintree/output_data",
+        "--test", "True",
+        "--upload-to-taiga", "predictability-76d5",
+      ]
+
+      subprocess.run(
+          docker_command,
+          check=True
+      )
+
+      # Find the output config file using glob
+      output_config_files = glob.glob(os.path.join(os.getcwd(), "output_data", "output_config_files", "*.json"))
+      if not output_config_files:
+          raise FileNotFoundError("No output config files found")
+
+      # Use the first, there should only be one matching file
+      output_config_file = output_config_files[0]
+
+      try:
+          with open(output_config_file, 'r') as f:
+              output_config = json.load(f)
+          with open("daintree_output_config.json", 'w') as f:
+              json.dump(output_config, f, indent=2)
+      except json.JSONDecodeError as e:
+          logger.error(f"Invalid JSON in output config: {e}")
+          raise
+    """
+
+
+rule combine_output_configs:
+    inputs:
+        daintree_output_config = all{
+          "type": "daintree_output_config"
+          }
+    outputs:
+        {
+          "type": "combined_daintree_output_config",
+          "filename": {"$filename": "combined_daintree_output_config.json"}
+        }
+    run "python" with """
+        import json
+        import os
+
+        def merge_json_files(json_files):
+            combined = {}
+
+            for file_path in json_files:
+                with open(file_path, 'r') as f:
+                    data = json.load(f)
+
+                model_name = list(data.keys())[0]
+                screen_name = data[model_name]["input"]["screen_name"]
+
+                # Initialize the screen in combined if it doesn't exist
+                if screen_name not in combined:
+                    combined[screen_name] = {}
+
+                # Add the model data to the appropriate screen
+                combined[screen_name][model_name] = data[model_name]
+
+            return combined        
+
+        artifacts = {{ inputs.daintree_output_config }}
+        list_of_files = [artifact['filename'] for artifact in artifacts]
+
+        combined_output_config = merge_json_files(list_of_files)
+
+        try:
+            with open("combined_daintree_output_config.json", 'w') as f:
+                json.dump(combined_output_config, f, indent=2)
+        except json.JSONDecodeError as e:
+            logger.error(f"Invalid JSON in combined output config: {e}")
+            raise
+    """
diff --git a/analysis-pipeline/predictability/model-config.yaml b/analysis-pipeline/predictability/model-config.yaml
@@ -0,0 +1,86 @@
+CellContext:
+  Features:
+    - lineage
+    - confounder # Will map to crispr_confounder or rnai_confounder based on screen
+  Required:
+    - lineage
+    - confounder
+  Relation: All
+
+DriverEvents:
+  Features:
+    - lineage
+    - confounder
+    - driver_events
+  Required:
+    - lineage
+    - confounder
+    - driver_events
+  Relation: All
+
+GeneticDerangement:
+  Features:
+    - lineage
+    - confounder
+    - driver_events
+    - armlevel_cn
+    - cytoband_cn
+    - genetic_signature
+  Required:
+    - lineage
+    - confounder
+    - driver_events
+    - armlevel_cn
+    - cytoband_cn
+    - genetic_signature
+  Relation: All
+
+DNA:
+  Features:
+    - lineage
+    - confounder
+    - driver_events
+    - armlevel_cn
+    - cytoband_cn
+    - genetic_signature
+    - mutations_hotspot
+    - mutations_damaging
+    - gene_cn
+    - loh
+  Required:
+    - lineage
+    - confounder
+    - driver_events
+    - armlevel_cn
+    - cytoband_cn
+    - genetic_signature
+    - mutations_hotspot
+    - mutations_damaging
+    - gene_cn
+  Relation: All
+
+RNASeq:
+  Features:
+    - lineage
+    - confounder
+    - driver_events
+    - armlevel_cn
+    - cytoband_cn
+    - genetic_signature
+    - mutations_hotspot
+    - mutations_damaging
+    - gene_cn
+    - loh
+    - rnaseq
+  Required:
+    - lineage
+    - confounder
+    - driver_events
+    - armlevel_cn
+    - cytoband_cn
+    - genetic_signature
+    - mutations_hotspot
+    - mutations_damaging
+    - gene_cn
+    - rnaseq
+  Relation: All
diff --git a/analysis-pipeline/predictability/predictability_inputs.conseq b/analysis-pipeline/predictability/predictability_inputs.conseq
@@ -0,0 +1,97 @@
+# Target Matrices
+add-if-missing {
+    "type": "target_matrix",
+    "label": "crispr_gene_effect",
+    "source_dataset_id": "internal-24q2-3719.82/CRISPRGeneEffect"
+}
+
+add-if-missing {
+    "type": "target_matrix",
+    "label": "rnai",
+    "source_dataset_id": "predictability-legacy-datasets-8c54.14/RNAiDep"
+}
+
+# Feature Matrices
+add-if-missing {
+    "type": "feature",
+    "label": "lineage",
+    "category": "lineage",
+    "source_dataset_id": "predictability-76d5.94/PredictabilityLineageTransformed"
+}
+
+add-if-missing {
+    "type": "feature",
+    "label": "crispr_confounder",
+    "category": "confounder",
+    "source_dataset_id": "predictability-76d5.111/PredictabilityCRISPRConfoundersTransformed"
+}
+
+add-if-missing {
+    "type": "feature",
+    "label": "rnai_confounder",
+    "category": "confounder",
+    "source_dataset_id": "predictability-legacy-datasets-8c54.14/RNAiConfounders"
+}
+
+add-if-missing {
+    "type": "feature",
+    "label": "driver_events",
+    "category": "driverevents",
+    "source_dataset_id": "predictability-76d5.99/DriverEvents"
+}
+
+add-if-missing {
+    "type": "feature",
+    "label": "armlevel_cna",
+    "category": "armlevel",
+    "source_dataset_id": "internal-24q2-3719.82/OmicsArmLevelCNA"
+}
+
+add-if-missing {
+    "type": "feature",
+    "label": "cytoband_cn",
+    "category": "cytoband",
+    "source_dataset_id": "predictability-76d5.99/PredictabilityGenticDerangementTransformed"
+}
+
+add-if-missing {
+    "type": "feature",
+    "label": "genetic_signature",
+    "category": "geneticsignature",
+    "source_dataset_id": "internal-24q2-3719.82/OmicsSignatures"
+}
+
+add-if-missing {
+    "type": "feature",
+    "label": "mutations_hotspot",
+    "category": "gene",
+    "source_dataset_id": "internal-24q2-3719.82/OmicsSomaticMutationsMatrixHotspot"
+}
+
+add-if-missing {
+    "type": "feature",
+    "label": "mutations_damaging",
+    "category": "gene",
+    "source_dataset_id": "internal-24q2-3719.82/OmicsSomaticMutationsMatrixDamaging"
+}
+
+add-if-missing {
+    "type": "feature",
+    "label": "gene_cn",
+    "category": "gene",
+    "source_dataset_id": "internal-24q2-3719.82/OmicsCNGene"
+}
+
+add-if-missing {
+    "type": "feature",
+    "label": "loh",
+    "category": "gene",
+    "source_dataset_id": "internal-24q2-3719.82/OmicsLoH"
+}
+
+add-if-missing {
+    "type": "feature",
+    "label": "rnaseq",
+    "category": "gene",
+    "source_dataset_id": "internal-24q2-3719.82/OmicsExpressionProteinCodingGenesTPMLogp1"
+}