Merge pull request #451 from nextstrain/benchmarks-stats

Add benchmark and `--stats`
nextstrain · Jun 18, 2024 · f5f238b · f5f238b
2 parents de35468 + eb63e29
commit f5f238b
Show file tree

Hide file tree

Showing 8 changed files with 91 additions and 23 deletions.
diff --git a/.github/workflows/fetch-and-ingest-genbank-master.yml b/.github/workflows/fetch-and-ingest-genbank-master.yml
@@ -108,5 +108,6 @@ jobs:
           --env SLACK_CHANNELS \
           --env PAT_GITHUB_DISPATCH="$GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH" \
           . \
+            --stats snakemake_stats.json \
             --configfile config/genbank.yaml \
             $CONFIG_OVERRIDES
diff --git a/.github/workflows/fetch-and-ingest-gisaid-master.yml b/.github/workflows/fetch-and-ingest-gisaid-master.yml
@@ -109,5 +109,6 @@ jobs:
           --env SLACK_CHANNELS \
           --env PAT_GITHUB_DISPATCH="$GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH" \
           . \
+            --stats snakemake_stats.json \
             --configfile config/gisaid.yaml \
             $CONFIG_OVERRIDES
diff --git a/workflow/snakemake_rules/curate.smk b/workflow/snakemake_rules/curate.smk
@@ -29,6 +29,8 @@ rule transform_rki_data:
     output:
         fasta="data/rki_sequences.fasta",
         metadata="data/rki_metadata_transformed.tsv",
+    benchmark:
+        "benchmarks/transform_rki_data.txt"
     params:
         subsampled=config.get("subsampled", False),
     shell:
@@ -45,6 +47,8 @@ rule transform_biosample:
         biosample = "data/biosample.ndjson"
     output:
         biosample = "data/genbank/biosample.tsv"
+    benchmark:
+        "benchmarks/transform_biosample.txt"
     shell:
         """
         ./bin/transform-biosample {input.biosample} \
@@ -85,6 +89,8 @@ rule merge_open_data:
     output:
         metadata="data/genbank/metadata_transformed.tsv",
         sequences="data/genbank/sequences.fasta",
+    benchmark:
+        "benchmarks/merge_open_data.txt"
     shell:
         """
         ./bin/merge-open \
@@ -105,6 +111,8 @@ rule transform_gisaid_data:
         metadata = "data/gisaid/metadata_transformed.tsv",
         flagged_annotations = temp("data/gisaid/flagged-annotations"),
         additional_info = "data/gisaid/additional_info.tsv"
+    benchmark:
+        "benchmarks/transform_gisaid_data.txt"
     shell:
         """
         ./bin/transform-gisaid {input.ndjson} \
@@ -120,6 +128,8 @@ rule flag_metadata:
         metadata = "data/gisaid/metadata.tsv"
     output:
         metadata = "data/gisaid/flagged_metadata.txt"
+    benchmark:
+        "benchmarks/flag_metadata.txt"
     resources:
         # Memory use scales primarily with the size of the metadata file.
         mem_mb=20000

diff --git a/workflow/snakemake_rules/fetch_sequences.smk b/workflow/snakemake_rules/fetch_sequences.smk
@@ -21,6 +21,8 @@ Produces different final outputs for GISAID vs GenBank/RKI:
 rule fetch_main_gisaid_ndjson:
     output:
         ndjson = temp(f"data/gisaid.ndjson")
+    benchmark:
+        "benchmarks/fetch_main_gisaid_ndjson.txt"
     retries: 5
     shell:
         """
@@ -124,10 +126,11 @@ rule create_genbank_ndjson:
         """
 
 rule fetch_biosample:
-    message:
-        """Fetching BioSample data (GenBank only)"""
+    """Fetching BioSample data (GenBank only)"""
     output:
         biosample = temp("data/biosample.ndjson")
+    benchmark:
+        "benchmarks/fetch_biosample.txt"
     retries: 5
     shell:
         """
@@ -136,10 +139,11 @@ rule fetch_biosample:
 
 
 rule fetch_cog_uk_accessions:
-    message:
-        """Fetching COG-UK sample accesions (GenBank only)"""
+    """Fetching COG-UK sample accesions (GenBank only)"""
     output:
         cog_uk_accessions = temp("data/cog_uk_accessions.tsv")
+    benchmark:
+        "benchmarks/fetch_cog_uk_accessions.txt"
     retries: 5
     shell:
         """
@@ -148,10 +152,11 @@ rule fetch_cog_uk_accessions:
 
 
 rule fetch_cog_uk_metadata:
-    message:
-        """Fetching COG-UK metadata (GenBank only)"""
+    """Fetching COG-UK metadata (GenBank only)"""
     output:
         cog_uk_metadata = temp("data/cog_uk_metadata.csv.gz")
+    benchmark:
+        "benchmarks/fetch_cog_uk_metadata.txt"
     retries: 5
     shell:
         """
@@ -164,13 +169,17 @@ rule uncompress_cog_uk_metadata:
         "data/cog_uk_metadata.csv.gz"
     output:
         cog_uk_metadata = temp("data/cog_uk_metadata.csv")
+    benchmark:
+        "benchmarks/uncompress_cog_uk_metadata.txt"
     shell:
         "gunzip -c {input} > {output}"
 
 
 rule fetch_rki_sequences:
     output:
         rki_sequences=temp("data/rki_sequences.fasta.xz"),
+    benchmark:
+        "benchmarks/fetch_rki_sequences.txt"
     retries: 5
     shell:
         """
@@ -181,6 +190,8 @@ rule fetch_rki_sequences:
 rule fetch_rki_metadata:
     output:
         rki_metadata=temp("data/rki_metadata.tsv.xz"),
+    benchmark:
+        "benchmarks/fetch_rki_metadata.txt"
     retries: 5
     shell:
         """
@@ -194,6 +205,8 @@ rule transform_rki_data_to_ndjson:
         rki_metadata="data/rki_metadata.tsv.xz"
     output:
         ndjson="data/rki.ndjson",
+    benchmark:
+        "benchmarks/transform_rki_data_to_ndjson.txt"
     shell:
         """
         ./bin/transform-rki-data-to-ndjson \
@@ -227,29 +240,31 @@ if config.get("s3_dst") and config.get("s3_src"):
         ruleorder: fetch_main_ndjson_from_s3 > create_genbank_ndjson
 
     rule fetch_main_ndjson_from_s3:
-        message:
-            """Fetching main NDJSON from AWS S3"""
+        """Fetching main NDJSON from AWS S3"""
         params:
             file_on_s3_dst=f"{config['s3_dst']}/{database}.ndjson.zst",
             file_on_s3_src=f"{config['s3_src']}/{database}.ndjson.zst",
             lines = config.get("subsample",{}).get("main_ndjson", 0)
         output:
             ndjson = temp(f"data/{database}.ndjson")
+        benchmark:
+            "benchmarks/fetch_main_ndjson_from_s3.txt"
         shell:
             """
             ./vendored/download-from-s3 {params.file_on_s3_dst} {output.ndjson} {params.lines} ||  \
             ./vendored/download-from-s3 {params.file_on_s3_src} {output.ndjson} {params.lines}
             """
 
     rule fetch_biosample_from_s3:
-        message:
-            """Fetching BioSample NDJSON from AWS S3"""
+        """Fetching BioSample NDJSON from AWS S3"""
         params:
             file_on_s3_dst=f"{config['s3_dst']}/biosample.ndjson.zst",
             file_on_s3_src=f"{config['s3_src']}/biosample.ndjson.zst",
             lines = config.get("subsample",{}).get("biosample", 0)
         output:
             biosample = temp("data/biosample.ndjson")
+        benchmark:
+            "benchmarks/fetch_biosample_from_s3.txt"
         shell:
             """
             ./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} ||  \
@@ -263,6 +278,8 @@ if config.get("s3_dst") and config.get("s3_src"):
             lines = config.get("subsample",{}).get("rki_ndjson", 0)
         output:
             rki_ndjson = temp("data/rki.ndjson")
+        benchmark:
+            "benchmarks/fetch_rki_ndjson_from_s3.txt"
         shell:
             """
             ./vendored/download-from-s3 {params.file_on_s3_dst} {output.rki_ndjson} {params.lines} ||  \
@@ -275,6 +292,8 @@ if config.get("s3_dst") and config.get("s3_src"):
             lines = config.get("subsample",{}).get("cog_uk_accessions", 0)
         output:
             biosample = "data/cog_uk_accessions.tsv" if config.get("keep_temp",False) else temp("data/cog_uk_accessions.tsv")
+        benchmark:
+            "benchmarks/fetch_cog_uk_accessions_from_s3.txt"
         shell:
             """
             ./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} ||  \
@@ -288,6 +307,8 @@ if config.get("s3_dst") and config.get("s3_src"):
             lines = config.get("subsample",{}).get("cog_uk_metadata", 0)
         output:
             biosample = temp("data/cog_uk_metadata.csv")
+        benchmark:
+            "benchmarks/fetch_cog_uk_metadata_from_s3.txt"
         shell:
             """
             ./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} ||  \
@@ -299,5 +320,7 @@ if config.get("s3_dst") and config.get("s3_src"):
             "data/cog_uk_metadata.csv"
         output:
             cog_uk_metadata = "data/cog_uk_metadata.csv.gz" if config.get("keep_temp",False) else temp("data/cog_uk_metadata.csv.gz")
+        benchmark:
+            "benchmarks/compress_cog_uk_metadata.txt"
         shell:
             "gzip -c {input} > {output}"
diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk
@@ -27,30 +27,30 @@ Produces the following outputs:
         alignment = f"data/{database}/aligned.fasta"
 """
 
-from shlex import quote as shellquote
-
 
 wildcard_constraints:
     reference="|_21L",
     seqtype="aligned|translation_[^.]+",
 
 
 rule create_empty_nextclade_info:
-    message:
-        """Creating empty NextClade info cache file"""
+    """Creating empty NextClade info cache file"""
     output:
         touch(f"data/{database}/nextclade{{reference}}_old.tsv"),
+    benchmark:
+        f"benchmarks/create_empty_nextclade_info_{database}{{reference}}.txt"
 
 
 rule create_empty_nextclade_aligned:
-    message:
-        """Creating empty NextClade aligned cache file"""
+    """Creating empty NextClade aligned cache file"""
     output:
         touch(f"data/{database}/nextclade.aligned.old.fasta"),
         *[
             touch(f"data/{database}/nextclade.translation_{gene}.old.fasta")
             for gene in GENE_LIST
         ],
+    benchmark:
+        f"benchmarks/create_empty_nextclade_aligned_{database}.txt"
 
 
 # Only include rules to fetch from S3 if S3 config params are provided
@@ -74,6 +74,8 @@ if config.get("s3_dst") and config.get("s3_src"):
             lines=config.get("subsample", {}).get("nextclade", 0),
         output:
             nextclade=f"data/{database}/nextclade{{reference}}_old.tsv",
+        benchmark:
+            f"benchmarks/download_nextclade_tsv_from_s3_{database}{{reference}}.txt"
         shell:
             """
             ./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 ||  \
@@ -95,6 +97,8 @@ if config.get("s3_dst") and config.get("s3_src"):
             lines=config.get("subsample", {}).get("nextclade", 0),
         output:
             alignment=temp(f"data/{database}/nextclade.{{seqtype}}.old.fasta"),
+        benchmark:
+            f"benchmarks/download_previous_alignment_from_s3_{database}{{seqtype}}.txt"
         shell:
             """
             ./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 ||  \
@@ -132,6 +136,8 @@ rule download_nextclade_executable:
     """Download Nextclade"""
     output:
         nextclade="nextclade",
+    benchmark:
+        f"benchmarks/download_nextclade_executable_{database}.txt"
     shell:
         """
         if [ "$(uname)" = "Darwin" ]; then
@@ -148,7 +154,7 @@ rule download_nextclade_executable:
         fi
 
         NEXTCLADE_VERSION="$(./nextclade --version)"
-        echo "[ INFO] Nextclade version: $NEXTCLADE_VERSION" 
+        echo "[ INFO] Nextclade version: $NEXTCLADE_VERSION"
         """
 
 
@@ -158,6 +164,8 @@ rule download_nextclade_dataset:
         "nextclade",
     output:
         dataset="data/nextclade_data/{dataset_name}.zip",
+    benchmark:
+        f"benchmarks/download_nextclade_dataset_{database}_{{dataset_name}}.txt"
     shell:
         """
         ./nextclade dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose
@@ -185,6 +193,8 @@ rule run_wuhan_nextclade:
             temp(f"data/{database}/nextclade.translation_{gene}.upd.fasta")
             for gene in GENE_LIST
         ],
+    benchmark:
+        f"benchmarks/run_wuhan_nextclade_{database}.txt"
     shell:
         """
         # If there are no sequences to run Nextclade on, create empty output files
@@ -214,6 +224,8 @@ rule run_21L_nextclade:
         sequences=f"data/{database}/nextclade_21L.sequences.fasta",
     output:
         info=f"data/{database}/nextclade_21L_new_raw.tsv",
+    benchmark:
+        f"benchmarks/run_21L_nextclade_{database}.txt"
     shell:
         """
         # If there are no sequences to run Nextclade on, create empty output files
@@ -235,6 +247,8 @@ rule nextclade_tsv_concat_versions:
         dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip",
     output:
         tsv=f"data/{database}/nextclade{{reference}}_new.tsv",
+    benchmark:
+        f"benchmarks/nextclade_tsv_concat_versions_{database}{{reference}}.txt"
     shell:
         """
         if [ -s {input.tsv} ]; then
@@ -261,15 +275,16 @@ rule nextclade_tsv_concat_versions:
 
 
 rule nextclade_info:
-    message:
-        """
-        Generates nextclade info TSV for all sequences (new + old)
-        """
+    """
+    Generates nextclade info TSV for all sequences (new + old)
+    """
     input:
         old_info=f"data/{database}/nextclade{{reference}}_old.tsv",
         new_info=rules.nextclade_tsv_concat_versions.output.tsv,
     output:
         nextclade_info=f"data/{database}/nextclade{{reference}}.tsv",
+    benchmark:
+        f"benchmarks/nextclade_info_{database}{{reference}}.txt"
     shell:
         """
         tsv-append -H {input.old_info} {input.new_info} \
@@ -286,6 +301,8 @@ rule combine_alignments:
         new_alignment=f"data/{database}/nextclade.{{seqtype}}.upd.fasta",
     output:
         alignment=f"data/{database}/{{seqtype}}.fasta",
+    benchmark:
+        f"benchmarks/combine_alignments_{database}{{seqtype}}.txt"
     params:
         keep_temp=config.get("keep_temp", "false"),
     shell:

diff --git a/workflow/snakemake_rules/slack_notifications.smk b/workflow/snakemake_rules/slack_notifications.smk
@@ -30,6 +30,8 @@ rule notify_on_record_change:
         ndjson_on_s3 = f"{config['s3_src']}/{database}.ndjson.xz"
     output:
         touch(f"data/{database}/notify-on-record-change.done")
+    benchmark:
+        f"benchmarks/notify_on_record_change_{database}.txt"
     shell:
         """
         ./vendored/notify-on-record-change {input.ndjson} {params.ndjson_on_s3} {database}
@@ -46,6 +48,8 @@ rule notify_gisaid:
         s3_bucket = config["s3_src"]
     output:
         touch("data/gisaid/notify.done")
+    benchmark:
+        "benchmarks/notify_gisaid.txt"
     run:
         shell("./vendored/notify-slack --upload flagged-annotations < {input.flagged_annotations}")
         shell("./bin/notify-on-additional-info-change {input.additional_info} {params.s3_bucket}/additional_info.tsv.gz")
@@ -60,6 +64,8 @@ rule notify_genbank:
         s3_bucket = config["s3_src"]
     output:
         touch("data/genbank/notify.done")
+    benchmark:
+        "benchmarks/notify_genbank.txt"
     run:
         shell("./vendored/notify-slack --upload flagged-annotations < {input.flagged_annotations}")
         # TODO - which rule produces data/genbank/problem_data.tsv? (was not explicit in `ingest-genbank` bash script)