Skip to content

Commit

Permalink
Merge pull request #451 from nextstrain/benchmarks-stats
Browse files Browse the repository at this point in the history
Add benchmark and `--stats`
  • Loading branch information
joverlee521 authored Jun 18, 2024
2 parents de35468 + eb63e29 commit f5f238b
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 23 deletions.
1 change: 1 addition & 0 deletions .github/workflows/fetch-and-ingest-genbank-master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,5 +108,6 @@ jobs:
--env SLACK_CHANNELS \
--env PAT_GITHUB_DISPATCH="$GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH" \
. \
--stats snakemake_stats.json \
--configfile config/genbank.yaml \
$CONFIG_OVERRIDES
1 change: 1 addition & 0 deletions .github/workflows/fetch-and-ingest-gisaid-master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,5 +109,6 @@ jobs:
--env SLACK_CHANNELS \
--env PAT_GITHUB_DISPATCH="$GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH" \
. \
--stats snakemake_stats.json \
--configfile config/gisaid.yaml \
$CONFIG_OVERRIDES
10 changes: 10 additions & 0 deletions workflow/snakemake_rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ rule transform_rki_data:
output:
fasta="data/rki_sequences.fasta",
metadata="data/rki_metadata_transformed.tsv",
benchmark:
"benchmarks/transform_rki_data.txt"
params:
subsampled=config.get("subsampled", False),
shell:
Expand All @@ -45,6 +47,8 @@ rule transform_biosample:
biosample = "data/biosample.ndjson"
output:
biosample = "data/genbank/biosample.tsv"
benchmark:
"benchmarks/transform_biosample.txt"
shell:
"""
./bin/transform-biosample {input.biosample} \
Expand Down Expand Up @@ -85,6 +89,8 @@ rule merge_open_data:
output:
metadata="data/genbank/metadata_transformed.tsv",
sequences="data/genbank/sequences.fasta",
benchmark:
"benchmarks/merge_open_data.txt"
shell:
"""
./bin/merge-open \
Expand All @@ -105,6 +111,8 @@ rule transform_gisaid_data:
metadata = "data/gisaid/metadata_transformed.tsv",
flagged_annotations = temp("data/gisaid/flagged-annotations"),
additional_info = "data/gisaid/additional_info.tsv"
benchmark:
"benchmarks/transform_gisaid_data.txt"
shell:
"""
./bin/transform-gisaid {input.ndjson} \
Expand All @@ -120,6 +128,8 @@ rule flag_metadata:
metadata = "data/gisaid/metadata.tsv"
output:
metadata = "data/gisaid/flagged_metadata.txt"
benchmark:
"benchmarks/flag_metadata.txt"
resources:
# Memory use scales primarily with the size of the metadata file.
mem_mb=20000
Expand Down
43 changes: 33 additions & 10 deletions workflow/snakemake_rules/fetch_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ Produces different final outputs for GISAID vs GenBank/RKI:
rule fetch_main_gisaid_ndjson:
output:
ndjson = temp(f"data/gisaid.ndjson")
benchmark:
"benchmarks/fetch_main_gisaid_ndjson.txt"
retries: 5
shell:
"""
Expand Down Expand Up @@ -124,10 +126,11 @@ rule create_genbank_ndjson:
"""

rule fetch_biosample:
message:
"""Fetching BioSample data (GenBank only)"""
"""Fetching BioSample data (GenBank only)"""
output:
biosample = temp("data/biosample.ndjson")
benchmark:
"benchmarks/fetch_biosample.txt"
retries: 5
shell:
"""
Expand All @@ -136,10 +139,11 @@ rule fetch_biosample:


rule fetch_cog_uk_accessions:
message:
"""Fetching COG-UK sample accesions (GenBank only)"""
"""Fetching COG-UK sample accesions (GenBank only)"""
output:
cog_uk_accessions = temp("data/cog_uk_accessions.tsv")
benchmark:
"benchmarks/fetch_cog_uk_accessions.txt"
retries: 5
shell:
"""
Expand All @@ -148,10 +152,11 @@ rule fetch_cog_uk_accessions:


rule fetch_cog_uk_metadata:
message:
"""Fetching COG-UK metadata (GenBank only)"""
"""Fetching COG-UK metadata (GenBank only)"""
output:
cog_uk_metadata = temp("data/cog_uk_metadata.csv.gz")
benchmark:
"benchmarks/fetch_cog_uk_metadata.txt"
retries: 5
shell:
"""
Expand All @@ -164,13 +169,17 @@ rule uncompress_cog_uk_metadata:
"data/cog_uk_metadata.csv.gz"
output:
cog_uk_metadata = temp("data/cog_uk_metadata.csv")
benchmark:
"benchmarks/uncompress_cog_uk_metadata.txt"
shell:
"gunzip -c {input} > {output}"


rule fetch_rki_sequences:
output:
rki_sequences=temp("data/rki_sequences.fasta.xz"),
benchmark:
"benchmarks/fetch_rki_sequences.txt"
retries: 5
shell:
"""
Expand All @@ -181,6 +190,8 @@ rule fetch_rki_sequences:
rule fetch_rki_metadata:
output:
rki_metadata=temp("data/rki_metadata.tsv.xz"),
benchmark:
"benchmarks/fetch_rki_metadata.txt"
retries: 5
shell:
"""
Expand All @@ -194,6 +205,8 @@ rule transform_rki_data_to_ndjson:
rki_metadata="data/rki_metadata.tsv.xz"
output:
ndjson="data/rki.ndjson",
benchmark:
"benchmarks/transform_rki_data_to_ndjson.txt"
shell:
"""
./bin/transform-rki-data-to-ndjson \
Expand Down Expand Up @@ -227,29 +240,31 @@ if config.get("s3_dst") and config.get("s3_src"):
ruleorder: fetch_main_ndjson_from_s3 > create_genbank_ndjson

rule fetch_main_ndjson_from_s3:
message:
"""Fetching main NDJSON from AWS S3"""
"""Fetching main NDJSON from AWS S3"""
params:
file_on_s3_dst=f"{config['s3_dst']}/{database}.ndjson.zst",
file_on_s3_src=f"{config['s3_src']}/{database}.ndjson.zst",
lines = config.get("subsample",{}).get("main_ndjson", 0)
output:
ndjson = temp(f"data/{database}.ndjson")
benchmark:
"benchmarks/fetch_main_ndjson_from_s3.txt"
shell:
"""
./vendored/download-from-s3 {params.file_on_s3_dst} {output.ndjson} {params.lines} || \
./vendored/download-from-s3 {params.file_on_s3_src} {output.ndjson} {params.lines}
"""

rule fetch_biosample_from_s3:
message:
"""Fetching BioSample NDJSON from AWS S3"""
"""Fetching BioSample NDJSON from AWS S3"""
params:
file_on_s3_dst=f"{config['s3_dst']}/biosample.ndjson.zst",
file_on_s3_src=f"{config['s3_src']}/biosample.ndjson.zst",
lines = config.get("subsample",{}).get("biosample", 0)
output:
biosample = temp("data/biosample.ndjson")
benchmark:
"benchmarks/fetch_biosample_from_s3.txt"
shell:
"""
./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \
Expand All @@ -263,6 +278,8 @@ if config.get("s3_dst") and config.get("s3_src"):
lines = config.get("subsample",{}).get("rki_ndjson", 0)
output:
rki_ndjson = temp("data/rki.ndjson")
benchmark:
"benchmarks/fetch_rki_ndjson_from_s3.txt"
shell:
"""
./vendored/download-from-s3 {params.file_on_s3_dst} {output.rki_ndjson} {params.lines} || \
Expand All @@ -275,6 +292,8 @@ if config.get("s3_dst") and config.get("s3_src"):
lines = config.get("subsample",{}).get("cog_uk_accessions", 0)
output:
biosample = "data/cog_uk_accessions.tsv" if config.get("keep_temp",False) else temp("data/cog_uk_accessions.tsv")
benchmark:
"benchmarks/fetch_cog_uk_accessions_from_s3.txt"
shell:
"""
./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \
Expand All @@ -288,6 +307,8 @@ if config.get("s3_dst") and config.get("s3_src"):
lines = config.get("subsample",{}).get("cog_uk_metadata", 0)
output:
biosample = temp("data/cog_uk_metadata.csv")
benchmark:
"benchmarks/fetch_cog_uk_metadata_from_s3.txt"
shell:
"""
./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \
Expand All @@ -299,5 +320,7 @@ if config.get("s3_dst") and config.get("s3_src"):
"data/cog_uk_metadata.csv"
output:
cog_uk_metadata = "data/cog_uk_metadata.csv.gz" if config.get("keep_temp",False) else temp("data/cog_uk_metadata.csv.gz")
benchmark:
"benchmarks/compress_cog_uk_metadata.txt"
shell:
"gzip -c {input} > {output}"
39 changes: 28 additions & 11 deletions workflow/snakemake_rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -27,30 +27,30 @@ Produces the following outputs:
alignment = f"data/{database}/aligned.fasta"
"""

from shlex import quote as shellquote


wildcard_constraints:
reference="|_21L",
seqtype="aligned|translation_[^.]+",


rule create_empty_nextclade_info:
message:
"""Creating empty NextClade info cache file"""
"""Creating empty NextClade info cache file"""
output:
touch(f"data/{database}/nextclade{{reference}}_old.tsv"),
benchmark:
f"benchmarks/create_empty_nextclade_info_{database}{{reference}}.txt"


rule create_empty_nextclade_aligned:
message:
"""Creating empty NextClade aligned cache file"""
"""Creating empty NextClade aligned cache file"""
output:
touch(f"data/{database}/nextclade.aligned.old.fasta"),
*[
touch(f"data/{database}/nextclade.translation_{gene}.old.fasta")
for gene in GENE_LIST
],
benchmark:
f"benchmarks/create_empty_nextclade_aligned_{database}.txt"


# Only include rules to fetch from S3 if S3 config params are provided
Expand All @@ -74,6 +74,8 @@ if config.get("s3_dst") and config.get("s3_src"):
lines=config.get("subsample", {}).get("nextclade", 0),
output:
nextclade=f"data/{database}/nextclade{{reference}}_old.tsv",
benchmark:
f"benchmarks/download_nextclade_tsv_from_s3_{database}{{reference}}.txt"
shell:
"""
./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 || \
Expand All @@ -95,6 +97,8 @@ if config.get("s3_dst") and config.get("s3_src"):
lines=config.get("subsample", {}).get("nextclade", 0),
output:
alignment=temp(f"data/{database}/nextclade.{{seqtype}}.old.fasta"),
benchmark:
f"benchmarks/download_previous_alignment_from_s3_{database}{{seqtype}}.txt"
shell:
"""
./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 || \
Expand Down Expand Up @@ -132,6 +136,8 @@ rule download_nextclade_executable:
"""Download Nextclade"""
output:
nextclade="nextclade",
benchmark:
f"benchmarks/download_nextclade_executable_{database}.txt"
shell:
"""
if [ "$(uname)" = "Darwin" ]; then
Expand All @@ -148,7 +154,7 @@ rule download_nextclade_executable:
fi
NEXTCLADE_VERSION="$(./nextclade --version)"
echo "[ INFO] Nextclade version: $NEXTCLADE_VERSION"
echo "[ INFO] Nextclade version: $NEXTCLADE_VERSION"
"""


Expand All @@ -158,6 +164,8 @@ rule download_nextclade_dataset:
"nextclade",
output:
dataset="data/nextclade_data/{dataset_name}.zip",
benchmark:
f"benchmarks/download_nextclade_dataset_{database}_{{dataset_name}}.txt"
shell:
"""
./nextclade dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose
Expand Down Expand Up @@ -185,6 +193,8 @@ rule run_wuhan_nextclade:
temp(f"data/{database}/nextclade.translation_{gene}.upd.fasta")
for gene in GENE_LIST
],
benchmark:
f"benchmarks/run_wuhan_nextclade_{database}.txt"
shell:
"""
# If there are no sequences to run Nextclade on, create empty output files
Expand Down Expand Up @@ -214,6 +224,8 @@ rule run_21L_nextclade:
sequences=f"data/{database}/nextclade_21L.sequences.fasta",
output:
info=f"data/{database}/nextclade_21L_new_raw.tsv",
benchmark:
f"benchmarks/run_21L_nextclade_{database}.txt"
shell:
"""
# If there are no sequences to run Nextclade on, create empty output files
Expand All @@ -235,6 +247,8 @@ rule nextclade_tsv_concat_versions:
dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip",
output:
tsv=f"data/{database}/nextclade{{reference}}_new.tsv",
benchmark:
f"benchmarks/nextclade_tsv_concat_versions_{database}{{reference}}.txt"
shell:
"""
if [ -s {input.tsv} ]; then
Expand All @@ -261,15 +275,16 @@ rule nextclade_tsv_concat_versions:


rule nextclade_info:
message:
"""
Generates nextclade info TSV for all sequences (new + old)
"""
"""
Generates nextclade info TSV for all sequences (new + old)
"""
input:
old_info=f"data/{database}/nextclade{{reference}}_old.tsv",
new_info=rules.nextclade_tsv_concat_versions.output.tsv,
output:
nextclade_info=f"data/{database}/nextclade{{reference}}.tsv",
benchmark:
f"benchmarks/nextclade_info_{database}{{reference}}.txt"
shell:
"""
tsv-append -H {input.old_info} {input.new_info} \
Expand All @@ -286,6 +301,8 @@ rule combine_alignments:
new_alignment=f"data/{database}/nextclade.{{seqtype}}.upd.fasta",
output:
alignment=f"data/{database}/{{seqtype}}.fasta",
benchmark:
f"benchmarks/combine_alignments_{database}{{seqtype}}.txt"
params:
keep_temp=config.get("keep_temp", "false"),
shell:
Expand Down
6 changes: 6 additions & 0 deletions workflow/snakemake_rules/slack_notifications.smk
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ rule notify_on_record_change:
ndjson_on_s3 = f"{config['s3_src']}/{database}.ndjson.xz"
output:
touch(f"data/{database}/notify-on-record-change.done")
benchmark:
f"benchmarks/notify_on_record_change_{database}.txt"
shell:
"""
./vendored/notify-on-record-change {input.ndjson} {params.ndjson_on_s3} {database}
Expand All @@ -46,6 +48,8 @@ rule notify_gisaid:
s3_bucket = config["s3_src"]
output:
touch("data/gisaid/notify.done")
benchmark:
"benchmarks/notify_gisaid.txt"
run:
shell("./vendored/notify-slack --upload flagged-annotations < {input.flagged_annotations}")
shell("./bin/notify-on-additional-info-change {input.additional_info} {params.s3_bucket}/additional_info.tsv.gz")
Expand All @@ -60,6 +64,8 @@ rule notify_genbank:
s3_bucket = config["s3_src"]
output:
touch("data/genbank/notify.done")
benchmark:
"benchmarks/notify_genbank.txt"
run:
shell("./vendored/notify-slack --upload flagged-annotations < {input.flagged_annotations}")
# TODO - which rule produces data/genbank/problem_data.tsv? (was not explicit in `ingest-genbank` bash script)
Expand Down
Loading

0 comments on commit f5f238b

Please sign in to comment.