Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add benchmark and --stats #451

Merged
merged 4 commits into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/fetch-and-ingest-genbank-master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,5 +108,6 @@ jobs:
--env SLACK_CHANNELS \
--env PAT_GITHUB_DISPATCH="$GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH" \
. \
--stats snakemake_stats.json \
--configfile config/genbank.yaml \
$CONFIG_OVERRIDES
1 change: 1 addition & 0 deletions .github/workflows/fetch-and-ingest-gisaid-master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,5 +109,6 @@ jobs:
--env SLACK_CHANNELS \
--env PAT_GITHUB_DISPATCH="$GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH" \
. \
--stats snakemake_stats.json \
--configfile config/gisaid.yaml \
$CONFIG_OVERRIDES
10 changes: 10 additions & 0 deletions workflow/snakemake_rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ rule transform_rki_data:
output:
fasta="data/rki_sequences.fasta",
metadata="data/rki_metadata_transformed.tsv",
benchmark:
"benchmarks/transform_rki_data.txt"
params:
subsampled=config.get("subsampled", False),
shell:
Expand All @@ -45,6 +47,8 @@ rule transform_biosample:
biosample = "data/biosample.ndjson"
output:
biosample = "data/genbank/biosample.tsv"
benchmark:
"benchmarks/transform_biosample.txt"
shell:
"""
./bin/transform-biosample {input.biosample} \
Expand Down Expand Up @@ -85,6 +89,8 @@ rule merge_open_data:
output:
metadata="data/genbank/metadata_transformed.tsv",
sequences="data/genbank/sequences.fasta",
benchmark:
"benchmarks/merge_open_data.txt"
shell:
"""
./bin/merge-open \
Expand All @@ -105,6 +111,8 @@ rule transform_gisaid_data:
metadata = "data/gisaid/metadata_transformed.tsv",
flagged_annotations = temp("data/gisaid/flagged-annotations"),
additional_info = "data/gisaid/additional_info.tsv"
benchmark:
"benchmarks/transform_gisaid_data.txt"
shell:
"""
./bin/transform-gisaid {input.ndjson} \
Expand All @@ -120,6 +128,8 @@ rule flag_metadata:
metadata = "data/gisaid/metadata.tsv"
output:
metadata = "data/gisaid/flagged_metadata.txt"
benchmark:
"benchmarks/flag_metadata.txt"
resources:
# Memory use scales primarily with the size of the metadata file.
mem_mb=20000
Expand Down
43 changes: 33 additions & 10 deletions workflow/snakemake_rules/fetch_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ Produces different final outputs for GISAID vs GenBank/RKI:
rule fetch_main_gisaid_ndjson:
output:
ndjson = temp(f"data/gisaid.ndjson")
benchmark:
"benchmarks/fetch_main_gisaid_ndjson.txt"
retries: 5
shell:
"""
Expand Down Expand Up @@ -124,10 +126,11 @@ rule create_genbank_ndjson:
"""

rule fetch_biosample:
message:
"""Fetching BioSample data (GenBank only)"""
"""Fetching BioSample data (GenBank only)"""
output:
biosample = temp("data/biosample.ndjson")
benchmark:
"benchmarks/fetch_biosample.txt"
retries: 5
shell:
"""
Expand All @@ -136,10 +139,11 @@ rule fetch_biosample:


rule fetch_cog_uk_accessions:
message:
"""Fetching COG-UK sample accesions (GenBank only)"""
"""Fetching COG-UK sample accesions (GenBank only)"""
output:
cog_uk_accessions = temp("data/cog_uk_accessions.tsv")
benchmark:
"benchmarks/fetch_cog_uk_accessions.txt"
retries: 5
shell:
"""
Expand All @@ -148,10 +152,11 @@ rule fetch_cog_uk_accessions:


rule fetch_cog_uk_metadata:
message:
"""Fetching COG-UK metadata (GenBank only)"""
"""Fetching COG-UK metadata (GenBank only)"""
output:
cog_uk_metadata = temp("data/cog_uk_metadata.csv.gz")
benchmark:
"benchmarks/fetch_cog_uk_metadata.txt"
retries: 5
shell:
"""
Expand All @@ -164,13 +169,17 @@ rule uncompress_cog_uk_metadata:
"data/cog_uk_metadata.csv.gz"
output:
cog_uk_metadata = temp("data/cog_uk_metadata.csv")
benchmark:
"benchmarks/uncompress_cog_uk_metadata.txt"
shell:
"gunzip -c {input} > {output}"


rule fetch_rki_sequences:
output:
rki_sequences=temp("data/rki_sequences.fasta.xz"),
benchmark:
"benchmarks/fetch_rki_sequences.txt"
retries: 5
shell:
"""
Expand All @@ -181,6 +190,8 @@ rule fetch_rki_sequences:
rule fetch_rki_metadata:
output:
rki_metadata=temp("data/rki_metadata.tsv.xz"),
benchmark:
"benchmarks/fetch_rki_metadata.txt"
retries: 5
shell:
"""
Expand All @@ -194,6 +205,8 @@ rule transform_rki_data_to_ndjson:
rki_metadata="data/rki_metadata.tsv.xz"
output:
ndjson="data/rki.ndjson",
benchmark:
"benchmarks/transform_rki_data_to_ndjson.txt"
shell:
"""
./bin/transform-rki-data-to-ndjson \
Expand Down Expand Up @@ -227,29 +240,31 @@ if config.get("s3_dst") and config.get("s3_src"):
ruleorder: fetch_main_ndjson_from_s3 > create_genbank_ndjson

rule fetch_main_ndjson_from_s3:
message:
"""Fetching main NDJSON from AWS S3"""
"""Fetching main NDJSON from AWS S3"""
params:
file_on_s3_dst=f"{config['s3_dst']}/{database}.ndjson.zst",
file_on_s3_src=f"{config['s3_src']}/{database}.ndjson.zst",
lines = config.get("subsample",{}).get("main_ndjson", 0)
output:
ndjson = temp(f"data/{database}.ndjson")
benchmark:
"benchmarks/fetch_main_ndjson_from_s3.txt"
shell:
"""
./vendored/download-from-s3 {params.file_on_s3_dst} {output.ndjson} {params.lines} || \
./vendored/download-from-s3 {params.file_on_s3_src} {output.ndjson} {params.lines}
"""

rule fetch_biosample_from_s3:
message:
"""Fetching BioSample NDJSON from AWS S3"""
"""Fetching BioSample NDJSON from AWS S3"""
params:
file_on_s3_dst=f"{config['s3_dst']}/biosample.ndjson.zst",
file_on_s3_src=f"{config['s3_src']}/biosample.ndjson.zst",
lines = config.get("subsample",{}).get("biosample", 0)
output:
biosample = temp("data/biosample.ndjson")
benchmark:
"benchmarks/fetch_biosample_from_s3.txt"
shell:
"""
./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \
Expand All @@ -263,6 +278,8 @@ if config.get("s3_dst") and config.get("s3_src"):
lines = config.get("subsample",{}).get("rki_ndjson", 0)
output:
rki_ndjson = temp("data/rki.ndjson")
benchmark:
"benchmarks/fetch_rki_ndjson_from_s3.txt"
shell:
"""
./vendored/download-from-s3 {params.file_on_s3_dst} {output.rki_ndjson} {params.lines} || \
Expand All @@ -275,6 +292,8 @@ if config.get("s3_dst") and config.get("s3_src"):
lines = config.get("subsample",{}).get("cog_uk_accessions", 0)
output:
biosample = "data/cog_uk_accessions.tsv" if config.get("keep_temp",False) else temp("data/cog_uk_accessions.tsv")
benchmark:
"benchmarks/fetch_cog_uk_accessions_from_s3.txt"
shell:
"""
./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \
Expand All @@ -288,6 +307,8 @@ if config.get("s3_dst") and config.get("s3_src"):
lines = config.get("subsample",{}).get("cog_uk_metadata", 0)
output:
biosample = temp("data/cog_uk_metadata.csv")
benchmark:
"benchmarks/fetch_cog_uk_metadata_from_s3.txt"
shell:
"""
./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \
Expand All @@ -299,5 +320,7 @@ if config.get("s3_dst") and config.get("s3_src"):
"data/cog_uk_metadata.csv"
output:
cog_uk_metadata = "data/cog_uk_metadata.csv.gz" if config.get("keep_temp",False) else temp("data/cog_uk_metadata.csv.gz")
benchmark:
"benchmarks/compress_cog_uk_metadata.txt"
shell:
"gzip -c {input} > {output}"
39 changes: 28 additions & 11 deletions workflow/snakemake_rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -27,30 +27,30 @@ Produces the following outputs:
alignment = f"data/{database}/aligned.fasta"
"""

from shlex import quote as shellquote


wildcard_constraints:
reference="|_21L",
seqtype="aligned|translation_[^.]+",


rule create_empty_nextclade_info:
message:
"""Creating empty NextClade info cache file"""
"""Creating empty NextClade info cache file"""
output:
touch(f"data/{database}/nextclade{{reference}}_old.tsv"),
benchmark:
f"benchmarks/create_empty_nextclade_info_{database}{{reference}}.txt"


rule create_empty_nextclade_aligned:
message:
"""Creating empty NextClade aligned cache file"""
"""Creating empty NextClade aligned cache file"""
output:
touch(f"data/{database}/nextclade.aligned.old.fasta"),
*[
touch(f"data/{database}/nextclade.translation_{gene}.old.fasta")
for gene in GENE_LIST
],
benchmark:
f"benchmarks/create_empty_nextclade_aligned_{database}.txt"


# Only include rules to fetch from S3 if S3 config params are provided
Expand All @@ -74,6 +74,8 @@ if config.get("s3_dst") and config.get("s3_src"):
lines=config.get("subsample", {}).get("nextclade", 0),
output:
nextclade=f"data/{database}/nextclade{{reference}}_old.tsv",
benchmark:
f"benchmarks/download_nextclade_tsv_from_s3_{database}{{reference}}.txt"
shell:
"""
./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 || \
Expand All @@ -95,6 +97,8 @@ if config.get("s3_dst") and config.get("s3_src"):
lines=config.get("subsample", {}).get("nextclade", 0),
output:
alignment=temp(f"data/{database}/nextclade.{{seqtype}}.old.fasta"),
benchmark:
f"benchmarks/download_previous_alignment_from_s3_{database}{{seqtype}}.txt"
shell:
"""
./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 || \
Expand Down Expand Up @@ -132,6 +136,8 @@ rule download_nextclade_executable:
"""Download Nextclade"""
output:
nextclade="nextclade",
benchmark:
f"benchmarks/download_nextclade_executable_{database}.txt"
shell:
"""
if [ "$(uname)" = "Darwin" ]; then
Expand All @@ -148,7 +154,7 @@ rule download_nextclade_executable:
fi

NEXTCLADE_VERSION="$(./nextclade --version)"
echo "[ INFO] Nextclade version: $NEXTCLADE_VERSION"
echo "[ INFO] Nextclade version: $NEXTCLADE_VERSION"
"""


Expand All @@ -158,6 +164,8 @@ rule download_nextclade_dataset:
"nextclade",
output:
dataset="data/nextclade_data/{dataset_name}.zip",
benchmark:
f"benchmarks/download_nextclade_dataset_{database}_{{dataset_name}}.txt"
shell:
"""
./nextclade dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose
Expand Down Expand Up @@ -185,6 +193,8 @@ rule run_wuhan_nextclade:
temp(f"data/{database}/nextclade.translation_{gene}.upd.fasta")
for gene in GENE_LIST
],
benchmark:
f"benchmarks/run_wuhan_nextclade_{database}.txt"
shell:
"""
# If there are no sequences to run Nextclade on, create empty output files
Expand Down Expand Up @@ -214,6 +224,8 @@ rule run_21L_nextclade:
sequences=f"data/{database}/nextclade_21L.sequences.fasta",
output:
info=f"data/{database}/nextclade_21L_new_raw.tsv",
benchmark:
f"benchmarks/run_21L_nextclade_{database}.txt"
shell:
"""
# If there are no sequences to run Nextclade on, create empty output files
Expand All @@ -235,6 +247,8 @@ rule nextclade_tsv_concat_versions:
dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip",
output:
tsv=f"data/{database}/nextclade{{reference}}_new.tsv",
benchmark:
f"benchmarks/nextclade_tsv_concat_versions_{database}{{reference}}.txt"
shell:
"""
if [ -s {input.tsv} ]; then
Expand All @@ -261,15 +275,16 @@ rule nextclade_tsv_concat_versions:


rule nextclade_info:
message:
"""
Generates nextclade info TSV for all sequences (new + old)
"""
"""
Generates nextclade info TSV for all sequences (new + old)
"""
input:
old_info=f"data/{database}/nextclade{{reference}}_old.tsv",
new_info=rules.nextclade_tsv_concat_versions.output.tsv,
output:
nextclade_info=f"data/{database}/nextclade{{reference}}.tsv",
benchmark:
f"benchmarks/nextclade_info_{database}{{reference}}.txt"
shell:
"""
tsv-append -H {input.old_info} {input.new_info} \
Expand All @@ -286,6 +301,8 @@ rule combine_alignments:
new_alignment=f"data/{database}/nextclade.{{seqtype}}.upd.fasta",
output:
alignment=f"data/{database}/{{seqtype}}.fasta",
benchmark:
f"benchmarks/combine_alignments_{database}{{seqtype}}.txt"
params:
keep_temp=config.get("keep_temp", "false"),
shell:
Expand Down
6 changes: 6 additions & 0 deletions workflow/snakemake_rules/slack_notifications.smk
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ rule notify_on_record_change:
ndjson_on_s3 = f"{config['s3_src']}/{database}.ndjson.xz"
output:
touch(f"data/{database}/notify-on-record-change.done")
benchmark:
f"benchmarks/notify_on_record_change_{database}.txt"
shell:
"""
./vendored/notify-on-record-change {input.ndjson} {params.ndjson_on_s3} {database}
Expand All @@ -46,6 +48,8 @@ rule notify_gisaid:
s3_bucket = config["s3_src"]
output:
touch("data/gisaid/notify.done")
benchmark:
"benchmarks/notify_gisaid.txt"
run:
shell("./vendored/notify-slack --upload flagged-annotations < {input.flagged_annotations}")
shell("./bin/notify-on-additional-info-change {input.additional_info} {params.s3_bucket}/additional_info.tsv.gz")
Expand All @@ -60,6 +64,8 @@ rule notify_genbank:
s3_bucket = config["s3_src"]
output:
touch("data/genbank/notify.done")
benchmark:
"benchmarks/notify_genbank.txt"
run:
shell("./vendored/notify-slack --upload flagged-annotations < {input.flagged_annotations}")
# TODO - which rule produces data/genbank/problem_data.tsv? (was not explicit in `ingest-genbank` bash script)
Expand Down
Loading