Remove version columns from Nextclade TSV

joverlee521 · joverlee521 · commit 3e86a9fc1cc1 · 2024-07-26T17:33:17.000-07:00
We now check the Nextclade versions using the separate version JSON, so
we no longer need to track version per row.

This is a breaking change for the cache, so will need to be run with the
renew flag to manually force a full-rerun.
diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk
@@ -218,7 +218,7 @@ rule run_wuhan_nextclade:
             f"--output-translations=data/{database}/nextclade.translation_{{cds}}.upd.fasta"
         ),
     output:
-        info=f"data/{database}/nextclade_new_raw.tsv",
+        info=f"data/{database}/nextclade_new.tsv",
         alignment=temp(f"data/{database}/nextclade.aligned.upd.fasta"),
         translations=[
             temp(f"data/{database}/nextclade.translation_{gene}.upd.fasta")
@@ -249,7 +249,7 @@ rule run_21L_nextclade:
         dataset=lambda w: f"data/nextclade_data/sars-cov-2-21L.zip",
         sequences=f"data/{database}/nextclade_21L.sequences.fasta",
     output:
-        info=f"data/{database}/nextclade_21L_new_raw.tsv",
+        info=f"data/{database}/nextclade_21L_new.tsv",
     threads:
         workflow.cores * 0.5
     benchmark:
@@ -264,47 +264,13 @@ rule run_21L_nextclade:
         """
 
 
-rule nextclade_tsv_concat_versions:
-    input:
-        nextclade="data/nextclade",
-        tsv=f"data/{database}/nextclade{{reference}}_new_raw.tsv",
-        dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip",
-    output:
-        tsv=f"data/{database}/nextclade{{reference}}_new.tsv",
-    benchmark:
-        f"benchmarks/nextclade_tsv_concat_versions_{database}{{reference}}.txt"
-    shell:
-        """
-        if [ -s {input.tsv} ]; then
-            # Get version numbers
-            nextclade_version="$({input.nextclade:q} --version)"
-            dataset_version="$(unzip -p {input.dataset} pathogen.json | jq -r '.version.tag')"
-            timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
-
-            # Combine input file with version numbers and write to output
-            printf "%s\tnextclade_version\tdataset_version\trun_timestamp\n" \
-                "$(head -n 1 {input.tsv})" \
-                > {output.tsv}
-
-            tail -n +2 {input.tsv} | \
-            awk -v v1="$nextclade_version" \
-                -v v2="$dataset_version" \
-                -v v3="$timestamp" \
-                -v OFS='\t' '{{print $0, v1, v2, v3}}' \
-                >> {output.tsv}
-        else
-            cp {input.tsv} {output.tsv}
-        fi
-        """
-
-
 rule nextclade_info:
     """
     Generates nextclade info TSV for all sequences (new + old)
     """
     input:
         old_info=f"data/{database}/nextclade{{reference}}_old.tsv",
-        new_info=rules.nextclade_tsv_concat_versions.output.tsv,
+        new_info=f"data/{database}/nextclade{{reference}}_new.tsv",
     output:
         nextclade_info=f"data/{database}/nextclade{{reference}}.tsv",
     benchmark: