feat: prepared wf to run test data; removed unused conf options; snak…

…efmt + linting
MPUSP · Jul 23, 2024 · d351622 · d351622
1 parent 4b31486
commit d351622
Show file tree

Hide file tree

Showing 11 changed files with 169 additions and 159 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,4 +12,5 @@ resources/**
 # Custom additions
 Notes.md
 .vscode/*
-.snakemake-workflow-catalog.yml
+.snakemake-workflow-catalog.yml
+.test/results/*
diff --git a/.test/config/config.yml b/.test/config/config.yml
@@ -1,10 +1,4 @@
-
-# optional: define output folder here
-# default: "./results"
-output: null
-
-# define samplesheet here
-samplesheet: "./test/config/samples.tsv"
+samplesheet: "config/samples.tsv"
 
 get_genome:
   database: "ncbi"
@@ -28,18 +22,20 @@ star:
   multi: 10
   sam_multi: 1
   intron_max: 1
-  default: [
-    "--readFilesCommand zcat ",
-    "--outSAMstrandField None ",
-    "--outSAMattributes All ",
-    "--outSAMattrIHstart 0 ",
-    "--outFilterType Normal ",
-    "--outFilterMultimapScoreRange 1 ",
-    "-o STARmappings ",
-    "--outSAMtype BAM Unsorted ",
-    "--outStd BAM_Unsorted ",
-    "--outMultimapperOrder Random ",
-    "--alignEndsType EndToEnd"]
+  default:
+    [
+      "--readFilesCommand zcat ",
+      "--outSAMstrandField None ",
+      "--outSAMattributes All ",
+      "--outSAMattrIHstart 0 ",
+      "--outFilterType Normal ",
+      "--outFilterMultimapScoreRange 1 ",
+      "-o STARmappings ",
+      "--outSAMtype BAM Unsorted ",
+      "--outStd BAM_Unsorted ",
+      "--outMultimapperOrder Random ",
+      "--alignEndsType EndToEnd",
+    ]
 
 extract_features:
   biotypes: ["rRNA", "tRNA"]
@@ -54,26 +50,25 @@ deeptools:
   normalize: "CPM"
 
 annotate_orfs:
-    window_size: 30
-    sorf_max_length: 300
-    sorf_min_length: 45
-    orf_start_codon_table: 11
-    orf_stop_codon: ["TAA", "TAG", "TGA"]
-    orf_longest_only: False
+  window_size: 30
+  sorf_max_length: 300
+  sorf_min_length: 45
+  orf_start_codon_table: 11
+  orf_stop_codon: ["TAA", "TAG", "TGA"]
+  orf_longest_only: False
 
 shift_reads:
-    window_size: 30
-    read_length: [27, 45]
-    # rpf_read_length: [30, 45]
-    # qti_read_length: [27, 45]
-    rnaseq_read_length: [0, 1000]
-    end_alignment: "3prime"
-    shift_table: "config/shift_table/shift_table.csv"
-    export_bam: False
-    export_bigwig: True
-    skip_shifting: False
-    skip_length_filter: True
-
+  window_size: 30
+  read_length: [27, 45]
+  # rpf_read_length: [30, 45]
+  # qti_read_length: [27, 45]
+  rnaseq_read_length: [0, 1000]
+  end_alignment: "3prime"
+  shift_table: "config/shift_table/shift_table.csv"
+  export_bam: False
+  export_bigwig: True
+  skip_shifting: False
+  skip_length_filter: True
 
 multiqc:
   config: "config/multiqc_config.yml"
diff --git a/.test/config/multiqc_config.yml b/.test/config/multiqc_config.yml
@@ -0,0 +1,2 @@
+remove_sections:
+  - samtools-stats
diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv
@@ -1,4 +1,3 @@
 sample	condition	replicate	lib_prep 	data_folder	fq1
-RPF-RTP1	RPF-RTP	1	mpusp	.test/data	RPF-RTP1_R1_001.fastq.gz
-RPF-RTP2	RPF-RTP	2	mpusp	.test/data	RPF-RTP2_R1_001.fastq.gz
-
+RPF-RTP1	RPF-RTP	1	mpusp	data	RPF-RTP1_R1_001.fastq.gz
+RPF-RTP2	RPF-RTP	2	mpusp	data	RPF-RTP2_R1_001.fastq.gz
diff --git a/.test/config/shift_table/shift_table.csv b/.test/config/shift_table/shift_table.csv
@@ -0,0 +1,20 @@
+fraction,offsets_start
+27,-11
+28,-12
+29,-13
+30,-14
+31,-15
+32,-16
+33,-17
+34,-18
+35,-19
+36,-20
+37,-21
+38,-22
+39,-23
+40,-24
+41,-25
+42,-26
+43,-27
+44,-28
+45,-29
diff --git a/config/config.yml b/config/config.yml
@@ -1,9 +1,3 @@
-
-# optional: define output folder here
-# default: "./results"
-output: null
-
-# define samplesheet here
 samplesheet: "config/samples.tsv"
 
 get_genome:
@@ -28,18 +22,20 @@ star:
   multi: 10
   sam_multi: 1
   intron_max: 1
-  default: [
-    "--readFilesCommand zcat ",
-    "--outSAMstrandField None ",
-    "--outSAMattributes All ",
-    "--outSAMattrIHstart 0 ",
-    "--outFilterType Normal ",
-    "--outFilterMultimapScoreRange 1 ",
-    "-o STARmappings ",
-    "--outSAMtype BAM Unsorted ",
-    "--outStd BAM_Unsorted ",
-    "--outMultimapperOrder Random ",
-    "--alignEndsType EndToEnd"]
+  default:
+    [
+      "--readFilesCommand zcat ",
+      "--outSAMstrandField None ",
+      "--outSAMattributes All ",
+      "--outSAMattrIHstart 0 ",
+      "--outFilterType Normal ",
+      "--outFilterMultimapScoreRange 1 ",
+      "-o STARmappings ",
+      "--outSAMtype BAM Unsorted ",
+      "--outStd BAM_Unsorted ",
+      "--outMultimapperOrder Random ",
+      "--alignEndsType EndToEnd",
+    ]
 
 extract_features:
   biotypes: ["rRNA", "tRNA"]
@@ -54,26 +50,25 @@ deeptools:
   normalize: "CPM"
 
 annotate_orfs:
-    window_size: 30
-    sorf_max_length: 300
-    sorf_min_length: 45
-    orf_start_codon_table: 11
-    orf_stop_codon: ["TAA", "TAG", "TGA"]
-    orf_longest_only: False
+  window_size: 30
+  sorf_max_length: 300
+  sorf_min_length: 45
+  orf_start_codon_table: 11
+  orf_stop_codon: ["TAA", "TAG", "TGA"]
+  orf_longest_only: False
 
 shift_reads:
-    window_size: 30
-    read_length: [27, 45]
-    # rpf_read_length: [30, 45]
-    # qti_read_length: [27, 45]
-    rnaseq_read_length: [0, 1000]
-    end_alignment: "3prime"
-    shift_table: "config/shift_table/shift_table.csv"
-    export_bam: False
-    export_bigwig: True
-    skip_shifting: False
-    skip_length_filter: True
-
+  window_size: 30
+  read_length: [27, 45]
+  # rpf_read_length: [30, 45]
+  # qti_read_length: [27, 45]
+  rnaseq_read_length: [0, 1000]
+  end_alignment: "3prime"
+  shift_table: "config/shift_table/shift_table.csv"
+  export_bam: False
+  export_bigwig: True
+  skip_shifting: False
+  skip_length_filter: True
 
 multiqc:
   config: "config/multiqc_config.yml"
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -13,10 +13,10 @@ import pandas as pd
 from datetime import date
 from snakemake.utils import min_version
 
-#min_version("7.0")
+# min_version("7.0")
 
 __author__ = "Rina Ahmed-Begrich, Michael Jahn"
-__year__ = str(date.today()).split('-')[0]
+__year__ = str(date.today()).split("-")[0]
 
 bold = "\033[1m"
 green = "\033[92m"
@@ -27,13 +27,14 @@ msg = f"""{cyan}Bacterial-Riboseq: A Snakemake workflow
 for the analysis of riboseq data in bacteria.{end}
 """
 
-epilog=f"""
+epilog = f"""
 {cyan}Written by {__author__}.
 Max Planck Unit for the Science of Pathogens. Copyright (c) {__year__}.
 Copyright Holder All Rights Reserved.{end}
 
 """
 
+
 # load configuration
 # -----------------------------------------------------
 configfile: "config/config.yml"
@@ -51,35 +52,37 @@ include: "rules/postprocessing.smk"
 shell.executable("bash")
 shell.prefix(f"set -eo pipefail; ")
 
-if config.get('output') is None:
-    config['output'] =  os.path.join(os.getcwd(), "./results")
 
 onstart:
     print("\n--- Analysis started...\n")
     print()
     print("--- Analysis parameters --------------------------------------------\n")
-    print(f"Current working directory: {os.path.join(os.getcwd())}")
-    print(f"Output directory:", {config['output']})
+    print(f"Current working directory: {os.getcwd()}")
+    print(f"Output directory: {os.path.join(os.getcwd(), 'results')}")
     print()
     print(f"Riboseq samples: {list(samples.index)}")
     print()
 
+
 onsuccess:
     print()
     print(msg)
     print(epilog)
     print("--- Workflow finished, no error! -----------------------------------")
     print()
-    debug = os.path.join(config['output'], "workflow.log")
-    shell("cat {log} > {debug} && echo -e '\nWorkflow finished, no error!\n' >> {debug}")
+    debug = os.path.join(os.getcwd(), "results/workflow.log")
+    shell(
+        "cat {log} > {debug} && echo -e '\nWorkflow finished, no error!\n' >> {debug}"
+    )
+
 
 onerror:
     print()
     print(msg)
     print(epilog)
     print("--- An error occurred! ---------------------------------------------")
     print()
-    error = os.path.join(config['output'], "error.log")
+    error = os.path.join(os.getcwd(), "results/error.log")
     shell("cat {log} > {error} && echo -e '\nAn error occurred!' >> {error}")
 
 

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -10,47 +10,52 @@ samples = (
     .sort_index()
 )
 
-#TODO: write validation schema
+# TODO: write validation schema
 # validate(SAMPLES, schema="../config/schemas/samples.schema.yml")
 
 
 def get_final_output():
     targets = []
     targets.append("results/multiqc/multiqc_report.html")
     targets.append(
-        expand("results/{mapping_status}/length_dist/{sample}_length_dist.tsv",
+        expand(
+            "results/{mapping_status}/length_dist/{sample}_length_dist.tsv",
             mapping_status=["mapped", "deduplicated", "filtered_bam"],
-            sample=samples.index)
+            sample=samples.index,
+        )
     )
     targets.append("results/get_genome/mRNA_features.gff")
     targets.append(
-        expand("results/shift_reads/{sample}_shift.csv",
-            sample=samples.index)
+        expand("results/shift_reads/{sample}_shift.csv", sample=samples.index)
     )
     return targets
 
 
 # get fastq files
 def get_fastq(wildcards):
-    if wildcards.status == 'raw':
+    if wildcards.status == "raw":
         return expand(
             "{input_dir}/{sample}",
             input_dir=samples.loc[wildcards.sample]["data_folder"],
-            sample=samples.loc[wildcards.sample]["fq1"])
-    if wildcards.status == 'clipped':
-        return expand(
-            "results/clipped/{sample}.fastq.gz",
-            sample=samples.index)
+            sample=samples.loc[wildcards.sample]["fq1"],
+        )
+    if wildcards.status == "clipped":
+        return expand("results/clipped/{sample}.fastq.gz", sample=samples.index)
 
 
 # get bam files
 def get_bam(wildcards):
-    if wildcards.mapping_status == 'mapped':
-        return expand(os.path.join("results",  "mapped", "{sample}.bam"),
-            sample=wildcards.sample)
-    if wildcards.mapping_status == 'deduplicated':
-        return expand(os.path.join("results",  "deduplicated", "{sample}.bam"),
-            sample=wildcards.sample)
-    if wildcards.mapping_status == 'filtered_bam':
-        return expand(os.path.join("results",  "filtered_bam", "{sample}.bam"),
-            sample=wildcards.sample)
+    if wildcards.mapping_status == "mapped":
+        return expand(
+            os.path.join("results", "mapped", "{sample}.bam"), sample=wildcards.sample
+        )
+    if wildcards.mapping_status == "deduplicated":
+        return expand(
+            os.path.join("results", "deduplicated", "{sample}.bam"),
+            sample=wildcards.sample,
+        )
+    if wildcards.mapping_status == "filtered_bam":
+        return expand(
+            os.path.join("results", "filtered_bam", "{sample}.bam"),
+            sample=wildcards.sample,
+        )
diff --git a/workflow/rules/postprocessing.smk b/workflow/rules/postprocessing.smk
@@ -2,6 +2,7 @@
 # Riboseq postprocessing:                               #
 # ----------------------------------------------------- #
 
+
 # module to extract selected biotypes from gff file
 # -----------------------------------------------------
 rule extract_mRNA_features:
@@ -60,7 +61,7 @@ rule shift_reads:
         """--- Shifting Ribo-Seq reads."""
     params:
         config["shift_reads"],
-    threads: workflow.cores,
+    threads: workflow.cores
     log:
         path="results/shift_reads/log/{sample}.log",
     script: