nextflow-io · adamrtalbot · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024
diff --git a/docs/hello_nextflow/03_hello_gatk.md b/docs/hello_nextflow/03_hello_gatk.md
diff --git a/docs/hello_nextflow/04_hello_modules.md b/docs/hello_nextflow/04_hello_modules.md
@@ -15,13 +15,6 @@ GATK (equivalent to `scripts/hello-gatk-6.nf`).
 
 Note: This is a basic variant calling pipeline consisting of three processes. You can find a complete description of the pipeline in the previous section of this training.
 
-This workflow relies on reference files that are provided in compressed form in the Gitpod environment. If you completed the previous parts of the training course, then you already have everything you need in the working directory. However, if you're picking this up here, you need to run the following command to expand the reference files:
-
-```bash
-cd /workspace/gitpod/hello-nextflow
-tar -zxvf data/ref.tar.gz -C data/
-```
-
 ### 0.1 Run the workflow to verify that it produces the expected outputs
 
 ```bash
@@ -41,18 +34,14 @@ The pipeline takes in three BAM files, each one containing sequencing data for o
  * Pipeline parameters
  */
 
-// Execution environment setup
-params.projectDir = "/workspace/gitpod/hello-nextflow"
-$projectDir = params.projectDir
-
 // Primary input (samplesheet in CSV format with ID and file path, one sample per line)
-params.reads_bam = "${projectDir}/data/samplesheet.csv"
+params.reads_bam = "./data/samplesheet.csv"
 
 // Accessory files
-params.genome_reference = "${projectDir}/data/ref/ref.fasta"
-params.genome_reference_index = "${projectDir}/data/ref/ref.fasta.fai"
-params.genome_reference_dict = "${projectDir}/data/ref/ref.dict"
-params.calling_intervals = "${projectDir}/data/intervals.list"
+params.reference = "./data/ref/ref.fasta"
+params.reference_index = "./data/ref/ref.fasta.fai"
+params.reference_dict = "./data/ref/ref.dict"
+params.calling_intervals = "./data/intervals.list"
 
 // Base name for final output file
 params.cohort_name = "family_trio"
@@ -106,16 +95,18 @@ This creates an empty file named `main.nf` under the appropriate directory struc
  */
 process SAMTOOLS_INDEX {
 
-    container 'quay.io/biocontainers/samtools:1.19.2--h50ea8bc_1'
+    container 'community.wave.seqera.io/library/samtools:1.20--b5dfbd93de237464'
+    conda "bioconda::samtools=1.19.2"
 
     input:
-        tuple val(id), path(input_bam)
+        path input_bam
 
     output:
-        tuple val(id), path(input_bam), path("${input_bam}.bai")
+        tuple path(input_bam), path("${input_bam}.bai")
 
     """
     samtools index '$input_bam'
+
     """
 }
 ```
@@ -177,21 +168,22 @@ Move this code to `modules/local/gatk/haplotypecaller/main.nf`:
 
 ```groovy
 /*
- * Call variants with GATK HaplotypeCaller in GVCF mode
+ * Call variants with GATK HapolotypeCaller in GVCF mode
  */
 process GATK_HAPLOTYPECALLER {
 
-    container "docker.io/broadinstitute/gatk:4.5.0.0"
+    container "community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867"
+    conda "bioconda::gatk4=4.5.0.0"
 
     input:
-        tuple val(id), path(input_bam), path(input_bam_index)
+        tuple path(input_bam), path(input_bam_index)
         path ref_fasta
         path ref_index
         path ref_dict
         path interval_list
 
     output:
-        tuple val(id), path("${input_bam}.g.vcf"), path("${input_bam}.g.vcf.idx")
+        tuple path("${input_bam}.g.vcf"), path("${input_bam}.g.vcf.idx")
 
     """
     gatk HaplotypeCaller \
@@ -212,11 +204,11 @@ And move this code to `modules/local/gatk/jointgenotyping/main.nf`:
  */
 process GATK_JOINTGENOTYPING {
 
-    container "docker.io/broadinstitute/gatk:4.5.0.0"
+    container "community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867"
+    conda "bioconda::gatk4=4.5.0.0"
 
     input:
-        path(sample_map)
-        val(cohort_name)
+        tuple val(cohort_name), path(vcfs), path(idxs)
         path ref_fasta
         path ref_index
         path ref_dict
@@ -226,9 +218,11 @@ process GATK_JOINTGENOTYPING {
         path "${cohort_name}.joint.vcf"
         path "${cohort_name}.joint.vcf.idx"
 
+    script:
+    def inputs = vcfs.collect { "-V ${it}" }.join(' ')
     """
     gatk GenomicsDBImport \
-        --sample-name-map ${sample_map} \
+        ${inputs} \
         --genomicsdb-workspace-path ${cohort_name}_gdb \
         -L ${interval_list}
 

diff --git a/docs/hello_nextflow/05_hello_nf-test.md b/docs/hello_nextflow/05_hello_nf-test.md
@@ -20,12 +20,6 @@ cp scripts/nextflow.config .
 cp -r scripts/modules .
 ```
 
-You also need to unzip the reference data files:
-
-```
-tar -zxvf data/ref.tar.gz -C data/
-```
-
 ### 0.1 Run the workflow to verify that it produces the expected outputs
 
 ```bash
@@ -169,7 +163,7 @@ _After:_
 ```groovy title="modules/local/samtools/index/tests/main.nf.test" linenums="14"
 process {
     """
-    input[0] = [ [id: 'NA12882' ], file("${projectDir}/data/bam/reads_son.bam") ]
+    input[0] = file("${projectDir}/data/bam/reads_son.bam", checkIfExists: true)
     """
 }
 ```
@@ -271,7 +265,7 @@ test("reads_mother [bam]") {
         }
         process {
             """
-            input[0] = [ [id: 'NA12878' ], file("${projectDir}/data/bam/reads_mother.bam") ]
+            input[0] = file("${projectDir}/data/bam/reads_mother.bam", checkIfExists: true)
             """
         }
     }
@@ -295,7 +289,7 @@ test("reads_father [bam]") {
         }
         process {
             """
-            input[0] = [ [id: 'NA12877' ], file("${projectDir}/data/bam/reads_father.bam") ]
+            input[0] = file("${projectDir}/data/bam/reads_father.bam", checkIfExists: true)
             """
         }
     }
@@ -447,7 +441,7 @@ test("reads_son [bam]") {
             script "../../../samtools/index/main.nf"
             process {
                 """
-                input[0] =  [ [id: 'NA12882' ], file("${projectDir}/data/bam/reads_son.bam") ]
+                input[0] =  file("${projectDir}/data/bam/reads_son.bam", checkIfExists: true)
                 """
             }
         }
@@ -469,7 +463,7 @@ Then we can refer to the output of that process in the `when` block where we spe
             input[1] = file("${projectDir}/data/ref/ref.fasta")
             input[2] = file("${projectDir}/data/ref/ref.fasta.fai")
             input[3] = file("${projectDir}/data/ref/ref.dict")
-            input[4] = file("${projectDir}/data/intervals.list")
+            input[4] = file("${projectDir}/data/ref/intervals.bed")
             """
         }
     }
@@ -578,8 +572,8 @@ _After:_
 ```console title="modules/local/gatk/haplotypecaller/tests/main.nf.test" linenums="35"
 then {
     assert process.success
-    assert path(process.out[0][0][1]).readLines().contains('#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12882')
-    assert path(process.out[0][0][1]).readLines().contains('20	10040001	.	T	<NON_REF>	.	.	END=10040048	GT:DP:GQ:MIN_DP:PL	0/0:40:99:37:0,99,1150')
+    assert path(process.out[0][0]).readLines().contains('#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	reads_son')
+    assert path(process.out[0][0]).readLines().contains('20:10037292-10066351	3277	.	G	<NON_REF>	.	.	END=3282	GT:DP:GQ:MIN_DP:PL	0/0:25:72:24:0,72,719')
 }
 ```
 
@@ -625,7 +619,7 @@ test("reads_mother [bam]") {
             script "../../../samtools/index/main.nf"
             process {
                 """
-                input[0] =  [ [id: 'NA12882' ], file("${projectDir}/data/bam/reads_mother.bam") ]
+                input[0] =  file("${projectDir}/data/bam/reads_mother.bam", checkIfExists: true)
                 """
             }
         }
@@ -641,15 +635,15 @@ test("reads_mother [bam]") {
             input[1] = file("${projectDir}/data/ref/ref.fasta")
             input[2] = file("${projectDir}/data/ref/ref.fasta.fai")
             input[3] = file("${projectDir}/data/ref/ref.dict")
-            input[4] = file("${projectDir}/data/intervals.list")
+            input[4] = file("${projectDir}/data/ref/intervals.bed")
             """
         }
     }
 
     then {
         assert process.success
-        assert path(process.out[0][0][1]).readLines().contains('#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878')
-        assert path(process.out[0][0][1]).readLines().contains('20	10040001	.	T	<NON_REF>	.	.	END=10040013	GT:DP:GQ:MIN_DP:PL	0/0:28:81:27:0,81,829')
+        assert path(process.out[0][0]).readLines().contains('#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	reads_mother')
+        assert path(process.out[0][0]).readLines().contains('20:10037292-10066351	3277	.	G	<NON_REF>	.	.	END=3278	GT:DP:GQ:MIN_DP:PL	0/0:38:99:37:0,102,1530')
     }
 }
 ```
@@ -664,7 +658,7 @@ test("reads_father [bam]") {
             script "../../../samtools/index/main.nf"
             process {
                 """
-                input[0] =  [ [id: 'NA12882' ], file("${projectDir}/data/bam/reads_father.bam") ]
+                input[0] =  file("${projectDir}/data/bam/reads_father.bam", checkIfExists: true)
                 """
             }
         }
@@ -680,15 +674,15 @@ test("reads_father [bam]") {
             input[1] = file("${projectDir}/data/ref/ref.fasta")
             input[2] = file("${projectDir}/data/ref/ref.fasta.fai")
             input[3] = file("${projectDir}/data/ref/ref.dict")
-            input[4] = file("${projectDir}/data/intervals.list")
+            input[4] = file("${projectDir}/data/ref/intervals.bed")
             """
         }
     }
 
     then {
         assert process.success
-        assert path(process.out[0][0][1]).readLines().contains('#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12877')
-        assert path(process.out[0][0][1]).readLines().contains('20	10040001	.	T	<NON_REF>	.	.	END=10040011	GT:DP:GQ:MIN_DP:PL	0/0:30:81:29:0,81,1025')
+        assert path(process.out[0][0]).readLines().contains('#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	reads_father')
+        assert path(process.out[0][0]).readLines().contains('20:10037292-10066351	3277	.	G	<NON_REF>	.	.	END=3281	GT:DP:GQ:MIN_DP:PL	0/0:44:99:42:0,120,1800')
     }
 }
 ```
@@ -735,7 +729,6 @@ This should be the result:
 
 ```bash
 modules/local/gatk/jointgenotyping/tests/inputs/
-├── family_trio_map.tsv
 ├── reads_father.bam.g.vcf
 ├── reads_father.bam.g.vcf.idx
 ├── reads_mother.bam.g.vcf
@@ -825,12 +818,13 @@ test("family_trio [vcf] [idx]") {
         }
         process {
             """
-            input[0] = file("${projectDir}/modules/local/gatk/jointgenotyping/tests/inputs/family_trio_map.tsv")
-            input[1] = "family_trio"
-            input[2] = file("${projectDir}/data/ref/ref.fasta")
-            input[3] = file("${projectDir}/data/ref/ref.fasta.fai")
-            input[4] = file("${projectDir}/data/ref/ref.dict")
-            input[5] = file("${projectDir}/data/intervals.list")
+            input[0] = Channel.fromPath("${projectDir}/modules/local/gatk/jointgenotyping/tests/inputs/*.bam")
+            input[1] = Channel.fromPath("${projectDir}/modules/local/gatk/jointgenotyping/tests/inputs/*.bam.bai")
+            input[2] = "family_trio"
+            input[3] = file("${projectDir}/data/ref/ref.fasta")
+            input[4] = file("${projectDir}/data/ref/ref.fasta.fai")
+            input[5] = file("${projectDir}/data/ref/ref.dict")
+            input[6] = file("${projectDir}/data/ref/intervals.bed")
             """
         }
     }
@@ -843,8 +837,8 @@ The output of the joint genotyping step is another VCF file, so we're going to u
 ```groovy title="modules/local/gatk/jointgenotyping/tests/main.nf.test" linenums="25"
 then {
     assert process.success
-    assert path(process.out[0][0]).readLines().contains('#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12877	NA12878	NA12882')
-    assert path(process.out[0][0]).readLines().contains('20	10040772	.	C	CT	1568.89	.	AC=5;AF=0.833;AN=6;BaseQRankSum=0.399;DP=82;ExcessHet=0.0000;FS=4.291;MLEAC=5;MLEAF=0.833;MQ=60.00;MQRankSum=0.00;QD=21.79;ReadPosRankSum=-9.150e-01;SOR=0.510	GT:AD:DP:GQ:PL	0/1:14,16:30:99:370,0,348	1/1:0,17:17:51:487,51,0	1/1:0,25:25:75:726,75,0')
+    assert path(process.out[0][0]).readLines().contains('#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	reads_son	reads_mother	reads_father')
+    assert path(process.out[0][0]).readLines().contains('20:10037292-10066351    3480    .       C       CT      1625.89 .       AC=5;AF=0.833;AN=6;BaseQRankSum=0.220;DP=85;ExcessHet=0.0000;FS=2.476;MLEAC=5;MLEAF=0.833;MQ=60.00;MQRankSum=0.00;QD=21.68;ReadPosRankSum=-1.147e+00;SOR=0.487  GT:AD:DP:GQ:PL  0/1:15,16:31:99:367,0,375       1/1:0,18:18:54:517,54,0 1/1:0,26:26:78:756,78,0')
 }
 ```
 

diff --git a/hello-nextflow/data/bam/reads_father.bam b/hello-nextflow/data/bam/reads_father.bam
diff --git a/hello-nextflow/data/bam/reads_mother.bam b/hello-nextflow/data/bam/reads_mother.bam
diff --git a/hello-nextflow/data/bam/reads_son.bam b/hello-nextflow/data/bam/reads_son.bam
diff --git a/hello-nextflow/data/intervals.list b/hello-nextflow/data/intervals.list
diff --git a/hello-nextflow/data/ref.tar.gz b/hello-nextflow/data/ref.tar.gz
diff --git a/hello-nextflow/data/ref/intervals.bed b/hello-nextflow/data/ref/intervals.bed
@@ -0,0 +1,3 @@
+20:10037292-10066351    3276    5495
+20:10037292-10066351    7535    9859
+20:10037292-10066351    12911   14737
diff --git a/hello-nextflow/data/ref/ref.dict b/hello-nextflow/data/ref/ref.dict
@@ -0,0 +1,2 @@
+@HD	VN:1.6
+@SQ	SN:20:10037292-10066351	LN:29059	M5:d9a7bb8816cea7d1e5c55e00d1faeeb4	UR:reads_mother_20_10037292_10066351.region.fasta
diff --git a/hello-nextflow/data/ref/ref.fasta b/hello-nextflow/data/ref/ref.fasta
diff --git a/hello-nextflow/data/ref/ref.fasta.fai b/hello-nextflow/data/ref/ref.fasta.fai
@@ -0,0 +1 @@
+20:10037292-10066351	29059	22	29059	29060
diff --git a/hello-nextflow/data/sample_bams.txt b/hello-nextflow/data/sample_bams.txt
diff --git a/hello-nextflow/data/samplesheet.csv b/hello-nextflow/data/samplesheet.csv