-
Notifications
You must be signed in to change notification settings - Fork 0
/
MultiSampleSmartSeq2SingleNucleus.wdl
204 lines (174 loc) · 7.69 KB
/
MultiSampleSmartSeq2SingleNucleus.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
version 1.0
import "tasks/CheckInputs.wdl" as CheckInputs
import "tasks/TrimAdapters.wdl" as TrimAdapters
import "tasks/StarAlign.wdl" as StarAlign
import "tasks/Picard.wdl" as Picard
import "tasks/FeatureCounts.wdl" as CountAlignments
import "tasks/LoomUtils.wdl" as LoomUtils
import "tasks/Utilities.wdl" as Utils
workflow MultiSampleSmartSeq2SingleNucleus {
meta {
description: "The MultiSampleSmartSeq2SingleNucleus pipeline runs multiple snSS2 samples in a single pipeline invocation"
allowNestedInputs: true
}
input {
# Cloud provider
String cloud_provider = "gcp"
# reference genome fasta
File genome_ref_fasta
# Reference index information
File tar_star_reference
# annotation file
File annotations_gtf
# adapter list file
File adapter_list
# Sample information
Array[String] input_ids
Array[String]? input_names
Array[String] fastq1_input_files
Array[String] fastq2_input_files
String batch_id
String? batch_name
Array[String]? project_id
Array[String]? project_name
Array[String]? library
Array[String]? species
Array[String]? organ
String? input_name_metadata_field
String? input_id_metadata_field
}
# Version of this pipeline
String pipeline_version = "1.2.25"
# Docker images
String picard_cloud_docker = "picard-cloud:2.26.10"
String alpine_docker = "alpine-bash:latest"
String ubuntu_docker = "ubuntu_16_0_4:latest"
String ea_utils_docker = "ea-utils:1.0.0-1.04.807-1659990665"
String star_docker = "star:1.0.0-2.7.9a-1658781884"
String subread_docker = "subread:1.0.0-2.0.1-1689097353"
String pytools_docker = "pytools:1.0.0-1661263730"
#TODO how do we handle these?
String gcp_alpine_docker_prefix = "bashell/"
String acr_alpine_docker_prefix = "dsppipelinedev.azurecr.io/"
String alpine_docker_prefix = if cloud_provider == "gcp" then gcp_alpine_docker_prefix else acr_alpine_docker_prefix
String ubuntu_docker = "ubuntu_16_0_4:latest"
String gcp_ubuntu_docker_prefix = "gcr.io/gcp-runtimes/"
String acr_ubuntu_docker_prefix = "dsppipelinedev.azurecr.io/"
String ubuntu_docker_prefix = if cloud_provider == "gcp" then gcp_ubuntu_docker_prefix else acr_ubuntu_docker_prefix
String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/"
String acr_docker_prefix = "dsppipelinedev.azurecr.io/"
# choose docker prefix based on cloud provider
String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix
# make sure either gcp or azr is supplied as cloud_provider input
if ((cloud_provider != "gcp") && (cloud_provider != "azure")) {
call Utils.ErrorWithMessage as ErrorMessageIncorrectInput {
input:
message = "cloud_provider must be supplied with either 'gcp' or 'azure'."
}
}
if (false) {
String? none = "None"
}
# Parameter metadata information
parameter_meta {
genome_ref_fasta: "Genome reference in fasta format"
tar_star_reference: "STAR reference index tar file"
annotations_gtf: "gtf containing annotations for gene tagging (must match star reference)"
input_ids: "Array of input ids"
input_names: "Array of input names"
input_id_metadata_field: "String that describes the metadata field containing the input_ids"
input_name_metadata_field: "String that describes the metadata field containing the input_names"
fastq1_input_files: "Array of fastq1 files; order must match the order in input_id."
fastq2_input_files: "Array of fastq2 files for paired end runs; order must match fastq1_input_files and input_id."
batch_id: " Identifier for the batch"
}
# Check that all input arrays are the same length
call CheckInputs.checkInputArrays as checkArrays{
input:
input_ids = input_ids,
input_names = input_names,
fastq1_input_files = fastq1_input_files,
fastq2_input_files = fastq2_input_files,
paired_end = true,
alpine_docker_path = alpine_docker_prefix + alpine_docker
}
call StarAlign.STARGenomeRefVersion as ReferenceCheck {
input:
tar_star_reference = tar_star_reference,
ubuntu_docker_path = ubuntu_docker_prefix + ubuntu_docker
}
call TrimAdapters.TrimAdapters as TrimAdapters {
input:
input_ids = input_ids,
fastq1_input_files = fastq1_input_files,
fastq2_input_files = fastq2_input_files,
adapter_list = adapter_list,
ea_utils_docker_path = docker_prefix + ea_utils_docker
}
call StarAlign.StarAlignFastqMultisample as StarAlign {
input:
input_ids = input_ids,
fastq1_input_files = TrimAdapters.trimmed_fastq1_files,
fastq2_input_files = TrimAdapters.trimmed_fastq2_files,
tar_star_reference = tar_star_reference,
star_docker_path = docker_prefix + star_docker
}
call Picard.RemoveDuplicatesFromBam as RemoveDuplicatesFromBam {
input:
input_ids = input_ids,
aligned_bam_inputs = StarAlign.output_bam,
picard_docker_path = docker_prefix + picard_cloud_docker
}
call Picard.CollectMultipleMetricsMultiSample {
input:
aligned_bam_inputs = RemoveDuplicatesFromBam.output_bam,
genome_ref_fasta = genome_ref_fasta,
input_ids = input_ids,
picard_docker_path = docker_prefix + picard_cloud_docker
}
call CountAlignments.CountAlignments as CountAlignments {
input:
input_ids = input_ids,
aligned_bam_inputs = RemoveDuplicatesFromBam.output_bam,
annotation_gtf = annotations_gtf,
subread_docker_path = docker_prefix + subread_docker
}
call LoomUtils.SingleNucleusSmartSeq2LoomOutput as LoomOutput {
input:
input_ids = input_ids,
input_names = input_names,
pipeline_version = "SmartSeq2SingleNucleus_v~{pipeline_version}",
input_id_metadata_field = input_id_metadata_field,
input_name_metadata_field = input_name_metadata_field,
alignment_summary_metrics = CollectMultipleMetricsMultiSample.alignment_summary_metrics,
dedup_metrics = RemoveDuplicatesFromBam.dedup_metrics,
gc_bias_summary_metrics = CollectMultipleMetricsMultiSample.gc_bias_summary_metrics,
introns_counts = CountAlignments.intron_counts_out,
exons_counts = CountAlignments.exon_counts_out,
annotation_introns_added_gtf = annotations_gtf,
pytools_docker_path = docker_prefix + pytools_docker
}
### Aggregate the Loom Files Directly ###
call LoomUtils.AggregateSmartSeq2Loom as AggregateLoom {
input:
loom_input = LoomOutput.loom_output,
batch_id = batch_id,
batch_name = batch_name,
project_id = if defined(project_id) then select_first([project_id])[0] else none,
project_name = if defined(project_name) then select_first([project_name])[0] else none,
library = if defined(library) then select_first([library])[0] else none,
species = if defined(species) then select_first([species])[0] else none,
organ = if defined(organ) then select_first([organ])[0] else none,
pipeline_version = "MultiSampleSmartSeq2SingleNucleus_v~{pipeline_version}",
pytools_docker_path = docker_prefix + pytools_docker
}
### Pipeline output ###
output {
# loom output, exon/intron count tsv files and the aligned bam files
File loom_output = AggregateLoom.loom_output_file
File genomic_reference_version = ReferenceCheck.genomic_ref_version
Array[File] exon_intron_count_files = LoomOutput.exon_intron_counts
Array[File] bam_files = RemoveDuplicatesFromBam.output_bam
String pipeline_version_out = pipeline_version
}
}