forked from garber-lab/inDrop_Processing
-
Notifications
You must be signed in to change notification settings - Fork 1
/
SingleCell-Indrop-Hisat.nf
402 lines (325 loc) · 11.5 KB
/
SingleCell-Indrop-Hisat.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
params.outdir = 'results'
if (!params.cellBarcodeFile){params.cellBarcodeFile = ""}
if (!params.mate_split){params.mate_split = ""}
if (!params.determined_fastq){params.determined_fastq = ""}
if (!params.genomeIndexPath){params.genomeIndexPath = ""}
if (!params.countUniqueAlignedBarcodes_fromFile_filePath){params.countUniqueAlignedBarcodes_fromFile_filePath = ""}
if (!params.filter_lowCountBC_bam_print_py_filePath){params.filter_lowCountBC_bam_print_py_filePath = ""}
if (!params.cutoff_reads_for_valid_cell){params.cutoff_reads_for_valid_cell = ""}
if (!params.ESAT_path){params.ESAT_path = ""}
if (!params.gene_to_transcript_mapping_file){params.gene_to_transcript_mapping_file = ""}
if (!params.cleanLowEndUmis_path){params.cleanLowEndUmis_path = ""}
if (!params.extractValidReadsPath){params.extractValidReadsPath = ""}
if (!params.hisat2_path){params.hisat2_path = ""}
g_7_cellBarcodeFile = file(params.cellBarcodeFile)
g_13_mate = params.mate_split
g_27_fastq_set_g_75 = Channel.fromPath(params.determined_fastq).toSortedList()
g_34_genomeIndexPath = params.genomeIndexPath
g_41_script = file(params.countUniqueAlignedBarcodes_fromFile_filePath)
g_43_script = file(params.filter_lowCountBC_bam_print_py_filePath)
g_44_cutoff = params.cutoff_reads_for_valid_cell
g_50_script = file(params.ESAT_path)
g_51_trans2gene_file = file(params.gene_to_transcript_mapping_file)
g_52_script = file(params.cleanLowEndUmis_path)
g_70_g_extractValid = file(params.extractValidReadsPath)
g_79_script_path = params.hisat2_path
// function that returns the sampleName by matching asterisk(*) of a glob pattern starting from the beginning of the name.
// eg. readPrefix('/some/data/file_alpha_S1_002_1.fa', 'file*_??_{001,002}_{1,2}.fa' ) Returns 'file_alpha'
def readPrefix( actual, template ) {
final fileName = actual instanceof Path ? actual.name : actual.toString()
def filePattern = template.toString()
int p = filePattern.lastIndexOf('/')
if( p != -1 ) filePattern = filePattern.substring(p+1)
if( !filePattern.contains('*') && !filePattern.contains('?') )
filePattern = '*' + filePattern
def regex = filePattern.replace('.','\\.').replace('*','(.*)').replace('?','(?:.?)').replace('{','(?:').replace('}',')').replace(',','|')
def matcher = (fileName =~ /$regex/ )
if( matcher.matches() ) {
def end = matcher.end(matcher.groupCount() )
def prefix = fileName.substring(0,end)
while(prefix.endsWith('-') || prefix.endsWith('_') || prefix.endsWith('.') )
prefix=prefix[0..-2]
return prefix
}
return null
}
process extractValidReads {
publishDir params.outdir, mode: 'copy',
saveAs: {filename ->
if (filename =~ /validfastq\/.*.fastq$/) "valid_fastq/$filename"
}
input:
file extractVcode from g_70_g_extractValid
file cellBarcode from g_7_cellBarcodeFile
set file(fastq1), file(fastq2), file(fastq3) from g_27_fastq_set_g_75.flatMap().buffer(size:3)
output:
set val(sampleName), file("validfastq/*.fastq") into g_75_valid_fastq_g_73
script:
// readPrefix function returns the sampleName by matching asterisk(*) of a glob pattern starting from the beginning of the name.
// note: params.determined_fastq is pipeline input where reads are defined.
sampleName = readPrefix (fastq1, params.determined_fastq)
name = fastq1.toString() - ~/(\.fastq.gz)?(\.fq.gz)?(\.fastq)?(\.fq)?$/
"""
mkdir -p validfastq
python ${extractVcode} -i ${fastq1} -o ${name} -d validfastq -b ${cellBarcode} -u 8
"""
}
process Merge_Reads {
input:
val mate from g_13_mate
set val(name), file(reads) from g_75_valid_fastq_g_73.groupTuple()
output:
set val(name), file("reads/*") into g_73_reads_g_68
script:
"""
mkdir reads
cat ${reads} > ${name}_mergeValid.fastq
mv ${name}_mergeValid.fastq reads/.
"""
}
readsPerFile = 5000000 //* @input @description:"The number of reads per file"
//Since splitFastq operator requires flat file structure, first convert grouped structure to flat, execute splitFastq, and then return back to original grouped structure
//.map(flatPairsClosure).splitFastq(splitFastqParams).map(groupPairsClosure)
//Mapping grouped read structure to flat structure
def flatPairsClosure = {row -> (row[1] instanceof Collection) ? tuple(row[0], file(row[1][0]), file(row[1][1])) : tuple(row[0], file(row[1]))}
//Mapping flat read structure to grouped read structure
def groupPairsClosure = {row -> tuple(row[0], (row[2]) ? [file(row[1]), file(row[2])] : [file(row[1])])}
// if mate of split process different than rest of the pipeline, use "mate_split" as input parameter. Otherwise use default "mate" as input parameter
def mateParamName = (params.mate_split) ? "mate_split" : "mate"
def splitFastqParams;
if (params[mateParamName] != "pair"){
splitFastqParams = [by: readsPerFile, file:true]
}else {
splitFastqParams = [by: readsPerFile, pe:true, file:true]
}
process SplitFastq {
input:
set val(name), file(reads) from g_73_reads_g_68.map(flatPairsClosure).splitFastq(splitFastqParams).map(groupPairsClosure)
val mate from g_13_mate
output:
set val(name), file("split/*") into g_68_split_fastq_g_77
"""
mkdir -p split
mv ${reads} split/.
"""
}
params_HISAT2 = "-p 4"
process Map_HISAT2 {
publishDir params.outdir, mode: 'copy',
saveAs: {filename ->
if (filename =~ /${newName}.bam$/) "mapped_reads/$filename"
}
input:
val index from g_34_genomeIndexPath
set val(name), file(reads) from g_68_split_fastq_g_77
val mate from g_13_mate
val script from g_79_script_path
output:
set val(name), file("${newName}.bam") into g_77_mapped_reads_g_56
set val(name), file("${newName}.align_summary.txt") into g_77_outputFileTxt_g_78
set val(name), file("${newName}.flagstat.txt") into g_77_outputFileOut
when:
(params.run_HISAT2 && (params.run_HISAT2 == "yes")) || !params.run_HISAT2
script:
nameAll = reads.toString()
nameArray = nameAll.split(' ')
def file2;
if (nameAll.contains('.gz')) {
newName = nameArray[0] - ~/(\.fastq.gz)?(\.fq.gz)?$/
file1 = nameArray[0] - '.gz'
if (mate == "pair") {file2 = nameArray[1] - '.gz'}
runGzip = "ls *.gz | xargs -i echo gzip -df {} | sh"
} else {
newName = nameArray[0] - ~/(\.fastq)?(\.fq)?$/
file1 = nameArray[0]
if (mate == "pair") {file2 = nameArray[1]}
runGzip = ''
}
"""
$runGzip
if [ "${mate}" == "pair" ]; then
${script} ${params_HISAT2} -x ${index} -1 ${file1} -2 ${file2} -S ${newName}.sam &> ${newName}.align_summary.txt
else
${script} ${params_HISAT2} -x ${index} -U ${file1} -S ${newName}.sam &> ${newName}.align_summary.txt
fi
samtools view -bS ${newName}.sam > ${newName}.bam
samtools flagstat ${newName}.bam > ${newName}.flagstat.txt
"""
}
process Merge_Bam {
publishDir params.outdir, mode: 'copy',
saveAs: {filename ->
if (filename =~ /${oldname}.bam$/) "merged_bam/$filename"
}
input:
set val(oldname), file(bamfiles) from g_77_mapped_reads_g_56.groupTuple()
output:
set val(oldname), file("${oldname}.bam") into g_56_merged_bams_g_57
shell:
'''
num=$(echo "!{bamfiles.join(" ")}" | awk -F" " '{print NF-1}')
if [ "${num}" -gt 0 ]; then
samtools merge !{oldname}.bam !{bamfiles.join(" ")}
else
mv !{bamfiles.join(" ")} !{oldname}.bam
fi
'''
}
process sort_index {
input:
set val(name), file(initial_alignment) from g_56_merged_bams_g_57
output:
set val(name), file("*.bam") into g_57_bam_file_g_76
set val(name), file("*.bai") into g_57_bam_index_g_76
"""
samtools sort ${initial_alignment} ${name}_sorted
samtools index ${name}_sorted.bam
"""
}
process Count_cells {
publishDir params.outdir, mode: 'copy',
saveAs: {filename ->
if (filename =~ /.*_count.txt$/) "cell_counts/$filename"
}
input:
set val(oldname), file(sorted_bams) from g_57_bam_file_g_76
set val(oldname), file(bams_index) from g_57_bam_index_g_76
file script_path from g_41_script
output:
set val(oldname), file("bam/*.bam") into g_76_sorted_bam_g_47
set val(oldname), file("bam/*.bam.bai") into g_76_bam_index_g_47
set val(oldname), file("*_count.txt") into g_76_count_file_g_47
"""
find -name "*.bam" > filelist.txt
python ${script_path} -i filelist.txt -o ${oldname}_count.txt
mkdir bam
mv $sorted_bams bam/.
mv $bams_index bam/.
"""
}
process filter_lowCount {
input:
file script_path from g_43_script
set val(oldname), file(sorted_bams) from g_76_sorted_bam_g_47
val cutoff from g_44_cutoff
set val(name), file(count_file) from g_76_count_file_g_47
set val(oldname), file(bam_index) from g_76_bam_index_g_47
output:
set val(name), file("*_filtered.bam") into g_47_filtered_bam_g_48
"""
python ${script_path} -i ${sorted_bams} -b ${name}_count.txt -o ${name}_filtered.bam -n ${cutoff}
"""
}
process ESAT {
publishDir params.outdir, mode: 'copy',
saveAs: {filename ->
if (filename =~ /.*umi.distributions.txt$/) "UMI_distributions/$filename"
}
input:
file esat_path from g_50_script
file trans2gene_filepath from g_51_trans2gene_file
set val(name), file(filtered_bam) from g_47_filtered_bam_g_48
output:
set val(name), file("*umi.distributions.txt") into g_48_UMI_distributions_g_49
"""
find -name "*.bam" | awk '{print "${name}\t"\$1 }' > filelist.txt
java \
-Xmx40g \
-jar ${esat_path} \
-alignments filelist.txt \
-out ${name}_esat.txt \
-geneMapping ${trans2gene_filepath} \
-task score3p \
-wLen 100 \
-wOlap 50 \
-wExt 1000 \
-sigTest .01 \
-multimap ignore \
-scPrep
"""
}
process UMI_Trim {
publishDir params.outdir, mode: 'copy',
saveAs: {filename ->
if (filename =~ /.*_umiClean.txt$/) "UMI_count_final/$filename"
}
input:
file cleanLowEndUmis_path from g_52_script
set val(name), file(umi_dist) from g_48_UMI_distributions_g_49
output:
set val(name), file("*_umiClean.txt") into g_49_UMI_clean
"""
python ${cleanLowEndUmis_path} \
-i ${umi_dist} \
-o ${name}_umiClean.txt \
-n 2
"""
}
process HISAT2_Summary {
publishDir params.outdir, mode: 'copy',
saveAs: {filename ->
if (filename =~ /.*.tsv$/) "hisat_summary/$filename"
}
input:
set val(name), file(alignSum) from g_77_outputFileTxt_g_78.groupTuple()
output:
file "*.tsv" into g_78_outputFile
shell:
'''
#!/usr/bin/env perl
use List::Util qw[min max];
use strict;
use File::Basename;
use Getopt::Long;
use Pod::Usage;
use Data::Dumper;
my %tsv;
my @headers = ();
my $name = "!{name}";
alteredAligned();
my @keys = keys %tsv;
my $summary = "$name"."_hisat_sum.tsv";
my $header_string = join("\\t", @headers);
`echo "$header_string" > $summary`;
foreach my $key (@keys){
my $values = join("\\t", @{ $tsv{$key} });
`echo "$values" >> $summary`;
}
sub alteredAligned
{
my @files = qw(!{alignSum});
my $multimappedSum;
my $alignedSum;
my $inputCountSum;
push(@headers, "Sample");
push(@headers, "Total Reads");
push(@headers, "Multimapped Reads Aligned Hisat2");
push(@headers, "Unique Reads Aligned Hisat2");
foreach my $file (@files){
my $multimapped;
my $aligned;
my $inputCount;
chomp($inputCount = `cat $file | grep 'reads; of these:' | awk '{sum+=\\$1} END {print sum}'`);
chomp($aligned = `cat $file | grep 'aligned.*exactly 1 time' | awk '{sum+=\\$1} END {print sum}'`);
chomp($multimapped = `cat $file | grep 'aligned.*>1 times' | awk '{sum+=\\$1} END {print sum}'`);
$multimappedSum += int($multimapped);
$alignedSum += (int($aligned) - int($multimapped));
$inputCountSum += int($inputCount);
if ($alignedSum < 0){
$alignedSum = 0;
}
}
$tsv{$name} = [$name, $inputCountSum];
push(@{$tsv{$name}}, $multimappedSum);
push(@{$tsv{$name}}, $alignedSum);
}
'''
}
workflow.onComplete {
println "##Pipeline execution summary##"
println "---------------------------"
println "##Completed at: $workflow.complete"
println "##Duration: ${workflow.duration}"
println "##Success: ${workflow.success ? 'OK' : 'failed' }"
println "##Exit status: ${workflow.exitStatus}"
}