forked from rbpisupati/nf-haplocaller
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_filter_vcf.nf
107 lines (86 loc) · 2.65 KB
/
get_filter_vcf.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/*
provide all the GVCFs together, the script can split the samples separately
nextflow run get_filter_vcf.nf --reads "001.plate.raw.vcfs/*vcf" --fasta ref_seq/TAIR10_wholeGenome.fasta --outdir filter_vcfs
*/
params.project = "the1001genomes"
build_index = false
params.outdir = './snpcall'
params.fasta = false
params.tmpdir = "/lustre/scratch/users/rahul.pisupati/tempFiles/"
if ( params.fasta ){
genome = file(params.fasta)
reffol = genome.parent
refid = genome.baseName
if( !genome.exists() ) exit 1, "Reference fasta file not found: ${params.fasta}"
bwa_indices = Channel
.fromPath( "$reffol/${refid}.fasta.b*" )
.ifEmpty { build_index = true }
} else {
exit 1, "Provide reference fasta file. Ex., --fasta file_path"
}
input_gvcfs = Channel
.fromPath( "${params.input}" )
.map { it -> [ file("$it").getExtension(), file("$it"), file("$it" + ".idx") ] }
/*
* 1. Create a channel for checking bwa index for genome ref
*/
if (build_index == true){
process makeBWAindex {
publishDir "${reffol}", mode: 'copy'
input:
file genome
output:
file "${refid}.fasta.*" into fasta_index
file "${refid}.dict" into fasta_dict
script:
"""
samtools faidx ${genome}
bwa index $genome
java -jar \$EBROOTPICARD/picard.jar CreateSequenceDictionary R=$genome O=${refid}.dict
"""
}
} else {
fasta_index = Channel
.fromPath( "$reffol/${refid}.fasta.*" )
}
process getSamples {
input:
set val(file_ext), file(gvcf), file(gvcf_index) from input_gvcfs
output:
set file("$gvcf"), stdout into sample_names
script:
if (file_ext == "gz"){
"""
tabix -H $gvcf | grep -m 1 "^#CHROM" | awk '{for(i=10;i<=NF;i++) print \$i}'
"""
} else if (file_ext == "vcf"){
"""
grep -m 1 "^#CHROM" $gvcf | awk '{for(i=10;i<=NF;i++) print \$i}'
"""
}
}
input_names = sample_names
.splitText(elem: 1)
.map{ row -> [file("${row[0]}"), file("${row[0]}.idx"), "${row[1]}".replace('\n','') ] }
process selectSNPs {
tag "$name"
publishDir "${params.outdir}/$out_fol", mode: 'copy'
input:
set file(gvcf), file(gvcf_index), val(name) from input_names
file fasta_index
when:
file("${params.outdir}/${out_fol}/${name}.filter.vcf").isEmpty()
output:
set file("${name}.filter.vcf"), file("${name}.filter.vcf.idx") into filter_vcf
script:
out_fol = file("$gvcf").baseName.replace( /.vcf/, '')
"""
java -Djava.io.tmpdir=${params.tmpdir} -jar \$EBROOTGATK/GenomeAnalysisTK.jar\
-T SelectVariants -R $reffol/${refid}.fasta\
-nt ${task.cpus} \
-V $gvcf \
-o ${name}.filter.vcf \
-se "${name}" \
-selectType SNP -restrictAllelesTo BIALLELIC
"""
}