-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adapted contamination estimation workflow and BAMutils workflow for s…
…hort-read data contamination analysis
- Loading branch information
1 parent
7d36a93
commit b090a4a
Showing
3 changed files
with
2,264 additions
and
0 deletions.
There are no files selected for viewing
91 changes: 91 additions & 0 deletions
91
wdl/pipelines/TechAgnostic/Utility/ShortReadsContaminationEstimation.wdl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
version 1.0 | ||
|
||
import "../../../tasks/Utility/Utils.wdl" | ||
import "../../../tasks/Utility/BAMutils.wdl" as BU | ||
import "../../../tasks/QC/Contamination.wdl" | ||
|
||
# This workflow estimates contamination levels for short-read BAM files | ||
struct VBID2_config { | ||
File genotyping_sites | ||
|
||
Boolean is_hgdp_sites | ||
Boolean is_100k_sites | ||
|
||
Boolean disable_baq | ||
|
||
Int max_retries | ||
|
||
String? tech | ||
} | ||
|
||
workflow ShortReadsContaminationEstimation { | ||
meta { | ||
description: | ||
"Estimate the cross-individual contamination level of a GRCh38 BAM file for short-read data." | ||
} | ||
|
||
input { | ||
File bam | ||
File bai | ||
String tech # Short-read technology, e.g., NovaSeq | ||
File ref_map_file | ||
|
||
File gt_sites_bed | ||
Boolean is_hgdp_sites | ||
Boolean is_100k_sites | ||
|
||
Boolean disable_baq | ||
|
||
String disk_type | ||
|
||
Int max_retries | ||
} | ||
|
||
parameter_meta { | ||
gt_sites_bed: "Bed file holding the genotyping sites for short-read data." | ||
is_hgdp_sites: "Provided BED is HGDP genotyping sites." | ||
is_100k_sites: "Provided BED is 100k genotyping sites, suitable for short-read workflows." | ||
disable_baq: "If turned on, BAQ computation will be disabled (faster operation). Recommended for short-read data." | ||
|
||
tech: "Technology used for generating the data; accepted values: [NovaSeq, HiSeq]" | ||
|
||
max_retries: "Number of retries for transient errors (e.g., disk or network issues)." | ||
} | ||
|
||
# Thresholds adjusted for short-read technologies | ||
Map[String, Int] bam_threshold_per_tech = {'NovaSeq': 50, 'HiSeq': 100} # Adjusted for short-read data | ||
Int bam_file_threshold = bam_threshold_per_tech[tech] | ||
|
||
if (bam_file_threshold > ceil(size(bam, "MiB"))) { | ||
Float extreme_low_cov_val = 50.0 # Adjusted low-coverage value for short reads | ||
} | ||
|
||
if (bam_file_threshold <= ceil(size(bam, "MiB"))) { | ||
# Prepare reference map and proceed with pileup | ||
Map[String, String] ref_map = read_map(ref_map_file) | ||
|
||
Int scaleup_factor = 20 | ||
call BU.BamToRelevantPileup as Pileup { | ||
input: | ||
bam = bam, | ||
bai = bai, | ||
bed = gt_sites_bed, | ||
ref_fasta = ref_map['fasta'], | ||
disable_baq = disable_baq, | ||
disk_type = disk_type, | ||
max_retries = max_retries | ||
} | ||
|
||
call Contamination.VerifyBamID { | ||
input: | ||
pileup = Pileup.pileups, | ||
ref_fasta = ref_map['fasta'], | ||
is_hgdp_sites = is_hgdp_sites, | ||
is_100k_sites = is_100k_sites | ||
} | ||
} | ||
|
||
output { | ||
Float contamination_est = select_first([VerifyBamID.contamination_est, extreme_low_cov_val]) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
version 1.0 | ||
|
||
task VerifyBamID { | ||
meta { | ||
desciption: "Uses VerifyBamID2 for human cross-individual contamination estimation. Assumes GRCh38." | ||
} | ||
|
||
input { | ||
File pileup | ||
File ref_fasta | ||
Boolean is_hgdp_sites | ||
Boolean is_100k_sites | ||
} | ||
|
||
String a = if is_hgdp_sites then 'hgdp' else '1000g.phase3' | ||
String b = if is_100k_sites then '100k' else '10k' | ||
String resource_prefix = '~{a}.~{b}.b38.vcf.gz.dat' | ||
|
||
command <<< | ||
set -eux | ||
|
||
export VERIFY_BAM_ID_HOME='/VerifyBamID' | ||
|
||
time \ | ||
${VERIFY_BAM_ID_HOME}/bin/VerifyBamID \ | ||
--SVDPrefix ${VERIFY_BAM_ID_HOME}/resource/~{resource_prefix} \ | ||
--Reference ~{ref_fasta} \ | ||
--PileupFile ~{pileup} \ | ||
--NumThread 4 \ | ||
> vbid2.out \ | ||
2> vbid2.log | ||
|
||
cat vbid2.out | ||
tail -1 vbid2.out | awk -F ':' '{print $2}' | awk '{$1=$1};1' > "est_contam.txt" | ||
>>> | ||
|
||
output { | ||
File vbid2_log = "vbid2.log" | ||
File vbid2_out = "vbid2.out" | ||
Float contamination_est = read_float("est_contam.txt") | ||
} | ||
|
||
Int disk_size = 10 + ceil(size(pileup, "GiB")) | ||
runtime { | ||
cpu: 4 | ||
memory: "8 GiB" | ||
disks: "local-disk ~{disk_size} SSD" | ||
docker: "us.gcr.io/broad-dsp-lrma/verifybamid2:v2.0.1" | ||
} | ||
} |
Oops, something went wrong.