Skip to content

Commit

Permalink
Adapted contamination estimation workflow and BAMutils workflow for s…
Browse files Browse the repository at this point in the history
…hort-read data contamination analysis
  • Loading branch information
shadizaheri committed Jan 3, 2025
1 parent 7d36a93 commit b090a4a
Show file tree
Hide file tree
Showing 3 changed files with 2,264 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
version 1.0

import "../../../tasks/Utility/Utils.wdl"
import "../../../tasks/Utility/BAMutils.wdl" as BU
import "../../../tasks/QC/Contamination.wdl"

# This workflow estimates contamination levels for short-read BAM files
struct VBID2_config {
File genotyping_sites

Boolean is_hgdp_sites
Boolean is_100k_sites

Boolean disable_baq

Int max_retries

String? tech
}

workflow ShortReadsContaminationEstimation {
meta {
description:
"Estimate the cross-individual contamination level of a GRCh38 BAM file for short-read data."
}

input {
File bam
File bai
String tech # Short-read technology, e.g., NovaSeq
File ref_map_file

File gt_sites_bed
Boolean is_hgdp_sites
Boolean is_100k_sites

Boolean disable_baq

String disk_type

Int max_retries
}

parameter_meta {
gt_sites_bed: "Bed file holding the genotyping sites for short-read data."
is_hgdp_sites: "Provided BED is HGDP genotyping sites."
is_100k_sites: "Provided BED is 100k genotyping sites, suitable for short-read workflows."
disable_baq: "If turned on, BAQ computation will be disabled (faster operation). Recommended for short-read data."

tech: "Technology used for generating the data; accepted values: [NovaSeq, HiSeq]"

max_retries: "Number of retries for transient errors (e.g., disk or network issues)."
}

# Thresholds adjusted for short-read technologies
Map[String, Int] bam_threshold_per_tech = {'NovaSeq': 50, 'HiSeq': 100} # Adjusted for short-read data
Int bam_file_threshold = bam_threshold_per_tech[tech]

if (bam_file_threshold > ceil(size(bam, "MiB"))) {
Float extreme_low_cov_val = 50.0 # Adjusted low-coverage value for short reads
}

if (bam_file_threshold <= ceil(size(bam, "MiB"))) {
# Prepare reference map and proceed with pileup
Map[String, String] ref_map = read_map(ref_map_file)

Int scaleup_factor = 20
call BU.BamToRelevantPileup as Pileup {
input:
bam = bam,
bai = bai,
bed = gt_sites_bed,
ref_fasta = ref_map['fasta'],
disable_baq = disable_baq,
disk_type = disk_type,
max_retries = max_retries
}

call Contamination.VerifyBamID {
input:
pileup = Pileup.pileups,
ref_fasta = ref_map['fasta'],
is_hgdp_sites = is_hgdp_sites,
is_100k_sites = is_100k_sites
}
}

output {
Float contamination_est = select_first([VerifyBamID.contamination_est, extreme_low_cov_val])
}
}
50 changes: 50 additions & 0 deletions wdl/tasks/QC/Contamination.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
version 1.0

task VerifyBamID {
meta {
desciption: "Uses VerifyBamID2 for human cross-individual contamination estimation. Assumes GRCh38."
}

input {
File pileup
File ref_fasta
Boolean is_hgdp_sites
Boolean is_100k_sites
}

String a = if is_hgdp_sites then 'hgdp' else '1000g.phase3'
String b = if is_100k_sites then '100k' else '10k'
String resource_prefix = '~{a}.~{b}.b38.vcf.gz.dat'

command <<<
set -eux

export VERIFY_BAM_ID_HOME='/VerifyBamID'

time \
${VERIFY_BAM_ID_HOME}/bin/VerifyBamID \
--SVDPrefix ${VERIFY_BAM_ID_HOME}/resource/~{resource_prefix} \
--Reference ~{ref_fasta} \
--PileupFile ~{pileup} \
--NumThread 4 \
> vbid2.out \
2> vbid2.log

cat vbid2.out
tail -1 vbid2.out | awk -F ':' '{print $2}' | awk '{$1=$1};1' > "est_contam.txt"
>>>

output {
File vbid2_log = "vbid2.log"
File vbid2_out = "vbid2.out"
Float contamination_est = read_float("est_contam.txt")
}

Int disk_size = 10 + ceil(size(pileup, "GiB"))
runtime {
cpu: 4
memory: "8 GiB"
disks: "local-disk ~{disk_size} SSD"
docker: "us.gcr.io/broad-dsp-lrma/verifybamid2:v2.0.1"
}
}
Loading

0 comments on commit b090a4a

Please sign in to comment.