Skip to content

Commit

Permalink
Chaning utility WDLs:
Browse files Browse the repository at this point in the history
  * make BAM split by readgroup more efficient
  * make the ONT duplicated reads verification step resilient to http transient errors
  * new task to verify primrose was run on PacBio BAM
  * new task to get basecall model from ONT BAM
  • Loading branch information
SHuang-Broad committed Jan 22, 2024
1 parent da5bfb0 commit f6f8475
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 28 deletions.
89 changes: 61 additions & 28 deletions wdl/tasks/Utility/BAMutils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -716,53 +716,76 @@ task GetDuplicateReadnamesInQnameSortedBam {
}
parameter_meta {
qns_bam: {
desciption: "Query name sorted BAM to be de-duplicated",
localization_optional: true
}
trial_idx: "the n-th time this is being tried for (start from 1), if this value is >= trial_max, the BAM will be localized and the task will use a persistent SSD instead of persistent HDD."
trial_max: "the max number of attempt to perform the duty by streaming in the BAM; this design together with trial_idx is to prevent call-caching preventing retries."
}
input {
File qns_bam
Int trial_idx = 1
Int trial_max = 3
}
output {
File dup_names_txt = "dup_read_names.txt"
Boolean result_may_be_corrupted = read_boolean("samtools.failed.txt")
}
Boolean localize_bam = trial_idx >= trial_max
command <<<
# the way this works is the following:
# 0) relying on the re-auth.sh script to export the credentials
# 1) perform the remote sam-view subsetting in the background
# 2) listen to the PID of the background process, while re-auth every 1200 seconds
source /opt/re-auth.sh
set -euxo pipefail
# assumption
sort_order=$(samtools view -H ~{qns_bam} | grep "^@HD" | tr '\t' '\n' | grep "^SO:" | awk -F ':' '{print $2}')
if [[ "queryname" != "${sort_order}" ]]; then echo -e "Sort order ${sort_oder} isn't the expected 'queryname'." && exit 1; fi
# remote grab read names
echo "false" > samtools.failed.txt
samtools view ~{qns_bam} \
| awk -F '\t' '{print $1}' \
| uniq -d \
> "dup_read_names.txt" \
|| { echo "true" > samtools.failed.txt; exit 77; } &
pid=$!
if ~{localize_bam}; then
time \
gcloud storage cp ~{qns_bam} name_does_not_matter.bam
set +e
count=1
while true; do
sleep 1200 && date && source /opt/re-auth.sh
if [[ ${count} -gt 2 ]]; then exit 0; fi
if ! pgrep -x -P $pid; then exit 0; fi
count=$(( count+1 ))
done
samtools view name_does_not_matter.bam \
| awk -F '\t' '{print $1}' \
| uniq -d \
> "dup_read_names.txt"
echo "false" > samtools.failed.txt
else
# the way this works is the following:
# 0) relying on the re-auth.sh script to export the credentials
# 1) perform the remote sam-view operation in the background
# 2) listen to the PID of the background process, while re-auth every 1200 seconds
# remote grab read names
echo "false" > samtools.failed.txt
samtools view ~{qns_bam} \
| awk -F '\t' '{print $1}' \
| uniq -d \
> "dup_read_names.txt" \
|| { echo "true" > samtools.failed.txt; exit 77; } &
pid=$!
set +e
count=1
while true; do
sleep 1200 && date && source /opt/re-auth.sh
if [[ ${count} -gt 2 ]]; then exit 0; fi
if ! pgrep -x -P $pid; then exit 0; fi
count=$(( count+1 ))
done
fi
>>>
Int disk_size = 5 + (if (localize_bam) then ceil(size(qns_bam, "Gib")) else 0)
String disk_type = if (localize_bam) then "SSD" else "HDD"
runtime {
cpu: 1
memory: "4 GiB"
disks: "local-disk 10 HDD"
disks: "local-disk ~{disk_size} ~{disk_type}"
preemptible: 2
maxRetries: 1
docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3"
Expand Down Expand Up @@ -1727,31 +1750,38 @@ task SplitByRG {
}
parameter_meta {
bam: "BAM to be split"
bam: {
desciption: "BAM to be split",
localization_optional: true
}
out_prefix: "prefix for output bam and bai file names"
retain_rgless_records: "flag to save the reads that have no RG tag"
sort_and_index: "if the user wants to (pos-)sort and index the resulting BAMs; this indicates the input BAM is mapped"
split_bam: "the resuling BAMs, each having reads only in a single read group"
split_bai: "the accompanying BAIs, if possible and explicit requested"
}
Int disk_size = if defined(num_ssds) then 375*select_first([num_ssds]) else 1+3*ceil(size([bam], "GB"))
Array[String] extra_args = if (retain_rgless_records) then ["-u", "~{out_prefix}_noRG.bam"] else [""]
String local_bam = basename(bam)
command <<<
set -eux
time \
gcloud storage cp ~{bam} ~{local_bam}
samtools view -H ~{bam} | grep "^@RG" > "read_groups_header.txt"
samtools view -H ~{local_bam} | grep "^@RG" > "read_groups_header.txt"
cat "read_groups_header.txt" | tr '\t' '\n' | grep "^ID:" | awk -F ':' '{print $2}' > "RG_ids.txt"
samtools split -@3 \
-f "~{out_prefix}_%#.bam" \
~{sep=" " extra_args} \
~{bam}
~{local_bam}
rm ~{local_bam}
if ~{sort_and_index} ;
then
# cleanup space for the sorting
rm ~{bam}
for split_bam in "~{out_prefix}_"*.bam;
do
mv "${split_bam}" temp.bam
Expand All @@ -1771,6 +1801,9 @@ task SplitByRG {
}
#########################
Int disk_size = if defined(num_ssds) then 375*select_first([num_ssds]) else 1+3*ceil(size([bam], "GB"))
String disk_type = if defined(num_ssds) then "LOCAL" else "SSD" # IO-bound operation, no HDD please
RuntimeAttr default_attr = object {
cpu_cores: 4,
mem_gb: 16,
Expand All @@ -1783,7 +1816,7 @@ task SplitByRG {
runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL"
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " ~{disk_type}"
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
docker: select_first([runtime_attr.docker, default_attr.docker])
Expand Down
43 changes: 43 additions & 0 deletions wdl/tasks/Utility/ONTUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -321,3 +321,46 @@ task DeduplicateBam {
docker: select_first([runtime_attr.docker, default_attr.docker])
}
}
task GetBasecallModel {
meta {
desciption: "Getting the basecall model string of an ONT BAM"
}
parameter_meta {
bam: {
desciption: "BAM to operate on",
localization_optional: true
}
runid_2_model: "The basecall model for each run."
}
input {
File bam
}
output {
Map[String, String] runid_2_model = read_map("results.tsv")
}

command <<<
set -eux

export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token)
samtools view -H ~{bam} | grep "^@RG" > one_rg_per_line.txt

while IFS= read -r line
do
echo "$line" | tr '\t' '\n' | grep "^DS:" | sed "s/^DS://" | tr ' ' '\n' > tmp.txt
runid=$(grep "^runid=" tmp.txt | awk -F '=' '{print $2}')
model=$(grep "^basecall_model=" tmp.txt | awk -F '=' '{print $2}')
echo -e "${runid}\t${model}" >> results.tsv
done < one_rg_per_line.txt
>>>

runtime {
cpu: 1
memory: "4 GiB"
disks: "local-disk 10 HDD"
preemptible: 2
maxRetries: 1
docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3"
}
}
43 changes: 43 additions & 0 deletions wdl/tasks/Utility/PBUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -1280,3 +1280,46 @@ task SummarizePBI {
docker: select_first([runtime_attr.docker, default_attr.docker])
}
}
# todo: primrose is rebranded as jasmine, take care of that later
task VerifyPacBioBamHasAppropriatePrimroseRuns {
meta {
desciption: "Verify that a PacBio's BAM has primrose run on all its read groups"
}
input {
String bam
}

output {
Array[String] readgroups_missing_primrose = read_lines("movies_without_primrose.txt")
}

command <<<
set -eux

export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token`
samtools view -H ~{bam} > header.txt

# get read groups' movies
grep "^@RG" header.txt | tr '\t' '\n' | grep "^PU:" | awk -F ':' '{print $2}' | sort > readgroup.movies.txt
cat readgroup.movies.txt

# get primrose PG lines
grep "^@PG" header.txt | grep -v "^@SQ" | grep "^@PG" | grep -F 'ID:primrose' | tr '\t' '\n' | grep '^CL:' > primrose.pg.lines.txt
tr ' ' '\n' < primrose.pg.lines.txt

touch movies_without_primrose.txt
while IFS= read -r readgroup; do
if ! grep -q "${readgroup}" primrose.pg.lines.txt; then echo "${readgroup}" >> movies_without_primrose.txt; fi
done < readgroup.movies.txt
>>>

runtime {
cpu: 1
memory: "4 GiB"
disks: "local-disk 10 HDD"
preemptible: 2
maxRetries: 1
docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3"
}
}

0 comments on commit f6f8475

Please sign in to comment.