diff --git a/.dockstore.yml b/.dockstore.yml index ff958fe50..828b38b4d 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -1,17 +1,11 @@ version: 1.2 workflows: + +################################################### +# deprecated - name: ONT10x subclass: wdl primaryDescriptorPath: /wdl/deprecated/ONT10x.wdl -- name: ONTWholeGenome - subclass: wdl - primaryDescriptorPath: /wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl -- name: ONTFlowcell - subclass: wdl - primaryDescriptorPath: /wdl/pipelines/ONT/Preprocessing/ONTFlowcell.wdl -- name: PBFlowcell - subclass: wdl - primaryDescriptorPath: /wdl/pipelines/PacBio/Alignment/PBFlowcell.wdl - name: PBCCS subclass: wdl primaryDescriptorPath: /wdl/deprecated/PBCCS.wdl @@ -24,21 +18,24 @@ workflows: - name: PBCCSDemultiplexWholeGenome subclass: wdl primaryDescriptorPath: /wdl/deprecated/PBCCSDemultiplexWholeGenome.wdl -- name: PBCCSIsoSeq - subclass: wdl - primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/PBCCSIsoSeq.wdl -- name: PBCCSWholeGenome - subclass: wdl - primaryDescriptorPath: /wdl/pipelines/PacBio/VariantCalling/PBCCSWholeGenome.wdl - name: PBCLRDemultiplexWholeGenome subclass: wdl primaryDescriptorPath: /wdl/deprecated/PBCLRDemultiplexWholeGenome.wdl - name: PBCLRWholeGenome subclass: wdl primaryDescriptorPath: /wdl/deprecated/PBCLRWholeGenome.wdl -- name: LRCNVs +- name: DownloadFromHudsonAlpha subclass: wdl - primaryDescriptorPath: /wdl/pipelines/TechAgnostic/VariantCalling/LRCNVs.wdl + primaryDescriptorPath: /wdl/deprecated/DownloadFromHudsonAlpha.wdl + +################################################### +# ONT +- name: ONTWholeGenome + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl +- name: ONTFlowcell + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/ONT/Preprocessing/ONTFlowcell.wdl - name: ONTBasecall subclass: wdl primaryDescriptorPath: /wdl/pipelines/ONT/Preprocessing/ONTBasecall.wdl @@ -48,15 +45,6 @@ workflows: - name: ONTAssembleWithFlye subclass: wdl primaryDescriptorPath: /wdl/pipelines/ONT/Assembly/ONTAssembleWithFlye.wdl -- name: VerifyFingerprint - subclass: wdl - primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/VerifyFingerprint.wdl -- name: DownloadFromHudsonAlpha - subclass: wdl - primaryDescriptorPath: /wdl/deprecated/DownloadFromHudsonAlpha.wdl -- name: PBAssembleWithHifiasm - subclass: wdl - primaryDescriptorPath: /wdl/pipelines/PacBio/Assembly/PBAssembleWithHifiasm.wdl - name: ONTMethylation subclass: wdl primaryDescriptorPath: /wdl/pipelines/ONT/Epigenomics/ONTMethylation.wdl @@ -66,12 +54,42 @@ workflows: - name: ONTPfTypeDrugResistanceMarkers subclass: wdl primaryDescriptorPath: /wdl/pipelines/ONT/MultiAnalysis/ONTPfTypeDrugResistanceMarkers.wdl +- name: ONTProcessBasecall + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/ONT/Preprocessing/ONTProcessBasecall.wdl +- name: ONTFlowcellFromMultipleBasecalls + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/ONT/Preprocessing/ONTFlowcellFromMultipleBasecalls.wdl + +################################################### +# PacBio +- name: PBFlowcell + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/PacBio/Alignment/PBFlowcell.wdl +- name: PBCCSIsoSeq + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/PBCCSIsoSeq.wdl +- name: PBCCSWholeGenome + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/PacBio/VariantCalling/PBCCSWholeGenome.wdl +- name: PBAssembleWithHifiasm + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/PacBio/Assembly/PBAssembleWithHifiasm.wdl - name: PBMASIsoSeqQuantify subclass: wdl primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/PBMASIsoSeqQuantify.wdl - name: PBMASIsoSeqDemultiplex subclass: wdl primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/PBMASIsoSeqDemultiplex.wdl + +################################################### +# TechAgnostic - *mics data processing +- name: CallVariantsReadBased + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/VariantCalling/CallVariantsReadBased.wdl +- name: LRCNVs + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/VariantCalling/LRCNVs.wdl - name: LRJointCallGVCFs subclass: wdl primaryDescriptorPath: /wdl/pipelines/TechAgnostic/VariantCalling/LRJointCallGVCFs.wdl @@ -81,18 +99,57 @@ workflows: - name: LRConvertBCF subclass: wdl primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/LRConvertBCF.wdl -- name: DownloadFromSRA + +################################################### +# TechAgnostic - *mics data QC & metrics +- name: ShardWholeGenome subclass: wdl - primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/DownloadFromSRA.wdl -- name: DownloadFromWeb + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/ShardWholeGenome.wdl +- name: MergeSampleBamsAndCollectMetrics subclass: wdl - primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/DownloadFromWeb.wdl -- name: ONTProcessBasecall + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/MergeSampleBamsAndCollectMetrics.wdl +- name: AlignedBamQCandMetrics subclass: wdl - primaryDescriptorPath: /wdl/pipelines/ONT/Preprocessing/ONTProcessBasecall.wdl -- name: ONTFlowcellFromMultipleBasecalls + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/AlignedBamQCandMetrics.wdl +- name: CollectBamFlagStats subclass: wdl - primaryDescriptorPath: /wdl/pipelines/ONT/Preprocessing/ONTFlowcellFromMultipleBasecalls.wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/CollectBamFlagStats.wdl +- name: CountTheBeans + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/CountTheBeans.wdl +- name: DystPeaker + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/DystPeaker.wdl +- name: FASTQstats + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/FASTQstats.wdl +- name: FilterBamByLength + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/FilterBamByLength.wdl +- name: LongReadsContaminationEstimation + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/LongReadsContaminationEstimation.wdl +- name: SexCheckNaive + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/SexCheckNaive.wdl +- name: VerifyFingerprint + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/VerifyFingerprint.wdl +- name: VerifyBamFingerprint + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/VerifyBamFingerprint.wdl + +################################################### +# TechAgnostic - utility - name: CleanupIntermediate subclass: wdl primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/CleanupIntermediate.wdl +- name: DownloadFromSRA + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/DownloadFromSRA.wdl +- name: DownloadFromWeb + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/DownloadFromWeb.wdl +- name: SaveFilesToDestination + subclass: wdl + primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/SaveFilesToDestination.wdl diff --git a/docker/lr-bam-dedup/Dockerfile b/docker/lr-bam-dedup/Dockerfile new file mode 100644 index 000000000..094bd3535 --- /dev/null +++ b/docker/lr-bam-dedup/Dockerfile @@ -0,0 +1,73 @@ +FROM python:3.9.16-slim-bullseye + +COPY remove_duplicate_ont_aln.py /opt/ +COPY remove_duplicate_ont_namesorted_unaligned.py /opt/ + +RUN pip install pysam==0.21.0 + +# install gcloud and gsutil cli +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + gnupg \ + zlib1g-dev \ + curl \ + wget \ + tree \ + tabix && \ + echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ + apt-get -qqy update && \ + apt-get -qqy install --no-install-recommends google-cloud-cli && \ + gcloud config set core/disable_usage_reporting true && \ + gcloud config set component_manager/disable_update_check true && \ + gcloud config set metrics/environment github_docker_image && \ + apt-get -qqy purge gnupg && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* + +# install latest samtools +ARG DEBIAN_FRONTEND=noninteractive +ARG SAMTOOLS_VERSION=1.18 +ARG BCFTOOLS_VERSION=1.18 +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends \ + ca-certificates \ + libbz2-dev \ + libcurl4-openssl-dev \ + liblzma-dev \ + libncurses5-dev \ + autoconf \ + automake \ + bzip2 \ + gcc \ + make \ + wget \ + zlib1g-dev && \ + wget https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2 && \ + tar xjf samtools-${SAMTOOLS_VERSION}.tar.bz2 && \ + cd samtools-${SAMTOOLS_VERSION} && ./configure --without-curses --enable-libcurl && make -s all all-htslib && make install install-htslib && cd - && \ + rm -rf samtools-${SAMTOOLS_VERSION}* && \ + wget https://github.com/samtools/bcftools/releases/download/${BCFTOOLS_VERSION}/bcftools-${BCFTOOLS_VERSION}.tar.bz2 && \ + tar xjf bcftools-${BCFTOOLS_VERSION}.tar.bz2 && \ + cd bcftools-${BCFTOOLS_VERSION} && ./configure --without-curses && make -s && make install && cd - && \ + rm -rf bcftools-${BCFTOOLS_VERSION}* && \ + apt-get -qqy purge autoconf automake bzip2 gcc make wget && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* && \ + samtools --help && \ + bcftools --help diff --git a/docker/lr-bam-dedup/Makefile b/docker/lr-bam-dedup/Makefile new file mode 100644 index 000000000..af8ea122b --- /dev/null +++ b/docker/lr-bam-dedup/Makefile @@ -0,0 +1,12 @@ +VERSION = 0.1.2 +TAG1 = us.gcr.io/broad-dsp-lrma/lr-bam-dedup:$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/lr-bam-dedup:latest + +all: build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-bam-dedup/remove_duplicate_ont_aln.py b/docker/lr-bam-dedup/remove_duplicate_ont_aln.py new file mode 100644 index 000000000..7484a696f --- /dev/null +++ b/docker/lr-bam-dedup/remove_duplicate_ont_aln.py @@ -0,0 +1,76 @@ +import argparse +import pysam + + +def main(): + parser = argparse.ArgumentParser(description='Remove redundant alignment records from ONT BAM file', + prog='remove_redundant_reads') + parser.add_argument('-p', '--prefix', type=str, default="shard", help="Output prefix") + parser.add_argument('-a', '--annotations', type=str, help="Annotations on (potential) duplicate reads") + + parser.add_argument('bam', type=str, help="BAM") + args = parser.parse_args() + + # create a dict of set's, a trick to avoid Hash collisions + guilty_dict_per_chr = dict() + with open(args.annotations) as f: + for line in f: + arr = line.strip().split('\t') + name = arr[0] + chrom = arr[2] + guilty_dict_per_chr.setdefault(chrom, set()) + guilty_dict_per_chr[chrom].add(name) + + print("chromosomes on which there are duplicate records:") + print(f" {guilty_dict_per_chr}") + + # Silence message about the .bai file not being found. + pysam.set_verbosity(0) + + num_alignments, num_dropped_alignments = 0, 0 + duplicate_signatures = [] + bf = pysam.Samfile(args.bam, 'rb', check_sq=False) + with pysam.Samfile(f'{args.prefix}.bam', 'wb', header=bf.header) as out: + # we rely on the observation that for coordinate sorted BAM, + # duplicate records will appear in blocks, hence once we step off a position with duplicates, we start afresh + current_position = -1 + current_signatures = set() + for read in bf: + num_alignments += 1 + + chrom = read.reference_name + n = read.query_name + if chrom in guilty_dict_per_chr and n in guilty_dict_per_chr[chrom]: + + mq = read.mapping_quality + sam_flag = read.flag + pos = read.reference_start + cigar = read.cigarstring + signature = f"{n}-{chrom}-{pos}-{mq}-{sam_flag}-{cigar}" + + if current_position != pos: # new position, let's write and reset + out.write(read) + current_position = pos + current_signatures = set() + current_signatures.add(signature) + elif signature in current_signatures: # You're a duplicate record, and not appearing for the 1st time! + num_dropped_alignments += 1 + duplicate_signatures.append(signature) # same signature may appear more than twice, hence list and append + pass + else: # you are in a new group of duplicates that map to this location + out.write(read) + current_signatures.add(signature) + else: + out.write(read) + + print(f'num_alignments: {num_alignments}') + print(f'num_dropped_alignments: {num_dropped_alignments}') + print(f'num_kept_alignments: {num_alignments - num_dropped_alignments}') + + with open(f'{args.prefix}.duplicate.signatures.txt', 'w') as out: + for sig in duplicate_signatures: + out.write(f"{sig}\n") + + +if __name__ == "__main__": + main() diff --git a/docker/lr-bam-dedup/remove_duplicate_ont_namesorted_unaligned.py b/docker/lr-bam-dedup/remove_duplicate_ont_namesorted_unaligned.py new file mode 100644 index 000000000..aafe88108 --- /dev/null +++ b/docker/lr-bam-dedup/remove_duplicate_ont_namesorted_unaligned.py @@ -0,0 +1,48 @@ +import argparse +import pysam + + +def main(): + parser = argparse.ArgumentParser(description='Remove redundant reads from renamed-sorted ONT BAM file', + prog='remove_redundant_reads') + parser.add_argument('-p', '--prefix', type=str, default="shard", help="Output prefix") + parser.add_argument('-q', '--qnames', type=str, help="Read names of duplicate records") + + parser.add_argument('bam', type=str, help="BAM") + args = parser.parse_args() + + # Silence message about the .bai file not being found. + pysam.set_verbosity(0) + bf = pysam.Samfile(args.bam, 'rb', check_sq=False) + + num_records, num_dropped_records = 0, 0 + duplicate_record_names = list() + + with pysam.Samfile(f'{args.prefix}.bam', 'wb', header=bf.header) as out: + + # we rely on the observation that for queryname sorted, unaligned BAM, + # if two neighboring records have the same query name, then they must be duplicate of each other + current_qm = '' + + for read in bf: + num_records += 1 + + n = read.query_name + if n == current_qm: + duplicate_record_names.append(n) + num_dropped_records += 1 + else: + current_qm = n + out.write(read) + + print(f'num_records: {num_records}') + print(f'num_dropped_records: {num_dropped_records}') + print(f'num_kept_alignments: {num_records - num_dropped_records}') + + with open(args.qnames, 'w') as outf: + for qn in duplicate_record_names: + outf.write(f'{qn}\n') + + +if __name__ == "__main__": + main() diff --git a/docker/lr-bam-pileup/Dockerfile b/docker/lr-bam-pileup/Dockerfile new file mode 100644 index 000000000..071a99c08 --- /dev/null +++ b/docker/lr-bam-pileup/Dockerfile @@ -0,0 +1,3 @@ +FROM us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3 + +COPY convert.2.pileup.sh /opt/ diff --git a/docker/lr-bam-pileup/Makefile b/docker/lr-bam-pileup/Makefile new file mode 100644 index 000000000..f6714a48d --- /dev/null +++ b/docker/lr-bam-pileup/Makefile @@ -0,0 +1,12 @@ +VERSION = 0.1.3 +TAG1 = us.gcr.io/broad-dsp-lrma/lr-bam-pileup:$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/lr-bam-pileup:latest + +all: build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-bam-pileup/convert.2.pileup.sh b/docker/lr-bam-pileup/convert.2.pileup.sh new file mode 100644 index 000000000..94693e6b5 --- /dev/null +++ b/docker/lr-bam-pileup/convert.2.pileup.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +set -eu + +input_bam=$1 +ref_fasta=$2 +baq_option=$3 +my_bed=$4 + +prefix=$(echo "${my_bed}" | awk -F '.' '{print $1}') + +samtools view -h \ + --region-file "${my_bed}" \ + --write-index \ + -o "${prefix}.bam##idx##${prefix}.bam.bai" \ + "${input_bam}" + +samtools mpileup \ + "${baq_option}" \ + -s \ + -q 1 \ + -f "${ref_fasta}" \ + -o "${prefix}.mpileup" \ + "${prefix}.bam" \ + 2> "${prefix}.mpileup.err" + +rm "${prefix}.bam" "${prefix}.bam.bai" diff --git a/docker/lr-custom-gatk/Dockerfile b/docker/lr-custom-gatk/Dockerfile new file mode 100644 index 000000000..e0809d7a2 --- /dev/null +++ b/docker/lr-custom-gatk/Dockerfile @@ -0,0 +1,52 @@ +# update samtools and bcftools in the GATK docker +# then support both gsutil and gcloud CLIs + +# attempting to install libdeflate-dev with this base will lead to error +# becase it comes with 18.04, which doesn't have the library +FROM us.gcr.io/broad-gatk/gatk:4.4.0.0 + +ARG DEBIAN_FRONTEND=noninteractive +ARG SAMTOOLS_VERSION=1.18 +ARG BCFTOOLS_VERSION=1.18 +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends \ + apt-transport-https \ + autoconf \ + automake \ + ca-certificates \ + gnupg \ + libbz2-dev \ + libcurl4-openssl-dev \ + liblzma-dev \ + libncurses5-dev \ + zlib1g-dev \ + bzip2 \ + gcc \ + make \ + wget \ + curl \ + gawk && \ + wget https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2 && \ + tar xjf samtools-${SAMTOOLS_VERSION}.tar.bz2 && \ + cd samtools-${SAMTOOLS_VERSION} \ + && ./configure --without-curses --enable-libcurl \ + && make -s all all-htslib \ + && make install install-htslib && \ + cd - && rm -rf samtools-${SAMTOOLS_VERSION}* && \ + wget https://github.com/samtools/bcftools/releases/download/${BCFTOOLS_VERSION}/bcftools-${BCFTOOLS_VERSION}.tar.bz2 && \ + tar xjf bcftools-${BCFTOOLS_VERSION}.tar.bz2 && \ + cd bcftools-${BCFTOOLS_VERSION} \ + && ./configure --without-curses \ + && make -s \ + && make install && \ + cd - && rm -rf bcftools-${BCFTOOLS_VERSION}* && \ + apt-get -qqy purge autoconf automake bzip2 gcc make wget && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* && \ + samtools --version diff --git a/docker/lr-custom-gatk/Makefile b/docker/lr-custom-gatk/Makefile new file mode 100644 index 000000000..dd01a21a6 --- /dev/null +++ b/docker/lr-custom-gatk/Makefile @@ -0,0 +1,12 @@ +VERSION = 4.4.0.0-samtools1.18 +TAG1 = us.gcr.io/broad-dsp-lrma/lr-custom-gatk:$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/lr-custom-gatk:latest + +all: build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-dv/Dockerfile b/docker/lr-dv/Dockerfile index a48bdb2d2..e6773173d 100644 --- a/docker/lr-dv/Dockerfile +++ b/docker/lr-dv/Dockerfile @@ -1,5 +1,6 @@ -FROM google/deepvariant:1.3.0 +FROM gcr.io/deepvariant-docker/deepvariant:1.5.0 COPY vm_local_monitoring_script.sh /opt/ +RUN pip install tree ENTRYPOINT [] diff --git a/docker/lr-dv/Makefile b/docker/lr-dv/Makefile index 859eefdd1..04dc4407d 100644 --- a/docker/lr-dv/Makefile +++ b/docker/lr-dv/Makefile @@ -1,4 +1,4 @@ -VERSION = 1.3.0 +VERSION = 1.5.0 TAG1 = us.gcr.io/broad-dsp-lrma/lr-deepvariant:$(VERSION) TAG2 = us.gcr.io/broad-dsp-lrma/lr-deepvariant:latest diff --git a/docker/lr-dvgpu/Dockerfile b/docker/lr-dvgpu/Dockerfile new file mode 100644 index 000000000..0868edd06 --- /dev/null +++ b/docker/lr-dvgpu/Dockerfile @@ -0,0 +1,6 @@ +FROM gcr.io/deepvariant-docker/deepvariant_gpu:1.5.0 + +COPY vm_local_monitoring_script.sh /opt/ +RUN pip install tree gpustat + +ENTRYPOINT [] diff --git a/docker/lr-dvgpu/Makefile b/docker/lr-dvgpu/Makefile new file mode 100644 index 000000000..a4550e020 --- /dev/null +++ b/docker/lr-dvgpu/Makefile @@ -0,0 +1,12 @@ +VERSION = 1.5.0-gpu +TAG1 = us.gcr.io/broad-dsp-lrma/lr-deepvariant:$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/lr-deepvariant:latest + +all: build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-dvgpu/vm_local_monitoring_script.sh b/docker/lr-dvgpu/vm_local_monitoring_script.sh new file mode 100644 index 000000000..b504fca70 --- /dev/null +++ b/docker/lr-dvgpu/vm_local_monitoring_script.sh @@ -0,0 +1,305 @@ +#!/bin/bash + +# ADDED NOTE: this script is intended to be localized to your google cloud +# vm and run in the following fashion to get resources usage +# ``` +# export MONITOR_MOUNT_POINT=${your_home_dir_or_cromwell_root} +# bash vm_local_monitoring_script.sh &> resources.log & +# job_id=$(ps -aux | grep -F 'vm_local_monitoring_script.sh' | head -1 | awk '{print $2}') +# ${you_resource_intensive_jobs} +# # we recommend only run the following if manually launched +# kill $job_id +# # if running on a cromwell providisioned VM +# # remember to delocalize the "resources.log" +# ``` + + +# NOTE: this script is intended to be placed in google cloud storage +# and invoked by adding the following line to your cromwell workflow +# options: +# "monitoring_script": "gs://bucket/path/to/cromwell_monitoring_script.sh" +# Upon task completion "monitoring.log" will be added to the appropriate +# cloud storage folder. +set -Eeuo pipefail + +MONITOR_MOUNT_POINT=${MONITOR_MOUNT_POINT:-"/"} +SLEEP_TIME=${SLEEP_TIME:-"10"} + +function getCpuUsage() { + # get the summary cpu statistics (i.e. for all cpus) since boot + # get the numeric values in an array, dropping the first field (the + # string, "cpu") + CPU_TIMES=(`sed -n 's/^cpu\s//p' /proc/stat`) + # idle time (in system units) is the 3rd numeric field + IDLE_TIME=${CPU_TIMES[3]} + # total cpu time is sum of all fields + TOTAL_TIME=0 + for T in ${CPU_TIMES[@]}; do + ((TOTAL_TIME += T)) + done + + # get the previous times from temp file + read PREVIOUS_IDLE PREVIOUS_TOTAL < $TEMP_CPU + + # write current times to temp file + echo "$IDLE_TIME $TOTAL_TIME" > $TEMP_CPU + + # get the difference in idle and total times since the previous + # update, and report the usage as: non-idle time as a percentage + # of total time + awk -v IDLE=$((IDLE_TIME-PREVIOUS_IDLE)) \ + -v TOTAL=$((TOTAL_TIME-PREVIOUS_TOTAL)) \ + 'BEGIN { printf "%.1f%%", 100 * (1 - IDLE / TOTAL) }' +} + +function getMem() { + # get desired memory value from /proc/meminfo, in GiB, and also + # as a percentage of total + # argument is the label of the desired memory value + cat /proc/meminfo \ + | awk -v MEM_FIELD="$1" '{ + f[substr($1, 1, length($1)-1)] = $2 + } END { + printf "%.2f GiB", f[MEM_FIELD] / 1048576 + }' +} + +function getMemUnavailable() { + # get unavailable memory from /proc/meminfo, in GiB + cat /proc/meminfo \ + | awk '{ + f[substr($1, 1, length($1)-1)] = $2 + } END { + + if("MemAvailable" in f) { + mem_available = f["MemAvailable"] + } else { + mem_available = f["MemFree"] + f["Buffers"] + f["Cached"] + } + mem_in_use = f["MemTotal"] - mem_available + printf "%.2f GiB %.1f%%", mem_in_use / 1048576, 100 * mem_in_use / f["MemTotal"] + }' +} + +# old version using "free -m" are kept in case a container somehow has +# weird values in /proc/meminfo +function getMem_with_free() { + # get memory info from "free" command. Convert to float in GB. + # First argument is desired row of output table. + # Second argument is desired column. + MEM_ROW=$(echo "$1" | awk '{print tolower($1)}') + MEM_COLUMN=$(echo "$2" | awk '{print tolower($1)}') + free -m | awk -v MEM_ROW=$MEM_ROW -v MEM_COLUMN=$MEM_COLUMN \ + 'NR=1 { + for(i=1; i<=NF; i++) { f[tolower($i)]=NF+1-i } + } + { + regex="^"MEM_ROW + if(tolower($1) ~ regex) { + print $(NF+1-f[MEM_COLUMN])/1024 " GiB" + } + }' +} + +# old version using "free -m" are kept in case a container somehow has +# weird values in /proc/meminfo +function getMemUnavailable_using_free() { + # get memory that is in active use (not just cached) from "free" + # command. Convert to float in GiB, followed by percent of total. + # NOTE: weird computation with awk due to variety of output from + # free on different systems. Rows and columns differ, and on some + # systems the desired quantity is used "used" memory, on most it's + # "used" - "buffers" - "cached". If "buffers" and "cached" don't + # exist, then awk will subtract 0 so the correct result is returned. + free -m \ + | awk '\ + NR=1 { + for(i=1; i<=NF; i++) { f[tolower($i)]=NF+1-i } + } + { + if(tolower($1) ~ "^mem") { + IN_USE=($(NF+1-f["used"]) - $(NF+1-f["buffers"]) - $(NF+1-f["cached"])) + printf "%.3f GiB %.1f%%", IN_USE/1024, 100*IN_USE/$(NF+1-f["total"]) + } + }' +} + + +function getDisk() { + # get information about disk usage from "df" command. + DISK_COLUMN=$(echo "$1" | awk '{print tolower($1)}') + MOUNT_POINT=$2 + # extract desired value + VALUE=$(\ + df -h "$MOUNT_POINT" \ + | sed 's/Mounted on/Mounted-on/' \ + | awk -v DISK_COLUMN=$DISK_COLUMN ' + FNR==1 { + NF_HEADER=NF + for(i=1; i<=NF; i++) { f[tolower($i)]=NF-i } + } + FNR>1 { + FIELD_NUM=NF-f[DISK_COLUMN] + if(FIELD_NUM > 0) { + VALUE=$(FIELD_NUM) + print VALUE + } else if(f[DISK_COLUMN] == NF_HEADER-1 && NF == 1) { + VALUE=$(1) + print VALUE + } + }' \ + ) + # If value is a number follwed by letters, it is a value with units + # and needs to be converted. Otherwise just print value + if [[ "$VALUE" =~ [0-9.]+[A-z]+ ]]; then + echo "$VALUE"\ + | sed -E 's/([0-9.]*)([^0-9.]*)/\1 \2/' \ + | awk '{ + UNIT=substr($2, 1, 1) + if(UNIT == "T") { + SCALE=2^10 + } else if(UNIT == "G") { + SCALE=1 + } else if(UNIT == "M") { + SCALE=2^-10 + } else if(UNIT == "K") { + SCALE=2^-20 + } else if(UNIT == "B") { + SCALE=2^-30 + } else { + SCALE=1 + } + printf "%.3f GiB", $1 * SCALE + }' + else + echo "$VALUE" + fi +} + +function findBlockDevice() { + MOUNT_POINT=$1 + FILESYSTEM=$(grep -E "$MOUNT_POINT\s" /proc/self/mounts \ + | awk '{print $1}') + DEVICE_NAME=$(basename "$FILESYSTEM") + FS_IN_BLOCK=$(find -L /sys/block/ -mindepth 2 -maxdepth 2 -type d \ + -name "$DEVICE_NAME") + if [ -n "$FS_IN_BLOCK" ]; then + # found path to the filesystem in the block devices. get the + # block device as the parent dir + dirname "$FS_IN_BLOCK" + elif [ -d "/sys/block/$DEVICE_NAME" ]; then + # the device is itself a block device + echo "/sys/block/$DEVICE_NAME" + else + # couldn't find, possibly mounted by mapper. + # look for block device that is just the name of the symlinked + # original file. if not found, echo empty string (no device found) + BLOCK_DEVICE=$(ls -l "$FILESYSTEM" 2>/dev/null \ + | cut -d'>' -f2 \ + | xargs basename 2>/dev/null \ + || echo) + if [[ -z "$BLOCK_DEVICE" ]]; then + 1>&2 echo "Unable to find block device for filesystem $FILESYSTEM." + if [[ -d /sys/block/sdb ]] && ! grep -qE "^/dev/sdb" /etc/mtab; then + 1>&2 echo "Guessing present but unused sdb is the correct block device." + echo "/sys/block/sdb" + else + 1>&2 echo "Disk IO will not be monitored." + fi + fi + fi +} + +function handle_integer_wrap() { + if [ $1 -ge 0 ]; then + echo $1 + else + WRAPPED=$1 + echo "$((WRAPPED + 2**30))" + fi +} + + + +function getBlockDeviceIO() { + # get read and write IO rate by looking at appropriate block device + STAT_FILE="$1" + if [[ -f "$STAT_FILE" ]]; then + # get IO stats as comma-separated list to extract 3rd and 7th fields + STATS=$(sed -E 's/[[:space:]]+/,/g' $STAT_FILE | sed -E 's/^,//'\ + | cut -d, -f3,7 | sed -E 's/,/ /g') + # get results of previous poll + read OLD_READ OLD_WRITE < $TEMP_IO + # save new poll results + read READ_SECTORS WRITE_SECTORS <<<$STATS + echo "$READ_SECTORS $WRITE_SECTORS" > $TEMP_IO + # update read and write sectors as difference since previous poll + READ_SECTORS=$(handle_integer_wrap $((READ_SECTORS - OLD_READ))) + WRITE_SECTORS=$(handle_integer_wrap $((WRITE_SECTORS - OLD_WRITE))) + + # output change in read/write sectors in kiB/s + echo "$READ_SECTORS $WRITE_SECTORS" \ + | awk -v T=$SLEEP_TIME -v B=$SECTOR_BYTES \ + '{ printf "%.3f MiB/s %.3f MiB/s", $1*B/T/1048576, $2*B/T/1048576 }' + else + echo "N/A MiB/s N/A MiB/s" + fi +} + + +function runtimeInfo() { + echo " [$(date)]" + echo \* CPU usage: $(getCpuUsage) + echo \* Memory usage: $(getMemUnavailable) + echo \* Disk usage: $(getDisk Used $MONITOR_MOUNT_POINT) $(getDisk Use% $MONITOR_MOUNT_POINT) + echo \* Read/Write IO: $(getBlockDeviceIO "$BLOCK_DEVICE_STAT_FILE") +} + +# print out header info +echo ================================== +echo =========== MONITORING =========== +echo ================================== +echo --- General Information --- +echo \#CPU: $(nproc) +echo Total Memory: $(getMem MemTotal) +echo Total Disk space: $(getDisk Size "$MONITOR_MOUNT_POINT") +echo +echo --- Runtime Information --- + + +# make a temp file to store io information, remove it on exit +TEMP_IO=$(mktemp "${TMPDIR:-/tmp/}$(basename $0).XXXXXXXXXXXX") +# make a temp file to store cpu information, remove it on exit +# remove temp files on exit +TEMP_CPU=$(mktemp "${TMPDIR:-/tmp/}$(basename $0).XXXXXXXXXXXX") +trap "rm -f $TEMP_IO $TEMP_CPU" EXIT + + +# find the block device +BLOCK_DEVICE=$(findBlockDevice "$MONITOR_MOUNT_POINT") +if [[ -z "$BLOCK_DEVICE" ]] \ + || [[ ! -f "$BLOCK_DEVICE/queue/hw_sector_size" ]]; then + # no block device found, can't get IO info + SECTOR_BYTES=0 + BLOCK_DEVICE_STAT_FILE="" +else + SECTOR_BYTES=$(cat "$BLOCK_DEVICE/queue/hw_sector_size") + BLOCK_DEVICE_STAT_FILE="$BLOCK_DEVICE/stat" +fi + + +# since getBlockDeviceIO looks at differences in stat file, run the +# update so the first reported update has a sensible previous result to +# compare to +echo "0 0" > $TEMP_IO +getBlockDeviceIO "$BLOCK_DEVICE_STAT_FILE" > /dev/null + +# same thing for getCpuUsage +echo "0 0" > $TEMP_CPU +getCpuUsage > /dev/null + + +while true; do + runtimeInfo + sleep $SLEEP_TIME +done diff --git a/docker/lr-dyst-peaker/Dockerfile b/docker/lr-dyst-peaker/Dockerfile new file mode 100644 index 000000000..d5bd38b34 --- /dev/null +++ b/docker/lr-dyst-peaker/Dockerfile @@ -0,0 +1,24 @@ +FROM ubuntu:20.04 + +# possibly install vim for debugging +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends \ + software-properties-common \ + python3 \ + python3-pip && \ + pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir dyst pandas==1.4.3 scipy==1.10.1 + +# some custom work to make dyst working in docker +COPY modified.dyst /usr/local/bin/dyst +COPY find_peaks.py /opt/ +COPY reverse_yield.py /opt/ +COPY measure_g1_skew.py /opt/ + +# # test installation +# COPY lengths.txt /tmp/ +# RUN dyst -n -b 100 -i /tmp/lengths.txt > /tmp/test.hist && \ +# grep -v "^#" /tmp/test.hist | awk -F ':' '{print $1}' > /tmp/test.plain.hist && \ +# python3 /opt/find_peaks.py -i /tmp/test.plain.hist -o /tmp/test.peaks.txt diff --git a/docker/lr-dyst-peaker/Makefile b/docker/lr-dyst-peaker/Makefile new file mode 100644 index 000000000..9dd87b0ef --- /dev/null +++ b/docker/lr-dyst-peaker/Makefile @@ -0,0 +1,12 @@ +VERSION = 0.0.2 +TAG1 = us.gcr.io/broad-dsp-lrma/lr-dyst-peaker:$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/lr-dyst-peaker:latest + +all: build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-dyst-peaker/find_peaks.py b/docker/lr-dyst-peaker/find_peaks.py new file mode 100644 index 000000000..30b2eb427 --- /dev/null +++ b/docker/lr-dyst-peaker/find_peaks.py @@ -0,0 +1,153 @@ +import argparse +from typing import List +import pandas as pd +import numpy as np + + +def bin_and_freq(dyst_list:str) -> pd.Series: + try: + arr = dyst_list.split('-') + beg = arr[0].strip() + brr = arr[1].split('[') + end = brr[0].strip() + freq = brr[1].replace(']', '').strip() + return pd.Series([float(beg), float(end), int(freq)], index = ['beg', 'end', 'freq']) + except: + print(dyst_list) + raise + + +def construct_dataframe(prepped_dyst_output:str) -> pd.DataFrame: + """ + Converts a prepped dyst output format into a dataframe with three columns, + [left boundary of a bin, right boundary of a bin, frequency of the bin] + :param prepped_dyst_output: + :return: + """ + with open(prepped_dyst_output, 'r') as inf: + lines = [l.strip('\n') for l in inf.readlines()] + df = pd.DataFrame([bin_and_freq(l) for l in lines if l]) + df['freq'] = df['freq'].astype(int) + return df + + +def find_blocks_in_sequence(indices: List[int]) -> List[List[int]]: + """ + Given a list of indices into a dataframe, produce blocks of contiguous indices. + :param indices: + :return: + """ + assert all([i>=0 for i in indices]), "input value must be all non-negative" + blocks = list() + indices_shift_1 = indices[1:] + indices_shift_1.append(indices[-1]+1) + i = 0 + d = 1 + for a, b in zip(indices, indices_shift_1): + if b-a!=1: + blocks.append(indices[i:(i+d)]) + i+=d + d=0 + d+=1 + blocks.append(indices[i:(i+d)]) + return blocks + + +def heuristic_trough_to_peak_ratio(formatted_df:pd.DataFrame, ratio:float) -> List[int]: + """ + The heuristic part of this algorithm. + + It works by finding the bins whose frequency is higher than _ratio_ * the globally highest frequency. + This ratio is a heuristic number. + :param formatted_df: + :param ratio: + :return: + """ + sequence = list(formatted_df.loc[formatted_df['freq'] > round(max(formatted_df['freq'])*ratio), :].index) + return sequence + + +def find_peaks_indices(formatted_df:pd.DataFrame, blocks_of_contiguous_indices:List[List[int]]) -> list: + """ + Given the blocks of indices, where each block is contiguous within themselves, + locate the index into the dataframe where the peak sits. + :param formatted_df: + :param blocks_of_contiguous_indices: + :return: + """ + indices_of_peaks = list() + for bl in blocks_of_contiguous_indices: + tdf = formatted_df.iloc[bl, :] + idx = tdf['freq'].idxmax() + indices_of_peaks.append(idx) + return indices_of_peaks + + +def bootstrap_peak_indices(formatted_df:pd.DataFrame, heuristic_ratio_lower_bound:float, heuristic_ratio_upper_bound:float): + + assert 0 < heuristic_ratio_lower_bound < 1, \ + f"lower bound ({heuristic_ratio_lower_bound}) must be between 0 and 1." + assert 0 < heuristic_ratio_upper_bound < 1, \ + f"lower bound ({heuristic_ratio_upper_bound}) must be between 0 and 1." + assert heuristic_ratio_lower_bound < heuristic_ratio_upper_bound, \ + f"lower bound ({heuristic_ratio_lower_bound}) must be lower than upper bound ({heuristic_ratio_upper_bound})." + + result = list() + # bootstrap + for f in np.arange(heuristic_ratio_lower_bound, heuristic_ratio_upper_bound, 0.1): + sequence = heuristic_trough_to_peak_ratio(formatted_df, f) + blocks = find_blocks_in_sequence(sequence) + peaks = find_peaks_indices(formatted_df, blocks) + print(peaks) + result.extend(peaks) + return sorted(list(set(result))) + + +def qc_check_peak_indices(formatted_df:pd.DataFrame, bootstrapped_peaks: list): + assert 0 < len(bootstrapped_peaks), "Please call me only when you've found peaks" + + if 1 == len(bootstrapped_peaks): + return bootstrapped_peaks + + bootstrapped_peaks_shifted_right = bootstrapped_peaks[1:] + bootstrapped_peaks_drop_last = bootstrapped_peaks[:-1] + + qc_pass_list = list() + for a, b in zip(bootstrapped_peaks_drop_last, bootstrapped_peaks_shifted_right): + x = formatted_df.iloc[a, :]['freq'] + y = formatted_df.iloc[b, :]['freq'] + m = min(x, y) + if any([f < m for f in formatted_df.iloc[a+1:b, :]['freq']]): + qc_pass_list.extend([a, b]) + return sorted(list(set(qc_pass_list))) + + +def locate_peak_bin_value(formatted_df:pd.DataFrame, peak_indices:List[int]) -> list: + peaks = list() + for idx in peak_indices: + peaks.append(round((formatted_df.iloc[idx, :]['beg'] + formatted_df.iloc[idx, :]['end'])/2)) + return peaks + + +###################################################################### +def main(): + parser = argparse.ArgumentParser(description='Find the first few peaks in a (prepped) dyst histogram output', + prog='find_peaks') + parser.add_argument('-i', '--input', type=str, help="prepped dyst histogram output (without the bars)") + parser.add_argument('-o', '--output', type=str, help="path to output peak values (flat file)") + args = parser.parse_args() + + prepped_dyst_output = args.input + peaks_output = args.output + df = construct_dataframe(prepped_dyst_output) + peak_indices = bootstrap_peak_indices(df, 0.2, 0.8) + pass_qc_peak_indices = qc_check_peak_indices(df, peak_indices) + peak_values = locate_peak_bin_value(df, pass_qc_peak_indices) + with open(peaks_output, 'w') as outf: + [outf.write(f"{p}\n") for p in peak_values] + + +###################################################################### +if __name__ == "__main__": + main() + diff --git a/docker/lr-dyst-peaker/measure_g1_skew.py b/docker/lr-dyst-peaker/measure_g1_skew.py new file mode 100644 index 000000000..3225c3fc4 --- /dev/null +++ b/docker/lr-dyst-peaker/measure_g1_skew.py @@ -0,0 +1,22 @@ +import numpy as np +import argparse +from scipy.stats import skew + +###################################################################### +def main(): + parser = argparse.ArgumentParser(description='Given a text file holding read lengths of all sequences, print G1 skewness.', + prog='measure_g1_skew') + parser.add_argument('-i', '--input', type=str, help="a text file holding read lengths of all sequences") + parser.add_argument('-o', '--output', type=str, help="path to output file") + args = parser.parse_args() + + with open(args.input) as inf: + read_lengths = [int(line.strip()) for line in inf] + + with open(args.output, 'w') as outf: + outf.write(f'{skew(read_lengths):.2f}\n') + + +###################################################################### +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/docker/lr-dyst-peaker/modified.dyst b/docker/lr-dyst-peaker/modified.dyst new file mode 100755 index 000000000..7177ccae2 --- /dev/null +++ b/docker/lr-dyst-peaker/modified.dyst @@ -0,0 +1,458 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +from collections import OrderedDict +from itertools import * +from math import sqrt +import argparse +import array +import os, stat +from select import select +import stat +import sys +import termios +import time +import tty +import atexit + +unicodes = [ + "▏", "▎", "▍", "▌", "▋", "▊", "▉" +] + +if sys.version_info[0] > 2: + raw_input = input +else: + unicodes=[i.decode('utf-8') for i in unicodes] + + +def restoretty(): + termios.tcsetattr(KEYBOARD, termios.TCSADRAIN, old_settings) + + +ch = os.fstat(1).st_mode & stat.S_IFCHR +reg = os.fstat(1).st_mode & stat.S_IFREG + +PRINTED = 0 + + +def goUp(n): + sys.stderr.write(u"\u001b[" + str(n) + "A") + sys.stderr.flush() + + +def cleanLine(n): + sys.stderr.write(u"\u001b[" + str(n) + "A") + for i in range(n): + sys.stderr.write(u"\u001b[2K\r\n") + sys.stderr.flush() + + +class histogram: + def __init__(self, + inibins=10, + pct=False, + width=80, + noprogress=False, + lowlim=None, + maxlim=None): + + self.bins = [] + self.nbins = inibins + while inibins: + self.bins.append(array.array('d', [])) + inibins -= 1 + + self.width = width + + self.LAST = time.time() + + self.others = array.array('d', []) + self.outlimits = array.array('d', []) + + self.min = -999999999 + self.max = 999999999 + self.calcSD = False + self.pct = pct + + self.lowlimit = lowlim + self.highlimit = maxlim + + if self.lowlimit != None: self.lowlimit = float(self.lowlimit) + if self.highlimit != None: self.highlimit = float(self.highlimit) + + self.bs = ((self.max - self.min) / self.nbins) + + self.sum = 0 + + self.ln = 0 + + self.progress = not noprogress + + def __append(self, v): + if self.lowlimit != None and v < self.lowlimit: + self.outlimits.append(v) + return + elif self.highlimit != None and v > self.highlimit: + self.outlimits.append(v) + return + + self.sum += v + if v < self.min or v > self.max: + self.others.append(v) + if len(self.others) > 1000: + self.restruct1() + return + + bn = int((v - self.min) / self.bs) + 1 + if v == self.max: bn -= 1 + + self.bins[bn - 1].append(v) + + self.ln += 1 + + def append(self, v): + d, _, _ = select([KEYBOARD], [], [], 0.0) + if d: + ch = os.read(KEYBOARD, 1) + if hasattr(ch,'decode'): + ch=ch.decode() + if ch.lower() == 'q': + return True + elif ch.lower() == 'p': + self.pct = not self.pct + elif ch.lower() == 'a': + self.nbins = max(5, self.nbins - 2) + self.restruct1() + elif ch.lower() == 's': + self.nbins = self.nbins + 2 + self.restruct1() + elif ch.lower() == 'x': + if self.lowlimit == None: self.lowlimit = self.min + self.lowlimit += (self.max - self.min) / 10 + self.restruct1() + elif ch.lower() == 'z': + if self.lowlimit == None: self.lowlimit = self.min + self.lowlimit -= (self.max - self.min) / 10 + self.restruct1() + elif ch.lower() == 'm': + if self.highlimit == None: self.highlimit = self.max + self.highlimit += (self.max - self.min) / 10 + self.restruct1() + elif ch.lower() == 'n': + if self.highlimit == None: self.highlimit = self.max + self.highlimit -= (self.max - self.min) / 10 + self.restruct1() + elif ch.lower() == 'v': + self.highlimit = self.lowlimit = None + self.restruct1() + elif ch.lower() == 'i': + self.width -= 10 + elif ch.lower() == 'o': + self.width += 10 + + self.__append(v) + + if self.ln == 100: + self.restruct1() + + n = time.time() + + if self.progress and (not self.ln % 10000 or n - self.LAST > .1): + self.LAST = n + self.printer() + + def restruct1(self): + vs = [] + + self.bins.append(self.others) + self.bins.append(self.outlimits) + + for i in self.bins: + try: + vs.append(min(i)) + except: + pass + try: + vs.append(max(i)) + except: + pass + + self.min = min(vs) + if self.lowlimit: self.min = self.lowlimit + self.max = max(vs) + if self.highlimit: self.max = self.highlimit + self.bs = ((self.max - self.min) / self.nbins) + + oldbins = self.bins + + self.bins = [] + nbins = self.nbins + while nbins: + self.bins.append(array.array('d', [])) + nbins -= 1 + self.others = array.array('d', []) + self.outlimits = array.array('d', []) + self.sum = 0 + + self.ln = 0 + + for v in chain(*oldbins): + self.__append(v) + + def stringify(self, writable): + avg = float(self.sum / self.ln) + res = "# TOTAL: {} - MEAN: {:.3f}\r\n# MIN: {} - MAX: {}\r\n".format( + self.ln, avg, self.min, self.max) + if writable: + CHAR = '#' + else: + CHAR = u'\u2589' + + if self.calcSD: + variance = 0 + for i in chain(self.others, *self.bins): + i -= avg + variance += i * i + variance = variance / self.ln + + res += "# SD: {} - Variance {}\r\n".format( + sqrt(variance), variance) + + len_max = len(str(int(self.max))) + 5 + len_min = len(str(int(self.min))) + 5 + len_n = str(max(len_max, len_min)) + len_counts = str(max(map(lambda x: len(str(len(x))), self.bins))) + + len_bar = self.width - (int(len_n) * 2 + 8 + int(len_counts)) + + longestbar = max([len(i) for i in self.bins]) + if not longestbar: return "" + factor = float(len_bar) / longestbar + + start = self.min + for i in range(self.nbins): + bar = CHAR * int(factor * len(self.bins[i])) + if not writable: + extra = factor * len(self.bins[i]) + extra = int(((extra - int(extra)) * 100) / 16.67) + bar += unicodes[extra] + if self.pct: + bar += ' ({:.2f}%)'.format( + len(self.bins[i]) / float(self.ln) * 100) + res += ( + "{:" + len_n + ".4f} - {:" + len_n + ".4f} [{:" + + len_counts + "}]: ").format(start, start + self.bs, + len(self.bins[i])) + bar + '\r\n' + start += self.bs + + return res + + def printer(self): + global PRINTED + s = self.stringify(False) + if PRINTED: + cleanLine(PRINTED) + goUp(PRINTED) + PRINTED = s.count('\r\n') + sys.stderr.write(s) + sys.stderr.flush() + + +class wordHist: + def __init__(self, pct=False, width=80, noprogress=False): + self.words = OrderedDict() + self.ln = 0 + self.pct = pct + self.LAST = time.time() + self.width = width + self.progress = not noprogress + + def append(self, w): + d, _, _ = select([KEYBOARD], [], [], 0.0) + if d: + ch = os.read(KEYBOARD, 1) + if hasattr(ch,'decode'): + ch=ch.decode() + if ch.lower() == 'q': + return True + elif ch.lower() == 'p': + self.pct = not self.pct + elif ch.lower() == 'i': + self.width -= 10 + elif ch.lower() == 'o': + self.width += 10 + + if w not in self.words: + self.words[w] = 1 + else: + self.words[w] += 1 + self.ln += 1 + + n = time.time() + + if self.progress and (not self.ln % 10000 or n - self.LAST > .1): + self.LAST = n + self.printer() + + def printer(self): + global PRINTED + s = self.stringify(False) + if PRINTED: + cleanLine(PRINTED ) + goUp(PRINTED ) + PRINTED = s.count('\r\n') + sys.stderr.write(s) + sys.stderr.flush() + + def stringify(self, writable): + res = "# TOTAL: {}\r\n".format(self.ln) + if writable: + CHAR = '#' + else: + CHAR = u'\u2589' + + len_n = str(max([len(i) for i in self.words])) + len_counts = str(max(map(lambda x: len(str(x)), self.words.values()))) + + len_bar = self.width - (int(len_n) + 8 + int(len_counts)) + + longestbar = max(self.words.values()) + if not longestbar: return "" + factor = float(len_bar) / longestbar + + for wd, cnt in self.words.items(): + bar = CHAR * int(factor * cnt) + if not writable: + extra = factor * cnt + extra = int(((extra - int(extra)) * 100) / 16.67) + bar += unicodes[extra] + if self.pct: + bar += ' ({:.2f}%)'.format(cnt / float(self.ln) * 100) + res += ("{:" + len_n + "s} [{:" + len_counts + "}]: ").format( + wd, cnt) + bar + '\r\n' + + return res + + def restruct1(self): + pass + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + description='Text mode Histogram software') + + parser.add_argument( + '-w', + dest='words', + action='store_true', + help='process words instead of numbers') + + parser.add_argument( + '-b', + dest='bins', + type=int, + default=10, + help='Bins shows in the histogram (numeric mode only) [10]') + + parser.add_argument( + '-c', + dest='columns', + type=int, + default=80, + help='Histogram width in columns [80]') + + parser.add_argument( + '-p', + dest='percentage', + action='store_true', + help='Show percentage for each bar') + + parser.add_argument( + '-n', + dest='noprogress', + action='store_true', + help='Don\'t show progress only final result') + + parser.add_argument( + '-m', dest='min', help='Minimum value to show in the histogram ') + + parser.add_argument( + '-x', dest='max', help='Maximum value to show in the histogram') + + parser.add_argument( + '-i', dest='inputfile', help='Input file containing data') + # parser.add_argument( + # 'inputfile', help="Input file containing data", nargs='?') + + args = parser.parse_args() + + mode = os.fstat(0).st_mode + # usually this should be stat.S_ISFIFO(mode), but in docker containers that always evalues to True + # so we decide to forgo piping + hack_is_fifo = False + if args.inputfile == None and not hack_is_fifo and not stat.S_ISREG( + mode): + parser.print_help() + sys.exit(1) + elif args.inputfile and hack_is_fifo or stat.S_ISREG(mode): + print("You must specify a file or input via stdin!, NOT BOTH") + sys.exit(1) + + if args.inputfile: + KEYBOARD = sys.stdin.fileno() + sys.stdin = open(args.inputfile) + + if not args.noprogress: + if hack_is_fifo or stat.S_ISREG(mode): + KEYBOARD = os.open('/dev/tty', os.O_RDONLY) + + # old_settings = termios.tcgetattr(KEYBOARD) + # tty.setraw(KEYBOARD) + # atexit.register(restoretty) + sys.stderr.write( + "# [q]uit | +/- bins [a,s] | percentages [p] | +/- width [i,o]\r\n# +/- minimum [z,x] | reset [v] | +/- maximum [n,m]\r\n\r\n" + ) + else: + if hack_is_fifo or stat.S_ISREG(mode): + KEYBOARD = os.open('/dev/tty', os.O_RDONLY) + + try: + if args.words: + h = wordHist(args.percentage, args.columns, args.noprogress) + while True: + try: + i = raw_input().strip() + except: + break + if h.append(i): break + else: + h = histogram( + args.bins, + args.percentage, + args.columns, + args.noprogress, + lowlim=args.min, + maxlim=args.max) + while True: + try: + i = raw_input() + except: + break + + try: + if h.append(float(i)): break + except: + pass + except KeyboardInterrupt: + pass + + h.calcSD = True + h.restruct1() + + if PRINTED: + cleanLine(PRINTED) + goUp(PRINTED) + if (not ch and not reg) or (not ch and reg): + print(h.stringify(True)) + else: + print(h.stringify(False)) diff --git a/docker/lr-dyst-peaker/reverse_yield.py b/docker/lr-dyst-peaker/reverse_yield.py new file mode 100644 index 000000000..90f624fd6 --- /dev/null +++ b/docker/lr-dyst-peaker/reverse_yield.py @@ -0,0 +1,25 @@ +import numpy as np +import argparse + + +###################################################################### +def main(): + parser = argparse.ArgumentParser(description='Given a text file holding read lengths of all sequences, print lengths at which a certain fraction of reads are shorter than. The fraction bins are 10% to 90% with 10% increments.', + prog='reverse_yield') + parser.add_argument('-i', '--input', type=str, help="a text file holding read lengths of all sequences") + parser.add_argument('-o', '--output', type=str, help="path to file holding length-9 reverse yield array (flat file)") + args = parser.parse_args() + + with open(args.input) as inf: + read_lengths = [int(line.strip()) for line in inf] + + sorted_read_lengths = sorted(read_lengths) + + with open(args.output, 'w') as outf: + for frac in np.arange(0.1, 1, 0.1): + outf.write( f"{sorted_read_lengths[round(frac*len(sorted_read_lengths))]}\n" ) + + +###################################################################### +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/docker/lr-dyst-peaker/test.py b/docker/lr-dyst-peaker/test.py new file mode 100644 index 000000000..084e2c715 --- /dev/null +++ b/docker/lr-dyst-peaker/test.py @@ -0,0 +1,8 @@ +import stat +import os + +# if on the host, both will print False +# but if in the container, the first will print True +mode = os.fstat(0).st_mode +print(stat.S_ISFIFO(mode)) +print(stat.S_ISREG(mode)) diff --git a/docker/lr-gcloud-samtools/Dockerfile b/docker/lr-gcloud-samtools/Dockerfile new file mode 100644 index 000000000..1451d24a3 --- /dev/null +++ b/docker/lr-gcloud-samtools/Dockerfile @@ -0,0 +1,101 @@ +# incorporate samtools and bcftools, plus some additional but most common genomics tools +# then support both gsutil and gcloud CLIs + +############### stage 0: build samtools and bcftools from source +FROM ubuntu:20.04 + +ARG DEBIAN_FRONTEND=noninteractive +ARG SAMTOOLS_VERSION=1.18 +ARG BCFTOOLS_VERSION=1.18 +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends \ + ca-certificates \ + libbz2-dev \ + libcurl4-openssl-dev \ + libdeflate-dev \ + liblzma-dev \ + libncurses5-dev \ + autoconf \ + automake \ + bzip2 \ + gcc \ + make \ + wget \ + zlib1g-dev && \ + wget https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2 && \ + tar xjf samtools-${SAMTOOLS_VERSION}.tar.bz2 && \ + cd samtools-${SAMTOOLS_VERSION} \ + && ./configure --without-curses --enable-libcurl --with-libdeflate \ + && make -s all all-htslib \ + && make install install-htslib && \ + cd - && rm -rf samtools-${SAMTOOLS_VERSION}* && \ + wget https://github.com/samtools/bcftools/releases/download/${BCFTOOLS_VERSION}/bcftools-${BCFTOOLS_VERSION}.tar.bz2 && \ + tar xjf bcftools-${BCFTOOLS_VERSION}.tar.bz2 && \ + cd bcftools-${BCFTOOLS_VERSION} \ + && ./configure --without-curses \ + && make -s \ + && make install && \ + cd - && rm -rf bcftools-${BCFTOOLS_VERSION}* && \ + apt-get -qqy purge autoconf automake bzip2 gcc make wget && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* && \ + samtools --help && \ + bcftools --help + +############### stage 1: other commonly used bioinformatics utilities +FROM ubuntu:20.04 + +ENV HOME=/root + +# Define default command. +CMD ["bash"] + +# copy from previous stage the binaries from samtools build +COPY --from=0 /usr/local/bin/* /usr/local/bin/ + +#### Basic utilities +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + gnupg \ + curl \ + wget \ + bc \ + bedtools \ + datamash \ + gawk \ + less \ + pigz \ + tabix \ + tree \ + vcftools \ + zlib1g-dev && \ + echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ + apt-get -qqy update && \ + apt-get -qqy install --no-install-recommends google-cloud-cli && \ + gcloud config set core/disable_usage_reporting true && \ + gcloud config set component_manager/disable_update_check true && \ + gcloud config set metrics/environment github_docker_image && \ + apt-get -qqy purge gnupg && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* && \ + samtools --version && \ + bcftools --version + + +COPY re-auth.sh /opt/ diff --git a/docker/lr-gcloud-samtools/Makefile b/docker/lr-gcloud-samtools/Makefile new file mode 100644 index 000000000..695f871f4 --- /dev/null +++ b/docker/lr-gcloud-samtools/Makefile @@ -0,0 +1,12 @@ +VERSION = 0.1.3 +TAG1 = us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:latest + +all: build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-gcloud-samtools/re-auth.sh b/docker/lr-gcloud-samtools/re-auth.sh new file mode 100644 index 000000000..a35df618d --- /dev/null +++ b/docker/lr-gcloud-samtools/re-auth.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +set -eu + +export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` + diff --git a/docker/lr-hifiasm/Dockerfile b/docker/lr-hifiasm/Dockerfile index 2c354e380..0b0f38933 100644 --- a/docker/lr-hifiasm/Dockerfile +++ b/docker/lr-hifiasm/Dockerfile @@ -1,12 +1,12 @@ ############### stage 0: build samtools and bcftools from source -FROM ubuntu:18.04 +FROM ubuntu:20.04 -ARG SAMTOOLS_VERSION=1.15.1 -ARG BCFTOOLS_VERSION=1.15.1 +ARG SAMTOOLS_VERSION=1.17 +ARG BCFTOOLS_VERSION=1.17 ARG YAK_VERSION=0.1 -ARG HIFIASM_VERSION=0.16.1 ARG SEQTK_VERSION=1.3 +ARG HIFIASM_VERSION=0.19.5 ARG DEBIAN_FRONTEND=noninteractive RUN apt-get -qqy update --fix-missing && \ @@ -63,7 +63,7 @@ RUN apt-get -qqy purge autoconf automake bzip2 gcc g++ make wget && \ ############### stage 1: copy over bin, and build necessary lib -FROM ubuntu:18.04 +FROM ubuntu:20.04 ENV HOME=/root @@ -94,3 +94,5 @@ RUN apt-get -qqy update --fix-missing && \ /usr/share/man/?? \ /usr/share/man/??_* && \ samtools --help + +COPY vm_local_monitoring_script.sh /opt/ diff --git a/docker/lr-hifiasm/Makefile b/docker/lr-hifiasm/Makefile index ec6140f39..9f86fabee 100644 --- a/docker/lr-hifiasm/Makefile +++ b/docker/lr-hifiasm/Makefile @@ -1,5 +1,5 @@ IMAGE_NAME = lr-hifiasm -VERSION = 0.16.1 +VERSION = 0.19.5 TAG1 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):$(VERSION) TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest @@ -7,11 +7,11 @@ TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest all: | build push build: - docker build -t $(TAG1) -t $(TAG2) . + docker build -t $(TAG1) -t $(TAG2) . build_no_cache: - docker build --no-cache -t $(TAG1) -t $(TAG2) . + docker build --no-cache -t $(TAG1) -t $(TAG2) . push: - docker push $(TAG1) - docker push $(TAG2) + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-hifiasm/vm_local_monitoring_script.sh b/docker/lr-hifiasm/vm_local_monitoring_script.sh new file mode 100644 index 000000000..b504fca70 --- /dev/null +++ b/docker/lr-hifiasm/vm_local_monitoring_script.sh @@ -0,0 +1,305 @@ +#!/bin/bash + +# ADDED NOTE: this script is intended to be localized to your google cloud +# vm and run in the following fashion to get resources usage +# ``` +# export MONITOR_MOUNT_POINT=${your_home_dir_or_cromwell_root} +# bash vm_local_monitoring_script.sh &> resources.log & +# job_id=$(ps -aux | grep -F 'vm_local_monitoring_script.sh' | head -1 | awk '{print $2}') +# ${you_resource_intensive_jobs} +# # we recommend only run the following if manually launched +# kill $job_id +# # if running on a cromwell providisioned VM +# # remember to delocalize the "resources.log" +# ``` + + +# NOTE: this script is intended to be placed in google cloud storage +# and invoked by adding the following line to your cromwell workflow +# options: +# "monitoring_script": "gs://bucket/path/to/cromwell_monitoring_script.sh" +# Upon task completion "monitoring.log" will be added to the appropriate +# cloud storage folder. +set -Eeuo pipefail + +MONITOR_MOUNT_POINT=${MONITOR_MOUNT_POINT:-"/"} +SLEEP_TIME=${SLEEP_TIME:-"10"} + +function getCpuUsage() { + # get the summary cpu statistics (i.e. for all cpus) since boot + # get the numeric values in an array, dropping the first field (the + # string, "cpu") + CPU_TIMES=(`sed -n 's/^cpu\s//p' /proc/stat`) + # idle time (in system units) is the 3rd numeric field + IDLE_TIME=${CPU_TIMES[3]} + # total cpu time is sum of all fields + TOTAL_TIME=0 + for T in ${CPU_TIMES[@]}; do + ((TOTAL_TIME += T)) + done + + # get the previous times from temp file + read PREVIOUS_IDLE PREVIOUS_TOTAL < $TEMP_CPU + + # write current times to temp file + echo "$IDLE_TIME $TOTAL_TIME" > $TEMP_CPU + + # get the difference in idle and total times since the previous + # update, and report the usage as: non-idle time as a percentage + # of total time + awk -v IDLE=$((IDLE_TIME-PREVIOUS_IDLE)) \ + -v TOTAL=$((TOTAL_TIME-PREVIOUS_TOTAL)) \ + 'BEGIN { printf "%.1f%%", 100 * (1 - IDLE / TOTAL) }' +} + +function getMem() { + # get desired memory value from /proc/meminfo, in GiB, and also + # as a percentage of total + # argument is the label of the desired memory value + cat /proc/meminfo \ + | awk -v MEM_FIELD="$1" '{ + f[substr($1, 1, length($1)-1)] = $2 + } END { + printf "%.2f GiB", f[MEM_FIELD] / 1048576 + }' +} + +function getMemUnavailable() { + # get unavailable memory from /proc/meminfo, in GiB + cat /proc/meminfo \ + | awk '{ + f[substr($1, 1, length($1)-1)] = $2 + } END { + + if("MemAvailable" in f) { + mem_available = f["MemAvailable"] + } else { + mem_available = f["MemFree"] + f["Buffers"] + f["Cached"] + } + mem_in_use = f["MemTotal"] - mem_available + printf "%.2f GiB %.1f%%", mem_in_use / 1048576, 100 * mem_in_use / f["MemTotal"] + }' +} + +# old version using "free -m" are kept in case a container somehow has +# weird values in /proc/meminfo +function getMem_with_free() { + # get memory info from "free" command. Convert to float in GB. + # First argument is desired row of output table. + # Second argument is desired column. + MEM_ROW=$(echo "$1" | awk '{print tolower($1)}') + MEM_COLUMN=$(echo "$2" | awk '{print tolower($1)}') + free -m | awk -v MEM_ROW=$MEM_ROW -v MEM_COLUMN=$MEM_COLUMN \ + 'NR=1 { + for(i=1; i<=NF; i++) { f[tolower($i)]=NF+1-i } + } + { + regex="^"MEM_ROW + if(tolower($1) ~ regex) { + print $(NF+1-f[MEM_COLUMN])/1024 " GiB" + } + }' +} + +# old version using "free -m" are kept in case a container somehow has +# weird values in /proc/meminfo +function getMemUnavailable_using_free() { + # get memory that is in active use (not just cached) from "free" + # command. Convert to float in GiB, followed by percent of total. + # NOTE: weird computation with awk due to variety of output from + # free on different systems. Rows and columns differ, and on some + # systems the desired quantity is used "used" memory, on most it's + # "used" - "buffers" - "cached". If "buffers" and "cached" don't + # exist, then awk will subtract 0 so the correct result is returned. + free -m \ + | awk '\ + NR=1 { + for(i=1; i<=NF; i++) { f[tolower($i)]=NF+1-i } + } + { + if(tolower($1) ~ "^mem") { + IN_USE=($(NF+1-f["used"]) - $(NF+1-f["buffers"]) - $(NF+1-f["cached"])) + printf "%.3f GiB %.1f%%", IN_USE/1024, 100*IN_USE/$(NF+1-f["total"]) + } + }' +} + + +function getDisk() { + # get information about disk usage from "df" command. + DISK_COLUMN=$(echo "$1" | awk '{print tolower($1)}') + MOUNT_POINT=$2 + # extract desired value + VALUE=$(\ + df -h "$MOUNT_POINT" \ + | sed 's/Mounted on/Mounted-on/' \ + | awk -v DISK_COLUMN=$DISK_COLUMN ' + FNR==1 { + NF_HEADER=NF + for(i=1; i<=NF; i++) { f[tolower($i)]=NF-i } + } + FNR>1 { + FIELD_NUM=NF-f[DISK_COLUMN] + if(FIELD_NUM > 0) { + VALUE=$(FIELD_NUM) + print VALUE + } else if(f[DISK_COLUMN] == NF_HEADER-1 && NF == 1) { + VALUE=$(1) + print VALUE + } + }' \ + ) + # If value is a number follwed by letters, it is a value with units + # and needs to be converted. Otherwise just print value + if [[ "$VALUE" =~ [0-9.]+[A-z]+ ]]; then + echo "$VALUE"\ + | sed -E 's/([0-9.]*)([^0-9.]*)/\1 \2/' \ + | awk '{ + UNIT=substr($2, 1, 1) + if(UNIT == "T") { + SCALE=2^10 + } else if(UNIT == "G") { + SCALE=1 + } else if(UNIT == "M") { + SCALE=2^-10 + } else if(UNIT == "K") { + SCALE=2^-20 + } else if(UNIT == "B") { + SCALE=2^-30 + } else { + SCALE=1 + } + printf "%.3f GiB", $1 * SCALE + }' + else + echo "$VALUE" + fi +} + +function findBlockDevice() { + MOUNT_POINT=$1 + FILESYSTEM=$(grep -E "$MOUNT_POINT\s" /proc/self/mounts \ + | awk '{print $1}') + DEVICE_NAME=$(basename "$FILESYSTEM") + FS_IN_BLOCK=$(find -L /sys/block/ -mindepth 2 -maxdepth 2 -type d \ + -name "$DEVICE_NAME") + if [ -n "$FS_IN_BLOCK" ]; then + # found path to the filesystem in the block devices. get the + # block device as the parent dir + dirname "$FS_IN_BLOCK" + elif [ -d "/sys/block/$DEVICE_NAME" ]; then + # the device is itself a block device + echo "/sys/block/$DEVICE_NAME" + else + # couldn't find, possibly mounted by mapper. + # look for block device that is just the name of the symlinked + # original file. if not found, echo empty string (no device found) + BLOCK_DEVICE=$(ls -l "$FILESYSTEM" 2>/dev/null \ + | cut -d'>' -f2 \ + | xargs basename 2>/dev/null \ + || echo) + if [[ -z "$BLOCK_DEVICE" ]]; then + 1>&2 echo "Unable to find block device for filesystem $FILESYSTEM." + if [[ -d /sys/block/sdb ]] && ! grep -qE "^/dev/sdb" /etc/mtab; then + 1>&2 echo "Guessing present but unused sdb is the correct block device." + echo "/sys/block/sdb" + else + 1>&2 echo "Disk IO will not be monitored." + fi + fi + fi +} + +function handle_integer_wrap() { + if [ $1 -ge 0 ]; then + echo $1 + else + WRAPPED=$1 + echo "$((WRAPPED + 2**30))" + fi +} + + + +function getBlockDeviceIO() { + # get read and write IO rate by looking at appropriate block device + STAT_FILE="$1" + if [[ -f "$STAT_FILE" ]]; then + # get IO stats as comma-separated list to extract 3rd and 7th fields + STATS=$(sed -E 's/[[:space:]]+/,/g' $STAT_FILE | sed -E 's/^,//'\ + | cut -d, -f3,7 | sed -E 's/,/ /g') + # get results of previous poll + read OLD_READ OLD_WRITE < $TEMP_IO + # save new poll results + read READ_SECTORS WRITE_SECTORS <<<$STATS + echo "$READ_SECTORS $WRITE_SECTORS" > $TEMP_IO + # update read and write sectors as difference since previous poll + READ_SECTORS=$(handle_integer_wrap $((READ_SECTORS - OLD_READ))) + WRITE_SECTORS=$(handle_integer_wrap $((WRITE_SECTORS - OLD_WRITE))) + + # output change in read/write sectors in kiB/s + echo "$READ_SECTORS $WRITE_SECTORS" \ + | awk -v T=$SLEEP_TIME -v B=$SECTOR_BYTES \ + '{ printf "%.3f MiB/s %.3f MiB/s", $1*B/T/1048576, $2*B/T/1048576 }' + else + echo "N/A MiB/s N/A MiB/s" + fi +} + + +function runtimeInfo() { + echo " [$(date)]" + echo \* CPU usage: $(getCpuUsage) + echo \* Memory usage: $(getMemUnavailable) + echo \* Disk usage: $(getDisk Used $MONITOR_MOUNT_POINT) $(getDisk Use% $MONITOR_MOUNT_POINT) + echo \* Read/Write IO: $(getBlockDeviceIO "$BLOCK_DEVICE_STAT_FILE") +} + +# print out header info +echo ================================== +echo =========== MONITORING =========== +echo ================================== +echo --- General Information --- +echo \#CPU: $(nproc) +echo Total Memory: $(getMem MemTotal) +echo Total Disk space: $(getDisk Size "$MONITOR_MOUNT_POINT") +echo +echo --- Runtime Information --- + + +# make a temp file to store io information, remove it on exit +TEMP_IO=$(mktemp "${TMPDIR:-/tmp/}$(basename $0).XXXXXXXXXXXX") +# make a temp file to store cpu information, remove it on exit +# remove temp files on exit +TEMP_CPU=$(mktemp "${TMPDIR:-/tmp/}$(basename $0).XXXXXXXXXXXX") +trap "rm -f $TEMP_IO $TEMP_CPU" EXIT + + +# find the block device +BLOCK_DEVICE=$(findBlockDevice "$MONITOR_MOUNT_POINT") +if [[ -z "$BLOCK_DEVICE" ]] \ + || [[ ! -f "$BLOCK_DEVICE/queue/hw_sector_size" ]]; then + # no block device found, can't get IO info + SECTOR_BYTES=0 + BLOCK_DEVICE_STAT_FILE="" +else + SECTOR_BYTES=$(cat "$BLOCK_DEVICE/queue/hw_sector_size") + BLOCK_DEVICE_STAT_FILE="$BLOCK_DEVICE/stat" +fi + + +# since getBlockDeviceIO looks at differences in stat file, run the +# update so the first reported update has a sensible previous result to +# compare to +echo "0 0" > $TEMP_IO +getBlockDeviceIO "$BLOCK_DEVICE_STAT_FILE" > /dev/null + +# same thing for getCpuUsage +echo "0 0" > $TEMP_CPU +getCpuUsage > /dev/null + + +while true; do + runtimeInfo + sleep $SLEEP_TIME +done diff --git a/docker/lr-margin/Dockerfile b/docker/lr-margin/Dockerfile new file mode 100644 index 000000000..52472f9f5 --- /dev/null +++ b/docker/lr-margin/Dockerfile @@ -0,0 +1,28 @@ +FROM kishwars/pepper_deepvariant:r0.8 + +# just add gcloud for faster localization +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + gnupg \ + curl \ + wget \ + zlib1g-dev && \ + echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ + apt-get -qqy update && \ + apt-get -qqy install --no-install-recommends google-cloud-cli && \ + gcloud config set core/disable_usage_reporting true && \ + gcloud config set component_manager/disable_update_check true && \ + gcloud config set metrics/environment github_docker_image && \ + apt-get -qqy purge gnupg wget && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* diff --git a/docker/lr-margin/Makefile b/docker/lr-margin/Makefile new file mode 100644 index 000000000..58eee6d63 --- /dev/null +++ b/docker/lr-margin/Makefile @@ -0,0 +1,16 @@ +IMAGE_NAME = lr-margin +VERSION = 2.2.dev-69f6fff # This should match the margin version number from the base image +TAG1 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest + +all: | build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +build_no_cache: + docker build --no-cache -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-nanoplot/Dockerfile b/docker/lr-nanoplot/Dockerfile index 6c5414637..e60b1730d 100644 --- a/docker/lr-nanoplot/Dockerfile +++ b/docker/lr-nanoplot/Dockerfile @@ -1,5 +1,32 @@ FROM python:3.9-buster +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends \ + apt-transport-https \ + bedtools \ + ca-certificates \ + curl \ + gnupg \ + wget \ + zlib1g-dev && \ + echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ + apt-get -qqy update && \ + apt-get -qqy install --no-install-recommends google-cloud-cli && \ + gcloud config set core/disable_usage_reporting true && \ + gcloud config set component_manager/disable_update_check true && \ + gcloud config set metrics/environment github_docker_image && \ + apt-get -qqy purge gnupg && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* + ARG sha=e0028d85ec9e61f8c96bea240ffca65b713e3385 RUN cd /opt/ && \ git clone https://github.com/wdecoster/NanoPlot.git && \ diff --git a/docker/lr-resource-visual/Dockerfile b/docker/lr-resource-visual/Dockerfile new file mode 100644 index 000000000..f46ee778e --- /dev/null +++ b/docker/lr-resource-visual/Dockerfile @@ -0,0 +1,10 @@ +FROM r-base:4.3.0 + +RUN Rscript -e "update.packages(contriburl=contrib.url(repos=c('http://lib.stat.cmu.edu/R/CRAN/', 'https://cran.rstudio.com')), clean=TRUE, quiet=TRUE, ask=FALSE)" +COPY install_R_packages.R /opt/ +ARG R_PKGS="stringr ggplot2 reshape2" +RUN chmod +x /opt/install_R_packages.R && \ + /opt/install_R_packages.R $R_PKGS + +COPY plot.resources.R /opt/ +RUN chmod +x /opt/plot.resources.R diff --git a/docker/lr-resource-visual/Makefile b/docker/lr-resource-visual/Makefile new file mode 100644 index 000000000..903407193 --- /dev/null +++ b/docker/lr-resource-visual/Makefile @@ -0,0 +1,12 @@ +VERSION = 0.1.1 +TAG1 = us.gcr.io/broad-dsp-lrma/lr-resource-visual:$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/lr-resource-visual:latest + +all: build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-resource-visual/install_R_packages.R b/docker/lr-resource-visual/install_R_packages.R new file mode 100644 index 000000000..340d64ba6 --- /dev/null +++ b/docker/lr-resource-visual/install_R_packages.R @@ -0,0 +1,26 @@ +#!/usr/bin/env Rscript +# adapted from https://github.com/broadinstitute/gatk-sv/blob/main/dockerfiles/sv-base-virtual-env/install_R_packages.R +# override for debugging purposes +quiet <- as.logical(Sys.getenv("QUIET", unset="TRUE")) +library(parallel) +Ncpus <- detectCores() + +# treat warnings as errors, otherwise script can fail silently if a package fails to install +options(warn = 2) + +# packages to be installed +to.be.installed <- commandArgs(trailingOnly=TRUE) +if ( 0 == length(to.be.installed) ) { + stop("At least one argument must be supplied when you call me.n", call.=FALSE) +} + +# multiple repos, multiple retries when a package is not found +repos <- c("http://lib.stat.cmu.edu/R/CRAN/", "https://cran.rstudio.com") + +# install to default place, quietly, then leave +install.packages(pkgs = to.be.installed, + repos = repos, + clean = TRUE, + quiet = quiet, + Ncpus = Ncpus) +q(save = "no") diff --git a/docker/lr-resource-visual/plot.resources.R b/docker/lr-resource-visual/plot.resources.R new file mode 100755 index 000000000..057e2ba3e --- /dev/null +++ b/docker/lr-resource-visual/plot.resources.R @@ -0,0 +1,90 @@ +#!/usr/bin/env Rscript + +################################################################################ +## usage example: +# source("plot.resources.R"); +# resources = get_time_series_df("resources.log"); +# plot_time_series_df(resources, "test.pdf", "resources usage by task") +################################################################################ + +get_time_series_df <- function(resources.log.file) { + library("stringr") + + logs = readLines(resources.log.file) + + b = grep("Runtime Information", logs) + 1 + L = length(logs) + i = grep("Read/Write IO:", logs[(L-5):L], fixed = T) + if (2 == length(i)) {# last block is complete + logs = logs[b:L] + } else {# ditch last block which is incomplete + logs = logs[b:(L-(6-i))] + } + rm(i, b, L) + + ########## + loc = Sys.getlocale("LC_TIME") + Sys.setlocale("LC_TIME", "C") + # covert to dataframe of time series + t = gsub("(\\[|\\])", "", sub("UTC ", "", sub("^ ", "", logs[1]))) + start.time = strptime(t, format = "%a %b %d %H:%M:%S %Y", tz = "UTC") + + b.arrays = grep("^\\s*?\\[", logs, perl = T) + res = vector(mode = "list", length = length(b.arrays)) + j = 1 + for (i in b.arrays) { + block = logs[i: (i+4)] + + t = gsub("(\\[|\\])", "", sub("UTC ", "", sub("^ ", "", block[1]))) + time = strptime(t, format = "%a %b %d %H:%M:%S %Y", tz = "UTC") + cpu = as.numeric(sub("%", "", str_match(block[2], "[0-9\\.]+%")[1,1])) + mem_gb = as.numeric(sub(" GiB", "", str_match(block[3], "[0-9\\.]+ GiB")[1,1])) + mem_percent = as.numeric(sub("%", "", str_match(block[3], "[0-9\\.]+%")[1,1])) + disk_gb = as.numeric(sub(" GiB", "", str_match(block[4], "[0-9\\.]+ GiB")[1,1])) + disk_percent = as.numeric(sub("%", "", str_match(block[4], "[0-9\\.]+%")[1,1])) + # io = str_match(block[5], "[0-9\\.]+%")[1,1] + res[[j]] = list(time = time, + cpu = cpu, + mem_gb = mem_gb, + mem_percent = mem_percent, + disk_gb = disk_gb, + disk_percent = disk_percent) + j = j+1 + } + Sys.setlocale("LC_TIME", loc) + + resources = do.call(rbind, lapply(res, data.frame)) + resources$"cpu" = as.numeric(resources$"cpu") + resources$"mem_gb" = as.numeric(resources$"mem_gb") + resources$"mem_percent" = as.numeric(resources$"mem_percent") + resources$"disk_gb" = as.numeric(resources$"disk_gb") + resources$"disk_percent" = as.numeric(resources$"disk_percent") + + rm(res, i,j) + resources +} + +plot_time_series_df <- function(resources.df, out.pdf, title) { + library("ggplot2") + library("reshape2") + re = melt(resources.df, id = "time") + p = ggplot(data = re, aes(x=time, y=value)) + geom_line() + + facet_grid(variable ~ ., scales = "free") + + ggtitle(title) + + theme(plot.title = element_text(hjust = 0.5)) + ggsave(p, filename = out.pdf) +} + +################################################################################ +options = commandArgs(trailingOnly = TRUE) +if (3!=length(options)) { + stop("I need input.log output.pdf title_string") +} +input_log = options[1] +output_pdf = options[2] +plot_title = options[3] + + +ts_df = get_time_series_df(input_log) + +plot_time_series_df(ts_df, output_pdf, plot_title) diff --git a/docker/lr-seqkit/Dockerfile b/docker/lr-seqkit/Dockerfile new file mode 100644 index 000000000..39dc1cc8d --- /dev/null +++ b/docker/lr-seqkit/Dockerfile @@ -0,0 +1,26 @@ +FROM ubuntu:20.04 + +ARG SEQKIT_VERSION=v2.4.0 +ARG DL_LINK="https://github.com/shenwei356/seqkit/releases/download/${SEQKIT_VERSION}/seqkit_linux_amd64.tar.gz" + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + datamash \ + wget \ + zlib1g-dev && \ + cd /tmp && \ + wget ${DL_LINK} && tar -xf seqkit_linux_amd64.tar.gz && cp seqkit /usr/local/bin && \ + cd - && \ + apt-get -qqy purge wget && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* + diff --git a/docker/lr-seqkit/Makefile b/docker/lr-seqkit/Makefile new file mode 100644 index 000000000..e09fd06df --- /dev/null +++ b/docker/lr-seqkit/Makefile @@ -0,0 +1,12 @@ +VERSION = 2.4.0 +TAG1 = us.gcr.io/broad-dsp-lrma/lr-seqkit:$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/lr-seqkit:latest + +all: build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-seqtk/Dockerfile b/docker/lr-seqtk/Dockerfile new file mode 100644 index 000000000..1e47d0492 --- /dev/null +++ b/docker/lr-seqtk/Dockerfile @@ -0,0 +1,23 @@ +FROM ubuntu:20.04 + +ARG SEQTK_VERSION=1.3-1 + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + datamash \ + seqtk=${SEQTK_VERSION} \ + zlib1g-dev && \ + apt-get -qqy purge wget make && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* + + diff --git a/docker/lr-seqtk/Makefile b/docker/lr-seqtk/Makefile new file mode 100644 index 000000000..14e55f741 --- /dev/null +++ b/docker/lr-seqtk/Makefile @@ -0,0 +1,12 @@ +VERSION = 1.3 +TAG1 = us.gcr.io/broad-dsp-lrma/lr-seqtk:$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/lr-seqtk:latest + +all: build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-sniffles2/Dockerfile b/docker/lr-sniffles2/Dockerfile index da637b1b6..99de57891 100644 --- a/docker/lr-sniffles2/Dockerfile +++ b/docker/lr-sniffles2/Dockerfile @@ -1,21 +1,60 @@ -FROM continuumio/miniconda3 - -MAINTAINER Evie Wan +############### stage 0 +FROM continuumio/miniconda3 AS build # copy other resources COPY ./environment.yml / -# install conda packages -RUN conda env create -f /environment.yml && conda clean -a -ENV PATH=/opt/conda/envs/lr-sniffles2/bin/:/root/google-cloud-sdk/bin/:${PATH} -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/lr-sniffles2/lib/ +# Install the package as normal: +COPY environment.yml . +RUN conda env create -f environment.yml + +# Install conda-pack: +RUN conda install -c conda-forge conda-pack && \ + conda clean -a + +# Use conda-pack to create a standalone enviornment +# in /venv: +RUN conda-pack \ + -n lr-sniffles2 \ + -o /tmp/env.tar && \ + mkdir /venv && cd /venv && tar xf /tmp/env.tar && \ + rm /tmp/env.tar + +# We've put venv in same path it'll be in final image, +# so now fix up paths: +RUN /venv/bin/conda-unpack -RUN apt-get -y update \ - && apt-get -y install git make cmake protobuf-compiler gcc g++ zlib1g-dev libcurl4-openssl-dev libbz2-dev tree python3-pip liblzma-dev wget curl \ - && apt-get clean +############### stage 1 +FROM ubuntu:20.04 AS runtime -# install gsutil -RUN curl https://sdk.cloud.google.com | bash +# Copy /venv from the previous stage: +COPY --from=build /venv /venv +ENV VIRTUAL_ENV=/venv +ENV PATH="$VIRTUAL_ENV/bin:$PATH" -RUN echo "source activate lr-sniffles2" > ~/.bashrc \ No newline at end of file +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + gnupg \ + curl \ + wget \ + zlib1g-dev && \ + echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ + apt-get -qqy update && \ + apt-get -qqy install --no-install-recommends google-cloud-cli && \ + gcloud config set core/disable_usage_reporting true && \ + gcloud config set component_manager/disable_update_check true && \ + gcloud config set metrics/environment github_docker_image && \ + apt-get -qqy purge gnupg wget && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* diff --git a/docker/lr-sniffles2/Makefile b/docker/lr-sniffles2/Makefile index fdc10acca..6f298b66d 100644 --- a/docker/lr-sniffles2/Makefile +++ b/docker/lr-sniffles2/Makefile @@ -1,4 +1,4 @@ -VERSION = 2.0.6 +VERSION = 2.2 TAG1 = us.gcr.io/broad-dsp-lrma/lr-sniffles2:$(VERSION) TAG2 = us.gcr.io/broad-dsp-lrma/lr-sniffles2:latest diff --git a/docker/lr-sniffles2/environment.yml b/docker/lr-sniffles2/environment.yml index a12597d6c..05dc25e7a 100644 --- a/docker/lr-sniffles2/environment.yml +++ b/docker/lr-sniffles2/environment.yml @@ -4,7 +4,7 @@ channels: - conda-forge - defaults dependencies: - - sniffles=2.0.6 - - pysam=0.19.1 + - sniffles=2.2 + - pysam=0.21.0 -prefix: /opt/conda/envs/lr-sniffles2 \ No newline at end of file +prefix: /opt/conda/envs/lr-sniffles2 diff --git a/docker/lr-utils/remove_duplicate_ont_aln.py b/docker/lr-utils/remove_duplicate_ont_aln.py index 51eb13172..82bff3149 100644 --- a/docker/lr-utils/remove_duplicate_ont_aln.py +++ b/docker/lr-utils/remove_duplicate_ont_aln.py @@ -21,6 +21,9 @@ def main(): guilty_dict_per_chr.setdefault(chrom, set()) guilty_dict_per_chr[chrom].add(name) + print("chromosomes on which there are duplicate records:") + print(f" {guilty_dict_per_chr}") + # Silence message about the .bai file not being found. pysam.set_verbosity(0) @@ -36,12 +39,13 @@ def main(): chrom = read.reference_name n = read.query_name - if n in guilty_dict_per_chr[chrom]: + if chrom in guilty_dict_per_chr and n in guilty_dict_per_chr[chrom]: mq = read.mapping_quality sam_flag = read.flag pos = read.reference_start - signature = f"{n}-{chrom}-{pos}-{mq}-{sam_flag}-" + cigar = read.cigarstring + signature = f"{n}-{chrom}-{pos}-{mq}-{sam_flag}-{cigar}" if current_position != pos: # new position, let's write and reset out.write(read) diff --git a/docker/lr-wdl-email/.gitattributes b/docker/lr-wdl-email/.gitattributes new file mode 100644 index 000000000..b77d7e23c --- /dev/null +++ b/docker/lr-wdl-email/.gitattributes @@ -0,0 +1 @@ +jq-linux-amd64 filter=lfs diff=lfs merge=lfs -text diff --git a/docker/lr-wdl-email/Dockerfile b/docker/lr-wdl-email/Dockerfile new file mode 100644 index 000000000..1b633a394 --- /dev/null +++ b/docker/lr-wdl-email/Dockerfile @@ -0,0 +1,50 @@ +FROM python:3.10.13-slim-bullseye + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + gnupg \ + curl \ + wget \ + pigz \ + zlib1g-dev \ + bc \ + datamash \ + gawk \ + less \ + tree && \ + echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ + apt-get -qqy update && \ + apt-get -qqy install --no-install-recommends google-cloud-cli && \ + gcloud config set core/disable_usage_reporting true && \ + gcloud config set component_manager/disable_update_check true && \ + gcloud config set metrics/environment github_docker_image && \ + apt-get -qqy purge gnupg && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir pandas==2.2.1 \ + numpy==1.26.4 \ + sendgrid==6.11.0 \ + ansi2html==1.9.1 \ + natsort==8.4.0 \ + pytz==2024.1 \ + termcolor==2.4.0 \ + xmltodict==0.13.0 \ + python-dateutil==2.9.0.post0 + +COPY --chmod=765 jq-linux-amd64 /usr/local/bin/jq + +COPY --chmod=765 send_email.py /opt/ + +COPY --chmod=765 localize_files.sh /opt/ diff --git a/docker/lr-wdl-email/Makefile b/docker/lr-wdl-email/Makefile new file mode 100644 index 000000000..1e4a0e08a --- /dev/null +++ b/docker/lr-wdl-email/Makefile @@ -0,0 +1,12 @@ +VERSION = 0.0.1 +TAG1 = us.gcr.io/broad-dsp-lrma/lr-wdl-email:$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/lr-wdl-email:latest + +all: build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-wdl-email/jq-linux-amd64 b/docker/lr-wdl-email/jq-linux-amd64 new file mode 100644 index 000000000..2531ace25 --- /dev/null +++ b/docker/lr-wdl-email/jq-linux-amd64 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5942c9b0934e510ee61eb3e30273f1b3fe2590df93933a93d7c58b81d19c8ff5 +size 2319424 diff --git a/docker/lr-wdl-email/localize_files.sh b/docker/lr-wdl-email/localize_files.sh new file mode 100644 index 000000000..a3bc535b4 --- /dev/null +++ b/docker/lr-wdl-email/localize_files.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +set -euxo pipefail + +# synopsis: +# bash /opt/localize_files.sh \ +# object_json \ +# $(pwd) \ +# output_tsv_name + +# for localizing files specified in a JSON file, +# formated as follows: +# { +# "contents": +# [ +# { +# "left": "test.pdf", +# "right": "gs://my_bucket/my_prefix/my_file.txt" +# } +# ] +# } + +# outputs a 2-col tsv, +# where 1st col is the desired file name and +# 2nd col is the local file path +input_json=$1 +localize_dir=$2 +export localize_dir=${localize_dir%/} +export output_tsv_name=$3 + +function localize() { + gcloud storage cp "$1" "${localize_dir}/$2" + echo -e "$2\t${localize_dir}/$2" >> "${output_tsv_name}" +} + + +jq --raw-output '.contents[] | .left' "${input_json}" > tmp.left.txt +jq --raw-output '.contents[] | .right' "${input_json}" > tmp.right.txt +paste tmp.left.txt tmp.right.txt > tmp.2-col.tsv + +while IFS=$'\t' read -r -a line +do + localize "${line[1]}" "${line[0]}" +done < tmp.2-col.tsv + +rm tmp.left.txt tmp.right.txt tmp.2-col.tsv diff --git a/docker/lr-wdl-email/send_email.py b/docker/lr-wdl-email/send_email.py new file mode 100644 index 000000000..086071a4e --- /dev/null +++ b/docker/lr-wdl-email/send_email.py @@ -0,0 +1,268 @@ +import argparse +import base64 +import os +from typing import List +import pandas as pd + +from sendgrid import SendGridAPIClient +from sendgrid.helpers.mail import (Attachment, FileContent, FileName, FileType, Disposition, ContentId) +from sendgrid.helpers.mail import From as SGFrom, To as SGTo, Subject as SGSubject +from sendgrid.helpers.mail import Mail, PlainTextContent, HtmlContent + + +def send_notification(notification_sender_name: str, + notification_receiver_names: List[str], + notification_receiver_emails: List[str], + email_subject: str, + email_body: str, + html_body: str = None) -> None: + """ + Sending notification email to (potentially) multiple recipients. + + Note that this assumes two environment variables are set appropriately: "SENDGRID_API_KEY" & "SENDER_EMAIL". + + Provide html_body at your own risk. + + Shameless copy from + https://github.com/sendgrid/sendgrid-python/blob/main/examples/helpers/mail_example.py#L9 + :return: + """ + if len(notification_receiver_emails) != len(notification_receiver_names): + raise ValueError("Different number of recipients and recipients' emails") + + sg = SendGridAPIClient(api_key=os.environ.get('SENDGRID_API_KEY')) + + email_core = _construct_sendgrid_mail_core(notification_sender_name, email_subject, email_body, html_body) + + # send + failed_responses = list() + from copy import deepcopy + for i in range(len(notification_receiver_emails)): + message = deepcopy(email_core) + message.add_to(SGTo(notification_receiver_emails[i], notification_receiver_names[i])) + response = sg.client.mail.send.post(request_body=message.get()) + if 202 != response.status_code: + failed_responses.append(i) + if 0 < len(failed_responses): + failures = ' \n'.join([notification_receiver_names[i]+':'+notification_receiver_emails[i] + for i in failed_responses]) + logger.warning(f"Failed to send message to some receivers: \n {failures}") + + +def send_notification_with_attachments(notification_sender_name: str, + notification_receiver_names: List[str], + notification_receiver_emails: List[str], + email_subject: str, + email_body: str, + html_body: str = None, + txt_names_and_files: list = None, + tsv_names_and_files: list = None, + pdf_names_and_files: list = None + ) -> None: + """ + Sending notification email to (potentially) multiple recipients. + Note that this assumes two environment variables are set appropriately, "SENDGRID_API_KEY" & "SENDER_EMAIL". + + Provide html_body at your own risk. + + Shameless copy from + https://github.com/sendgrid/sendgrid-python/blob/main/examples/helpers/mail_example.py#L9 + :return: + """ + if len(notification_receiver_emails) != len(notification_receiver_names): + raise ValueError("Different number of recipients and recipients' emails") + + sg = SendGridAPIClient(api_key=os.environ.get('SENDGRID_API_KEY')) + + email_core = _construct_sendgrid_mail_core(notification_sender_name, email_subject, email_body, html_body) + + # attach + attachments = _attach_files_to_mail(txt_names_and_files, tsv_names_and_files, pdf_names_and_files) + for a in attachments: + email_core.add_attachment(a) + + # send + failed_responses = list() + from copy import deepcopy + for i in range(len(notification_receiver_emails)): + message = deepcopy(email_core) + message.add_to(SGTo(notification_receiver_emails[i], notification_receiver_names[i])) + response = sg.client.mail.send.post(request_body=message.get()) + if 202 != response.status_code: + failed_responses.append(i) + if 0 < len(failed_responses): + failures = ' \n'.join([notification_receiver_names[i]+':'+notification_receiver_emails[i] + for i in failed_responses]) + logger.warning(f"Failed to send message to some receivers: \n {failures}") + + +def _construct_sendgrid_mail_core(notification_sender_name: str, + email_subject: str, + email_body: str, + html_body: str = None) -> Mail: + + """ + Construct core content of notification email to (potentially) multiple recipients. + + Note that this assumes two environment variables are set appropriately, "SENDGRID_API_KEY" & "SENDER_EMAIL". + + The returned Mail object DOES NOT specify recipients, caller should customize that. + + Provide html_body at your own risk. + + Shameless copy from + https://github.com/sendgrid/sendgrid-python/blob/main/examples/helpers/mail_example.py#L9 + :return: + """ + + # arg validation + assert "SENDGRID_API_KEY" in os.environ, \ + 'environment variable SENDGRID_API_KEY is needed.' + assert "SENDER_EMAIL" in os.environ, \ + 'environment variable SENDER_EMAIL is needed.' + + # construct core + notification_sender_email = os.environ.get('SENDER_EMAIL') + email_core = Mail(from_email=SGFrom(notification_sender_email, notification_sender_name), + subject=SGSubject(email_subject), + plain_text_content=PlainTextContent(email_body), + html_content=HtmlContent(html_body) if html_body else None, + is_multiple=True) # recipients won't see each other + return email_core + + +def _attach_files_to_mail(txt_names_and_files: list = None, + tsv_names_and_files: list = None, + pdf_names_and_files: list = None) -> list: + """ + Return a list of SendGrid Attachments for attaching to the email core. + + :param txt_names_and_files: list of tuple2 (file name, file path) + :param tsv_names_and_files: list of tuple2 (file name, file path) + :param pdf_names_and_files: list of tuple2 (file name, file path) + :return: a list of SendGrid Attachments for attaching to the email core. + """ + + has_something_to_attach = False + for a in [txt_names_and_files, tsv_names_and_files, pdf_names_and_files]: + if a is not None and 0 != len(a): + has_something_to_attach = True + break + + assert has_something_to_attach, "No valid inputs for building attachments" + + attachments = list() + + # txt + if txt_names_and_files is not None: + for attachment_txt_name, attachment_txt_path in txt_names_and_files: + with open(attachment_txt_path) as inf: + contents = [line.strip() for line in inf.readlines()] + base64_txt = \ + base64.b64encode(('\n'.join(contents)).encode('utf-8')).decode('utf-8') + txt_attachment = Attachment( + FileContent(base64_txt), + FileName(attachment_txt_name), + FileType('text/plain'), + Disposition('attachment'), + ContentId(attachment_txt_name) + ) + attachments.append(txt_attachment) + # tsv + if tsv_names_and_files is not None: + for attachment_tsv_name, attachment_tsv_path in tsv_names_and_files: + temp_dataframe = pd.read_csv(attachment_tsv_path, header='infer', sep='\t') + base64_csv = \ + base64.b64encode(temp_dataframe.to_csv(header=True, index=False, sep='\t').encode()).decode() + tsv_attachment = Attachment( + FileContent(base64_csv), + FileName(attachment_tsv_name), + FileType('text/csv'), + Disposition('attachment'), + ContentId('dataframe') + ) + attachments.append(tsv_attachment) + # pdf + if pdf_names_and_files is not None: + for attachment_pdf_name, attachment_pdf_path in pdf_names_and_files: + with open(attachment_pdf_path, 'rb') as f: + data = f.read() + pdf_content = base64.b64encode(data).decode() + + pdf_attachment = Attachment( + FileContent(pdf_content), + FileName(attachment_pdf_name), + FileType('application/pdf'), + Disposition('attachment') + ) + attachments.append(pdf_attachment) + + return attachments + + +def main(): + parser = argparse.ArgumentParser(description='Send email via SendGrid', + prog='send_email') + + parser.add_argument('--sendgrid_api_key', type=str, + help="JSON file holding the secret API key registered at SendGrid") + parser.add_argument('--sender_name', type=str, + help="Name of the sender, i.e. identify yourself") + parser.add_argument('--sender_email', type=str, + help="Email address registered at SendGrid for sending out the email") + + parser.add_argument('--notification_receiver_names', + type=str, + help="Read names of duplicate records") + parser.add_argument('--notification_receiver_emails', + type=str, + help="Read names of duplicate records") + + parser.add_argument('--email_subject', type=str, + help="The subject/title/topic of the email") + parser.add_argument('--email_body', type=str, + help="The plain-text contents of the email") + + # we hide this option in cli because that's for later + # parser.add_argument('--html_body', type=str, help="if you") + + parser.add_argument('--txt_names_and_files', type=str, + help="2-col TSV holding the (desired name, path) of the txt file to be attached") + parser.add_argument('--tsv_names_and_files', type=str, + help="2-col TSV holding the (desired name, path) of the tsv file to be attached") + parser.add_argument('--pdf_names_and_files', type=str, + help="2-col TSV holding the (desired name, path) of the pdf file to be attached") + + args = parser.parse_args() + + with open(args.sendgrid_api_key, 'r') as inf: + os.environ['SENDGRID_API_KEY'] = inf.readlines()[0].strip() + os.environ['SENDER_EMAIL'] = args.sender_email + + with open(args.notification_receiver_names) as inf: + receivers = [n.strip() for n in inf.readlines()] + with open(args.notification_receiver_emails) as inf: + receiver_emails = [n.strip() for n in inf.readlines()] + + def load_from_headerless_tsv(ff: str) -> list or None: + if ff is None: + return None + else: + a = pd.read_csv(ff, header=None, sep='\t') + return list(zip(a.iloc[:, 0], a.iloc[:, 1])) + + send_notification_with_attachments(notification_sender_name=args.sender_name, + notification_receiver_names=receivers, + notification_receiver_emails=receiver_emails, + + email_subject=args.email_subject, + email_body=args.email_body, + html_body=None, + + txt_names_and_files=load_from_headerless_tsv(args.txt_names_and_files), + tsv_names_and_files=load_from_headerless_tsv(args.tsv_names_and_files), + pdf_names_and_files=load_from_headerless_tsv(args.pdf_names_and_files)) + + +if __name__ == "__main__": + main() diff --git a/docker/lr-whatshap/Dockerfile b/docker/lr-whatshap/Dockerfile index 08b93cbb2..060d73c22 100644 --- a/docker/lr-whatshap/Dockerfile +++ b/docker/lr-whatshap/Dockerfile @@ -1,20 +1,57 @@ -FROM continuumio/miniconda3 +############### stage 0 +FROM continuumio/miniconda3 AS build -MAINTAINER Kiran V Garimella +# Install the package as normal: +COPY environment.yml . +RUN conda env create -f environment.yml -# copy other resources -COPY ./environment.yml / +# Install conda-pack: +RUN conda install -c conda-forge conda-pack && \ + conda clean -a -RUN apt-get -y update && \ - apt-get -y install curl zlib1g-dev libcurl4-openssl-dev libbz2-dev liblzma-dev gcc make && \ - apt-get clean +# Use conda-pack to create a standalone enviornment +# in /venv: +RUN conda-pack \ + -n lr-whatshap \ + -o /tmp/env.tar && \ + mkdir /venv && cd /venv && tar xf /tmp/env.tar && \ + rm /tmp/env.tar -# install conda packages -RUN conda env create -f /environment.yml && conda clean -a -ENV PATH=/opt/conda/envs/lr-whatshap/bin/:/root/google-cloud-sdk/bin/:${PATH} +# We've put venv in same path it'll be in final image, +# so now fix up paths: +RUN /venv/bin/conda-unpack -# install gsutil -RUN curl https://sdk.cloud.google.com | bash +############### stage 1 +FROM ubuntu:20.04 AS runtime -# activate conda environment -RUN echo "source activate lr-whatshap" > ~/.bashrc +# Copy /venv from the previous stage: +COPY --from=build /venv /venv + +ENV VIRTUAL_ENV=/venv +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy dist-upgrade && \ + apt-get -qqy install --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + gnupg \ + curl \ + wget \ + zlib1g-dev && \ + echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ + apt-get -qqy update && \ + apt-get -qqy install --no-install-recommends google-cloud-cli && \ + gcloud config set core/disable_usage_reporting true && \ + gcloud config set component_manager/disable_update_check true && \ + gcloud config set metrics/environment github_docker_image && \ + apt-get -qqy purge gnupg wget && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* diff --git a/docker/lr-whatshap/Makefile b/docker/lr-whatshap/Makefile index 224f2b0cd..8f7f86e04 100644 --- a/docker/lr-whatshap/Makefile +++ b/docker/lr-whatshap/Makefile @@ -1,17 +1,16 @@ IMAGE_NAME = lr-whatshap -VERSION = 1.1 # This should match the WhatsHap version number - +VERSION = 2.0 # This should match the WhatsHap version number TAG1 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):$(VERSION) TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest all: | build push build: - docker build -t $(TAG1) -t $(TAG2) . + docker build -t $(TAG1) -t $(TAG2) . build_no_cache: - docker build --no-cache -t $(TAG1) -t $(TAG2) . + docker build --no-cache -t $(TAG1) -t $(TAG2) . push: - docker push $(TAG1) - docker push $(TAG2) + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-whatshap/environment.yml b/docker/lr-whatshap/environment.yml index ff2f76879..e0eb58343 100644 --- a/docker/lr-whatshap/environment.yml +++ b/docker/lr-whatshap/environment.yml @@ -1,61 +1,13 @@ name: lr-whatshap + channels: - bioconda - conda-forge - defaults + dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=1_gnu - - bcftools=1.8=h4da6232_3 - - blas=1.0=openblas - - bz2file=0.98=py36h06a4308_1 - - bzip2=1.0.8=h7f98852_4 - - c-ares=1.17.1=h7f98852_1 - - ca-certificates=2021.5.25=h06a4308_1 - - certifi=2021.5.30=py36h06a4308_0 - - curl=7.71.1=hbc83047_1 - - htslib=1.11=hd3b49d5_2 - - krb5=1.18.2=h173b8e3_0 - - ld_impl_linux-64=2.33.1=h53a641e_7 - - libcurl=7.71.1=h20c2e04_1 - - libdeflate=1.7=h7f98852_5 - - libedit=3.1.20191231=he28a2e2_2 - - libev=4.33=h516909a_1 - - libffi=3.3=he6710b0_2 - - libgcc=7.2.0=h69d50b8_2 - - libgcc-ng=9.3.0=h2828fa1_19 - - libgfortran-ng=7.3.0=hdf63c60_0 - - libgomp=9.3.0=h2828fa1_19 - - libnghttp2=1.43.0=h812cca2_0 - - libopenblas=0.3.10=h5a2b251_0 - - libssh2=1.9.0=ha56f1ee_6 - - libstdcxx-ng=9.3.0=h6de172a_19 - - ncurses=6.2=h58526e2_4 - - nomkl=3.0=0 - - openblas=0.3.10=0 - - openblas-devel=0.3.10=0 - - openssl=1.1.1k=h27cfd23_0 - - pip=21.0.1=py36h06a4308_0 - - pyfaidx=0.5.9.5=pyh3252c3a_0 - - python=3.6.13=hdb3f193_0 - - pyvcf=0.6.8=py36_0 - - readline=8.1=h27cfd23_0 - - samtools=1.11=h6270b1f_0 - - setuptools=52.0.0=py36h06a4308_0 - - six=1.15.0=py36h06a4308_0 - - sqlite=3.35.4=hdfb4753_0 - - tk=8.6.10=h21135ba_1 - - wheel=0.36.2=pyhd3eb1b0_0 - - xopen=0.7.3=py_0 - - xz=5.2.5=h516909a_1 - - zlib=1.2.11=h516909a_1010 - - pip: - - biopython==1.79 - - dataclasses==0.8 - - decorator==4.4.2 - - networkx==2.5.1 - - numpy==1.19.5 - - pysam==0.16.0.1 - - scipy==1.5.4 - - whatshap==1.1 + - whatshap==2.0 + - samtools==1.17 + - bcftools==1.17 + prefix: /opt/conda/envs/lr-whatshap diff --git a/scripts/wdl/validate.wdls.sh b/scripts/wdl/validate.wdls.sh index 652551308..3856a1074 100755 --- a/scripts/wdl/validate.wdls.sh +++ b/scripts/wdl/validate.wdls.sh @@ -9,7 +9,8 @@ fi echo "VALIDATING WDL FILES IN $PWD" (womtool --version || echo "I need womtool to run" ) && echo -for wdl in *.wdl; do - echo -e "==============================\n${wdl}"; - womtool validate "${wdl}"; +for wdl in $(find . -type f -name "*.wdl"); do + womtool validate "${wdl}" 2>/dev/null + test $? -eq 0 || \ + (echo -e "==============================\n${wdl}"; womtool validate "${wdl}") done diff --git a/wdl/tasks/Alignment/AlignAndCheckFingerprintCCS.wdl b/wdl/deprecated/AlignAndCheckFingerprintCCS.wdl similarity index 76% rename from wdl/tasks/Alignment/AlignAndCheckFingerprintCCS.wdl rename to wdl/deprecated/AlignAndCheckFingerprintCCS.wdl index 9b0fb6027..a2ca9aeed 100644 --- a/wdl/tasks/Alignment/AlignAndCheckFingerprintCCS.wdl +++ b/wdl/deprecated/AlignAndCheckFingerprintCCS.wdl @@ -1,10 +1,13 @@ version 1.0 -import "../QC/FPCheckAoU.wdl" as FPCheck -import "../QC/CollectPacBioAlignedMetrics.wdl" as AlnMetrics -import "../Utility/PBUtils.wdl" as PB -import "../Utility/Utils.wdl" -import "../Utility/GeneralUtils.wdl" +import "../tasks/Utility/PBUtils.wdl" as PB +import "../tasks/Utility/Utils.wdl" +import "../tasks/Utility/GeneralUtils.wdl" +import "../tasks/Utility/BAMutils.wdl" as BU + +import "CollectPacBioAlignedMetrics.wdl" as AlnMetrics +import "../tasks/QC/AlignedMetrics.wdl" as GeAlnMetrics +import "../tasks/QC/FPCheckAoU.wdl" as FPCheck workflow AlignAndCheckFingerprintCCS { meta { @@ -13,10 +16,10 @@ workflow AlignAndCheckFingerprintCCS { } input { - File uBAM - File uPBI + File uBAM + File? uPBI String bam_sample_name - String library + String? library Boolean turn_off_fingperprint_check String fp_store @@ -40,6 +43,10 @@ workflow AlignAndCheckFingerprintCCS { Map[String, String] ref_map = read_map(ref_map_file) + call BU.GetReadGroupInfo as RG {input: bam = uBAM, keys = ['SM', 'LB', 'PU']} + String LB = select_first([library, RG.read_group_info['LB']]) + String movie_name = RG.read_group_info['PU'] + ################################################################################### if (ceil(size(uBAM, "GB")) > 50) {# shard & align, but practically never true @@ -52,10 +59,13 @@ workflow AlignAndCheckFingerprintCCS { call Utils.ComputeAllowedLocalSSD as Guess {input: intended_gb = 3*ceil(size(uBAM, "GB") + size(uPBI, "GB"))} call Utils.RandomZoneSpewer as arbitrary {input: num_of_zones = 3} + if (! defined(uPBI) ) { + call PB.PBIndex as PBIndex {input: bam = uBAM} + } call PB.ShardLongReads { input: - unaligned_bam = uBAM, unaligned_pbi = uPBI, + unaligned_bam = uBAM, unaligned_pbi = select_first([uPBI, PBIndex.pbi]), num_shards = 50, num_ssds = Guess.numb_of_local_ssd, zones = arbitrary.zones } @@ -109,24 +119,37 @@ workflow AlignAndCheckFingerprintCCS { name = "alignment.metrics" } + call BU.SamtoolsFlagStats { input: bam = aBAM, output_format = 'JSON' } + call BU.ParseFlagStatsJson { input: sam_flag_stats_json = SamtoolsFlagStats.flag_stats } + if (!turn_off_fingperprint_check){ call FPCheck.FPCheckAoU { input: aligned_bam = aBAM, aligned_bai = aBAI, - fp_store = fp_store, - sample_id_at_store = sample_id_at_store, + tech = 'Sequel', # making assumption that data to process here are all Sequel data (no critial impact though) + fp_vcf_store = fp_store, + fp_sample_id = sample_id_at_store, ref_specific_haplotype_map = ref_map['haplotype_map'] } call GeneralUtils.TarGZFiles as saveFPRes {input: files = [FPCheckAoU.fingerprint_summary, FPCheckAoU.fingerprint_details], name = 'fingerprint_check.summary_and_details'} } + call GeAlnMetrics.MosDepthWGS { input: bam = aBAM, bai = aBAI } + output { File aligned_bam = aBAM File aligned_bai = aBAI File aligned_pbi = IndexAlignedReads.pbi + String movie = movie_name + + Float wgs_cov = MosDepthWGS.wgs_cov + File coverage_per_chr = MosDepthWGS.summary_txt + File alignment_metrics_tar_gz = saveAlnMetrics.you_got_it + Map[String, Float] alignment_metrics = CollectPacBioAlignedMetrics.alignment_metrics + Map[String, Float] sam_flag_stats = ParseFlagStatsJson.qc_pass_reads_SAM_flag_stats Float? fp_lod_expected_sample = FPCheckAoU.lod_expected_sample String? fp_status = FPCheckAoU.FP_status diff --git a/wdl/tasks/QC/CollectPacBioAlignedMetrics.wdl b/wdl/deprecated/CollectPacBioAlignedMetrics.wdl similarity index 95% rename from wdl/tasks/QC/CollectPacBioAlignedMetrics.wdl rename to wdl/deprecated/CollectPacBioAlignedMetrics.wdl index b9d25d6f7..1f510802c 100644 --- a/wdl/tasks/QC/CollectPacBioAlignedMetrics.wdl +++ b/wdl/deprecated/CollectPacBioAlignedMetrics.wdl @@ -1,7 +1,7 @@ version 1.0 -import "../Utility/PBUtils.wdl" as PB -import "../Visualization/NanoPlot.wdl" as NP +import "../tasks/Utility/PBUtils.wdl" as PB +import "../tasks/Visualization/NanoPlot.wdl" as NP workflow CollectPacBioAlignedMetrics { @@ -46,6 +46,7 @@ workflow CollectPacBioAlignedMetrics { output { File custom_aln_metrics_summary = CustomMetricsSummaryToFile.custom_aln_metrics_summary File nanoplot_stats = NanoPlotFromBam.stats + Map[String, Float] alignment_metrics = NanoPlotFromBam.stats_map Array[File] nanoplot_pngs = NanoPlotFromBam.plots } } diff --git a/wdl/tasks/QC/CollectSMRTCellUnalignedMetrics.wdl b/wdl/deprecated/CollectSMRTCellUnalignedMetrics.wdl similarity index 94% rename from wdl/tasks/QC/CollectSMRTCellUnalignedMetrics.wdl rename to wdl/deprecated/CollectSMRTCellUnalignedMetrics.wdl index 551972276..44b6abd4f 100644 --- a/wdl/tasks/QC/CollectSMRTCellUnalignedMetrics.wdl +++ b/wdl/deprecated/CollectSMRTCellUnalignedMetrics.wdl @@ -1,7 +1,7 @@ version 1.0 -import "../Utility/PBUtils.wdl" as PB -import "../Utility/Utils.wdl" +import "../tasks/Utility/PBUtils.wdl" as PB +import "../tasks/Utility/Utils.wdl" workflow CollectSMRTCellUnalignedMetrics { diff --git a/wdl/deprecated/PBCCSDemultiplexWholeGenome.wdl b/wdl/deprecated/PBCCSDemultiplexWholeGenome.wdl index 05a41636e..591a881d6 100644 --- a/wdl/deprecated/PBCCSDemultiplexWholeGenome.wdl +++ b/wdl/deprecated/PBCCSDemultiplexWholeGenome.wdl @@ -12,8 +12,8 @@ import "../tasks/Utility/PBUtils.wdl" as PB import "../tasks/Utility/Utils.wdl" as Utils import "../tasks/Alignment/AlignReads.wdl" as AR import "../tasks/QC/AlignedMetrics.wdl" as AM -import "../tasks/VariantCalling/CallVariantsPBCCS.wdl" as VAR import "../tasks/Utility/Finalize.wdl" as FF +import "../pipelines/TechAgnostic/VariantCalling/CallVariantsReadBased.wdl" as VAR workflow PBCCSDemultiplexWholeGenome { input { @@ -126,10 +126,16 @@ workflow PBCCSDemultiplexWholeGenome { bam = ccs_bam, bai = ccs_bai, - ref_fasta = ref_map['fasta'], - ref_fasta_fai = ref_map['fai'], - ref_dict = ref_map['dict'], - tandem_repeat_bed = ref_map['tandem_repeat_bed'], + prefix = participant_name, + is_ont = false, + is_r10_4_pore_or_later = false, + model_for_dv_andor_pepper = 'PACBIO', + ref_map_file = ref_map_file, + + call_svs = true, + pbsv_discover_per_chr = true, + call_small_variants = true, + run_clair3 = false } ########## diff --git a/wdl/deprecated/PBCLRWholeGenome.wdl b/wdl/deprecated/PBCLRWholeGenome.wdl index 4d0c1dabe..e946bda61 100644 --- a/wdl/deprecated/PBCLRWholeGenome.wdl +++ b/wdl/deprecated/PBCLRWholeGenome.wdl @@ -3,10 +3,12 @@ version 1.0 import "../tasks/Utility/PBUtils.wdl" as PB import "../tasks/Utility/Utils.wdl" as Utils import "../tasks/Utility/Finalize.wdl" as FF -import "../tasks/QC/SampleLevelAlignedMetrics.wdl" as COV +import "SampleLevelAlignedMetrics.wdl" as COV import "tasks/CallVariantsPBCLR.wdl" as VAR +import "../pipelines/TechAgnostic/Utility/MergeSampleBamsAndCollectMetrics.wdl" as MERGE + workflow PBCLRWholeGenome { meta { @@ -32,6 +34,7 @@ workflow PBCLRWholeGenome { Array[File] aligned_bais File? bed_to_compute_coverage + String? bed_descriptor File ref_map_file @@ -70,8 +73,8 @@ workflow PBCLRWholeGenome { input: aligned_bam = bam, aligned_bai = bai, - ref_fasta = ref_map['fasta'], - bed_to_compute_coverage = bed_to_compute_coverage + bed_to_compute_coverage = bed_to_compute_coverage, + bed_descriptor = bed_descriptor } String dir = outdir + "/alignments" @@ -125,18 +128,20 @@ workflow PBCLRWholeGenome { File aligned_bai = FinalizeBai.gcs_path File aligned_pbi = FinalizePbi.gcs_path - Float aligned_num_reads = coverage.aligned_num_reads - Float aligned_num_bases = coverage.aligned_num_bases - Float aligned_frac_bases = coverage.aligned_frac_bases - Float aligned_est_fold_cov = coverage.aligned_est_fold_cov + # Float aligned_num_reads = coverage.aligned_num_reads + # Float aligned_num_bases = coverage.aligned_num_bases + # Float aligned_frac_bases = coverage.aligned_frac_bases + # Float aligned_est_fold_cov = coverage.aligned_est_fold_cov + + # Float aligned_read_length_mean = coverage.aligned_read_length_mean + # Float aligned_read_length_median = coverage.aligned_read_length_median + # Float aligned_read_length_stdev = coverage.aligned_read_length_stdev + # Float aligned_read_length_N50 = coverage.aligned_read_length_N50 - Float aligned_read_length_mean = coverage.aligned_read_length_mean - Float aligned_read_length_median = coverage.aligned_read_length_median - Float aligned_read_length_stdev = coverage.aligned_read_length_stdev - Float aligned_read_length_N50 = coverage.aligned_read_length_N50 + # Float average_identity = coverage.average_identity + # Float median_identity = coverage.median_identity - Float average_identity = coverage.average_identity - Float median_identity = coverage.median_identity + Map[String, Float] alignment_metrics = coverage.reads_stats File? bed_cov_summary = FinalizeRegionalCoverage.gcs_path ######################################## diff --git a/wdl/deprecated/PostprocessCCSedDemultiplexedSMRTCell.wdl b/wdl/deprecated/PostprocessCCSedDemultiplexedSMRTCell.wdl index 307167d1b..5fbf1bbd5 100644 --- a/wdl/deprecated/PostprocessCCSedDemultiplexedSMRTCell.wdl +++ b/wdl/deprecated/PostprocessCCSedDemultiplexedSMRTCell.wdl @@ -1,6 +1,6 @@ version 1.0 -import "../tasks/Alignment/AlignAndCheckFingerprintCCS.wdl" as major +import "AlignAndCheckFingerprintCCS.wdl" as major import "../tasks/Utility/BAMutils.wdl" import "../tasks/Utility/Utils.wdl" import "../tasks/Utility/GeneralUtils.wdl" as GU @@ -93,7 +93,7 @@ workflow PostprocessCCSedDemultiplexedSMRTCell { call GetDemxedFilePaths {input: demux_dir = bc_n_dir.right} - call BAMutils.GetReadGroupInfo as RG {input: uBAM = GetDemxedFilePaths.bam_path, keys = ['SM', 'LB']} + call BAMutils.GetReadGroupInfo as RG {input: bam = GetDemxedFilePaths.bam_path, keys = ['SM', 'LB']} call major.AlignAndCheckFingerprintCCS { input: diff --git a/wdl/deprecated/PreprocessBarcodedCCSedSMRTCell.wdl b/wdl/deprecated/PreprocessBarcodedCCSedSMRTCell.wdl index d6f66206a..bb65a087f 100644 --- a/wdl/deprecated/PreprocessBarcodedCCSedSMRTCell.wdl +++ b/wdl/deprecated/PreprocessBarcodedCCSedSMRTCell.wdl @@ -2,9 +2,10 @@ version 1.0 import "tasks/CCSLima.wdl" import "tasks/SMRTtools.wdl" -import "../tasks/QC/CollectSMRTCellUnalignedMetrics.wdl" as uBAMCustomMetrics -import "../tasks/Utility//PBUtils.wdl" as PB -import "../tasks/Utility//Finalize.wdl" as FF +import "CollectSMRTCellUnalignedMetrics.wdl" as uBAMCustomMetrics + +import "../tasks/Utility/PBUtils.wdl" as PB +import "../tasks/Utility/Finalize.wdl" as FF import "../tasks/Utility/Utils.wdl" import "../tasks/Utility/GeneralUtils.wdl" as GU diff --git a/wdl/tasks/QC/SampleLevelAlignedMetrics.wdl b/wdl/deprecated/SampleLevelAlignedMetrics.wdl similarity index 66% rename from wdl/tasks/QC/SampleLevelAlignedMetrics.wdl rename to wdl/deprecated/SampleLevelAlignedMetrics.wdl index 009eb747f..35b6a4706 100644 --- a/wdl/tasks/QC/SampleLevelAlignedMetrics.wdl +++ b/wdl/deprecated/SampleLevelAlignedMetrics.wdl @@ -1,8 +1,9 @@ version 1.0 -import "../Utility/Utils.wdl" -import "../Visualization/NanoPlot.wdl" as NP -import "../QC/AlignedMetrics.wdl" as AM +import "../tasks/Utility/Utils.wdl" +import "../tasks/Utility/BAMutils.wdl" as BU +import "../tasks/Visualization/NanoPlot.wdl" as NP +import "../tasks/QC/AlignedMetrics.wdl" as AM workflow SampleLevelAlignedMetrics { @@ -15,52 +16,47 @@ workflow SampleLevelAlignedMetrics { aligned_bai: "Index for the aligned BAM file" ref_fasta: "Reference FASTA file" bed_to_compute_coverage: "Optional BED file to compute coverage over" + bed_descriptor: "Description of the BED file, will be used in the file name so be careful naming things" } input { File aligned_bam File aligned_bai - File ref_fasta - File? bed_to_compute_coverage + String? bed_descriptor } - call Utils.ComputeGenomeLength { input: fasta = ref_fasta } - call NP.NanoPlotFromBam { input: bam = aligned_bam, bai = aligned_bai } - if (defined(bed_to_compute_coverage)) { - call AM.MosDepthOverBed { - input: - bam = aligned_bam, - bai = aligned_bai, - bed = select_first([bed_to_compute_coverage]) + if (!defined(bed_descriptor)) { + call Utils.StopWorkflow { input: reason = "Must provied descriptive name of the BED file if the file is provided."} } + } + + call NP.NanoPlotFromBam { input: bam = aligned_bam, bai = aligned_bai } + + call AM.MosDepthWGS { input: bam = aligned_bam, bai = aligned_bai, bed = bed_to_compute_coverage, bed_descriptor = bed_descriptor } + call BU.SamtoolsFlagStats { input: bam = aligned_bam, output_format = 'JSON' } + call BU.ParseFlagStatsJson { input: sam_flag_stats_json = SamtoolsFlagStats.flag_stats } + + if (defined(bed_to_compute_coverage)) { call SummarizeDepthOverWholeBed as cov_over_region { input: - mosdepth_output = MosDepthOverBed.regions + mosdepth_output = select_first([MosDepthWGS.regions]) } } output { + Float coverage = MosDepthWGS.wgs_cov + File coverage_per_chr = MosDepthWGS.summary_txt - File? bed_cov_summary = cov_over_region.cov_summary - - Float aligned_num_reads = NanoPlotFromBam.stats_map['number_of_reads'] - Float aligned_num_bases = NanoPlotFromBam.stats_map['number_of_bases_aligned'] - Float aligned_frac_bases = NanoPlotFromBam.stats_map['fraction_bases_aligned'] - Float aligned_est_fold_cov = NanoPlotFromBam.stats_map['number_of_bases_aligned']/ComputeGenomeLength.length - - Float aligned_read_length_mean = NanoPlotFromBam.stats_map['mean_read_length'] - Float aligned_read_length_median = NanoPlotFromBam.stats_map['median_read_length'] - Float aligned_read_length_stdev = NanoPlotFromBam.stats_map['read_length_stdev'] - Float aligned_read_length_N50 = NanoPlotFromBam.stats_map['n50'] + Map[String, Float] reads_stats = NanoPlotFromBam.stats_map + Array[File] nano_plots = NanoPlotFromBam.plots - Float average_identity = NanoPlotFromBam.stats_map['average_identity'] - Float median_identity = NanoPlotFromBam.stats_map['median_identity'] + Map[String, Float] sam_flag_stats = ParseFlagStatsJson.qc_pass_reads_SAM_flag_stats - Map[String, Float] reads_stats = NanoPlotFromBam.stats_map + File? bed_cov_summary = cov_over_region.cov_summary } } @@ -86,7 +82,7 @@ task SummarizeDepthOverWholeBed { command <<< set -euxo pipefail - echo 'chr start stop gene cov_mean' | awk 'BEGIN {OFS="\t"} {print}' > ~{prefix}.summary.txt + echo -e 'chr\tstart\tstop\tgene\tcov_mean' > ~{prefix}.summary.txt zcat ~{mosdepth_output} >> ~{prefix}.summary.txt >>> diff --git a/wdl/deprecated/tasks/CCSPepper.wdl b/wdl/deprecated/tasks/CCSPepper.wdl new file mode 100644 index 000000000..54f3d7ac8 --- /dev/null +++ b/wdl/deprecated/tasks/CCSPepper.wdl @@ -0,0 +1,96 @@ +version 1.0 + +####################################################### +# This pipeline calls small variants using DeepVariant. +####################################################### + +import "../../structs/Structs.wdl" + +task Pepper { + input { + File bam + File bai + + File ref_fasta + File ref_fasta_fai + + Int threads + Int memory + String zones + Boolean use_gpu = false + + RuntimeAttr? runtime_attr_override + } + + Int bam_sz = ceil(size(bam, "GB")) + Int disk_size = if bam_sz > 200 then 2*bam_sz else bam_sz + 200 + + String output_root = "/cromwell_root/pepper_output" + + String prefix = basename(bam, ".bam") + ".pepper" + + command <<< + # avoid the infamous pipefail 141 https://stackoverflow.com/questions/19120263 + set -eux + SM=$(samtools view -H ~{bam} | grep -m1 '^@RG' | sed 's/\t/\n/g' | grep '^SM:' | sed 's/SM://g') + + set -euxo pipefail + + touch ~{bai} + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + mkdir -p "~{output_root}" + + # no gVCF as it Pepper simply doesn't produce gVCF on CCS data + run_pepper_margin_deepvariant \ + call_variant \ + -b ~{bam} \ + -f ~{ref_fasta} \ + -t "${num_core}" \ + -s "${SM}" \ + -o "~{output_root}" \ + -p "~{prefix}" \ + --phased_output \ + --ccs + + find "~{output_root}/" -print | sed -e 's;[^/]*/;|____;g;s;____|; |;g' \ + > "~{output_root}/dir_structure.txt" + + if [[ -f "~{output_root}/intermediate_files/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" ]]; then + mv "~{output_root}/intermediate_files/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" \ + "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" + mv "~{output_root}/intermediate_files/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam.bai" \ + "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam.bai" + fi + >>> + + output { + File hap_tagged_bam = "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" + File hap_tagged_bai = "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam.bai" + + # maybe less useful + File output_dir_structure = "~{output_root}/dir_structure.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: threads, + mem_gb: memory, + disk_gb: disk_size, + boot_disk_gb: 50, + preemptible_tries: 1, + max_retries: 1, + docker: "kishwars/pepper_deepvariant:r0.8" + if use_gpu then "-gpu" else "" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + zones: zones + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/deprecated/tasks/CallVariantsPBCLR.wdl b/wdl/deprecated/tasks/CallVariantsPBCLR.wdl index f0bc9e011..c4dbc9754 100644 --- a/wdl/deprecated/tasks/CallVariantsPBCLR.wdl +++ b/wdl/deprecated/tasks/CallVariantsPBCLR.wdl @@ -83,15 +83,16 @@ workflow CallVariants { locus = contig_for_sv } - call PBSV.RunPBSV { + call PBSV.Discover as pbsv_discover_chr { input: bam = SubsetBam.subset_bam, bai = SubsetBam.subset_bai, + is_hifi = true, ref_fasta = ref_fasta, ref_fasta_fai = ref_fasta_fai, - prefix = prefix, tandem_repeat_bed = tandem_repeat_bed, - is_ccs = false, + chr = contig_for_sv, + prefix = prefix, zones = arbitrary.zones } @@ -114,12 +115,14 @@ workflow CallVariants { sample_name = InferSampleName.sample_name } } - - call VariantUtils.MergePerChrCalls as MergePBSVVCFs { + call PBSV.Call as pbsv_wg_call { input: - vcfs = RunPBSV.vcf, - ref_dict = ref_dict, - prefix = prefix + ".pbsv" + svsigs = pbsv_discover_chr.svsig, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + is_hifi = true, + prefix = prefix + ".pbsv", + zones = arbitrary.zones } call VariantUtils.CollectDefinitions as UnionHeadersSnifflesVCFs { @@ -145,10 +148,9 @@ workflow CallVariants { ref_fasta_fai = ref_fasta_fai, prefix = prefix, tandem_repeat_bed = tandem_repeat_bed, - is_ccs = false, + is_hifi = true, zones = arbitrary.zones } - call VariantUtils.ZipAndIndexVCF as ZipAndIndexPBSV {input: vcf = PBSVslow.vcf } call Sniffles.Sniffles as SnifflesSlow { input: @@ -165,7 +167,7 @@ workflow CallVariants { File? sniffles_vcf = select_first([MergeSnifflesVCFs.vcf, ZipAndIndexSniffles.sortedVCF]) File? sniffles_tbi = select_first([MergeSnifflesVCFs.tbi, ZipAndIndexSniffles.tbi]) - File? pbsv_vcf = select_first([MergePBSVVCFs.vcf, ZipAndIndexPBSV.vcfgz]) - File? pbsv_tbi = select_first([MergePBSVVCFs.tbi, ZipAndIndexPBSV.tbi]) + File? pbsv_vcf = select_first([pbsv_wg_call.vcf, PBSVslow.vcf]) + File? pbsv_tbi = select_first([pbsv_wg_call.tbi, PBSVslow.tbi]) } } diff --git a/wdl/tasks/VariantCalling/ONTPepper.wdl b/wdl/deprecated/tasks/ONTPepper.wdl similarity index 95% rename from wdl/tasks/VariantCalling/ONTPepper.wdl rename to wdl/deprecated/tasks/ONTPepper.wdl index 7bf3e7adb..f3ab23b30 100644 --- a/wdl/tasks/VariantCalling/ONTPepper.wdl +++ b/wdl/deprecated/tasks/ONTPepper.wdl @@ -17,6 +17,7 @@ task Pepper { bai: "The input bam index file." ref_fasta: "The reference fasta file." ref_fasta_fai: "The reference fasta index file." + model: "Model to uses" threads: "The number of threads to use." memory: "The amount of memory to use." # when running large scale workflows, we sometimes see errors like the following @@ -33,9 +34,12 @@ task Pepper { File ref_fasta File ref_fasta_fai + String model + Int threads Int memory + Boolean use_gpu = false String zones = "us-central1-b us-central1-c" RuntimeAttr? runtime_attr_override @@ -73,7 +77,7 @@ task Pepper { -p "~{prefix}" \ --gvcf \ --phased_output \ - --ont + ~{model} df -h . find "~{output_root}/" -print | sed -e 's;[^/]*/;|____;g;s;____|; |;g' \ @@ -111,10 +115,10 @@ task Pepper { cpu_cores: threads, mem_gb: memory, disk_gb: disk_size, - boot_disk_gb: 100, + boot_disk_gb: 50, preemptible_tries: 1, max_retries: 1, - docker: "kishwars/pepper_deepvariant:r0.4.1" + docker: "kishwars/pepper_deepvariant:r0.8" + if use_gpu then "-gpu" else "" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { diff --git a/wdl/deprecated/tasks/PEPPER-MARGIN-DeepVariant.wdl b/wdl/deprecated/tasks/PEPPER-MARGIN-DeepVariant.wdl new file mode 100644 index 000000000..ac5aec1ef --- /dev/null +++ b/wdl/deprecated/tasks/PEPPER-MARGIN-DeepVariant.wdl @@ -0,0 +1,83 @@ +version 1.0 + +import "ONTPepper.wdl" + +import "../../tasks/Utility/VariantUtils.wdl" +import "../../tasks/Utility/Utils.wdl" +import "../../tasks/Alignment/WhatsHap.wdl" + +workflow Run { + meta { + desciption: + "Runs Clair3 on the input (sharded) BAM." + } + parameter_meta { + how_to_shard_wg_for_calling: "An array of the BAM's shard; each element is assumed to be a tuple of (ID for the shard, (BAM of the shard, BAI of the shard))" + prefix: "Prefix for output files" + model_for_pepper_margin_dv: "refer to https://github.com/kishwarshafin/pepper for appropriate values" + } + + input { + Array[Pair[String, Pair[File, File]]] how_to_shard_wg_for_calling + String prefix + String model_for_pepper_margin_dv + + Map[String, String] ref_map + + # optimization + Int dv_threads + Int dv_memory + String zones = "us-central1-a us-central1-b us-central1-c us-central1-f" + } + output { + File legacy_ont_dvp_g_vcf = MergePEPPERGVCFs.vcf + File legacy_ont_dvp_g_tbi = MergePEPPERGVCFs.tbi + File legacy_ont_dvp_phased_vcf = MergePEPPERPhasedVCFs.vcf + File legacy_ont_dvp_phased_tbi = MergePEPPERPhasedVCFs.tbi + File legacy_ont_dvp_haplotagged_bam = MergePEPPERHapTaggedBam.merged_bam + File legacy_ont_dvp_haplotagged_bai = MergePEPPERHapTaggedBam.merged_bai + File legacy_ont_dvp_phased_vcf_stats_tsv = ONTPhaseStatsLegacy.stats_tsv + File legacy_ont_dvp_phased_vcf_stats_gtf = ONTPhaseStatsLegacy.stats_gtf + } + + scatter (triplet in how_to_shard_wg_for_calling) { + call ONTPepper.Pepper { + input: + bam = triplet.right.left, + bai = triplet.right.right, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + model = model_for_pepper_margin_dv, + threads = select_first([dv_threads]), + memory = select_first([dv_memory]), + zones = zones + } + } + + String pepper_prefix = prefix + ".PEPPER-Margin-DeepVariant" + + call VariantUtils.MergeAndSortVCFs as MergePEPPERGVCFs { + input: + vcfs = Pepper.gVCF, + prefix = pepper_prefix + ".g", + ref_fasta_fai = ref_map['fai'] + } + + call VariantUtils.MergeAndSortVCFs as MergePEPPERPhasedVCFs { + input: + vcfs = Pepper.phasedVCF, + prefix = pepper_prefix + ".phased", + ref_fasta_fai = ref_map['fai'] + } + + call Utils.MergeBams as MergePEPPERHapTaggedBam { + input: + bams = Pepper.hap_tagged_bam, + prefix = prefix + ".MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged" + } + + call WhatsHap.Stats as ONTPhaseStatsLegacy { + input: + phased_vcf=MergePEPPERPhasedVCFs.vcf, phased_tbi=MergePEPPERPhasedVCFs.tbi + } +} diff --git a/wdl/pipelines/ONT/Preprocessing/ONTFlowcellFromMultipleBasecalls.wdl b/wdl/pipelines/ONT/Preprocessing/ONTFlowcellFromMultipleBasecalls.wdl index 9ec226628..c39d5dd63 100644 --- a/wdl/pipelines/ONT/Preprocessing/ONTFlowcellFromMultipleBasecalls.wdl +++ b/wdl/pipelines/ONT/Preprocessing/ONTFlowcellFromMultipleBasecalls.wdl @@ -2,9 +2,10 @@ version 1.0 import "../../../tasks/Utility/Utils.wdl" as Utils import "../../../tasks/Utility/GeneralUtils.wdl" as GU +import "../../../tasks/Utility/ONTUtils.wdl" import "../../../tasks/Utility/Finalize.wdl" as FF -import "../../../tasks/QC/SampleLevelAlignedMetrics.wdl" as COV +import "../../../deprecated/SampleLevelAlignedMetrics.wdl" as COV workflow ONTFlowcellFromMultipleBasecalls { input { @@ -40,7 +41,7 @@ workflow ONTFlowcellFromMultipleBasecalls { File bam = select_first([MergeAllReads.merged_bam, aligned_bams[0]]) File bai = select_first([MergeAllReads.merged_bai, aligned_bais[0]]) if (bams_suspected_to_contain_dup_record) { - call Utils.DeduplicateBam as RemoveDuplicates { + call ONTUtils.DeduplicateBam as RemoveDuplicates { input: aligned_bam = bam, aligned_bai = bai, same_name_as_input = true } } @@ -52,7 +53,6 @@ workflow ONTFlowcellFromMultipleBasecalls { input: aligned_bam = usable_bam, aligned_bai = usable_bai, - ref_fasta = ref_map['fasta'], bed_to_compute_coverage = bed_to_compute_coverage } diff --git a/wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl b/wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl index 7e0470bfb..3cdd63875 100644 --- a/wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl +++ b/wdl/pipelines/ONT/VariantCalling/ONTWholeGenome.wdl @@ -1,11 +1,9 @@ version 1.0 -import "../../../tasks/Utility/ONTUtils.wdl" as ONT -import "../../../tasks/Utility/Utils.wdl" as Utils -import "../../../tasks/VariantCalling/CallVariantsONT.wdl" as VAR -import "../../../tasks/Utility/Finalize.wdl" as FF +import "../../../tasks/Utility/GeneralUtils.wdl" as GU -import "../../../tasks/QC/SampleLevelAlignedMetrics.wdl" as COV +import "../../TechAgnostic/Utility/MergeSampleBamsAndCollectMetrics.wdl" as MERGE +import "../../TechAgnostic/VariantCalling/CallVariantsReadBased.wdl" as VAR workflow ONTWholeGenome { @@ -13,201 +11,197 @@ workflow ONTWholeGenome { description: "A workflow that performs single sample variant calling on Oxford Nanopore reads from one or more flow cells. The workflow merges multiple flowcells into a single BAM prior to variant calling." } parameter_meta { + gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + aligned_bams: "GCS path to aligned BAM files" aligned_bais: "GCS path to aligned BAM file indices" - participant_name: "name of the participant from whom these samples were obtained" + sample_name: "sample name as encoded in the bams" - ref_map_file: "table indicating reference sequence and auxillary file locations" - gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + bams_suspected_to_contain_dup_record: "Some ONT output files from basecall dirs have a strange duplicate issue." + is_r10_4_pore_or_later: "tell us which pore version was used to generate the data. When true, will use the DV (>=1.5.0) toolchain." + model_for_dv_andor_pepper: "model string to be used on DV or the PEPPER-Margin-DeepVariant toolchain. Please refer to their github pages for accepted values." + + ref_map_file: "table indicating reference sequence and auxillary file locations" + ref_scatter_interval_list_locator: "A file holding paths to interval_list files, used for custom sharding the of the input BAM; when not provided, will shard WG by contig (possibly slower)" + ref_scatter_interval_list_ids: "A file that gives short IDs to the interval_list files; when not provided, will shard WG by contig (possibly slower)" + + bed_to_compute_coverage: "BED file holding regions-of-interest for computing coverage over." + bed_descriptor: "Description of the BED file, will be used in the file name so be careful naming things" call_svs: "whether to call SVs" - fast_less_sensitive_sv: "to trade less sensitive SV calling for faster speed" + pbsv_discover_per_chr: "Run the discover stage of PBSV per chromosome" call_small_variants: "whether to call small variants" - call_small_vars_on_mitochondria: "if false, will not attempt to call variants on mitochondria; if true, some samples might fail (caller feature) due to lack of signal" - sites_vcf: "for use with Clair" - sites_vcf_tbi: "for use with Clair" - run_dv_pepper_analysis: "to turn on DV-Pepper analysis or not (non-trivial increase in cost and runtime)" - ref_scatter_interval_list_locator: "A file holding paths to interval_list files; needed only when running DV-Pepper" - ref_scatter_interval_list_ids: "A file that gives short IDs to the interval_list files; needed only when running DV-Pepper" - } + run_clair3: "to turn on Clair3 analysis or not (non-trivial increase in cost and runtime)" - input { - Array[File] aligned_bams - Array[File] aligned_bais - Boolean bams_suspected_to_contain_dup_record + use_margin_for_tagging: "if false, will use margin-phased small-variant VCF for haplotagging the BAM; applicable only when input data isn't ONT data with pore older than R10.4" - File? bed_to_compute_coverage + gcp_zones: "which Google Cloud Zone to use (this has implications on how many GPUs are available and egress costs, so configure carefully)" - File ref_map_file + # outputs + haplotagged_bam: "BAM haplotagged using a small variant single-sample VCF." + haplotagged_bai: "Index for haplotagged_bam." + haplotagged_bam_tagger: "VCF used for doing the haplotagging. 'Legacy' if the input is ONT data generated on pores before R10.4." - String participant_name + legacy_g_vcf: "PEPPER-MARGIN-DeepVariant gVCF; available only when input is ONT data generated on pores older than R10.4." + legacy_g_tbi: "Index for PEPPER-MARGIN-DeepVariant gVCF; available only when input is ONT data generated on pores older than R10.4." + legacy_phased_vcf: "Phased PEPPER-MARGIN-DeepVariant VCF; available only when input is ONT data generated on pores older than R10.4." + legacy_phased_tbi: "Indes for phased PEPPER-MARGIN-DeepVariant VCF; available only when input is ONT data generated on pores older than R10.4." + legacy_phasing_stats_tsv: "Phasing stats of legacy_phased_vcf in TSV format; available only when input is ONT data generated on pores older than R10.4." + legacy_phasing_stats_gtf: "Phasing stats of legacy_phased_vcf in GTF format; available only when input is ONT data generated on pores older than R10.4." - String gcs_out_root_dir + dv_g_vcf: "DeepVariant gVCF; available for CCS data and ONT data generated with pores >= R10.4." + dv_g_tbi: "Index for DeepVariant ; available for CCS data and ONT data generated with pores >= R10.4." + dv_margin_phased_vcf: "Phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_margin_phased_tbi: "Index for phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_margin_phasing_stats_tsv: "Phasing stats (TSV format) of phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_margin_phasing_stats_gtf: "Phasing stats (GTF format) of phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_whatshap_phased_vcf: "Phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." + dv_whatshap_phased_tbi: "Index for phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_whatshap_phasing_stats_tsv: "Phasing stats (TSV format) of phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_whatshap_phasing_stats_gtf: "Phasing stats (GTF format) of phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." - Boolean call_svs = true - Boolean? fast_less_sensitive_sv = true + dv_nongpu_resources_usage_visual: "Resource usage monitoring log visualization for DV (per shard); available for CCS data and ONT data generated with pores >= R10.4." + } - Boolean call_small_variants = true - Boolean? call_small_vars_on_mitochondria = false - File? sites_vcf - File? sites_vcf_tbi + input { + String gcs_out_root_dir + + # sample specific + String sample_name + Array[File] aligned_bams + Array[File] aligned_bais + Boolean bams_suspected_to_contain_dup_record + Boolean is_r10_4_pore_or_later + String model_for_dv_andor_pepper - Boolean? run_dv_pepper_analysis = true - Int? dvp_threads = 32 - Int? dvp_memory = 128 + # reference-specific + File ref_map_file File? ref_scatter_interval_list_locator File? ref_scatter_interval_list_ids - } - - Map[String, String] ref_map = read_map(ref_map_file) + File? bed_to_compute_coverage + String? bed_descriptor - String outdir = sub(gcs_out_root_dir, "/$", "") + "/ONTWholeGenome/~{participant_name}" + # user choice + Boolean call_svs = true + Boolean pbsv_discover_per_chr = true + Int minsvlen = 50 - # gather across (potential multiple) input raw BAMs - if (length(aligned_bams) > 1) { - scatter (pair in zip(aligned_bams, aligned_bais)) { - call Utils.InferSampleName {input: bam = pair.left, bai = pair.right} - } - call Utils.CheckOnSamplenames {input: sample_names = InferSampleName.sample_name} + Boolean call_small_variants = true + Boolean run_clair3 = false + Boolean use_margin_for_tagging = true + Int dv_threads = 16 + Int dv_memory = 64 + Boolean use_gpu = false - call Utils.MergeBams as MergeAllReads { input: bams = aligned_bams, prefix = participant_name } + Array[String] gcp_zones = ['us-central1-a', 'us-central1-b', 'us-central1-c', 'us-central1-f'] } - File bam = select_first([MergeAllReads.merged_bam, aligned_bams[0]]) - File bai = select_first([MergeAllReads.merged_bai, aligned_bais[0]]) - - if (bams_suspected_to_contain_dup_record) { - call Utils.DeduplicateBam as RemoveDuplicates { - input: aligned_bam = bam, aligned_bai = bai - } - } - File usable_bam = select_first([RemoveDuplicates.corrected_bam, bam]) - File usable_bai = select_first([RemoveDuplicates.corrected_bai, bai]) + String workflow_name = "ONTWholeGenome" + String outdir = sub(gcs_out_root_dir, "/$", "") + "/~{workflow_name}/~{sample_name}" - call COV.SampleLevelAlignedMetrics as coverage { + ########################################################### + call MERGE.Work as MergeAndMetrics { input: - aligned_bam = usable_bam, - aligned_bai = usable_bai, - ref_fasta = ref_map['fasta'], - bed_to_compute_coverage = bed_to_compute_coverage - } + gcs_out_dir = outdir, - String dir = outdir + "/alignments" + sample_name = sample_name, + aligned_bams = aligned_bams, + aligned_bais = aligned_bais, - call FF.FinalizeToFile as FinalizeBam { input: outdir = dir, file = usable_bam, name = "~{participant_name}.bam" } - call FF.FinalizeToFile as FinalizeBai { input: outdir = dir, file = usable_bai, name = "~{participant_name}.bam.bai" } + is_ont = true, + bams_suspected_to_contain_dup_record = bams_suspected_to_contain_dup_record, - if (defined(bed_to_compute_coverage)) { call FF.FinalizeToFile as FinalizeRegionalCoverage { input: outdir = dir, file = select_first([coverage.bed_cov_summary]) } } + bed_to_compute_coverage = bed_to_compute_coverage, + bed_descriptor = bed_descriptor + } #################################################################################################### if (call_svs || call_small_variants) { - - # verify arguments are provided - if (call_svs) { - if (! defined(fast_less_sensitive_sv)) {call Utils.StopWorkflow as fast_less_sensitive_sv_not_provided {input: reason = "Calling SVs without specifying arg fast_less_sensitive_sv"}} - } - if (call_small_variants) { - if (! defined(call_small_vars_on_mitochondria)) {call Utils.StopWorkflow as call_small_vars_on_mitochondria_not_provided {input: reason = "Unprovided arg call_small_vars_on_mitochondria"}} - if (! defined(run_dv_pepper_analysis)) {call Utils.StopWorkflow as run_dv_pepper_analysis_not_provided {input: reason = "Unprovided arg run_dv_pepper_analysis"}} - if (! defined(dvp_threads)) {call Utils.StopWorkflow as dvp_threads_not_provided {input: reason = "Unprovided arg dvp_threads"}} - if (! defined(ref_scatter_interval_list_locator)) {call Utils.StopWorkflow as ref_scatter_interval_list_locator_not_provided {input: reason = "Unprovided arg ref_scatter_interval_list_locator"}} - if (! defined(ref_scatter_interval_list_ids)) {call Utils.StopWorkflow as ref_scatter_interval_list_ids_not_provided {input: reason = "Unprovided arg ref_scatter_interval_list_ids"}} - } - call VAR.CallVariants { input: - bam = usable_bam, - bai = usable_bai, - sample_id = participant_name, - ref_fasta = ref_map['fasta'], - ref_fasta_fai = ref_map['fai'], - ref_dict = ref_map['dict'], - tandem_repeat_bed = ref_map['tandem_repeat_bed'], + gcs_out_dir = outdir, - prefix = participant_name, + bam = MergeAndMetrics.aligned_bam, + bai = MergeAndMetrics.aligned_bai, + prefix = sample_name, - call_svs = call_svs, - fast_less_sensitive_sv = select_first([fast_less_sensitive_sv]), - - call_small_variants = call_small_variants, - call_small_vars_on_mitochondria = select_first([call_small_vars_on_mitochondria]), - sites_vcf = sites_vcf, - sites_vcf_tbi = sites_vcf_tbi, - - run_dv_pepper_analysis = select_first([run_dv_pepper_analysis]), - dvp_threads = select_first([dvp_threads]), - dvp_memory = select_first([dvp_memory]), - ref_scatter_interval_list_locator = select_first([ref_scatter_interval_list_locator]), - ref_scatter_interval_list_ids = select_first([ref_scatter_interval_list_ids]) - } + is_ont = true, + is_r10_4_pore_or_later = is_r10_4_pore_or_later, + model_for_dv_andor_pepper = model_for_dv_andor_pepper, - String svdir = outdir + "/variants/sv" - String smalldir = outdir + "/variants/small" + ref_map_file = ref_map_file, + ref_scatter_interval_list_locator = ref_scatter_interval_list_locator, + ref_scatter_interval_list_ids = ref_scatter_interval_list_ids, - if (call_svs) { - call FF.FinalizeToFile as FinalizePBSV { input: outdir = svdir, file = select_first([CallVariants.pbsv_vcf]) } - call FF.FinalizeToFile as FinalizePBSVtbi { input: outdir = svdir, file = select_first([CallVariants.pbsv_tbi]) } + call_svs = call_svs, + pbsv_discover_per_chr = pbsv_discover_per_chr, + minsvlen = minsvlen, - call FF.FinalizeToFile as FinalizeSniffles { input: outdir = svdir, file = select_first([CallVariants.sniffles_vcf]) } - call FF.FinalizeToFile as FinalizeSnifflesTbi { input: outdir = svdir, file = select_first([CallVariants.sniffles_tbi]) } - } + call_small_variants = call_small_variants, + run_clair3 = run_clair3, + use_margin_for_tagging = use_margin_for_tagging, + dv_threads = dv_threads, + dv_memory = dv_memory, + use_gpu = use_gpu, - if (call_small_variants) { - call FF.FinalizeToFile as FinalizeClairVcf { input: outdir = smalldir, file = select_first([CallVariants.clair_vcf])} - call FF.FinalizeToFile as FinalizeClairTbi { input: outdir = smalldir, file = select_first([CallVariants.clair_tbi])} - - call FF.FinalizeToFile as FinalizeClairGVcf { input: outdir = smalldir, file = select_first([CallVariants.clair_gvcf])} - call FF.FinalizeToFile as FinalizeClairGTbi { input: outdir = smalldir, file = select_first([CallVariants.clair_gtbi])} - - if (select_first([run_dv_pepper_analysis])) { - call FF.FinalizeToFile as FinalizeDVPepperVcf { input: outdir = smalldir, file = select_first([CallVariants.dvp_vcf])} - call FF.FinalizeToFile as FinalizeDVPepperTbi { input: outdir = smalldir, file = select_first([CallVariants.dvp_tbi])} - call FF.FinalizeToFile as FinalizeDVPepperGVcf { input: outdir = smalldir, file = select_first([CallVariants.dvp_g_vcf])} - call FF.FinalizeToFile as FinalizeDVPepperGTbi { input: outdir = smalldir, file = select_first([CallVariants.dvp_g_tbi])} - call FF.FinalizeToFile as FinalizeDVPepperPhasedVcf { input: outdir = smalldir, file = select_first([CallVariants.dvp_phased_vcf])} - call FF.FinalizeToFile as FinalizeDVPepperPhasedTbi { input: outdir = smalldir, file = select_first([CallVariants.dvp_phased_tbi])} - } + gcp_zones = gcp_zones } } -output { - File merged_bam = FinalizeBam.gcs_path - File merged_bai = FinalizeBai.gcs_path - - Float aligned_num_reads = coverage.aligned_num_reads - Float aligned_num_bases = coverage.aligned_num_bases - Float aligned_frac_bases = coverage.aligned_frac_bases - Float aligned_est_fold_cov = coverage.aligned_est_fold_cov - - Float aligned_read_length_mean = coverage.aligned_read_length_mean - Float aligned_read_length_median = coverage.aligned_read_length_median - Float aligned_read_length_stdev = coverage.aligned_read_length_stdev - Float aligned_read_length_N50 = coverage.aligned_read_length_N50 + #################################################################################################### + call GU.GetTodayDate as today {} - Float average_identity = coverage.average_identity - Float median_identity = coverage.median_identity + ########################################################### + output { + String last_processing_date = today.yyyy_mm_dd - File? bed_cov_summary = FinalizeRegionalCoverage.gcs_path ######################################## - File? pbsv_vcf = FinalizePBSV.gcs_path - File? pbsv_tbi = FinalizePBSVtbi.gcs_path - - File? sniffles_vcf = FinalizeSniffles.gcs_path - File? sniffles_tbi = FinalizeSnifflesTbi.gcs_path + File aligned_bam = MergeAndMetrics.aligned_bam + File aligned_bai = MergeAndMetrics.aligned_bai - File? clair_vcf = FinalizeClairVcf.gcs_path - File? clair_tbi = FinalizeClairTbi.gcs_path + Float coverage = MergeAndMetrics.coverage + File? bed_cov_summary = MergeAndMetrics.bed_cov_summary - File? clair_gvcf = FinalizeClairGVcf.gcs_path - File? clair_gtbi = FinalizeClairGTbi.gcs_path + Map[String, Float] alignment_metrics = MergeAndMetrics.alignment_metrics - File? dvp_vcf = FinalizeDVPepperVcf.gcs_path - File? dvp_tbi = FinalizeDVPepperTbi.gcs_path - File? dvp_g_vcf = FinalizeDVPepperGVcf.gcs_path - File? dvp_g_tbi = FinalizeDVPepperGTbi.gcs_path - File? dvp_phased_vcf = FinalizeDVPepperPhasedVcf.gcs_path - File? dvp_phased_tbi = FinalizeDVPepperPhasedTbi.gcs_path + ######################################## + File? pbsv_vcf = CallVariants.pbsv_vcf + File? pbsv_tbi = CallVariants.pbsv_tbi + + File? sniffles_vcf = CallVariants.sniffles_vcf + File? sniffles_tbi = CallVariants.sniffles_tbi + File? sniffles_snf = CallVariants.sniffles_snf + + File? sniffles_phased_vcf = CallVariants.sniffles_phased_vcf + File? sniffles_phased_tbi = CallVariants.sniffles_phased_tbi + File? sniffles_phased_snf = CallVariants.sniffles_phased_snf + + File? clair_vcf = CallVariants.clair_vcf + File? clair_tbi = CallVariants.clair_tbi + File? clair_gvcf = CallVariants.clair_gvcf + File? clair_gtbi = CallVariants.clair_gtbi + + # available for ONT >= R10.4 data, if small variants are requested + File? dv_g_vcf = CallVariants.dv_g_vcf + File? dv_g_tbi = CallVariants.dv_g_tbi + File? dv_margin_phased_vcf = CallVariants.dv_margin_phased_vcf + File? dv_margin_phased_tbi = CallVariants.dv_margin_phased_tbi + File? dv_vcf_margin_phasing_stats_tsv = CallVariants.dv_vcf_margin_phasing_stats_tsv + File? dv_vcf_margin_phasing_stats_gtf = CallVariants.dv_vcf_margin_phasing_stats_gtf + File? dv_whatshap_phased_vcf = CallVariants.dv_whatshap_phased_vcf + File? dv_whatshap_phased_tbi = CallVariants.dv_whatshap_phased_tbi + File? dv_vcf_whatshap_phasing_stats_tsv = CallVariants.dv_vcf_whatshap_phasing_stats_tsv + File? dv_vcf_whatshap_phasing_stats_gtf = CallVariants.dv_vcf_whatshap_phasing_stats_gtf + String? dv_nongpu_resources_usage_visual = CallVariants.dv_nongpu_resources_usage_visual + + # available for ONT < R10.4 data, if small variants are requested + File? legacy_g_vcf = CallVariants.legacy_g_vcf + File? legacy_g_tbi = CallVariants.legacy_g_tbi + File? legacy_phased_vcf = CallVariants.legacy_phased_vcf + File? legacy_phased_tbi = CallVariants.legacy_phased_tbi + File? legacy_phasing_stats_tsv = CallVariants.legacy_phasing_stats_tsv + File? legacy_phasing_stats_gtf = CallVariants.legacy_phasing_stats_gtf } } diff --git a/wdl/pipelines/PacBio/Assembly/PBAssembleWithHifiasm.wdl b/wdl/pipelines/PacBio/Assembly/PBAssembleWithHifiasm.wdl index 6a0ee904d..0822f36ca 100644 --- a/wdl/pipelines/PacBio/Assembly/PBAssembleWithHifiasm.wdl +++ b/wdl/pipelines/PacBio/Assembly/PBAssembleWithHifiasm.wdl @@ -1,34 +1,45 @@ version 1.0 +import "../../../tasks/Utility/GeneralUtils.wdl" as GU import "../../../tasks/Utility/Utils.wdl" as Utils +import "../../../tasks/Utility/Finalize.wdl" as FF + import "../../../tasks/Assembly/Hifiasm.wdl" as HA import "../../../tasks/QC/Quast.wdl" as QuastEval -import "../../../tasks/Utility/Finalize.wdl" as FF workflow PBAssembleWithHifiasm { meta { - description: "A workflow that performs single sample genome assembly on PacBio HiFi reads from one or more SMRT cells. The multiple SMRT cells data are merged prior to assembly." + description: "A workflow that performs single, diploid sample genome assembly on PacBio HiFi reads from one or more SMRT cells. The multiple SMRT cells data are merged prior to assembly." } parameter_meta { ccs_fqs: "GCS path to CCS fastq files" - - participant_name: "name of the participant from whom these samples were obtained" prefix: "prefix for output files" ref_fasta_for_eval: "Reference Fasta used for evaluating " gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + + hifiasm_primary_gfa: "primary assembly GFA output (gzipped)" + + hifiasm_alternate_tigs: "alternative assembly FASTA output (block gzipped)" + + hifiasm_haploGFAs: "path to folder hosting GFA files (gzipped) for the haplotype-resolved assemblies" + hifiasm_haplotigs: "path to folder hosting FASTA files (block gzipped) for the haplotype-resolved assemblies" + + quast_report_html: "QUAST report on [primary, H0, H1] assemblies FASTA, in HTML format" + quast_summary_on_all: "QUAST summary on [primary, H0, H1] assemblies FASTA" } input { Array[File] ccs_fqs - String participant_name String prefix File? ref_fasta_for_eval String gcs_out_root_dir + + Array[String] gcp_zones = ['us-central1-a', 'us-central1-b', 'us-central1-c', 'us-central1-f'] } ######################################################################################### @@ -37,10 +48,15 @@ workflow PBAssembleWithHifiasm { } File ccs_fq = select_first([ MergeAllFastqs.merged_fastq, ccs_fqs[0] ]) + ######################################################################################### + call GU.CollapseArrayOfStrings as get_zones {input: input_array = gcp_zones, joiner = " "} + String wdl_parsable_zones = get_zones.collapsed + call HA.Hifiasm { input: reads = ccs_fq, - prefix = prefix + prefix = prefix, + zones = wdl_parsable_zones } # todo: assumes ploidy 2 @@ -49,8 +65,8 @@ workflow PBAssembleWithHifiasm { ref = ref_fasta_for_eval, is_large = true, assemblies = [Hifiasm.primary_tigs, - Hifiasm.phased_tigs[0], - Hifiasm.phased_tigs[1]] + Hifiasm.hap1_tigs, + Hifiasm.hap2_tigs] } call QuastEval.SummarizeQuastReport as primary_h0_h1_quast_summary { @@ -62,62 +78,105 @@ workflow PBAssembleWithHifiasm { String workflow_name = "PBAssembleWithHifiasm" String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name + "/~{prefix}" - String dir = outdir + "/assembly" + ########## # merged FASTQ String dummy = basename(ccs_fq) String dummy_b = sub(dummy, ".gz$", "") if (dummy != dummy_b) { - call FF.FinalizeToFile as FinalizeMergedFQ { input: outdir = dir, file = ccs_fq, name = prefix + ".fq.gz" } + call FF.FinalizeToFile as FinalizeMergedFQ { input: outdir = outdir, file = ccs_fq, name = prefix + ".fq.gz" } } if (dummy == dummy_b) { - call FF.CompressAndFinalize as CompressAndFinalizeMergedFQ { input: outdir = dir, file = ccs_fq, name = prefix + ".fq.gz" } + call FF.CompressAndFinalize as CompressAndFinalizeMergedFQ { input: outdir = outdir, file = ccs_fq, name = prefix + ".fq.gz" } } String finalized_merged_fq_path = select_first([FinalizeMergedFQ.gcs_path, CompressAndFinalizeMergedFQ.gcs_path]) - + ########## # assembly results themselves - call FF.CompressAndFinalize as FinalizeHifiasmPrimaryGFA { input: outdir = dir, file = Hifiasm.primary_gfa } - call FF.CompressAndFinalize as FinalizeHifiasmPrimaryFA { input: outdir = dir, file = Hifiasm.primary_tigs } - - call FF.CompressAndFinalize as FinalizeHifiasmAlternateGFA { input: outdir = dir, file = Hifiasm.alternate_gfa } - call FF.CompressAndFinalize as FinalizeHifiasmAlternateFA { input: outdir = dir, file = Hifiasm.alternate_tigs } - - call FF.FinalizeAndCompress as FinalizeHifiasmHapGFAs { input: outdir = dir, files = Hifiasm.phased_gfas, prefix = prefix + ".haploGFAs" } - call FF.FinalizeAndCompress as FinalizeHifiasmHapFAs { input: outdir = dir, files = Hifiasm.phased_tigs, prefix = prefix + ".haploTigs" } + String asm_dir = outdir + "/assembly" + + # primary/alt + call FF.CompressAndFinalize as FinalizeHifiasmPrimaryGFA { input: outdir = asm_dir, file = Hifiasm.primary_gfa } + call FF.FinalizeToFile as FinalizeHifiasmPrimaryFA { input: outdir = asm_dir, file = Hifiasm.primary_tigs } + call FF.FinalizeToFile as FinalizeHifiasmPrimaryGzi { input: outdir = asm_dir, file = Hifiasm.primary_tigs_gzi } + + call FF.CompressAndFinalize as FinalizeHifiasmAlternateGFA { input: outdir = asm_dir, file = Hifiasm.alternate_gfa } + call FF.FinalizeToFile as FinalizeHifiasmAlternateFA { input: outdir = asm_dir, file = Hifiasm.alternate_tigs } + call FF.FinalizeToFile as FinalizeHifiasmAlternateGzi { input: outdir = asm_dir, file = Hifiasm.alternate_tigs_gzi } + + # H1/H2 + call FF.FinalizeToFile as FinalizeHifiasmHapOneGFA { input: outdir = asm_dir, file = Hifiasm.hap1_gfa } + call FF.FinalizeToFile as FinalizeHifiasmHapOneFASTA { input: outdir = asm_dir, file = Hifiasm.hap1_tigs } + call FF.FinalizeToFile as FinalizeHifiasmHapOneFaGzi { input: outdir = asm_dir, file = Hifiasm.hap1_tig_gzi } + call FF.FinalizeToFile as FinalizeHifiasmHapTwoGFA { input: outdir = asm_dir, file = Hifiasm.hap2_gfa } + call FF.FinalizeToFile as FinalizeHifiasmHapTwoFASTA { input: outdir = asm_dir, file = Hifiasm.hap2_tigs } + call FF.FinalizeToFile as FinalizeHifiasmHapTwoFaGzi { input: outdir = asm_dir, file = Hifiasm.hap2_tig_gzi } + + call FF.FinalizeToFile as FinalizeHifiasmLog { input: + outdir = asm_dir, file = Hifiasm.log_in_hap_mode, name = "~{prefix}.hifiasm-hapmode.log" + } + call FF.FinalizeToFile as FinalizeHifiasmResourceUsagesVisual { input: + outdir = asm_dir, file = Hifiasm.resource_usage_visual_in_hap_mode + } + ########## + # quast stuff + String quast_dir = outdir + "/quast" call FF.FinalizeToFile as FinalizeQuastReportHtml { - input: outdir = dir, file = primary_h0_h1_quast.report_html - } - call FF.FinalizeAndCompress as FinalizeQuastReports { - input: outdir = dir, files = primary_h0_h1_quast.report_in_various_formats, prefix = prefix + ".quast_reports" + input: outdir = quast_dir, file = primary_h0_h1_quast.report_html, name = '~{prefix}.quast-report.html' } call FF.FinalizeToFile as FinalizeQuastSummaryAll { - input: outdir = dir, file = select_first([primary_h0_h1_quast_summary.quast_metrics_together]) - } - scatter (report in select_first([primary_h0_h1_quast_summary.quast_metrics]) ) { - call FF.FinalizeToFile as FinalizeQuastIndividualSummary { input: outdir = dir, file = report } + input: outdir = quast_dir, file = primary_h0_h1_quast_summary.quast_metrics_together, name = '~{prefix}.quast-summary.txt' } + call FF.FinalizeToDir as FinalizeQuastReports { input: + outdir = quast_dir + "/misc/", - output { - File merged_fq = finalized_merged_fq_path - - File hifiasm_primary_gfa = FinalizeHifiasmPrimaryGFA.gcs_path - File hifiasm_primary_tigs = FinalizeHifiasmPrimaryFA.gcs_path - - File hifiasm_haploGFAs = FinalizeHifiasmHapGFAs.gcs_path - File hifiasm_haplotigs = FinalizeHifiasmHapFAs.gcs_path + files = flatten([primary_h0_h1_quast.plots, + primary_h0_h1_quast.report_in_various_formats, + primary_h0_h1_quast_summary.quast_metrics]) + } + if (defined(primary_h0_h1_quast.contigs_reports)) { + call FF.FinalizeToFile as FinalizeQuastContigsReport { input: outdir = quast_dir, file = select_first([primary_h0_h1_quast.contigs_reports])} + } - File hifiasm_alternate_gfa = FinalizeHifiasmAlternateGFA.gcs_path - File hifiasm_alternate_tigs = FinalizeHifiasmAlternateFA.gcs_path + ########################################################### + call GU.GetTodayDate as today {} - File? quast_report_html = FinalizeQuastReportHtml.gcs_path - File? quast_report_in_various_formats = FinalizeQuastReports.gcs_path + ########################################################### + output { + String last_processing_date = today.yyyy_mm_dd - File? quast_summary_on_all = FinalizeQuastSummaryAll.gcs_path + ######################################## + File merged_fq = finalized_merged_fq_path - File? quast_summary_on_primary = FinalizeQuastIndividualSummary.gcs_path[0] - File? quast_summary_on_H0 = FinalizeQuastIndividualSummary.gcs_path[1] - File? quast_summary_on_H1 = FinalizeQuastIndividualSummary.gcs_path[2] + ######################################## + Map[String, File] hifiasm_hap_outputs = { + "HapOne_GFA": FinalizeHifiasmHapOneGFA.gcs_path, + "HapTwo_GFA": FinalizeHifiasmHapTwoGFA.gcs_path, + "HapOne_FASTA": FinalizeHifiasmHapOneFASTA.gcs_path, + "HapTwo_FASTA": FinalizeHifiasmHapTwoFASTA.gcs_path, + "HapOne_FA_GZI": FinalizeHifiasmHapOneFaGzi.gcs_path, + "HapTWO_FA_GZI": FinalizeHifiasmHapTwoFaGzi.gcs_path, + "resource_use_visual": FinalizeHifiasmResourceUsagesVisual.gcs_path, + "runtime_log": FinalizeHifiasmLog.gcs_path, + } + + Map[String, File] hifiasm_primary_alt_outputs = { + "primary_gfa": FinalizeHifiasmPrimaryGFA.gcs_path, + "primary_fasta": FinalizeHifiasmPrimaryFA.gcs_path, + "primary_fa_gzi": FinalizeHifiasmPrimaryGzi.gcs_path, + "alternate_gfa": FinalizeHifiasmAlternateGFA.gcs_path, + "alternate_fasta": FinalizeHifiasmAlternateFA.gcs_path, + "alternate_fa_gzi": FinalizeHifiasmAlternateGzi.gcs_path, + } + + ######################################## + File quast_report_html = FinalizeQuastReportHtml.gcs_path + File quast_summary_on_all = FinalizeQuastSummaryAll.gcs_path + + Map[String, String] quast_2ndary_outputs = { + "quast_2ndary_outputs": FinalizeQuastReports.gcs_dir, + "quast_contigs_report": if (defined(primary_h0_h1_quast.contigs_reports)) then select_first([FinalizeQuastContigsReport.gcs_path]) else "None" + } } } diff --git a/wdl/pipelines/PacBio/VariantCalling/PBCCSWholeGenome.wdl b/wdl/pipelines/PacBio/VariantCalling/PBCCSWholeGenome.wdl index fb7a976c9..2c1007018 100644 --- a/wdl/pipelines/PacBio/VariantCalling/PBCCSWholeGenome.wdl +++ b/wdl/pipelines/PacBio/VariantCalling/PBCCSWholeGenome.wdl @@ -1,209 +1,184 @@ version 1.0 -import "../../../tasks/Utility/PBUtils.wdl" as PB -import "../../../tasks/Utility/Utils.wdl" as Utils -import "../../../tasks/VariantCalling/CallVariantsPBCCS.wdl" as VAR -import "../../../tasks/Utility/Finalize.wdl" as FF +import "../../../tasks/Utility/GeneralUtils.wdl" as GU -import "../../../tasks/QC/SampleLevelAlignedMetrics.wdl" as COV +import "../../TechAgnostic/Utility/MergeSampleBamsAndCollectMetrics.wdl" as MERGE +import "../../TechAgnostic/VariantCalling/CallVariantsReadBased.wdl" as VAR workflow PBCCSWholeGenome { meta { - description: "A workflow that performs single sample variant calling on PacBio HiFi reads from one or more flow cells. The workflow merges multiple SMRT cells into a single BAM prior to variant calling." + description: "A workflow that performs single sample variant calling on PacBio HiFi reads from one or more SMRT cells. The workflow merges multiple SMRT cells into a single BAM prior to variant calling." } parameter_meta { + gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + aligned_bams: "GCS path to aligned BAM files" aligned_bais: "GCS path to aligned BAM file indices" - participant_name: "name of the participant from whom these samples were obtained" + sample_name: "sample name as encoded in the bams" - ref_map_file: "table indicating reference sequence and auxillary file locations" - gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + ref_map_file: "table indicating reference sequence and auxillary file locations" + ref_scatter_interval_list_locator: "A file holding paths to interval_list files, used for custom sharding the of the input BAM; when not provided, will shard WG by contig (possibly slower)" + ref_scatter_interval_list_ids: "A file that gives short IDs to the interval_list files; when not provided, will shard WG by contig (possibly slower)" + + bed_to_compute_coverage: "BED file holding regions-of-interest for computing coverage over." + bed_descriptor: "Description of the BED file, will be used in the file name so be careful naming things" call_svs: "whether to call SVs" - fast_less_sensitive_sv: "to trade less sensitive SV calling for faster speed" + pbsv_discover_per_chr: "Run the discover stage of PBSV per chromosome" call_small_variants: "whether to call small variants" - call_small_vars_on_mitochondria: "if false, will not attempt to call variants on mitochondria; if true, some samples might fail (caller feature) due to lack of signal" - sites_vcf: "for use with Clair" - sites_vcf_tbi: "for use with Clair" - run_dv_pepper_analysis: "to turn on DV-Pepper analysis or not (non-trivial increase in cost and runtime)" - ref_scatter_interval_list_locator: "A file holding paths to interval_list files; needed only when running DV-Pepper" - ref_scatter_interval_list_ids: "A file that gives short IDs to the interval_list files; needed only when running DV-Pepper" - } + run_clair3: "to turn on Clair3 analysis or not (non-trivial increase in cost and runtime)" - input { - Array[File] aligned_bams - Array[File] aligned_bais + use_margin_for_tagging: "if false, will use margin-phased small-variant VCF for haplotagging the BAM; applicable only when input data isn't ONT data with pore older than R10.4" - File? bed_to_compute_coverage + gcp_zones: "which Google Cloud Zone to use (this has implications on how many GPUs are available and egress costs, so configure carefully)" - File ref_map_file + # outputs + haplotagged_bam: "BAM haplotagged using a small variant single-sample VCF." + haplotagged_bai: "Index for haplotagged_bam." + haplotagged_bam_tagger: "VCF used for doing the haplotagging. 'Legacy' if the input is ONT data generated on pores before R10.4." - String participant_name + dv_g_vcf: "DeepVariant gVCF; available for CCS data and ONT data generated with pores >= R10.4." + dv_g_tbi: "Index for DeepVariant ; available for CCS data and ONT data generated with pores >= R10.4." + dv_margin_phased_vcf: "Phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_margin_phased_tbi: "Index for phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_margin_phasing_stats_tsv: "Phasing stats (TSV format) of phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_margin_phasing_stats_gtf: "Phasing stats (GTF format) of phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_whatshap_phased_vcf: "Phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." + dv_whatshap_phased_tbi: "Index for phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_whatshap_phasing_stats_tsv: "Phasing stats (TSV format) of phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_whatshap_phasing_stats_gtf: "Phasing stats (GTF format) of phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." - String gcs_out_root_dir + dv_nongpu_resources_usage_visual: "Resource usage monitoring log visualization for DV (per shard); available for CCS data and ONT data generated with pores >= R10.4." + } - Boolean call_svs = true - Boolean? fast_less_sensitive_sv = true + input { + String gcs_out_root_dir - Boolean call_small_variants = true - Boolean? call_small_vars_on_mitochondria = false - File? sites_vcf - File? sites_vcf_tbi + # sample specific + String sample_name + Array[File] aligned_bams + Array[File] aligned_bais - Boolean? run_dv_pepper_analysis = true - Int? dvp_threads = 32 - Int? dvp_memory = 128 + # reference-specific + File ref_map_file File? ref_scatter_interval_list_locator File? ref_scatter_interval_list_ids - } - - Map[String, String] ref_map = read_map(ref_map_file) + File? bed_to_compute_coverage + String? bed_descriptor - String outdir = sub(gcs_out_root_dir, "/$", "") + "/PBCCSWholeGenome/~{participant_name}" + # user choice + Boolean call_svs = true + Boolean pbsv_discover_per_chr = true + Int minsvlen = 50 - # gather across (potential multiple) input CCS BAMs - if (length(aligned_bams) > 1) { - scatter (pair in zip(aligned_bams, aligned_bais)) { - call Utils.InferSampleName {input: bam = pair.left, bai = pair.right} - } - call Utils.CheckOnSamplenames {input: sample_names = InferSampleName.sample_name} + Boolean call_small_variants = true + Boolean run_clair3 = false + Boolean use_margin_for_tagging = true + Int dv_threads = 16 + Int dv_memory = 40 + Boolean use_gpu = false - call Utils.MergeBams as MergeAllReads { input: bams = aligned_bams, prefix = participant_name } + Array[String] gcp_zones = ['us-central1-a', 'us-central1-b', 'us-central1-c', 'us-central1-f'] } - File bam = select_first([MergeAllReads.merged_bam, aligned_bams[0]]) - File bai = select_first([MergeAllReads.merged_bai, aligned_bais[0]]) + String workflow_name = "PBCCSWholeGenome" + String outdir = sub(gcs_out_root_dir, "/$", "") + "/~{workflow_name}/~{sample_name}" - call PB.PBIndex as IndexCCSUnalignedReads { input: bam = bam } - File pbi = IndexCCSUnalignedReads.pbi - - call COV.SampleLevelAlignedMetrics as coverage { + ########################################################### + call MERGE.Work as MergeAndMetrics { input: - aligned_bam = bam, - aligned_bai = bai, - ref_fasta = ref_map['fasta'], - bed_to_compute_coverage = bed_to_compute_coverage - } + gcs_out_dir = outdir, - String dir = outdir + "/alignments" + sample_name = sample_name, + aligned_bams = aligned_bams, + aligned_bais = aligned_bais, - call FF.FinalizeToFile as FinalizeBam { input: outdir = dir, file = bam, name = "~{participant_name}.bam" } - call FF.FinalizeToFile as FinalizeBai { input: outdir = dir, file = bai, name = "~{participant_name}.bam.bai" } - call FF.FinalizeToFile as FinalizePbi { input: outdir = dir, file = pbi, name = "~{participant_name}.bam.pbi" } + is_ont = false, + bams_suspected_to_contain_dup_record = false, - if (defined(bed_to_compute_coverage)) { call FF.FinalizeToFile as FinalizeRegionalCoverage { input: outdir = dir, file = select_first([coverage.bed_cov_summary]) } } + bed_to_compute_coverage = bed_to_compute_coverage, + bed_descriptor = bed_descriptor + } - #################################################################################################### + ########################################################### if (call_svs || call_small_variants) { - - # verify arguments are provided - if (call_svs) { - if (! defined(fast_less_sensitive_sv)) {call Utils.StopWorkflow as fast_less_sensitive_sv_not_provided {input: reason = "Calling SVs without specifying arg fast_less_sensitive_sv"}} - } - if (call_small_variants) { - if (! defined(call_small_vars_on_mitochondria)) {call Utils.StopWorkflow as call_small_vars_on_mitochondria_not_provided {input: reason = "Unprovided arg call_small_vars_on_mitochondria"}} - if (! defined(run_dv_pepper_analysis)) {call Utils.StopWorkflow as run_dv_pepper_analysis_not_provided {input: reason = "Unprovided arg run_dv_pepper_analysis"}} - if (! defined(dvp_threads)) {call Utils.StopWorkflow as dvp_threads_not_provided {input: reason = "Unprovided arg dvp_threads"}} - if (! defined(ref_scatter_interval_list_locator)) {call Utils.StopWorkflow as ref_scatter_interval_list_locator_not_provided {input: reason = "Unprovided arg ref_scatter_interval_list_locator"}} - if (! defined(ref_scatter_interval_list_ids)) {call Utils.StopWorkflow as ref_scatter_interval_list_ids_not_provided {input: reason = "Unprovided arg ref_scatter_interval_list_ids"}} - } - call VAR.CallVariants { input: - bam = bam, - bai = bai, - sample_id = participant_name, - ref_fasta = ref_map['fasta'], - ref_fasta_fai = ref_map['fai'], - ref_dict = ref_map['dict'], - tandem_repeat_bed = ref_map['tandem_repeat_bed'], + gcs_out_dir = outdir, + bam = MergeAndMetrics.aligned_bam, + bai = MergeAndMetrics.aligned_bai, + prefix = sample_name, + + is_ont = false, + is_r10_4_pore_or_later = false, + model_for_dv_andor_pepper = 'PACBIO', - prefix = participant_name, + ref_map_file = ref_map_file, + ref_scatter_interval_list_locator = ref_scatter_interval_list_locator, + ref_scatter_interval_list_ids = ref_scatter_interval_list_ids, call_svs = call_svs, - fast_less_sensitive_sv = select_first([fast_less_sensitive_sv]), + pbsv_discover_per_chr = pbsv_discover_per_chr, + minsvlen = minsvlen, call_small_variants = call_small_variants, - call_small_vars_on_mitochondria = select_first([call_small_vars_on_mitochondria]), - sites_vcf = sites_vcf, - sites_vcf_tbi = sites_vcf_tbi, - - run_dv_pepper_analysis = select_first([run_dv_pepper_analysis]), - dvp_threads = select_first([dvp_threads]), - dvp_memory = select_first([dvp_memory]), - ref_scatter_interval_list_locator = select_first([ref_scatter_interval_list_locator]), - ref_scatter_interval_list_ids = select_first([ref_scatter_interval_list_ids]) - } - - String svdir = outdir + "/variants/sv" - String smalldir = outdir + "/variants/small" - - if (call_svs) { - call FF.FinalizeToFile as FinalizePBSV { input: outdir = svdir, file = select_first([CallVariants.pbsv_vcf]) } - call FF.FinalizeToFile as FinalizePBSVtbi { input: outdir = svdir, file = select_first([CallVariants.pbsv_tbi]) } + run_clair3 = run_clair3, + use_margin_for_tagging = use_margin_for_tagging, + dv_threads = dv_threads, + dv_memory = dv_memory, + use_gpu = use_gpu, - call FF.FinalizeToFile as FinalizeSniffles { input: outdir = svdir, file = select_first([CallVariants.sniffles_vcf]) } - call FF.FinalizeToFile as FinalizeSnifflesTbi { input: outdir = svdir, file = select_first([CallVariants.sniffles_tbi]) } - } - - if (call_small_variants) { - call FF.FinalizeToFile as FinalizeClairVcf { input: outdir = smalldir, file = select_first([CallVariants.clair_vcf])} - call FF.FinalizeToFile as FinalizeClairTbi { input: outdir = smalldir, file = select_first([CallVariants.clair_tbi])} - - call FF.FinalizeToFile as FinalizeClairGVcf { input: outdir = smalldir, file = select_first([CallVariants.clair_gvcf])} - call FF.FinalizeToFile as FinalizeClairGTbi { input: outdir = smalldir, file = select_first([CallVariants.clair_gtbi])} - - if (select_first([run_dv_pepper_analysis])) { - call FF.FinalizeToFile as FinalizeDVPepperVcf { input: outdir = smalldir, file = select_first([CallVariants.dvp_vcf])} - call FF.FinalizeToFile as FinalizeDVPepperTbi { input: outdir = smalldir, file = select_first([CallVariants.dvp_tbi])} - call FF.FinalizeToFile as FinalizeDVPepperGVcf { input: outdir = smalldir, file = select_first([CallVariants.dvp_g_vcf])} - call FF.FinalizeToFile as FinalizeDVPepperGTbi { input: outdir = smalldir, file = select_first([CallVariants.dvp_g_tbi])} - call FF.FinalizeToFile as FinalizeDVPEPPERPhasedVcf { input: outdir = smalldir, file = select_first([CallVariants.dvp_phased_vcf]), name = "~{participant_name}.deepvariant_pepper.phased.vcf.gz" } - call FF.FinalizeToFile as FinalizeDVPEPPERPhasedTbi { input: outdir = smalldir, file = select_first([CallVariants.dvp_phased_tbi]), name = "~{participant_name}.deepvariant_pepper.phased.vcf.gz.tbi" } - } + gcp_zones = gcp_zones } } - output { - File aligned_bam = FinalizeBam.gcs_path - File aligned_bai = FinalizeBai.gcs_path - File aligned_pbi = FinalizePbi.gcs_path + ########################################################### + call GU.GetTodayDate as today {} - Float aligned_num_reads = coverage.aligned_num_reads - Float aligned_num_bases = coverage.aligned_num_bases - Float aligned_frac_bases = coverage.aligned_frac_bases - Float aligned_est_fold_cov = coverage.aligned_est_fold_cov - - Float aligned_read_length_mean = coverage.aligned_read_length_mean - Float aligned_read_length_median = coverage.aligned_read_length_median - Float aligned_read_length_stdev = coverage.aligned_read_length_stdev - Float aligned_read_length_N50 = coverage.aligned_read_length_N50 - - Float average_identity = coverage.average_identity - Float median_identity = coverage.median_identity + ########################################################### + output { + String last_processing_date = today.yyyy_mm_dd - File? bed_cov_summary = FinalizeRegionalCoverage.gcs_path ######################################## - File? pbsv_vcf = FinalizePBSV.gcs_path - File? pbsv_tbi = FinalizePBSVtbi.gcs_path + File aligned_bam = MergeAndMetrics.aligned_bam + File aligned_bai = MergeAndMetrics.aligned_bai + File aligned_pbi = select_first([MergeAndMetrics.aligned_pbi]) - File? sniffles_vcf = FinalizeSniffles.gcs_path - File? sniffles_tbi = FinalizeSnifflesTbi.gcs_path + Float coverage = MergeAndMetrics.coverage + File? bed_cov_summary = MergeAndMetrics.bed_cov_summary - File? clair_vcf = FinalizeClairVcf.gcs_path - File? clair_tbi = FinalizeClairTbi.gcs_path + Map[String, Float] alignment_metrics = MergeAndMetrics.alignment_metrics - File? clair_gvcf = FinalizeClairGVcf.gcs_path - File? clair_gtbi = FinalizeClairGTbi.gcs_path - - File? dvp_vcf = FinalizeDVPepperVcf.gcs_path - File? dvp_tbi = FinalizeDVPepperTbi.gcs_path - File? dvp_g_vcf = FinalizeDVPepperGVcf.gcs_path - File? dvp_g_tbi = FinalizeDVPepperGTbi.gcs_path - File? dvp_phased_vcf = FinalizeDVPEPPERPhasedVcf.gcs_path - File? dvp_phased_tbi = FinalizeDVPEPPERPhasedTbi.gcs_path + ######################################## + File? pbsv_vcf = CallVariants.pbsv_vcf + File? pbsv_tbi = CallVariants.pbsv_tbi + + File? sniffles_vcf = CallVariants.sniffles_vcf + File? sniffles_tbi = CallVariants.sniffles_tbi + File? sniffles_snf = CallVariants.sniffles_snf + + File? sniffles_phased_vcf = CallVariants.sniffles_phased_vcf + File? sniffles_phased_tbi = CallVariants.sniffles_phased_tbi + File? sniffles_phased_snf = CallVariants.sniffles_phased_snf + + File? clair_vcf = CallVariants.clair_vcf + File? clair_tbi = CallVariants.clair_tbi + File? clair_gvcf = CallVariants.clair_gvcf + File? clair_gtbi = CallVariants.clair_gtbi + + File? dv_g_vcf = CallVariants.dv_g_vcf + File? dv_g_tbi = CallVariants.dv_g_tbi + File? dv_margin_phased_vcf = CallVariants.dv_margin_phased_vcf + File? dv_margin_phased_tbi = CallVariants.dv_margin_phased_tbi + File? dv_vcf_margin_phasing_stats_tsv = CallVariants.dv_vcf_margin_phasing_stats_tsv + File? dv_vcf_margin_phasing_stats_gtf = CallVariants.dv_vcf_margin_phasing_stats_gtf + File? dv_whatshap_phased_vcf = CallVariants.dv_whatshap_phased_vcf + File? dv_whatshap_phased_tbi = CallVariants.dv_whatshap_phased_tbi + File? dv_vcf_whatshap_phasing_stats_tsv = CallVariants.dv_vcf_whatshap_phasing_stats_tsv + File? dv_vcf_whatshap_phasing_stats_gtf = CallVariants.dv_vcf_whatshap_phasing_stats_gtf + String? dv_nongpu_resources_usage_visual = CallVariants.dv_nongpu_resources_usage_visual } } diff --git a/wdl/pipelines/TechAgnostic/Utility/AlignedBamQCandMetrics.wdl b/wdl/pipelines/TechAgnostic/Utility/AlignedBamQCandMetrics.wdl new file mode 100644 index 000000000..82efcbe0b --- /dev/null +++ b/wdl/pipelines/TechAgnostic/Utility/AlignedBamQCandMetrics.wdl @@ -0,0 +1,268 @@ +version 1.0 + +import "../../../tasks/Utility/Utils.wdl" as Utils +import "../../../tasks/Utility/BAMutils.wdl" as BU + +import "../../../tasks/QC/AlignedMetrics.wdl" as AM +import "../../../tasks/Visualization/NanoPlot.wdl" as NP + +import "../../../tasks/QC/FPCheckAoU.wdl" as QC0 +import "../../TechAgnostic/Utility/LongReadsContaminationEstimation.wdl" as QC1 +import "../../TechAgnostic/Utility/SexCheckNaive.wdl" as QC2 +import "../../TechAgnostic/Utility/CountTheBeans.wdl" as QC3 + +import "SaveFilesToDestination.wdl" as SAVE + +workflow Work { + meta { + desciption: + "A workflow that unifies standard human WGS aligned BAM QC checks and metrics collection." + } + + parameter_meta { + + ######### + # inputs + tech: + "The technology used to generate this BAM. Currently, the following values are accepted: [ONT, Sequel, Revio]." + + gcs_out_root_dir: + "output files will be copied over there" + + cov_bed: + "An optional BED file on which coverage will be collected (a mean value for each interval)" + cov_bed_descriptor: + "A short description of the BED provided for targeted coverage estimation; will be used in naming output files." + + fingerprint_vcf_store: + "A GCS 'folder' holding fingerprint VCF files" + fingerprint_sample_id: + "The ID of the sample supposedly this BAM belongs to; note that the fingerprint VCF is assumed to be located at {fingerprint_vcf_store}/{fingerprint_sample_id}*.vcf(.gz?)" + + vbid2_config_json: + "A config json to for running the VBID2 contamination estimation sub-workflow; if provided, will trigger the VBID2 sub-workflow for cross-(human)individual contamination estimation." + + expected_sex_type: + "If provided, triggers sex concordance check. Accepted value: [M, F, NA, na]" + + methyl_tag_check_bam_descriptor: + "If provided, triggers workflow that collects information on reads that miss MM/ML SAM tags; this is meant to be a short description of the purpose of the BAM (e.g. input, a single readgroup, per sample, etc; doesn't need to be single-file specific); used for saving the reads that miss MM/ML tags." + + ######### + # outputs + wgs_cov: + "whole genome mean coverage" + + nanoplot_summ: + "Summary on alignment metrics provided by Nanoplot (todo: study the value of this output)" + + sam_flag_stats: + "SAM flag stats" + + contamination_est: + "cross-(human)individual contamination estimation by VerifyBAMID2" + + inferred_sex_info: + "Inferred sex concordance information if expected sex type is provided" + + methyl_tag_simple_stats: + "Simple stats on the reads with & without SAM methylation tags (MM/ML)." + save_methyl_uncalled_reads: + "If to save the reads without MM/ML tags." + + aBAM_metrics_files: + "A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks" + } + + input { + File bam + File bai + + String tech + + File? cov_bed + String? cov_bed_descriptor + + String? fingerprint_vcf_store + String? fingerprint_sample_id + + File? vbid2_config_json + String? expected_sex_type + String? methyl_tag_check_bam_descriptor + Boolean? save_methyl_uncalled_reads + + File ref_map_file + String disk_type + + String output_prefix # String output_prefix = bam_sample_name + "." + flowcell + String gcs_out_root_dir + } + + output { + Float wgs_cov = MosDepthWGS.wgs_cov + Map[String, Float] nanoplot_summ = NanoPlotFromBam.stats_map + Map[String, Float] sam_flag_stats = ParseFlagStatsJson.qc_pass_reads_SAM_flag_stats + + # fingerprint + Map[String, String]? fingerprint_check = fp_res + + # contam + Float? contamination_est = VBID2.contamination_est + + # sex concordance + Map[String, String]? inferred_sex_info = SexConcordance.inferred_sex_info + + # methyl + Map[String, String]? methyl_tag_simple_stats = NoMissingBeans.methyl_tag_simple_stats + + # file-based outputs all packed into a finalization map + Map[String, String] aBAM_metrics_files = FF.result + } + + String workflow_name = "AlignedBamQCandMetrics" + String metrics_output_dir = sub(gcs_out_root_dir, "/+$","") + "/~{workflow_name}/~{output_prefix}" + + ################################################################################### + # arg validation and prep + ################################################################################### + Map[String, String] ref_map = read_map(ref_map_file) + + if (defined(fingerprint_vcf_store) != defined(fingerprint_sample_id)) { + call Utils.StopWorkflow as MisingFingerprintArgs { input: + reason = "fingerprint_vcf_store and fingerprint_sample_id must be specified together or omitted together" + } + } + + if (defined(cov_bed) != defined(cov_bed_descriptor)) { + call Utils.StopWorkflow as MisingCoverageBEDdescriptor { input: + reason = "cov_bed and cov_bed_descriptor must be specified together or omitted together" + } + } + ################################################################################### + # ALWAYS ON QC/METRICS + ################################################################################### + ################################ + # coverage + call AM.MosDepthWGS { input: + bam = bam, bai = bai, disk_type = disk_type, + bed = cov_bed, bed_descriptor = cov_bed_descriptor + } + FinalizationManifestLine a = object + {files_to_save: [MosDepthWGS.summary_txt], + is_singleton_file: true, + destination: metrics_output_dir, + output_attribute_name: "cov_per_chr"} + + ################################ + # SAM flag stats + call BU.SamtoolsFlagStats { input: bam = bam, output_format = 'JSON', disk_type = disk_type } + call BU.ParseFlagStatsJson { input: sam_flag_stats_json = SamtoolsFlagStats.flag_stats } + + ################################ + # nanoplot + call NP.NanoPlotFromBam { input: bam = bam, bai = bai, disk_type = disk_type } + FinalizationManifestLine b = object + {files_to_save: flatten([[NanoPlotFromBam.stats], NanoPlotFromBam.plots]), + is_singleton_file: false, + destination: metrics_output_dir + "/nanoplot", + output_attribute_name: "nanoplot"} + + ################################################################################### + # OPTIONAL QC/METRICS + ################################################################################### + ################################ + # (optional) fingerprint + if (defined(fingerprint_vcf_store)) { + call QC0.FPCheckAoU as fingerprint { + input: + aligned_bam = bam, + aligned_bai = bai, + tech = tech, + fp_vcf_store = select_first([fingerprint_vcf_store]), + fp_sample_id = select_first([fingerprint_sample_id]), + ref_specific_haplotype_map = ref_map['haplotype_map'] + } + Map[String, String] fp_res = {'status': fingerprint.FP_status, + 'LOD': fingerprint.lod_expected_sample} + String dummy = fingerprint.fingerprint_summary # this supposedly File type output may be a null because the input BAM is teeny + if ("None"!=dummy) { + FinalizationManifestLine c = object + {files_to_save: [fingerprint.fingerprint_summary, fingerprint.fingerprint_details], + pack_name: "fingerprint.details.tar.gz", + is_singleton_file: false, + destination: metrics_output_dir, + output_attribute_name: "fingerprint_check"} + } + } + ################################ + # (optional) contamination + if (defined(vbid2_config_json)) { + VBID2_config vb_conf = read_json(select_first([vbid2_config_json])) + call QC1.LongReadsContaminationEstimation as VBID2 { input: + bam=bam, + bai=bai, + ref_map_file=ref_map_file, + tech = tech, + gt_sites_bed = vb_conf.genotyping_sites, + is_hgdp_sites = vb_conf.is_hgdp_sites, + is_100k_sites = vb_conf.is_100k_sites, + disable_baq = vb_conf.disable_baq, + max_retries = vb_conf.max_retries, + disk_type = disk_type, + } + # no file to save from contam.est. + } + ################################ + # (optional) sex concordance + if (defined(expected_sex_type)) { + call QC2.SexCheckNaive as SexConcordance { input: + bam=bam, + bai=bai, + expected_sex_type=select_first([expected_sex_type]), + mosdepth_summary_txt=MosDepthWGS.summary_txt + } + # no file to save from sex concordance + } + ################################ + # (optional) verify methylation tags aren't missing + if (defined(methyl_tag_check_bam_descriptor)) { + call QC3.CountTheBeans as NoMissingBeans { input: + bam=bam, + bai=bai, + bam_descriptor=select_first([methyl_tag_check_bam_descriptor]), + gcs_out_root_dir= metrics_output_dir, + save_read_names_only = !select_first([save_methyl_uncalled_reads, true]), + use_local_ssd=disk_type=='LOCAL' + } + Map[String, String] methyl_out = {"missing_methyl_tag_reads": + NoMissingBeans.methyl_tag_simple_stats['files_holding_reads_without_tags']} + } + + ################################################################################### + # save results + ################################################################################### + call SAVE.SaveFilestoDestination as FF { input: + instructions = select_all([a, b, c]), + already_finalized = select_all([methyl_out]), + key_file = select_first(select_all([MosDepthWGS.summary_txt, + NanoPlotFromBam.stats, + fingerprint.fingerprint_summary,])) + } +} + +# if you want to call me as sub-workflow in a standard run +struct AlignedBamQCnMetricsConfig { + File? cov_bed # An optional BED file on which coverage will be collected (a mean value for each interval) + String? cov_bed_descriptor # A short description of the BED provided for targeted coverage estimation; will be used in naming output files. + + String? fingerprint_vcf_store # A GCS 'folder' holding fingerprint VCF files + + File? vbid2_config_json # A config json to for running the VBID2 contamination estimation sub-workflow; + # if provided, will trigger the VBID2 sub-workflow for cross-(human)individual contamination estimation. + + String? methyl_tag_check_bam_descriptor # A one-word description of the purpose of the BAM + # (e.g. 'rg_post_aln' for "readgroup post alignment", "sm_post_merge" for "sample post merging"); + # used for saving the reads that miss MM/ML tags; + # doesn't need to be single-file specific + Boolean? save_methyl_uncalled_reads # if true, will save the actual reads that miss the methylation SAM tags (otherwise we only save the read names) +} diff --git a/wdl/pipelines/TechAgnostic/Utility/CollectBamFlagStats.wdl b/wdl/pipelines/TechAgnostic/Utility/CollectBamFlagStats.wdl new file mode 100644 index 000000000..df7265821 --- /dev/null +++ b/wdl/pipelines/TechAgnostic/Utility/CollectBamFlagStats.wdl @@ -0,0 +1,20 @@ +version 1.0 + +import "../../../tasks/Utility/BAMutils.wdl" as BU + +workflow CollectBamFlagStats { + meta { + description: "Collect SAM flag stats of an aligned BAM" + } + input { + File bam + String disk_type = "HDD" + } + + call BU.SamtoolsFlagStats { input: bam = bam, output_format = 'JSON', disk_type = disk_type } + call BU.ParseFlagStatsJson { input: sam_flag_stats_json = SamtoolsFlagStats.flag_stats } + + output { + Map[String, Float] sam_flag_stats = ParseFlagStatsJson.qc_pass_reads_SAM_flag_stats + } +} diff --git a/wdl/pipelines/TechAgnostic/Utility/CountTheBeans.wdl b/wdl/pipelines/TechAgnostic/Utility/CountTheBeans.wdl new file mode 100644 index 000000000..c68155640 --- /dev/null +++ b/wdl/pipelines/TechAgnostic/Utility/CountTheBeans.wdl @@ -0,0 +1,61 @@ +version 1.0 + +import "../../../tasks/Utility/Finalize.wdl" as FF +import "../../../tasks/Utility/BAMutils.wdl" as BU +import "../../../tasks/Utility/Utils.wdl" + +workflow CountTheBeans { + meta { + desciption: + "For long-read bams (PacBio and ONT), gather information about reads with and without the ML/MM tags for methylation." + } + + parameter_meta { + bam_descriptor: + "a short description (no space) of the purpose of the BAM; used only for saving the results." + save_read_names_only: + "the workflow extracts both the reads and the read names missing the methylation SAM tags; if this is set to true, then only the read names are saved." + } + input { + File bam + File? bai + + Boolean use_local_ssd = false + + String? bam_descriptor + Boolean save_read_names_only = true + String? gcs_out_root_dir + } + + output { + Map[String, String] methyl_tag_simple_stats = { + 'raw_record_cnt': Count.raw_count, + 'raw_record_with-mm-ml_cnt': Count.bean_count, + 'primary_record_cnt': Count.non_2304_count, + 'primary_record_with-mm-ml_cnt': Count.non_2304_bean_count, + 'files_holding_reads_without_tags': select_first([FinalizeToDir.gcs_dir, "None"]) + } + } + + if (defined(gcs_out_root_dir)!=defined(bam_descriptor)) { + call Utils.StopWorkflow { input: reason = "'bam_descriptor' and 'gcs_out_root_dir' must be provided/omitted together." } + } + + call BU.CountMethylCallReads as Count { input: bam = bam, bai = bai, disk_type = if(use_local_ssd) then "LOCAL" else "SSD"} + + call BU.GatherReadsWithoutMethylCalls as GatherBitter { input: bam = bam, bai = bai, disk_type = if(use_local_ssd) then "LOCAL" else "SSD"} + + if (defined(gcs_out_root_dir)) { + String gcs_out = sub(select_first([gcs_out_root_dir]), "/$", "") + "/" + String out_dir = gcs_out + basename(bam, ".bam") + "." + select_first([bam_descriptor]) + "/RecordsWithoutMethylTags/" + + Array[File] reads = [GatherBitter.no_ml_reads, GatherBitter.no_mm_reads] + Array[File] read_names = [GatherBitter.names_missing_only_one_tag, GatherBitter.names_missing_both_tags] + Array[File] files_to_save = if (save_read_names_only) then read_names else flatten([reads, read_names]) + + call FF.FinalizeToDir { input: + files = files_to_save, + outdir = out_dir + } + } +} diff --git a/wdl/pipelines/TechAgnostic/Utility/DystPeaker.wdl b/wdl/pipelines/TechAgnostic/Utility/DystPeaker.wdl new file mode 100644 index 000000000..ef19eb7b9 --- /dev/null +++ b/wdl/pipelines/TechAgnostic/Utility/DystPeaker.wdl @@ -0,0 +1,63 @@ +version 1.0 + +import "../../../tasks/Utility/Finalize.wdl" as FF + +import "../../../tasks/Utility/ReadLengths.wdl" as ReLU + +workflow DystPeaker { + meta { + description: "Collect read length information from a long reads BAM." + } + input { + File input_file + Boolean input_is_bam + String id + Int short_reads_threshold + + String gcs_out_root_dir + } + parameter_meta { + gcs_out_root_dir: "Cloud storage output directory" + id: "A distinguishing ID that's going to impact how the files are named and where they are placed in the directories." + short_reads_threshold: "A threshold below which the reads will be classified as short" + + read_lengths_hist: "Read length histogram" + peaks: "Estimated peaks in the read length distritbution" + reverse_yield: "A lenth-9 array of lengths at which a certain fraction of reads are shorter than. The fraction bins are 10% to 90% with 10% increments." + read_len_summaries: "A summary on some other metrics related to read length" + } + + String relative_dir = "ReadLengthMetrics" + String output_dir = sub(gcs_out_root_dir, "/$", "") + "/" + relative_dir + "/" + id + + # collect + if (input_is_bam) { + call ReLU.GetLengthsFromBam { input: bam = input_file } + } + if ( !input_is_bam ) { + call ReLU.GetLengthsFromFastq { input: fastq = input_file } + } + File rl_file = select_first([GetLengthsFromBam.read_lengths, GetLengthsFromFastq.read_lengths]) + + # stats + call ReLU.Dyst { input: read_lengths_txt = rl_file } + call ReLU.Peaker { input: dyst_histogram = Dyst.histogram } + call ReLU.ReverseYield { input: read_lengths_txt = rl_file } + call ReLU.Skewness { input: read_lengths_txt = rl_file } + call ReLU.GetNumReadsAndShorts { input: read_lengths_txt = rl_file, short_threshold = short_reads_threshold } + String raw_pct = round(100 * GetNumReadsAndShorts.num_shorts/GetNumReadsAndShorts.num_seqs) + + call FF.FinalizeToFile as SaveRLArray { input: outdir = output_dir, file = GetNumReadsAndShorts.rl_bz2, name = id + ".readLen.txt.bz2" } + call FF.FinalizeToFile as SaveHist { input: outdir = output_dir, file = Dyst.histogram, name = id + ".readLen.hist.txt" } + + output { + File read_len_hist = SaveHist.gcs_path + Array[Int] read_len_peaks = Peaker.peaks + Array[Int] read_len_deciles = ReverseYield.reverse_yield + + Map[String, String] read_len_summaries = {'shortie_pct': raw_pct + "%", + 'shortie_threshold': short_reads_threshold, + 'skew': Skewness.skew, + 'raw_rl_file': SaveRLArray.gcs_path} + } +} diff --git a/wdl/pipelines/TechAgnostic/Utility/FASTQstats.wdl b/wdl/pipelines/TechAgnostic/Utility/FASTQstats.wdl new file mode 100644 index 000000000..aa257204d --- /dev/null +++ b/wdl/pipelines/TechAgnostic/Utility/FASTQstats.wdl @@ -0,0 +1,40 @@ +version 1.0 + +import "../../../tasks/Utility/FastqUtils.wdl" as FQU +import "../../../tasks/Utility/BAMutils.wdl" as BU + +workflow FASTQstats { + meta { + desription: + "Collect some basic stats from a FASTQ (or BAM) file." + } + parameter_meta { + reads: "file to collect stats on" + file_type: "type of file: accepted values are [FASTQ, BAM] (regardless of gz, bgz)" + seq_type: "argument to the --seq-type paramter of seqkit" + exclude_len_threshold: "Sequeces shorter than this will be dropped from analysis; no effect if not provided" + } + + input { + File reads + String file_type + String seq_type = "dna" + Int? exclude_len_threshold + } + + output { + Map[String, Float] stats = Stats.res + } + + if ('BAM' == file_type) { + call BU.BamToFastq { input: bam = reads, prefix = basename(reads, ".bam"), disk_type = 'SSD' } + } + File formatted_input = select_first([BamToFastq.reads_fq, reads]) + if (defined(exclude_len_threshold)) { + call FQU.FilterByLength { input: fq = formatted_input, threshold = select_first([exclude_len_threshold])} + # call FQU.FilterByLenSeqTk { input: fastq = fastq, exclude_len_threshold = select_first([exclude_len_threshold])} + } + File filtered_input = select_first([FilterByLength.res, formatted_input]) + + call FQU.Stats { input: fastq = filtered_input, seq_type = seq_type } +} diff --git a/wdl/pipelines/TechAgnostic/Utility/FilterBamByLength.wdl b/wdl/pipelines/TechAgnostic/Utility/FilterBamByLength.wdl new file mode 100644 index 000000000..7c182d631 --- /dev/null +++ b/wdl/pipelines/TechAgnostic/Utility/FilterBamByLength.wdl @@ -0,0 +1,48 @@ +version 1.0 + +import "../../../tasks/Utility/Utils.wdl" +import "../../../tasks/Utility/BAMutils.wdl" as BU +import "../../../tasks/Utility/Finalize.wdl" as FF + +workflow FilterBamByLength { + meta { + desciption: + "Filter a BAM (mapped or not) by sequence length." + } + parameter_meta { + len_threshold_inclusive: "Reads longer than or equal to this length will be included." + } + input { + File bam + File? bai + Int len_threshold_inclusive + Boolean compute_yield + Boolean conver_2_fq + String disk_type + String gcs_out_root_dir + } + + String workflow_name = "FilterBamByLength" + + call BU.InferSampleName { input: bam = bam, bai = bai } + String outdir = sub(gcs_out_root_dir, "/$", "") + "/~{workflow_name}/~{InferSampleName.sample_name}" + + call BU.FilterBamByLen { input: bam = bam, bai = bai, len_threshold_inclusive = len_threshold_inclusive, compute_yield = compute_yield, disk_type = disk_type} + call FF.FinalizeToFile as FinalizeBam { input: outdir = outdir, file = FilterBamByLen.fBAM } + if (defined(bai)) { + call FF.FinalizeToFile as FinalizeBai { input: outdir = outdir, file = select_first([FilterBamByLen.fBAI]) } + } + + if (conver_2_fq) { + call BU.BamToFastq { input: bam = FilterBamByLen.fBAM, prefix = basename(bam) + ".length-filter-~{len_threshold_inclusive}"} + call FF.FinalizeToFile as FinalizeFastq { input: outdir = outdir, file = BamToFastq.reads_fq } + } + + output { + File filtered_bam = FinalizeBam.gcs_path + File? filtered_bai = FinalizeBai.gcs_path + File? filtered_fastq = FinalizeFastq.gcs_path + Float? total_yield = FilterBamByLen.total_yield + Float? filtered_yield = FilterBamByLen.filtered_yield + } +} diff --git a/wdl/pipelines/TechAgnostic/Utility/LongReadsContaminationEstimation.wdl b/wdl/pipelines/TechAgnostic/Utility/LongReadsContaminationEstimation.wdl new file mode 100644 index 000000000..bf250863d --- /dev/null +++ b/wdl/pipelines/TechAgnostic/Utility/LongReadsContaminationEstimation.wdl @@ -0,0 +1,90 @@ +version 1.0 + +import "../../../tasks/Utility/Utils.wdl" +import "../../../tasks/Utility/BAMutils.wdl" as BU +import "../../../tasks/QC/Contamination.wdl" + +# this is a model that other sub-workflows can potentially follow, +# i.e. define a custom struct so that super workflows can use pre-defined JSON files +struct VBID2_config { + File genotyping_sites + + Boolean is_hgdp_sites + Boolean is_100k_sites + + Boolean disable_baq + + Int max_retries + + String? tech +} + +workflow LongReadsContaminationEstimation { + meta { + desciption: + "Estimate the cross-individual contamination level of a GRCh38 bam." + } + + input { + File bam + File bai + String tech + File ref_map_file + + File gt_sites_bed + Boolean is_hgdp_sites + Boolean is_100k_sites + + Boolean disable_baq + + String disk_type + + Int max_retries + } + + parameter_meta { + # input: + gt_sites_bed: "Bed file holding the genotyping sites." + is_hgdp_sites: "Provided BED is HGDP genotyping sites." + is_100k_sites: "Provided BED is 100k genotyping sites, not 10k sites." + disable_baq: "If turned on, BAQ computation will be disabled (faster operation)." + + tech: "technology used for generating the data; accepted value: [ONT, Sequel, Revio]" + + max_retries: "Because of the strange samtools failures reading from NAS storage, we should make multiple attempts to get away from the trasient errors. If after the max retries, we still get those failures, this task will fail." + } + + # if the coverage is too low, the tool errors out (and the data won't bring much value anyway) + # here we guard against it by using bam file size, with a emperically-determined threshold + Map[String, Int] bam_threshold_per_tech = {'ONT': 450, 'Revio': 150, 'Sequel': 250} # this value is technology dependent + Int bam_file_threshold = bam_threshold_per_tech[tech] + + if (bam_file_threshold > ceil(size(bam, "MiB"))) { + Float extreme_low_cov_val = 200 # completely arbitrary + } + + if (bam_file_threshold <= ceil(size(bam, "MiB"))) { + # quickly change to pileup + Map[String, String] ref_map = read_map(ref_map_file) + + Int scaleup_factor = 20 + call BU.BamToRelevantPileup as Pileup { + input: + bam = bam, + bai = bai, + bed = gt_sites_bed, + ref_fasta = ref_map['fasta'], + disable_baq = disable_baq, + disk_type = disk_type, + max_retries = max_retries + } + + call Contamination.VerifyBamID { + input: pileup = Pileup.pileups, ref_fasta = ref_map['fasta'], is_hgdp_sites = is_hgdp_sites, is_100k_sites = is_100k_sites + } + } + + output { + Float contamination_est = select_first([VerifyBamID.contamination_est, extreme_low_cov_val]) + } +} diff --git a/wdl/pipelines/TechAgnostic/Utility/MergeSampleBamsAndCollectMetrics.wdl b/wdl/pipelines/TechAgnostic/Utility/MergeSampleBamsAndCollectMetrics.wdl new file mode 100644 index 000000000..e4ff45fda --- /dev/null +++ b/wdl/pipelines/TechAgnostic/Utility/MergeSampleBamsAndCollectMetrics.wdl @@ -0,0 +1,109 @@ +version 1.0 + +import "../../../tasks/Utility/Utils.wdl" as Utils +import "../../../tasks/Utility/PBUtils.wdl" as PB +import "../../../tasks/Utility/ONTUtils.wdl" as ONT +import "../../../tasks/Utility/BAMutils.wdl" as BU + +import "../../../tasks/Utility/Finalize.wdl" as FF + +import "../../../deprecated/SampleLevelAlignedMetrics.wdl" as COV + +workflow Work { + meta { + description: "Merge a sample's (potential) multiple SMRT-/flow-cells data, and collect alignment metrics." + } + parameter_meta { + bams_suspected_to_contain_dup_record: "Some ONT output files from basecall dirs have a strange duplicate issue." + bed_to_compute_coverage: "BED file holding regions-of-interest for computing coverage over." + bed_descriptor: "Description of the BED file, will be used in the file name so be careful naming things" + } + input { + String gcs_out_dir + + # sample specific + String sample_name + Array[File] aligned_bams + Array[File] aligned_bais + + Boolean is_ont + Boolean bams_suspected_to_contain_dup_record + + File? bed_to_compute_coverage + String? bed_descriptor + } + output { + File aligned_bam = FinalizeBam.gcs_path + File aligned_bai = FinalizeBai.gcs_path + File? aligned_pbi = FinalizePbi.gcs_path + + Float coverage = AlignmentMetrics.coverage + File? bed_cov_summary = FinalizeRegionalCoverage.gcs_path + + Map[String, Float] alignment_metrics = AlignmentMetrics.reads_stats + } + if (defined(bed_to_compute_coverage)) { + if (!defined(bed_descriptor)) { + call Utils.StopWorkflow { input: reason = "Must provied descriptive name of the BED file if the file is provided."} + } + } + + String outdir = sub(gcs_out_dir, "/$", "") + "/alignments" + + ########################################################### + # some input validation + scatter (pair in zip(aligned_bams, aligned_bais)) { + call Utils.InferSampleName {input: bam = pair.left, bai = pair.right} + } + call Utils.CheckOnSamplenames {input: sample_names = InferSampleName.sample_name} + if (InferSampleName.sample_name[0] != sample_name) { + call Utils.StopWorkflow as SM_mismatch { input: reason = "Provided sample name and those encoded in the BAM(s) don't match."} + } + + ########################################################### + # gather across (potential multiple) input BAMs + if (length(aligned_bams) > 1) { + call BU.MergeBamsWithSamtools as MergeAllReads { input: bams = aligned_bams, out_prefix = sample_name } + } + + File bam = select_first([MergeAllReads.merged_bam, aligned_bams[0]]) + File bai = select_first([MergeAllReads.merged_bai, aligned_bais[0]]) + + ########################################################### + # ont specific: sometimes there are duplicate reads + if (is_ont && bams_suspected_to_contain_dup_record) { + call ONT.DeduplicateBam as RemoveONTDuplicates { + input: aligned_bam = bam, aligned_bai = bai + } + } + + ########################################################### + # save bam and index + File use_this_bam = select_first([RemoveONTDuplicates.corrected_bam, bam]) + File use_this_bai = select_first([RemoveONTDuplicates.corrected_bai, bai]) + + call FF.FinalizeToFile as FinalizeBam { input: outdir = outdir, file = use_this_bam, name = "~{sample_name}.bam" } + call FF.FinalizeToFile as FinalizeBai { input: outdir = outdir, file = use_this_bai, name = "~{sample_name}.bam.bai" } + + ########################################################### + # pacbio specific index + if (! is_ont) { + call PB.PBIndex as PBIndexSampleReads { input: bam = use_this_bam } + call FF.FinalizeToFile as FinalizePbi { input: outdir = outdir, file = PBIndexSampleReads.pbi, name = "~{sample_name}.bam.pbi" } + } + ########################################################### + call COV.SampleLevelAlignedMetrics as AlignmentMetrics { + input: + aligned_bam = use_this_bam, + aligned_bai = use_this_bai, + bed_to_compute_coverage = bed_to_compute_coverage, + bed_descriptor = bed_descriptor + } + + if (defined(bed_to_compute_coverage)) { + call FF.FinalizeToFile as FinalizeRegionalCoverage { input: + outdir = outdir, file = select_first([AlignmentMetrics.bed_cov_summary]), + name = "~{sample_name}.mosdepth_coverage.coverage_over_bed.~{bed_descriptor}.summary.txt" + } + } +} diff --git a/wdl/pipelines/TechAgnostic/Utility/SaveFilesToDestination.wdl b/wdl/pipelines/TechAgnostic/Utility/SaveFilesToDestination.wdl new file mode 100644 index 000000000..bc40a0a4c --- /dev/null +++ b/wdl/pipelines/TechAgnostic/Utility/SaveFilesToDestination.wdl @@ -0,0 +1,106 @@ +version 1.0 + +import "../../../tasks/Utility/GeneralUtils.wdl" as GU +import "../../../tasks/Utility/Finalize.wdl" as FF + +struct FinalizationManifestLine { + Array[File]+ files_to_save + Array[String]? file_names # must be same length as files_to_save, if provided; ignored if pack_name is also provided + String? pack_name # if provided, file_names will be ignored: single file will be (block-)gzipped and multiple files will be tar-gzed. + Boolean is_singleton_file # if this is just a single file [when files_to_save is intended to be an array, even though it might be length 1, this should be false] + String destination # where to save the files; + # if saving a single file, or if packing multiple files, then the result will be saved under this destination + # if saving multiple files but not packing them, then the files will be saved under this destination with their default or custom names + Boolean? individual_compress # has effect only for multiple files; if true, will + Boolean? use_bgzip # block-gzip files or not + String output_attribute_name # mostly for terra usage +} + +workflow SaveFilestoDestination { + meta { + desciption: + "If your workflow needs to save outputs from various tasks into slightly different locations, this is the sub-workflow for you." + } + parameter_meta { + instructions: + "A workflow, when calling this sub-workflow, should construct this array by hand" + result: + "The gs path for each line in your instructions" + key_file: + "Finalization will not take place until the KeyFile exists. This can be used to force the finaliation to wait until a certain point in a workflow. NOTE: The latest WDL development spec includes the `after` keyword which will obviate this." + } + input { + Array[FinalizationManifestLine] instructions + Array[Map[String, String]]? already_finalized + File? key_file + } + output { + Map[String, String] result = select_first([PackAllSavings.merged, collect.output_map]) + } + + scatter (line in instructions) { + + if (line.is_singleton_file) { + if (!defined(line.pack_name)) { # as-is + call FF.FinalizeToFile as SF { input: + file = line.files_to_save[0], + name = if (defined(line.file_names)) then select_first([line.file_names])[0] else basename(line.files_to_save[0]), + outdir = line.destination + } + } + if (defined(line.pack_name)) { # (block-)gzip single file + call FF.CompressAndFinalize as SF_gz { input: + file = line.files_to_save[0], + name = select_first([line.pack_name]), + block_gzip = if (defined(line.use_bgzip)) then select_first([line.use_bgzip]) else false, + outdir = line.destination + } + } + } + + if (!line.is_singleton_file) { + if (!defined(line.pack_name)) { # as-is + call FF.FinalizeToDir as MF_default { input: + files = line.files_to_save, + file_names = line.file_names, + outdir = line.destination + } + } + + if (defined(line.individual_compress)) { # (b)gz individual files + call FF.FinalizeAndCompress as MF_gz { input: + files = line.files_to_save, + outdir = line.destination, + block_gzip = if (defined(line.use_bgzip)) then select_first([line.use_bgzip]) else false + } + } + + if (defined(line.pack_name)) { # tar.gz to a single file + call FF.TarGZFilesAndSave as MF_pack { input: + files = line.files_to_save, + name = select_first([line.pack_name]), + outdir = line.destination + } + } + } + + String attr = line.output_attribute_name + String final_path = select_first([SF_gz.gcs_path, SF.gcs_path, + MF_default.gcs_dir, MF_gz.gcs_path, MF_pack.gcs_path]) + } + + call GU.CoerceArrayOfPairsToMap as collect { input: keys = attr, values = final_path } + + if (defined(already_finalized)) { + scatter(mm in select_first([already_finalized])) { + call GU.MapToTsv { input: m = mm} + } + call GU.ConcatenateFiles { input: af = MapToTsv.tsv, out_name = "does_not_matter.tsv" } + + Map[String, String] pack_one = read_map(ConcatenateFiles.merged) + + call GU.MergeMaps as PackAllSavings { input: + one = pack_one, two = collect.output_map + } + } +} diff --git a/wdl/pipelines/TechAgnostic/Utility/SexCheckNaive.wdl b/wdl/pipelines/TechAgnostic/Utility/SexCheckNaive.wdl new file mode 100644 index 000000000..42bfe0281 --- /dev/null +++ b/wdl/pipelines/TechAgnostic/Utility/SexCheckNaive.wdl @@ -0,0 +1,31 @@ +version 1.0 + +import "../../../tasks/QC/AlignedMetrics.wdl" as AM +import "../../../tasks/QC/SexConcordance.wdl" as SC + +workflow SexCheckNaive { + input { + File bam + File bai + String expected_sex_type + + File? mosdepth_summary_txt + } + + if (!defined(mosdepth_summary_txt)) { + call AM.MosDepthWGS {input: bam = bam, bai = bai} + } + + call SC.SummarizeCoverages {input: mosdepth_summary_txt = select_first([mosdepth_summary_txt, MosDepthWGS.summary_txt])} + call SC.MakeACall { + input: + cov_chr1 = SummarizeCoverages.cov_chr1, + cov_chrX = SummarizeCoverages.cov_chrX, + cov_chrY = SummarizeCoverages.cov_chrY, + expected_sex_type = expected_sex_type + } + + output { + Map[String, String] inferred_sex_info = MakeACall.inferred_sex_info + } +} diff --git a/wdl/pipelines/TechAgnostic/Utility/ShardWholeGenome.wdl b/wdl/pipelines/TechAgnostic/Utility/ShardWholeGenome.wdl new file mode 100644 index 000000000..35cd0c437 --- /dev/null +++ b/wdl/pipelines/TechAgnostic/Utility/ShardWholeGenome.wdl @@ -0,0 +1,101 @@ +version 1.0 + +import "../../../tasks/Utility/Utils.wdl" + +import "../../../tasks/Utility/BAMutils.wdl" as BU + +workflow Split { + meta { + description: "Split input BAM aligned to a reference genome." + } + parameter_meta { + contig_filter: "List of contigs in the ref genomes to skip. Ignored when ref_scatter_interval_list* are provided (assuming users know how they want to shard)." + ref_scatter_interval_list_locator: "A file holding paths to interval_list files; provide when explicit sharding scheme is desired." + ref_scatter_interval_list_ids: "A file that gives short IDs to the interval_list files; provide when explicit sharding scheme is desired." + + id_bam_bai_of_shards: "Id, bam and bai (in the form of >) of the sharded intput BAM" + } + input { + File ref_dict + File bam + File bai + Array[String] contig_filter = ['random', 'chrUn', 'decoy', 'alt', 'HLA', 'EBV'] + File? ref_scatter_interval_list_locator + File? ref_scatter_interval_list_ids + } + + output { + Array[Pair[String, Pair[File, File]]] id_bam_bai_of_shards = zip(ids_of_interval_lists, sharded_bam_bais) + } + + if (defined(ref_scatter_interval_list_locator)) { # custom + File scatter_interval_list_ids = select_first([ref_scatter_interval_list_ids]) + File scatter_interval_list_loc = select_first([ref_scatter_interval_list_locator]) + Array[String] custom_interval_list_ids = read_lines(scatter_interval_list_ids) + Array[String] custom_interval_list_files = read_lines(scatter_interval_list_loc) + Array[Pair[String, String]] ided_interval_list_files = zip(custom_interval_list_ids, custom_interval_list_files) + + scatter (pair in ided_interval_list_files) { + + call Utils.ResilientSubsetBam as user_controled_split { + input: + bam = bam, + bai = bai, + interval_list_file = pair.right, + interval_id = pair.left, + prefix = basename(bam, ".bam") + } + if (user_controled_split.is_samtools_failed) { + # attempt again, first retry streaming, + call Utils.ResilientSubsetBam as retry_streaming { + input: + bam = bam, + bai = bai, + interval_list_file = pair.right, + interval_id = pair.left, + prefix = basename(bam, ".bam") + } + # then localize if that still fails + if (retry_streaming.is_samtools_failed) { + call BU.SubsetBamToLocusLocal { + input: + bam = bam, + bai = bai, + interval_list_file = pair.right, + interval_id = pair.left, + prefix = basename(bam, ".bam") + } + } + # call Utils.StopWorkflow as CustomShardFailedStreaming { input: reason = "Streaming from BAM for subsetting into ~{pair.left} failed."} + } + File custom_shard_bam = select_first([SubsetBamToLocusLocal.subset_bam, retry_streaming.subset_bam, user_controled_split.subset_bam]) + File custom_shard_bai = select_first([SubsetBamToLocusLocal.subset_bai, retry_streaming.subset_bai, user_controled_split.subset_bai]) + } + } + + if (!defined(ref_scatter_interval_list_locator)) { # per contig/chromosome + call Utils.MakeChrIntervalList { + input: + ref_dict = ref_dict, + filter = contig_filter + } + scatter (c in MakeChrIntervalList.chrs) { + String contig = c[0] + call Utils.SubsetBam as default_split { + input: + bam = bam, + bai = bai, + locus = contig + } + if (default_split.is_samtools_failed) { + call Utils.StopWorkflow as StandardShardFailedStreaming { input: reason = "Streaming from BAM for subsetting into ~{contig} failed."} + } + String default_interval_list_id = contig + } + } + + Array[Pair[File, File]] sharded_bam_bais = zip(select_first([custom_shard_bam, default_split.subset_bam]), + select_first([custom_shard_bai, default_split.subset_bai]) + ) + Array[String] ids_of_interval_lists = select_first([default_interval_list_id, custom_interval_list_ids]) +} diff --git a/wdl/pipelines/TechAgnostic/Utility/VerifyBamFingerprint.wdl b/wdl/pipelines/TechAgnostic/Utility/VerifyBamFingerprint.wdl new file mode 100644 index 000000000..cd04cae6c --- /dev/null +++ b/wdl/pipelines/TechAgnostic/Utility/VerifyBamFingerprint.wdl @@ -0,0 +1,80 @@ +version 1.0 + +import "../../../tasks/QC/FPCheckAoU.wdl" as FP + +workflow VerifyBamFingerprint { + meta { + desciption: + "Verify fingerprint of a single BAM where it's assumed the BAM holds data from a single entity" + warn: + "So far, we've verified that this works for CCS/Hifi and ONT data, for legacy CLR data, it's not supported." + } + + parameter_meta { + tech: "The technology used to generate this BAM. Currently, the following values are accepted: [ONT, Sequel, Revio]." + + force: "If true, will force run the fingerprinting program, and the workflow may fail for various reasons; otherwise, if the BAM is too small, it will automatically fail this QC check." + + fp_vcf_store: "GCS storage bucket and folder where the fingperprint VCF files are stored." + fp_sample_id: "sample id of the data at the storage; CRITICAL: it's assumsed that the fingerprint VCF file follow the naming convention that start with this sample id." + ref_specific_haplotype_map: "Reference-specific haplotype map file to be passed on to Picard's `CheckFingerprint`" + + lod_pass_threshold: "Threshold for LOD above which the BAM will be declared as PASSing this QC check" + lod_fail_threshold: "Threshold for LOD below which the BAM will be declared as FAILing this QC check; LOD between the two thresholds will lead to the BAM's QC status as BORDERLINE" + } + + input { + File aligned_bam + File aligned_bai + String tech + + String fp_vcf_store + String fp_sample_id + + File ref_specific_haplotype_map + + Boolean force = false + Float lod_pass_threshold = 6.0 + Float lod_fail_threshold = -3.0 + + # Input args liftedoverVCFGATK generation: + File chain_file + File target_reference_sequence_fasta_file + File target_reference_sequence_fasta_file_index + File target_reference_sequence_fasta_file_dict + # Runtime args: + Int? mem + Int? preemptible_attempts + Int? disk_space_gb + Int? cpu + Int? boot_disk_size_gb + + } + + output { + Map[String, String] fingerprint_check = {"status": core.FP_status, + "LOD": core.lod_expected_sample} + } + + call FP.FPCheckAoU as core { + input: + aligned_bam = aligned_bam, + aligned_bai = aligned_bai, + tech = tech, + fp_vcf_store = fp_vcf_store, + fp_sample_id = fp_sample_id, + ref_specific_haplotype_map = ref_specific_haplotype_map, + force = force, + lod_pass_threshold = lod_pass_threshold, + lod_fail_threshold = lod_fail_threshold, + chain_file = chain_file, + target_reference_sequence_fasta_file = target_reference_sequence_fasta_file, + target_reference_sequence_fasta_file_index = target_reference_sequence_fasta_file_index, + target_reference_sequence_fasta_file_dict = target_reference_sequence_fasta_file_dict, + mem = mem, + preemptible_attempts = preemptible_attempts, + disk_space_gb = disk_space_gb, + cpu = cpu, + boot_disk_size_gb = boot_disk_size_gb + } +} diff --git a/wdl/pipelines/TechAgnostic/VariantCalling/CallVariantsReadBased.wdl b/wdl/pipelines/TechAgnostic/VariantCalling/CallVariantsReadBased.wdl new file mode 100644 index 000000000..01554f3fd --- /dev/null +++ b/wdl/pipelines/TechAgnostic/VariantCalling/CallVariantsReadBased.wdl @@ -0,0 +1,296 @@ +version 1.0 + +import "../../../tasks/Utility/GeneralUtils.wdl" as GU +import "../../../tasks/Utility/Finalize.wdl" as FF +import "../../../tasks/Utility/Utils.wdl" + +import "../Utility/ShardWholeGenome.wdl" + +import "../../../tasks/VariantCalling/CallStructuralVariants.wdl" +import "../../../tasks/VariantCalling/CallSmallVariants.wdl" + +import "../../../tasks/VariantCalling/Sniffles2.wdl" + + +workflow CallVariants { + + meta { + description: "A workflow for calling small and/or structural variants from an aligned BAM file. Note this calls out to read-based methods, not assembly-based methods. This also does not support CLR data." + } + + parameter_meta { + bam: "Aligned BAM file" + bai: "Index for the aligned BAM file" + prefix: "Prefix for output files" + + is_ont: "If the input data is generated on the ONT platform" + is_r10_4_pore_or_later: "tell us which pore version was used to generate the data. When true, will use the DV (>=1.5.0) toolchain." + model_for_dv_andor_pepper: "model string to be used on DV or the PEPPER-Margin-DeepVariant toolchain. Please refer to their github pages for accepted values." + + ref_map_file: "table indicating reference sequence and auxillary file locations" + ref_scatter_interval_list_locator: "A file holding paths to interval_list files, used for custom sharding the of the input BAM; when not provided, will shard WG by contig (possibly slower)" + ref_scatter_interval_list_ids: "A file that gives short IDs to the interval_list files; when not provided, will shard WG by contig (possibly slower)" + + call_svs: "Call structural variants or not" + minsvlen: "Minimum SV length in bp (default: 50)" + pbsv_discover_per_chr: "Run the discover stage of PBSV per chromosome" + + call_small_variants: "Call small variants or not" + run_clair3: "to turn on Clair3 analysis or not (non-trivial increase in cost and runtime)" + use_margin_for_tagging: "if false, will use margin-phased small-variant VCF for haplotagging the BAM; applicable only when input data isn't ONT data with pore older than R10.4" + + dv_threads: "number of threads for DeepVariant" + dv_memory: "memory for DeepVariant" + use_gpu: "to use GPU acceleration or not on DeepVariant" + + # outputs + haplotagged_bam: "BAM haplotagged using a small variant single-sample VCF." + haplotagged_bai: "Index for haplotagged_bam." + haplotagged_bam_tagger: "VCF used for doing the haplotagging. 'Legacy' if the input is ONT data generated on pores before R10.4." + + legacy_g_vcf: "PEPPER-MARGIN-DeepVariant gVCF; available only when input is ONT data generated on pores older than R10.4." + legacy_g_tbi: "Index for PEPPER-MARGIN-DeepVariant gVCF; available only when input is ONT data generated on pores older than R10.4." + legacy_phased_vcf: "Phased PEPPER-MARGIN-DeepVariant VCF; available only when input is ONT data generated on pores older than R10.4." + legacy_phased_tbi: "Indes for phased PEPPER-MARGIN-DeepVariant VCF; available only when input is ONT data generated on pores older than R10.4." + legacy_phasing_stats_tsv: "Phasing stats of legacy_phased_vcf in TSV format; available only when input is ONT data generated on pores older than R10.4." + legacy_phasing_stats_gtf: "Phasing stats of legacy_phased_vcf in GTF format; available only when input is ONT data generated on pores older than R10.4." + + dv_g_vcf: "DeepVariant gVCF; available for CCS data and ONT data generated with pores >= R10.4." + dv_g_tbi: "Index for DeepVariant ; available for CCS data and ONT data generated with pores >= R10.4." + dv_margin_phased_vcf: "Phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_margin_phased_tbi: "Index for phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_margin_phasing_stats_tsv: "Phasing stats (TSV format) of phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_margin_phasing_stats_gtf: "Phasing stats (GTF format) of phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_whatshap_phased_vcf: "Phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." + dv_whatshap_phased_tbi: "Index for phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_whatshap_phasing_stats_tsv: "Phasing stats (TSV format) of phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_whatshap_phasing_stats_gtf: "Phasing stats (GTF format) of phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." + + dv_nongpu_resources_usage_visual: "Resource usage monitoring log visualization for DV (per shard); available for CCS data and ONT data generated with pores >= R10.4." + } + + input { + String gcs_out_dir + + # sample info + File bam + File bai + String prefix + + # data type info + Boolean is_ont + Boolean is_r10_4_pore_or_later + String model_for_dv_andor_pepper + + # reference-specific + File ref_map_file + File? ref_scatter_interval_list_locator + File? ref_scatter_interval_list_ids + + # sv-specific args + Boolean call_svs + Boolean pbsv_discover_per_chr + Int minsvlen = 50 + + # smallVar-specific args + Boolean call_small_variants + Boolean run_clair3 + Boolean use_margin_for_tagging + + # optimization, balancing between throughput, wallclock time, and cost + Int dv_threads + Int dv_memory + Boolean use_gpu = false + Array[String] gcp_zones = ["us-central1-a", "us-central1-b", "us-central1-c", "us-central1-f"] + } + + if ((!call_svs) && (!call_small_variants)) { + call Utils.StopWorkflow { input: reason = "Why are you calling me if your want neither small variants nor SVs?"} + } + + ###################################################################### + # Block for prepping inputs + ###################################################################### + Map[String, String] ref_map = read_map(ref_map_file) + + call GU.CollapseArrayOfStrings as get_zones {input: input_array = gcp_zones, joiner = " "} + String wdl_parsable_zones = get_zones.collapsed + + # needed for whatshap phasing anyway, so this can be used by SV calling + call ShardWholeGenome.Split as SplitBamByChr { input: ref_dict = ref_map['dict'], bam = bam, bai = bai, } + + ###################################################################### + # Block for small variants handling + ###################################################################### + if (call_small_variants) { + call CallSmallVariants.Work as SmallVarJob { + input: + bam = bam, + bai = bai, + prefix = prefix, + + per_chr_bam_bai_and_id = SplitBamByChr.id_bam_bai_of_shards, + + is_ont = is_ont, + is_r10_4_pore_or_later = is_r10_4_pore_or_later, + model_for_dv_andor_pepper = model_for_dv_andor_pepper, + + ref_map = ref_map, + ref_scatter_interval_list_locator = ref_scatter_interval_list_locator, + ref_scatter_interval_list_ids = ref_scatter_interval_list_ids, + + run_clair3 = run_clair3, + use_margin_for_tagging = use_margin_for_tagging, + + dv_threads = dv_threads, + dv_memory = dv_memory, + use_gpu = use_gpu, + zones = wdl_parsable_zones + } + + ############################# + # save data + String smalldir = sub(gcs_out_dir, "/$", "") + "/variants/small" + String haptagoutdir = sub(gcs_out_dir, "/$", "") + "/alignments" + + call FF.FinalizeToFile as FinalizeHapTaggedBam { input: outdir = haptagoutdir, file = SmallVarJob.haplotagged_bam } + call FF.FinalizeToFile as FinalizeHapTaggedBai { input: outdir = haptagoutdir, file = SmallVarJob.haplotagged_bai } + + Boolean is_legacy_ont = is_ont && (!is_r10_4_pore_or_later) + if (is_legacy_ont) { + call FF.FinalizeToFile as FinalizeLegacyGVcf { input: outdir = smalldir, file = select_first([SmallVarJob.legacy_g_vcf]) } + call FF.FinalizeToFile as FinalizeLegacyGTbi { input: outdir = smalldir, file = select_first([SmallVarJob.legacy_g_tbi]) } + call FF.FinalizeToFile as FinalizeLegacyPhasedVcf { input: outdir = smalldir, file = select_first([SmallVarJob.legacy_phased_vcf]) } + call FF.FinalizeToFile as FinalizeLegacyPhasedTbi { input: outdir = smalldir, file = select_first([SmallVarJob.legacy_phased_tbi]) } + call FF.FinalizeToFile as FinalizeLegacyPhaseStatsTSV { input: outdir = smalldir, file = select_first([SmallVarJob.legacy_phasing_stats_tsv]) } + call FF.FinalizeToFile as FinalizeLegacyPhaseStatsGTF { input: outdir = smalldir, file = select_first([SmallVarJob.legacy_phasing_stats_gtf]) } + } + if (!is_legacy_ont) { + call FF.FinalizeToFile as FinalizeDVgVcf { input: outdir = smalldir, file = select_first([SmallVarJob.dv_g_vcf]) } + call FF.FinalizeToFile as FinalizeDVgTbi { input: outdir = smalldir, file = select_first([SmallVarJob.dv_g_tbi]) } + + call FF.FinalizeToFile as FinalizeDVMarginPhasedVcf { input: outdir = smalldir, file = select_first([SmallVarJob.dv_margin_phased_vcf]) } + call FF.FinalizeToFile as FinalizeDVMarginPhasedTbi { input: outdir = smalldir, file = select_first([SmallVarJob.dv_margin_phased_tbi]) } + call FF.FinalizeToFile as FinalizeDVMarginPhasedVcfStatusTSV { input: outdir = smalldir, file = select_first([SmallVarJob.dv_vcf_margin_phasing_stats_tsv]) } + call FF.FinalizeToFile as FinalizeDVMarginPhasedVcfStatusGtf { input: outdir = smalldir, file = select_first([SmallVarJob.dv_vcf_margin_phasing_stats_gtf]) } + + call FF.FinalizeToFile as FinalizeDVWhatsHapPhasedVcf { input: outdir = smalldir, file = select_first([SmallVarJob.dv_whatshap_phased_vcf]) } + call FF.FinalizeToFile as FinalizeDVWhatsHapPhasedTbi { input: outdir = smalldir, file = select_first([SmallVarJob.dv_whatshap_phased_tbi]) } + call FF.FinalizeToFile as FinalizeDVWhatsHapPhasedVcfStatusTSV { input: outdir = smalldir, file = select_first([SmallVarJob.dv_vcf_whatshap_phasing_stats_tsv]) } + call FF.FinalizeToFile as FinalizeDVWhatsHapPhasedVcfStatusGtf { input: outdir = smalldir, file = select_first([SmallVarJob.dv_vcf_whatshap_phasing_stats_gtf]) } + + call FF.FinalizeToDir as FinalizeDVResourceUsagesVisual { + input: files = select_first([SmallVarJob.dv_nongpu_resources_usage_visual]), outdir = smalldir + "/DV_monitoring" + } + } + + if (run_clair3) { + call FF.FinalizeToFile as FinalizeClairVcf { input: outdir = smalldir, file = select_first([SmallVarJob.clair_vcf])} + call FF.FinalizeToFile as FinalizeClairTbi { input: outdir = smalldir, file = select_first([SmallVarJob.clair_tbi])} + + call FF.FinalizeToFile as FinalizeClairGVcf { input: outdir = smalldir, file = select_first([SmallVarJob.clair_gvcf])} + call FF.FinalizeToFile as FinalizeClairGTbi { input: outdir = smalldir, file = select_first([SmallVarJob.clair_gtbi])} + } + } + + ###################################################################### + # Block for SV handling + ###################################################################### + if (call_svs) { + call CallStructuralVariants.Work as SVjob { + input: + is_hifi = !is_ont, + is_ont = is_ont, + + bam = bam, + bai = bai, + prefix = prefix, + + per_chr_bam_bai_and_id = SplitBamByChr.id_bam_bai_of_shards, + + ref_map = ref_map, + + minsvlen = minsvlen, + + pbsv_discover_per_chr = pbsv_discover_per_chr, + + zones = wdl_parsable_zones + } + + ############################# + # save data + String svdir = sub(select_first([gcs_out_dir]), "/$", "") + "/variants/sv" + + call FF.FinalizeToFile as FinalizePBSV { input: outdir = svdir, file = SVjob.pbsv_vcf } + call FF.FinalizeToFile as FinalizePBSVtbi { input: outdir = svdir, file = SVjob.pbsv_tbi } + + call FF.FinalizeToFile as FinalizeSniffles { input: outdir = svdir, file = SVjob.sniffles_vcf } + call FF.FinalizeToFile as FinalizeSnifflesTbi { input: outdir = svdir, file = SVjob.sniffles_tbi } + call FF.FinalizeToFile as FinalizeSnifflesSnf { input: outdir = svdir, file = SVjob.sniffles_snf } + } + + ###################################################################### + # Experiment with Sniffles-2 phased SV calling + ###################################################################### + if (call_svs && call_small_variants) { + File m = select_first([SmallVarJob.haplotagged_bam]) + File i = select_first([SmallVarJob.haplotagged_bai]) + call Utils.InferSampleName { input: bam = m, bai = i } + call Sniffles2.SampleSV as SnifflesPhaseSV { + input: + bam = m, bai = i, sample_id = InferSampleName.sample_name, + prefix = prefix, tandem_repeat_bed = ref_map['tandem_repeat_bed'], + minsvlen = minsvlen, + phase_sv = true + } + if (defined(gcs_out_dir)) { + String svdir_copy = sub(select_first([gcs_out_dir]), "/$", "") + "/variants/sv" + call FF.FinalizeToFile as FinalizePhasedSniffles { input: outdir = svdir_copy, file = SnifflesPhaseSV.vcf } + call FF.FinalizeToFile as FinalizePhasedSnifflesTbi { input: outdir = svdir_copy, file = SnifflesPhaseSV.tbi } + call FF.FinalizeToFile as FinalizePhasedSnifflesSnf { input: outdir = svdir_copy, file = SnifflesPhaseSV.snf } + } + } + + output { + File? sniffles_vcf = FinalizeSniffles.gcs_path + File? sniffles_tbi = FinalizeSnifflesTbi.gcs_path + File? sniffles_snf = FinalizeSnifflesSnf.gcs_path + + File? sniffles_phased_vcf = FinalizePhasedSniffles.gcs_path + File? sniffles_phased_tbi = FinalizePhasedSnifflesTbi.gcs_path + File? sniffles_phased_snf = FinalizePhasedSnifflesSnf.gcs_path + + File? pbsv_vcf = FinalizePBSV.gcs_path + File? pbsv_tbi = FinalizePBSVtbi.gcs_path + + File? clair_vcf = FinalizeClairVcf.gcs_path + File? clair_tbi = FinalizeClairTbi.gcs_path + File? clair_gvcf = FinalizeClairGVcf.gcs_path + File? clair_gtbi = FinalizeClairGTbi.gcs_path + + File? haplotagged_bam = FinalizeHapTaggedBam.gcs_path + File? haplotagged_bai = FinalizeHapTaggedBai.gcs_path + String? haplotagged_bam_tagger = SmallVarJob.haplotagged_bam_tagger + + # available for CCS and ONT >= R10.4 data, if small variants are requested + File? dv_g_vcf = FinalizeDVgVcf.gcs_path + File? dv_g_tbi = FinalizeDVgTbi.gcs_path + File? dv_margin_phased_vcf = FinalizeDVMarginPhasedVcf.gcs_path + File? dv_margin_phased_tbi = FinalizeDVMarginPhasedTbi.gcs_path + File? dv_vcf_margin_phasing_stats_tsv = FinalizeDVMarginPhasedVcfStatusTSV.gcs_path + File? dv_vcf_margin_phasing_stats_gtf = FinalizeDVMarginPhasedVcfStatusGtf.gcs_path + File? dv_whatshap_phased_vcf = FinalizeDVWhatsHapPhasedVcf.gcs_path + File? dv_whatshap_phased_tbi = FinalizeDVWhatsHapPhasedTbi.gcs_path + File? dv_vcf_whatshap_phasing_stats_tsv = FinalizeDVWhatsHapPhasedVcfStatusTSV.gcs_path + File? dv_vcf_whatshap_phasing_stats_gtf = FinalizeDVWhatsHapPhasedVcfStatusGtf.gcs_path + String? dv_nongpu_resources_usage_visual = FinalizeDVResourceUsagesVisual.gcs_dir + + # available for ONT < R10.4 data, if small variants are requested + File? legacy_g_vcf = FinalizeLegacyGVcf.gcs_path + File? legacy_g_tbi = FinalizeLegacyGTbi.gcs_path + File? legacy_phased_vcf = FinalizeLegacyPhasedVcf.gcs_path + File? legacy_phased_tbi = FinalizeLegacyPhasedTbi.gcs_path + File? legacy_phasing_stats_tsv = FinalizeLegacyPhaseStatsTSV.gcs_path + File? legacy_phasing_stats_gtf = FinalizeLegacyPhaseStatsGTF.gcs_path + } +} diff --git a/wdl/tasks/Alignment/WhatsHap.wdl b/wdl/tasks/Alignment/WhatsHap.wdl new file mode 100644 index 000000000..e703c67b7 --- /dev/null +++ b/wdl/tasks/Alignment/WhatsHap.wdl @@ -0,0 +1,220 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task HaploTagBam { + meta { + description: "Uses whatshap to haplotag a BAM." + } + + parameter_meta { + to_tag_bam: {description: "BAM to be haplotagged", localization_optional: true} + phased_vcf: "VCF holding phased small variants to be used for the tagging" + } + + input { + File to_tag_bam + File to_tag_bai + File ref_fasta + File ref_fasta_fai + File phased_vcf + File phased_tbi + + RuntimeAttr? runtime_attr_override + } + + output { + File tagged_bam = "~{prefix}.whatshap-haplotagged.bam" + File tagged_bai = "~{prefix}.whatshap-haplotagged.bam.bai" + } + + String prefix = basename(to_tag_bam, ".bam") + String vcf_prefix = basename(phased_vcf, ".vcf.gz") + + Int disk_size = 10 + 2*ceil(size([to_tag_bam, phased_vcf], "GiB")) + + String local_bam = "/cromwell_root/~{prefix}.bam" + String local_bai = "/cromwell_root/~{prefix}.bam.bai" + + command <<< + set -eux + + time gcloud storage cp ~{to_tag_bam} ~{local_bam} + mv ~{to_tag_bai} ~{local_bai} + + mv ~{phased_vcf} ~{vcf_prefix}.vcf.gz + mv ~{phased_tbi} ~{vcf_prefix}.vcf.gz.tbi + ls + + whatshap haplotag \ + -o ~{prefix}.whatshap-haplotagged.bam \ + --reference ~{ref_fasta} \ + --skip-missing-contigs \ + --output-threads=3 \ + ~{vcf_prefix}.vcf.gz \ + ~{local_bam} + + samtools index -@3 ~{prefix}.whatshap-haplotagged.bam + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 16, + disk_gb: disk_size, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-whatshap:2.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Phase { + meta { + description: "Uses whatshap phase (small variant) VCF." + } + + parameter_meta { + unphased_vcf: "VCF holding phased small variants to be phased." + bam: {description: "BAM used for genrating the unphased VCF.", localization_optional: true} + } + + input { + File bam + File bai + File ref_fasta + File ref_fasta_fai + File unphased_vcf + File unphased_tbi + + String? chromosome + + RuntimeAttr? runtime_attr_override + } + + String bam_prefix = basename(bam, ".bam") + String vcf_prefix = basename(unphased_vcf, ".vcf.gz") + if (defined(chromosome)) then ".~{chromosome}" else "" + + output { + File phased_vcf = "~{vcf_prefix}.whatshap-phased.vcf.gz" + File phased_tbi = "~{vcf_prefix}.whatshap-phased.vcf.gz.tbi" + } + + Int disk_size = 10 + 2*ceil(size([bam, unphased_vcf], "GiB")) + String extra_args = if (defined(chromosome)) then "--chromosome " + select_first([chromosome]) else "" + + String local_bam = "/cromwell_root/~{bam_prefix}.bam" + String local_bai = "~{local_bam}.bai" + + command <<< + set -eux + + time gcloud storage cp ~{bam} ~{local_bam} + mv ~{bai} ~{local_bai} + mv ~{unphased_vcf} ~{vcf_prefix}.vcf.gz + mv ~{unphased_tbi} ~{vcf_prefix}.vcf.gz.tbi + ls + + whatshap phase \ + --indels \ + -o ~{vcf_prefix}.whatshap-phased.vcf.gz \ + ~{extra_args} \ + --reference ~{ref_fasta} \ + ~{vcf_prefix}.vcf.gz \ + ~{local_bam} + + if [[ ! -f ~{vcf_prefix}.whatshap-phased.vcf.gz.tbi ]]; then tabix -p vcf ~{vcf_prefix}.whatshap-phased.vcf.gz; fi + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 16, + disk_gb: disk_size, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-whatshap:2.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Stats { + meta { + description: "Uses whatshap to haplotag a BAM." + } + + parameter_meta { + phased_vcf: "VCF holding phased small variants" + } + + input { + File phased_vcf + File phased_tbi + + RuntimeAttr? runtime_attr_override + } + + output { + File stats_tsv = "~{vcf_prefix}.whatshap-stats.tsv" + File stats_gtf = "~{vcf_prefix}.whatshap-stats.gtf" + } + + String vcf_prefix = basename(phased_vcf, ".vcf.gz") + + Int disk_size = 10 + 2*ceil(size(phased_vcf, "GiB")) + + command <<< + set -eux + + mv ~{phased_vcf} ~{vcf_prefix}.vcf.gz + mv ~{phased_tbi} ~{vcf_prefix}.vcf.gz.tbi + + # for visualization + whatshap stats \ + --gtf=~{vcf_prefix}.whatshap-stats.gtf \ + ~{vcf_prefix}.vcf.gz & + + # for use with MultiQC + whatshap stats \ + --tsv=~{vcf_prefix}.whatshap-stats.tsv \ + ~{vcf_prefix}.vcf.gz & + + wait + ls + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 16, + disk_gb: disk_size, + preemptible_tries: 3, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-whatshap:2.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/tasks/Assembly/Hifiasm.wdl b/wdl/tasks/Assembly/Hifiasm.wdl index 878daa73b..86e0c981f 100644 --- a/wdl/tasks/Assembly/Hifiasm.wdl +++ b/wdl/tasks/Assembly/Hifiasm.wdl @@ -1,13 +1,12 @@ version 1.0 import "../../structs/Structs.wdl" - -import "../Utility/Utils.wdl" +import "../Visualization/VisualizeResourceUsage.wdl" workflow Hifiasm { meta { - description: "We run two HiFiasm jobs, one for getting alternative contigs and one for getting the haplotigs. And we take the primary assembly from the first job." + description: "We run two HiFiasm jobs, one for getting alternative contigs and one for getting the haplotigs. And we take the primary assembly from the first job. Note that we've only tested on diploid organisms." } parameter_meta { reads: "reads (in fasta or fastq format, compressed or uncompressed)" @@ -21,34 +20,48 @@ workflow Hifiasm { String zones = "us-central1-a us-central1-b us-central1-c" } - call AssembleForAltContigs { - input: - reads = reads, - prefix = prefix, - zones = zones + call AssembleForAltContigs { input: + reads = reads, + prefix = prefix, + zones = zones + } + + call AssembleForHaplotigs { input: + reads = reads, + prefix = prefix, + zones = zones } - call AssembleForHaplotigs { - input: - reads = reads, - prefix = prefix, - zones = zones + call VisualizeResourceUsage.SimpleRscript as VisualizeHapAsmResoureUsage { input: + resource_log = AssembleForHaplotigs.resouce_monitor_log, + output_pdf_name = "~{prefix}.hifiasm.resources-usage.hap-mode.pdf", + plot_title = "Hifiasm, on input ~{prefix}, in haplotype-resolve mode" } output { File primary_gfa = AssembleForAltContigs.primary_gfa File primary_tigs = AssembleForAltContigs.primary_tigs + File primary_tigs_gzi = AssembleForAltContigs.primary_fa_gzi File alternate_gfa = AssembleForAltContigs.alternate_gfa File alternate_tigs = AssembleForAltContigs.alternate_tigs + File alternate_tigs_gzi = AssembleForAltContigs.alternate_tigs_gzi File log_in_pVSa_mode = AssembleForAltContigs.log + File resource_usage_in_pVSa_mode = AssembleForAltContigs.resouce_monitor_log ########### - Array[File] phased_gfas = AssembleForHaplotigs.phased_gfas - Array[File] phased_tigs = AssembleForHaplotigs.phased_tigs + File hap1_gfa = AssembleForHaplotigs.hap1_gfa + File hap1_tigs = AssembleForHaplotigs.hap1_tigs + File hap1_tig_gzi = AssembleForHaplotigs.hap1_tig_gzi + + File hap2_gfa = AssembleForHaplotigs.hap2_gfa + File hap2_tigs = AssembleForHaplotigs.hap2_tigs + File hap2_tig_gzi = AssembleForHaplotigs.hap2_tig_gzi File log_in_hap_mode = AssembleForHaplotigs.log + File resource_usage_in_hap_mode = AssembleForHaplotigs.resouce_monitor_log + File resource_usage_visual_in_hap_mode = VisualizeHapAsmResoureUsage.plot_pdf # these two are saved, but the one generated in the primary VS alternate mode are preferred File primary_gfa_in_hap_mode = AssembleForHaplotigs.primary_gfa @@ -71,17 +84,25 @@ task AssembleForHaplotigs { Int num_cpus_proposal = if (n/2)*2 == n then n else n+1 # a hack because WDL doesn't have modulus operator Int num_cpus = if num_cpus_proposal > 96 then 96 else num_cpus_proposal - Int disk_size = 10 * ceil(size(reads, "GB")) + Int min_disk = 75 + Int proposed_disk = 5 * ceil(size(reads, "GB")) + Int disk_size = if proposed_disk < min_disk then min_disk else proposed_disk command <<< set -euxo pipefail - time hifiasm \ + export MONITOR_MOUNT_POINT="/cromwell_root/" + bash /opt/vm_local_monitoring_script.sh &> resources.log & + job_id=$(ps -aux | grep -F 'vm_local_monitoring_script.sh' | head -1 | awk '{print $2}') + + time \ + hifiasm \ -o ~{prefix} \ -t~{num_cpus} \ ~{reads} \ 2>&1 | tee hifiasm.log + if ps -p "${job_id}" > /dev/null; then kill "${job_id}"; fi tree -h . # GFA graph to contigs, primary @@ -100,17 +121,28 @@ task AssembleForHaplotigs { "${haplotype_gfa}" \ > "${haplotype}".fa done + + for ff in ./*.p_ctg.fa; do + bgzip -@2 -k --index "${ff}" + done >>> output { - # these are saved, but the one with alt contigs genearted will be preferred for now - File primary_gfa = "~{prefix}.bp.p_ctg.gfa" - File primary_fa = "~{prefix}.bp.p_ctg.fa" + File resouce_monitor_log = "resources.log" + File log = "hifiasm.log" - Array[File] phased_gfas = glob("~{prefix}.bp.hap*.p_ctg.gfa") - Array[File] phased_tigs = glob("~{prefix}.bp.hap*.p_ctg.fa") + File hap1_gfa = "~{prefix}.bp.hap1.p_ctg.gfa" + File hap1_tigs = "~{prefix}.bp.hap1.p_ctg.fa.gz" + File hap1_tig_gzi = "~{prefix}.bp.hap1.p_ctg.fa.gz.gzi" - File log = "hifiasm.log" + File hap2_gfa = "~{prefix}.bp.hap2.p_ctg.gfa" + File hap2_tigs = "~{prefix}.bp.hap2.p_ctg.fa.gz" + File hap2_tig_gzi = "~{prefix}.bp.hap2.p_ctg.fa.gz.gzi" + + # these are saved, but the one with alt contigs genearted will be preferred for now + File primary_gfa = "~{prefix}.bp.p_ctg.gfa" + File primary_fa = "~{prefix}.bp.p_ctg.fa.gz" + File primary_fa_gzi = "~{prefix}.bp.p_ctg.fa.gz.gzi" } ######################### @@ -118,17 +150,15 @@ task AssembleForHaplotigs { cpu_cores: num_cpus, mem_gb: memory, disk_gb: disk_size, - boot_disk_gb: 10, preemptible_tries: 0, - max_retries: 0, - docker: "us.gcr.io/broad-dsp-lrma/lr-hifiasm:0.16.1" + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-hifiasm:0.19.5" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) docker: select_first([runtime_attr.docker, default_attr.docker]) @@ -150,40 +180,55 @@ task AssembleForAltContigs { Int num_cpus_proposal = if (n/2)*2 == n then n else n+1 # a hack because WDL doesn't have modulus operator Int num_cpus = if num_cpus_proposal > 96 then 96 else num_cpus_proposal - Int disk_size = 10 * ceil(size(reads, "GB")) + Int min_disk = 75 + Int proposed_disk = 5 * ceil(size(reads, "GB")) + Int disk_size = if proposed_disk < min_disk then min_disk else proposed_disk command <<< set -euxo pipefail - time hifiasm \ + export MONITOR_MOUNT_POINT="/cromwell_root/" + bash /opt/vm_local_monitoring_script.sh &> resources.log & + job_id=$(ps -aux | grep -F 'vm_local_monitoring_script.sh' | head -1 | awk '{print $2}') + + time \ + hifiasm \ -o ~{prefix} \ -t~{num_cpus} \ --primary \ ~{reads} \ 2>&1 | tee hifiasm.log + if ps -p "${job_id}" > /dev/null; then kill "${job_id}"; fi tree -h . # tricky, outputs generated this way has no "bp" in their file names # GFA graph to contigs, primary awk '/^S/{print ">"$2; print $3}' \ ~{prefix}.p_ctg.gfa \ - > ~{prefix}.p_ctg.fa + > ~{prefix}.p_ctg.fa # GFA graph to contigs, alternate awk '/^S/{print ">"$2; print $3}' \ ~{prefix}.a_ctg.gfa \ - > ~{prefix}.a_ctg.fa + > ~{prefix}.a_ctg.fa + + for ff in ./*_ctg.fa; do + bgzip -@2 -k --index "${ff}" + done >>> output { + File resouce_monitor_log = "resources.log" + File log = "hifiasm.log" + File primary_gfa = "~{prefix}.p_ctg.gfa" - File primary_tigs = "~{prefix}.p_ctg.fa" + File primary_tigs = "~{prefix}.p_ctg.fa.gz" + File primary_fa_gzi = "~{prefix}.p_ctg.fa.gz.gzi" File alternate_gfa = "~{prefix}.a_ctg.gfa" - File alternate_tigs = "~{prefix}.a_ctg.fa" - - File log = "hifiasm.log" + File alternate_tigs = "~{prefix}.a_ctg.fa.gz" + File alternate_tigs_gzi = "~{prefix}.a_ctg.fa.gz.gzi" } ######################### @@ -191,17 +236,15 @@ task AssembleForAltContigs { cpu_cores: num_cpus, mem_gb: memory, disk_gb: disk_size, - boot_disk_gb: 10, preemptible_tries: 0, - max_retries: 0, - docker: "us.gcr.io/broad-dsp-lrma/lr-hifiasm:0.16.1" + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-hifiasm:0.19.5" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) docker: select_first([runtime_attr.docker, default_attr.docker]) diff --git a/wdl/tasks/QC/AlignedMetrics.wdl b/wdl/tasks/QC/AlignedMetrics.wdl index 7ad2ec933..fb2dabfca 100644 --- a/wdl/tasks/QC/AlignedMetrics.wdl +++ b/wdl/tasks/QC/AlignedMetrics.wdl @@ -244,7 +244,7 @@ task MosDepthOverBed { boot_disk_gb: 10, preemptible_tries: 2, max_retries: 1, - docker: "quay.io/biocontainers/mosdepth:0.2.4--he527e40_0" + docker: "us.gcr.io/broad-dsp-lrma/mosdepth:0.3.3--h37c5b7d_2" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { @@ -258,6 +258,91 @@ task MosDepthOverBed { } } +task MosDepthWGS { + meta { + description: "Collects WGS coverage of the bam. Optionally, collects coverage number for each region in the provided BED (this avoids extra localizations)." + } + parameter_meta { + bam: {localization_optional: true} + bed_descriptor: "A short description on the BED file provided. It will be used in naming the regions output, so be careful what you provide here." + regions: "When bed is provided, this gets generated, which holds the coverage over the regions defined in the bed file." + } + input { + File bam + File bai + File? bed + String bed_descriptor = "unknown" + + String disk_type = "SSD" + RuntimeAttr? runtime_attr_override + } + + output { + Float wgs_cov = read_float("wgs.cov.txt") + File summary_txt = "~{prefix}.mosdepth.summary.txt" + File? regions = "~{prefix}.coverage_over_bed.~{bed_descriptor}.regions.bed.gz" + } + + String basename = basename(bam, ".bam") + String prefix = "~{basename}.mosdepth_coverage" + + Boolean collect_over_bed = defined(bed) + + String local_bam = "/cromwell_root/~{basename}.bam" + + command <<< + set -euxo pipefail + + time gcloud storage cp ~{bam} ~{local_bam} + mv ~{bai} "~{local_bam}.bai" + + mosdepth \ + -t 2 \ + -x -n -Q1 \ + ~{prefix} \ + ~{local_bam} & + + if ~{collect_over_bed}; then + mosdepth \ + -t 2 \ + -b ~{bed} \ + -x -n -Q 1 \ + "~{prefix}.coverage_over_bed.~{bed_descriptor}" \ + ~{local_bam} & + fi + + wait && ls + + # wg + tail -n1 ~{prefix}.mosdepth.summary.txt | \ + awk -F '\t' '{print $4}' | \ + xargs printf "%0.2f\n" > wgs.cov.txt + >>> + + ######################### + Int pd_disk_size = 10 + ceil(size(bam, "GiB")) + Int local_disk_size = if(size(bam, "GiB")>300) then 750 else 375 + Int disk_size = if('LOCAL'==disk_type) then local_disk_size else pd_disk_size + + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 8, + disk_gb: disk_size, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/mosdepth:0.3.4-gcloud" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " ~{disk_type}" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + task SummarizeDepth { input { File regions @@ -266,18 +351,18 @@ task SummarizeDepth { } Int disk_size = 2*ceil(size(regions, "GB")) - String chrName = sub(basename(regions, ".regions.bed.gz"), "out.coverage.", "") + String regName = sub(basename(regions, ".regions.bed.gz"), "out.coverage.", "") command <<< set -euxo pipefail ((echo 'chr start stop cov_mean cov_sd cov_q1 cov_median cov_q3 cov_iqr') && \ (zcat ~{regions} | datamash first 1 first 2 last 3 mean 4 sstdev 4 q1 4 median 4 q3 4 iqr 4)) | \ - column -t > ~{chrName}.summary.txt + column -t > ~{regName}.summary.txt >>> output { - File cov_summary = "~{chrName}.summary.txt" + File cov_summary = "~{regName}.summary.txt" } ######################### diff --git a/wdl/tasks/QC/Contamination.wdl b/wdl/tasks/QC/Contamination.wdl new file mode 100644 index 000000000..302e3cafb --- /dev/null +++ b/wdl/tasks/QC/Contamination.wdl @@ -0,0 +1,50 @@ +version 1.0 + +task VerifyBamID { + meta { + desciption: "Uses VerifyBamID2 for human cross-individual contamination estimation. Assumes GRCh38." + } + + input { + File pileup + File ref_fasta + Boolean is_hgdp_sites + Boolean is_100k_sites + } + + String a = if is_hgdp_sites then 'hgdp' else '1000g.phase3' + String b = if is_100k_sites then '100k' else '10k' + String resource_prefix = '~{a}.~{b}.b38.vcf.gz.dat' + + command <<< + set -eux + + export VERIFY_BAM_ID_HOME='/VerifyBamID' + + time \ + ${VERIFY_BAM_ID_HOME}/bin/VerifyBamID \ + --SVDPrefix ${VERIFY_BAM_ID_HOME}/resource/~{resource_prefix} \ + --Reference ~{ref_fasta} \ + --PileupFile ~{pileup} \ + --NumThread 4 \ + > vbid2.out \ + 2> vbid2.log + + cat vbid2.out + tail -1 vbid2.out | awk -F ':' '{print $2}' | awk '{$1=$1};1' > "est_contam.txt" + >>> + + output { + File vbid2_log = "vbid2.log" + File vbid2_out = "vbid2.out" + Float contamination_est = read_float("est_contam.txt") + } + + Int disk_size = 10 + ceil(size(pileup, "GiB")) + runtime { + cpu: 4 + memory: "8 GiB" + disks: "local-disk ~{disk_size} SSD" + docker: "us.gcr.io/broad-dsp-lrma/verifybamid2:v2.0.1" + } +} diff --git a/wdl/tasks/QC/FPCheckAoU.wdl b/wdl/tasks/QC/FPCheckAoU.wdl index 3c6d63107..209f4ad9d 100644 --- a/wdl/tasks/QC/FPCheckAoU.wdl +++ b/wdl/tasks/QC/FPCheckAoU.wdl @@ -11,9 +11,11 @@ workflow FPCheckAoU { } parameter_meta { aligned_bam: "GCS path to aligned BAM file, supposed to be of the same sample as from the fingerprinting (FP) VCF" + tech: "The technology used to generate this BAM. Currently, the following values are accepted: [ONT, Sequel, Revio]." + force: "If true, will force run the fingerprinting program, and the workflow may fail for various reasons; otherwise, if the BAM is too small, it will automatically fail this QC check." - fp_store: "Name of the bucket and prefix holding the fingerprint VCFs." - sample_id_at_store: "UUID of the sample at the fingerprint store, used to fetch the fingerprinting VCF" + fp_vcf_store: "Name of the bucket and prefix holding the fingerprint VCFs." + fp_sample_id: "UUID of the sample at the fingerprint store, used to fetch the fingerprinting VCF" ref_specific_haplotype_map: "Happlotype map file for the reference build used. See https://bit.ly/3QyZbwt " @@ -25,66 +27,124 @@ workflow FPCheckAoU { FP_status: "A single word summary on the result of FP check; one of [PASS, FAIL, BORDERLINE]." fingerprint_summary: "A file holding the summaries of LOD (a bit more detail than pass/fail)." fingerprint_details: "A file holding the detailed LOD at each FP site." + chain_file: "Chain file for the GATK LiftoverVcf process, mapping between source and target genome builds." + target_reference_sequence_fasta_file: "Reference sequence fasta file for the target genome build used in the liftover." + target_reference_sequence_fasta_file_index: "Index file for the target reference sequence fasta, required for GATK processes." + target_reference_sequence_fasta_file_dict: "Sequence dictionary for the target reference fasta, required by GATK for reference sequence validation." + + mem: "Memory allocated for the GATK LiftoverVcf process, specified in GB." + preemptible_attempts: "Number of preemptible attempts for the task. Preemptible instances are cheaper but may be terminated unexpectedly." + disk_space_gb: "Amount of disk space allocated for the task, specified in GB." + cpu: "Number of CPU cores allocated for the task." + boot_disk_size_gb: "Size of the boot disk for the virtual machine running the task, specified in GB." } input { File aligned_bam File aligned_bai + String tech - String fp_store - String sample_id_at_store + String fp_vcf_store + String fp_sample_id File ref_specific_haplotype_map + # Input args liftedoverVCFGATK generation: + File chain_file + File target_reference_sequence_fasta_file + File target_reference_sequence_fasta_file_index + File target_reference_sequence_fasta_file_dict + # Runtime args: + Int? mem + Int? preemptible_attempts + Int? disk_space_gb + Int? cpu + Int? boot_disk_size_gb + + Boolean force = false + Float lod_pass_threshold = 6.0 Float lod_fail_threshold = -3.0 } + # Generate output file names using fp_sample_id + String lifted_over_vcf = fp_sample_id + ".lifted_over.vcf" + String rejectedVcf = fp_sample_id + ".rejected.vcf" - ##### Prep work - call ResolveFPVCFPath {input: fp_store = fp_store, sample_id_at_store = sample_id_at_store} - call ReheaderFullGRCh38VCFtoNoAlt {input: full_GRCh38_vcf = ResolveFPVCFPath.fp_vcf} + output { + Float lod_expected_sample = fingerprint_check_LOD + String FP_status = fingerprint_check_status - call VariantUtils.GetVCFSampleName { - input: - fingerprint_vcf = ReheaderFullGRCh38VCFtoNoAlt.reheadered_vcf - } - call FPUtils.FilterGenotypesVCF { - input: - fingerprint_vcf = ReheaderFullGRCh38VCFtoNoAlt.reheadered_vcf - } - call FPUtils.ExtractGenotypingSites { - input: - fingerprint_vcf = FilterGenotypesVCF.ready_to_use_vcf - } - call FPUtils.ExtractRelevantGenotypingReads { - input: - aligned_bam = aligned_bam, - aligned_bai = aligned_bai, - genotyping_sites_bed = ExtractGenotypingSites.sites, - } + File fingerprint_summary = select_first([CheckFingerprint.summary_metrics, "None"]) + File fingerprint_details = select_first([CheckFingerprint.detail_metrics, "None"]) - ##### check - call FPUtils.CheckFingerprint { - input: - aligned_bam = ExtractRelevantGenotypingReads.relevant_reads, - aligned_bai = ExtractRelevantGenotypingReads.relevant_reads_bai, - fingerprint_vcf = FilterGenotypesVCF.ready_to_use_vcf, - vcf_sample_name = GetVCFSampleName.sample_name, - haplotype_map = ref_specific_haplotype_map } - ##### wrapup - Float lod_expected_sample_t = CheckFingerprint.metrics_map['LOD_EXPECTED_SAMPLE'] - - String status = if(lod_expected_sample_t < lod_fail_threshold) then "FAIL" else if (lod_expected_sample_t > lod_pass_threshold) then "PASS" else "BORDERLINE" - - output { - Float lod_expected_sample = lod_expected_sample_t - String FP_status = status - - File fingerprint_summary = CheckFingerprint.summary_metrics - File fingerprint_details = CheckFingerprint.detail_metrics + # 1X coverage ~= 1.5GiB (Revio); 2.3GiB (Sequel); 2.8GiB (ONT) + # below 0.5X, we define the bam as too small to draw a conclusion on, unless we're forced to run the program + Map[String, Int] teeny_bam_def_sz = {"ONT": 1400, + "Sequel": 1650, + "Revio": 750} + + Int bam_sz_mb = ceil(size(aligned_bam, "MiB")) + if (bam_sz_mb >= teeny_bam_def_sz[tech] || force) { + ##### Prep work + call ResolveFPVCFPath {input: fp_vcf_store = fp_vcf_store, fp_sample_id = fp_sample_id} + call ReheaderFullGRCh38VCFtoNoAlt {input: full_GRCh38_vcf = ResolveFPVCFPath.fp_vcf} + # liftover the vcfs + call LiftoverVcfGATK { + input: + input_vcf_file = ReheaderFullGRCh38VCFtoNoAlt.reheadered_vcf, + chain_file = chain_file, + target_reference_sequence_fasta_file = target_reference_sequence_fasta_file, + target_reference_sequence_fasta_file_index = target_reference_sequence_fasta_file_index, + target_reference_sequence_fasta_file_dict = target_reference_sequence_fasta_file_dict, + lifted_over_vcf_name = lifted_over_vcf, + lifted_over_rejects_vcf_name = rejectedVcf, + mem = mem, + preemptible_attempts = preemptible_attempts, + disk_space_gb = disk_space_gb, + cpu = cpu, + boot_disk_size_gb = boot_disk_size_gb + } + + call VariantUtils.GetVCFSampleName { + input: + fingerprint_vcf = LiftoverVcfGATK.lifted_over_vcf + } + call FPUtils.FilterGenotypesVCF { + input: + fingerprint_vcf = LiftoverVcfGATK.lifted_over_vcf + } + call FPUtils.ExtractGenotypingSites { + input: + fingerprint_vcf = FilterGenotypesVCF.ready_to_use_vcf + } + call FPUtils.ExtractRelevantGenotypingReads { + input: + aligned_bam = aligned_bam, + aligned_bai = aligned_bai, + genotyping_sites_bed = ExtractGenotypingSites.sites, + } + + ##### check + call FPUtils.CheckFingerprint { + input: + aligned_bam = ExtractRelevantGenotypingReads.relevant_reads, + aligned_bai = ExtractRelevantGenotypingReads.relevant_reads_bai, + fingerprint_vcf = FilterGenotypesVCF.ready_to_use_vcf, + vcf_sample_name = GetVCFSampleName.sample_name, + haplotype_map = ref_specific_haplotype_map + } + + ##### wrapup + Float lod_expected_sample_t = CheckFingerprint.metrics_map['LOD_EXPECTED_SAMPLE'] + + String status = if(lod_expected_sample_t < lod_fail_threshold) then "FAIL" else if (lod_expected_sample_t > lod_pass_threshold) then "PASS" else "BORDERLINE" } + # FAIL the bam if coverage is below a certain threshold (not much useful data anyway) + Float teeny_bam_lod = 0.0 + String fingerprint_check_status = select_first([status, "FAIL"]) + Float fingerprint_check_LOD = select_first([lod_expected_sample_t, teeny_bam_lod]) } task ResolveFPVCFPath { @@ -94,19 +154,19 @@ task ResolveFPVCFPath { } input { - String fp_store - String sample_id_at_store + String fp_vcf_store + String fp_sample_id RuntimeAttr? runtime_attr_override } - String fp_store_formatted = sub(fp_store, "/$", "") + String fp_vcf_store_formatted = sub(fp_vcf_store, "/$", "") command <<< set -eux # note the addition of the wildcard character * - FP_SEARCH="~{fp_store_formatted}/~{sample_id_at_store}*.fingerprint.liftedover.vcf" - # this will error if no paths match, i.e. no FP file exists with this sample_id_at_store + FP_SEARCH="~{fp_vcf_store_formatted}/~{fp_sample_id}*.fingerprint.liftedover.vcf" + # this will error if no paths match, i.e. no FP file exists with this fp_sample_id FP_PATH=$(gsutil ls "${FP_SEARCH}" | head -n 1) FP_INDEX_PATH="${FP_PATH}.idx" @@ -169,3 +229,72 @@ task ReheaderFullGRCh38VCFtoNoAlt { docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" } } +task LiftoverVcfGATK { + meta { + description: "Use GATK's LiftoverVcf tool to lift over a VCF from one reference build to another." + } + input { + # Input args: + File input_vcf_file + File chain_file + File target_reference_sequence_fasta_file + File target_reference_sequence_fasta_file_index + File target_reference_sequence_fasta_file_dict + + # Output Names: + String lifted_over_vcf_name + String lifted_over_rejects_vcf_name + + # Runtime args: + Int? mem + Int? preemptible_attempts + Int? disk_space_gb + Int? cpu + Int? boot_disk_size_gb + + } + # Get machine settings: + #Boolean use_ssd = true + Int base_disk_space_gb = 100 + + Int base_boot_disk_size_gb = 15 + + # Timing output file + #String timing_output_file = "liftover_timing.txt" + + command <<< + set -euxo pipefail + + # startTime=`date +%s.%N` + # echo "StartTime: $startTime" > ${timing_output_file} + + time \ + gatk LiftoverVcf \ + -I ~{input_vcf_file} \ + -O ~{lifted_over_vcf_name} \ + -CHAIN ~{chain_file} \ + -REJECT ~{lifted_over_rejects_vcf_name} \ + -R ~{target_reference_sequence_fasta_file} \ + --RECOVER_SWAPPED_REF_ALT true + # endTime=`date +%s.%N` + # echo "EndTime: $endTime" >> ${timing_output_file} + # elapsedTime=`python -c "print( $endTime - $startTime )"` + # echo "Elapsed Time: $elapsedTime" >> ${timing_output_file} + >>> + runtime { + docker: "us.gcr.io/broad-gatk/gatk:4.4.0.0" + memory: select_first([mem, 8]) + " GB" + cpu: select_first([cpu, 1]) + disks: "local-disk " + select_first([disk_space_gb, base_disk_space_gb]) + " SSD" + bootDiskSizeGb: select_first([boot_disk_size_gb, base_boot_disk_size_gb]) + preemptible: select_first([preemptible_attempts, 3]) + } + + # Outputs: + output { + File lifted_over_vcf = "${lifted_over_vcf_name}" + File lifted_over_rejects_vcf = "${lifted_over_rejects_vcf_name}" + # File timing_info = timing_output_file + } +} + diff --git a/wdl/tasks/QC/Fingerprinting.wdl b/wdl/tasks/QC/Fingerprinting.wdl index ce939d301..71faeb6a0 100644 --- a/wdl/tasks/QC/Fingerprinting.wdl +++ b/wdl/tasks/QC/Fingerprinting.wdl @@ -229,10 +229,12 @@ task ExtractRelevantGenotypingReads { export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` samtools view -h -@ 1 \ + -X \ --write-index \ -o "relevant_reads.bam##idx##relevant_reads.bam.bai" \ -M -L ~{genotyping_sites_bed} \ - ~{aligned_bam} + ~{aligned_bam} \ + ~{aligned_bai} >>> output { @@ -244,18 +246,16 @@ task ExtractRelevantGenotypingReads { RuntimeAttr default_attr = object { cpu_cores: 4, mem_gb: 8, - disk_gb: 375, # will use LOCAL SSD for speeding things up - boot_disk_gb: 10, - preemptible_tries: 0, + disk_gb: 50, + preemptible_tries: 1, max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) docker: select_first([runtime_attr.docker, default_attr.docker]) diff --git a/wdl/tasks/QC/Quast.wdl b/wdl/tasks/QC/Quast.wdl index 49e41188b..65ff113db 100644 --- a/wdl/tasks/QC/Quast.wdl +++ b/wdl/tasks/QC/Quast.wdl @@ -97,18 +97,18 @@ task SummarizeQuastReport { sed 's/__\+/\t/g' | \ sed 's/\s\+$//g' | \ sed 's/>=/gt/g' | \ - tee report_map.txt + tee quast_summary.txt - for i in $(seq 2 $(awk '{print NF}' report_map.txt | sort -nu | tail -n 1)) + for i in $(seq 2 $(awk '{print NF}' quast_summary.txt | sort -nu | tail -n 1)) do j=$(( i - 2 )) # to make sure the primary, assuming it's the 0-th fed in to this task and the left-most value column - cut -d$'\t' -f1,${i} < report_map.txt > report_map_${j}.txt + cut -d$'\t' -f1,${i} < quast_summary.txt > quast_summary_${j}.txt done >>> output { - File quast_metrics_together = "report_map.txt" - Array[File] quast_metrics = glob("report_map_*.txt") + File quast_metrics_together = "quast_summary.txt" + Array[File] quast_metrics = glob("quast_summary_*.txt") } runtime { diff --git a/wdl/tasks/QC/SexConcordance.wdl b/wdl/tasks/QC/SexConcordance.wdl new file mode 100644 index 000000000..2bd8274ed --- /dev/null +++ b/wdl/tasks/QC/SexConcordance.wdl @@ -0,0 +1,109 @@ +version 1.0 + +task SummarizeCoverages { + input { + File mosdepth_summary_txt + } + + command <<< + set -eux + + cat ~{mosdepth_summary_txt} + + grep -w "chr1" ~{mosdepth_summary_txt} | \ + awk -F '\t' '{print $4}' | \ + xargs printf "%0.2f\n" > cov.chr1.txt + + grep -w "chrX" ~{mosdepth_summary_txt} | \ + awk -F '\t' '{print $4}' | \ + xargs printf "%0.2f\n" > cov.chrX.txt + grep -w "chrY" ~{mosdepth_summary_txt} | \ + awk -F '\t' '{print $4}' | \ + xargs printf "%0.2f\n" > cov.chrY.txt + >>> + + output { + Float cov_chr1 = read_float("cov.chr1.txt") + Float cov_chrX = read_float("cov.chrX.txt") + Float cov_chrY = read_float("cov.chrY.txt") + } + + runtime { + disks: "local-disk 10 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task MakeACall { + input { + Float cov_chr1 + Float cov_chrX + Float cov_chrY + + String expected_sex_type + } + + Map[String, Int] sex_codec = {'F':2,'M':1,'NA':0,'na':0} + Int expected_sex_code = sex_codec[expected_sex_type] + + command <<< + set -eux + + if (( $(echo "~{cov_chr1} < 0.01" | bc -l) )); then + scaled_x_dp_mean="na" + scaled_y_dp_mean="na" + extreme_low_cov=1 + else + scaled_x_dp_mean=$(echo "scale=2; 2*~{cov_chrX}/~{cov_chr1}" | bc) + scaled_y_dp_mean=$(echo "scale=2; 2*~{cov_chrY}/~{cov_chr1}" | bc) + extreme_low_cov=0 + fi + + touch my_call.tsv + echo -e "scaled_x_dp_mean\t${scaled_x_dp_mean}" >> my_call.tsv + echo -e "scaled_y_dp_mean\t${scaled_y_dp_mean}" >> my_call.tsv + + nx=$(echo "${scaled_x_dp_mean}" | awk '{print int($1+0.5)}') + ny=$(echo "${scaled_y_dp_mean}" | awk '{print int($1+0.5)}') + export nx + export ny + if [[ ${nx} -eq 0 ]]; then xchar=''; else xchar=$(perl -e "print 'X' x ${nx}"); fi + if [[ ${ny} -eq 0 ]]; then ychar=''; else ychar=$(perl -e "print 'Y' x ${ny}"); fi + if [[ (${extreme_low_cov} -eq 1) || (${nx} -eq 0 && ${ny} -eq 0) ]]; then + sex_type="na" + else + sex_type="${xchar}${ychar}" + fi + echo -e "sex_call\t${sex_type}" >> my_call.tsv + + # unsure if this is correct + if [[ ~{expected_sex_code} -eq 1 ]]; + then + if [[ ${ny} -ge 1 ]]; + then + echo -e "is_sex_concordant\ttrue" >> my_call.tsv + else + echo -e "is_sex_concordant\tfalse" >> my_call.tsv + fi + elif [[ ~{expected_sex_code} -eq 2 ]]; + then + if [[ ${nx} -ge 2 ]]; + then + echo -e "is_sex_concordant\ttrue" >> my_call.tsv; + else + echo -e "is_sex_concordant\tfalse" >> my_call.tsv + fi + else + echo -e "is_sex_concordant\ttrue" >> my_call.tsv + fi + >>> + + output { + Map[String, String] inferred_sex_info = read_map("my_call.tsv") + } + + runtime { + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-dsp-lrma/somalier:v0.2.15" # need bc for floating point arith. + } +} diff --git a/wdl/tasks/Utility/BAMutils.wdl b/wdl/tasks/Utility/BAMutils.wdl index deedb7a7f..5aee398dd 100644 --- a/wdl/tasks/Utility/BAMutils.wdl +++ b/wdl/tasks/Utility/BAMutils.wdl @@ -1,45 +1,2066 @@ version 1.0 +import "../../structs/Structs.wdl" + +################################################# +# header-only READ operations +################################################# + task GetReadGroupInfo { meta { desciption: - "Get some read group information Given a single-readgroup BAM. Will fail if the information isn't present." + "Get some read group information given a single-readgroup BAM. If the requested keys are absent, a null value is assigned in the returned entry." + warn: + "If the BAM contains multiple read groups, task will fail." } - parameter_meta { - uBAM: "The input BAM file." + bam: { desciption: "The input BAM file.", localization_optional: true } keys: "A list of requested fields in the RG line, e.g. ID, SM, LB." + null_value_representation: "For keys requested that aren't available in the bam's header, this value will be returned." } input { - String uBAM # not using file as call-caching brings not much benefit + File bam Array[String] keys + String null_value_representation = "None" + } + + output { + Map[String, String] read_group_info = read_map("result.tsv") } command <<< set -eux export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) - samtools view -H ~{uBAM} | grep "^@RG" | tr '\t' '\n' > rh_header.txt + samtools view -H ~{bam} | grep "^@RG" > one_rg_per_line.txt + num_rgs=$(wc -l one_rg_per_line.txt | awk '{print $1}') + if [[ ${num_rgs} -gt 1 ]]; then exit 1; fi + + cat one_rg_per_line.txt | tr '\t' '\n' > rh_header.txt for attribute in ~{sep=' ' keys}; do - value=$(grep "^${attribute}" rh_header.txt | awk -F ':' '{print $2}') - echo -e "${attribute}\t${value}" >> "result.txt" + if grep -q "^${attribute}" rh_header.txt; then + value=$(grep "^${attribute}" rh_header.txt | awk -F ':' '{print $2}') + else + value="~{null_value_representation}" + fi + echo -e "${attribute}\t${value}" >> "result.tsv" done >>> + runtime { + cpu: 1 + memory: "4 GiB" + disks: "local-disk 10 HDD" + preemptible: 2 + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } +} + +task GetReadGroupLines { + meta { + desciption: "Get the @RG lines in a BAM's header. Will error if there's no read group defined in the header." + } + parameter_meta { + bam: {localization_optional: true} + } + + input { + File bam + } + + output { + Array[String] read_group_ids = read_lines("rgids.txt") + Array[String] read_group_lines = read_lines("read_groups.txt") + } + + command <<< + set -eux + + export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` + samtools view -H ~{bam} | grep "^@RG" > read_groups.txt + + rm -f rgids.txt + while IFS= read -r line + do + echo "${line}" | tr '\t' '\n' \ + | grep "^ID:" | cut -d ':' -f 2- \ + >> rgids.txt + done < read_groups.txt + >>> + + runtime { + cpu: 1 + memory: "4 GiB" + disks: "local-disk 10 HDD" + preemptible: 2 + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } +} + +task GatherBamMetadata { + meta { + description: "Check several metadata of an input BAM (aliged? sort order? etc)" + } + parameter_meta { + bam: { localization_optional: true } + } + + input { + File bam + } + + output { + Boolean is_aligned = read_boolean("is_mapped.txt") + + Boolean is_sorted = read_boolean("is_sorted.txt") + String sort_order = read_string("sort_order.txt") + } + + command <<< + set -eux + + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + samtools view -H ~{bam} > header.txt + + grep -F "@HD" header.txt | tr '\t' '\n' > hd.line.txt + if grep -q "SO" hd.line.txt; + then + echo "true" > "is_sorted.txt" + grep "SO" hd.line.txt | cut -d ':' -f 2- \ + > "sort_order.txt" + else + echo "false" > "is_sorted.txt" + echo "NA" > "sort_order.txt" + fi + + # we use two conditions: @SQ lines in header, and at least some mapped reads + mapped_bool='' + if grep -q "@SQ" "header.txt"; + then + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + if [[ 1 -le $(samtools view -F 4 ~{bam} | head | wc -l | awk '{print $1}') ]]; + then + mapped_bool='true' + else # @SQ lines defined but seemingly no mapped reads + mapped_bool='unsure' # this will trigger error in WDL later, but it's intentional because we cannot be sure + fi + else + mapped_bool='false' + fi + echo "${mapped_bool}" > "is_mapped.txt" + >>> + + runtime { + cpu: 1 + memory: "4 GiB" + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } +} + +task CountReadGroups { + meta { + desciption: "Count the number of RG lines in the header of the BAM file." + } + parameter_meta { + bam: { localization_optional: true } + } + + input { + File bam + } + + output { + Int num_rg = read_int("rg_cnt.txt") + } + + command <<< + set -eux + + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + samtools view -H ~{bam} | grep -c "^@RG" > "rg_cnt.txt" + >>> + + runtime { + cpu: 1 + memory: "4 GiB" + disks: "local-disk 10 HDD" + preemptible: 2 + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } +} + +task InferSampleName { + meta { + description: "Infer sample name encoded on the @RG line of the header section." + warn: "Fails if multiple values found, or if SM is the specified illegal value." + } + parameter_meta { + bam: { localization_optional: true } + } + + input { + File bam + File? bai + String illegal_value = "unnamedsample" + } + output { - Map[String, String] read_group_info = read_map("result.txt") + String sample_name = read_string("sample.names.txt") } + command <<< + set -euxo pipefail + + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + samtools view -H ~{bam} > header.txt + if ! grep -q '^@RG' header.txt; then echo "No read group line found!" && exit 1; fi + + grep '^@RG' header.txt | tr '\t' '\n' | grep '^SM:' | sed 's/SM://g' | sort | uniq > sample.names.txt + if [[ $(wc -l sample.names.txt) -gt 1 ]]; then echo "Multiple sample names found!" && exit 1; fi + if grep -iq "~{illegal_value}" sample.names.txt; then echo "Sample name found to be illegal!" && exit 1; fi + >>> + runtime { cpu: 1 memory: "4 GiB" disks: "local-disk 100 HDD" - bootDiskSizeGb: 10 preemptible: 2 maxRetries: 1 - docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } +} + +################################################# +# statistics +################################################# + +task ValidateSamFile { + meta { + desciption: "Call GATK/Picard ValidateSamFile to validate input BAM: https://bit.ly/3JMutxp." + } + parameter_meta { + validation_mode: "Desired valiation mode; see Picard documentation for the supproted values." + disk_type: "Type of disk to use for the computation; SSD for persistent SSD disks, LOCAL for local SSDs." + bam: { + localization_optional : true + } + } + + input { + File bam + String validation_mode = "SUMMARY" + + Array[String] validation_errs_to_ignore = ["INVALID_TAG_NM", # for the purpose we currently have, NM and CIGAR don't matter, and longreads have no mates + "MISSING_TAG_NM", + "INVALID_CIGAR", + "ADJACENT_INDEL_IN_CIGAR", + "CIGAR_MAPS_OFF_REFERENCE", + "MISMATCH_MATE_CIGAR_STRING", + "MATE_CIGAR_STRING_INVALID_PRESENCE", + "MATE_NOT_FOUND", + "INVALID_MAPPING_QUALITY", + "INVALID_FLAG_MATE_UNMAPPED", + "MISMATCH_FLAG_MATE_UNMAPPED", + "INVALID_FLAG_MATE_NEG_STRAND", + "MISMATCH_FLAG_MATE_NEG_STRAND", + "INVALID_MATE_REF_INDEX", + "MISMATCH_MATE_REF_INDEX", + "MISMATCH_MATE_ALIGNMENT_START", + "MATE_FIELD_MISMATCH", + "PAIRED_READ_NOT_MARKED_AS_FIRST_OR_SECOND" + ] + + String disk_type = "SSD" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = ceil(size(bam, "GiB")) + 50 + String output_basename = basename(basename(bam, ".bam"), ".cram") + String output_name = "${output_basename}_${validation_mode}.txt" + + String base = basename(bam, ".bam") + String local_bam = "/cromwell_root/~{base}.bam" + + command <<< + set -eux + + time gcloud storage cp ~{bam} ~{local_bam} + + gatk ValidateSamFile \ + --INPUT ~{local_bam} \ + --OUTPUT ~{output_name} \ + --MODE ~{validation_mode} \ + ~{true="--IGNORE " false="" 0>> + + output { + File validation_report = "${output_name}" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-custom-gatk:4.4.0.0-samtools1.18" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " ~{disk_type}" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SamtoolsFlagStats { + meta { + description: "Collect SAM flag stats of an aligned BAM" + } + parameter_meta { + bam: {localization_optional: true} + output_format: "argument passed on to '-O' of `samtools flagstats` " + } + + input { + File bam + String output_format = "tsv" + String disk_type = "SSD" + RuntimeAttr? runtime_attr_override + } + + output { + File flag_stats = "~{output_name}" + } + + String base = basename(bam, ".bam") + + String local_bam = "/cromwell_root/~{base}" + + Map[String, String] reformat_user_input = {'JSON': 'json', 'json': 'json', + "TSV": 'tsv', 'tsv': 'tsv', + 'DEFAULT': 'defalt', 'default': 'defalt'} + String o_f = reformat_user_input[output_format] + Map[String, String] reformat_output_format = {'default': 'txt', 'json': 'json', 'tsv': 'tsv'} + String o_ext = reformat_output_format[o_f] + + String output_name = "~{base}.flag_stats.~{o_ext}" + + command <<< + set -euxo pipefail + + time \ + gcloud storage cp ~{bam} ~{local_bam} + + time \ + samtools flagstat \ + -O ~{o_f} \ + ~{local_bam} \ + > "~{output_name}" + >>> + + ######################### + Int disk_size = 10 + ceil(size(bam, "GiB")) + + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " ~{disk_type}" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ParseFlagStatsJson { + meta { + description: "Parse the output from `samtools flatstats -O json`" + } + + parameter_meta { + sam_flag_stats_json: "JSON output from samtools flatstats" + replace_none_with: "value to use to replace 'None' in the json value fields" + } + input { + File sam_flag_stats_json + Float replace_none_with = 0 + } + output { + Map[String, Float] qc_pass_reads_SAM_flag_stats = read_map("qcPass.stats.tsv") + Map[String, Float] qc_fail_reads_SAM_flag_stats = read_map("qcFail.stats.tsv") + } + + command <<< + set -euxo pipefail + + python <>> + + runtime { + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-dsp-lrma/python:3.9.18-slim-bullseye" + } +} + +task CountMethylCallReads { + meta { + desciption: "Count the numbers of records in the bam with and without the ML & MM tags" + } + + parameter_meta { + bam: {localization_optional: true} + disk_type: "must be one of [HDD, SSD, LOCAL]. SSD is recommended" + + raw_count: "number of records in input BAM" + bean_count: "number of records in input BAM that has both MM & ML tags" + + non_2304_count: "number of records in input BAM that are neither 256 (secondary) nor 2048 (supplementary)" + non_2304_bean_count: "number of records in input BAM that are neither 256 (secondary) nor 2048 (supplementary), and have both MM & ML tags" + } + input { + File bam + File? bai + String disk_type + } + output { + Int raw_count = read_int("raw_count.txt") + Int bean_count = read_int("bean_count.txt") + Int non_2304_count = read_int("non_2304_count.txt") + Int non_2304_bean_count = read_int("non_2304_bean_count.txt") + } + + String base = basename(bam, '.bam') + String local_bam = "/cromwell_root/~{base}.bam" + + command <<< + set -euxo pipefail + time \ + gcloud storage cp ~{bam} ~{local_bam} + + samtools view -@1 -c ~{local_bam} > raw_count.txt & + samtools view -@1 -c -F 2304 ~{local_bam} > non_2304_count.txt & + + samtools view -h --tag "MM" ~{local_bam} | samtools view -c --tag "ML" > bean_count.txt & + samtools view -h -F 2304 --tag "MM" ~{local_bam} | samtools view -c --tag "ML" > non_2304_bean_count.txt & + + wait + + tail ./*_count.txt + >>> + + Int local_ssd_sz = if size(bam, "GiB") > 300 then 750 else 375 + Int pd_sz = 50 + ceil(size(bam, "GiB")) + Int disk_size = if "LOCAL" == disk_type then local_ssd_sz else pd_sz + runtime { + cpu: 10 + memory: "20 GiB" + disks: "local-disk ~{disk_size} ~{disk_type}" + preemptible: 2 + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } +} + +task CountAlignmentRecords { + meta { + desciption: + "Count the number of alignment records with a particular SAM flag." + } + + parameter_meta { + aligned_bam: { + localization_optional: true + } + localize_bam: "If false, the BAM is streamed in from the bucket instead of localized, but the operation is subject to network instabilities and may also timeout" + + decimal_flag: "Only include records having the SAM flag; in decimal (as opposed to hexadecimal); when not provided, all records are included." + inverse_flag: "If true, filter away records that has the requested decimal_flag; no effect when decimal_flag is not provided" + } + + input { + File aligned_bam + File aligned_bai + + Boolean localize_bam = false + + Int? decimal_flag + Boolean inverse_flag = false + } + + output { + File stderr_log = "error.log" + Int count = read_int("count.txt") + } + + command <<< + set -eux + + touch "error.log" + + if ~{defined(decimal_flag)}; then + if ~{inverse_flag}; then + filter_op='-F ' + else + filter_op='-f ' + fi + else + filter_op=' ' + fi + if ~{localize_bam}; then + time \ + gcloud storage cp ~{aligned_bam} input_bam.bam + mv ~{aligned_bai} input_bam.bam.bai + + samtools view -c \ + "${filter_op}" ~{select_first([decimal_flag, " "])} \ + input_bam.bam \ + > "count.txt" + else + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + samtools view -c \ + "${filter_op}" ~{select_first([decimal_flag, " "])} \ + ~{aligned_bam} \ + > "count.txt" \ + 2>"error.log" + fi + >>> + + Int disk_size = 20 + ceil(size(aligned_bam, "GiB")) + String disk_type = if localize_bam then "SSD" else "HDD" + runtime { + cpu: 1 + memory: "4 GiB" + disks: "local-disk ~{disk_size} ~{disk_type}" + preemptible: 1 + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } +} + +task StreamingBamErrored { + meta { + desciption: + "A helper task that reads the error log from task to report if streaming BAM from GCS bucket failed." + } + parameter_meta { + stderr_log: "The stderr log output from task StreamCountAlignmentRecords" + yes: "if true, then the streaming read operation failed and the output of that task is likely corrupted." + } + input { + File stderr_log + } + command <<< + set -eux + if [[ -s ~{stderr_log} ]]; then + echo "Streaming a BAM triggered warnings or errors." \ + && echo "true" > result.txt \ + && cat ~{stderr_log} \ + && exit 1 + else + echo "false" > result.txt + fi + >>> + output { + Boolean yes = read_boolean("result.txt") + } + + runtime { + disks: "local-disk 10 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task CountAlignmentRecordsByFlag { + meta { + desciption: + "Count the number of alignment records with particular SAM flags; the difference to CountAlignmentRecords is this allows specifying multiple SAM flags" + } + + parameter_meta { + names_and_decimal_flags: "The SAM flags, in decimal (as opposed to hexadecimal.), with their (appropriate) names." + + aligned_bam: { + localization_optional: true + } + } + + input { + File aligned_bam + File aligned_bai + + Map[String, Int] names_and_decimal_flags + + Int num_local_ssds + } + + # Int n = length(names_and_decimal_flags) + + String base = basename(aligned_bam, ".bam") + String local_bam = "/cromwell_root/~{base}.bam" + command <<< + set -eux + + two_col_tsv=~{write_map(names_and_decimal_flags)} + cat "${two_col_tsv}" + # x=$(wc -l "${two_col_tsv}" | awk '{print $1}') + # if [[ 3 -ne "${x}" ]]; then ## 3 here is used in place of n to avoid validation error (yes, validation false positive) + # sed -i -e '$a\' "${two_col_tsv}" + # fi + wc -l "${two_col_tsv}" + # because some Cromwell versions' stdlib function write_map() doesn't have new line at end of file, so we add it explicitly + if [[ $(tail -c1 "${two_col_tsv}" | wc -l) -eq 0 ]]; then + sed -i -e '$a\' "${two_col_tsv}" + fi + # ' + wc -l "${two_col_tsv}" + + time \ + gcloud storage cp ~{aligned_bam} ~{local_bam} + mv ~{aligned_bai} ~{local_bam}.bai + + # iterate through each requested SAM flag + while IFS=$'\t' read -r -a line + do + name="${line[0]}" + flag="${line[1]}" + samtools view -c \ + -f "${flag}" \ + ~{local_bam} \ + > "asdfxyz_${name}.txt" & + done < "${two_col_tsv}" + + # overall + samtools view -c ~{local_bam} \ + > "total_cnt.txt" & + # primary (2308=4+256+2048) + samtools view -c \ + -F 2308 \ + ~{local_bam} \ + > "primary_count.txt" & + wait + total_count=$(cat total_cnt.txt) + primary_count=$(cat primary_count.txt) + + # format results + touch result_raw.txt result_pct.txt # have to do this because iterating a Map isn't possible on WDL 1.0 + echo -e "Primary\t${primary_count}" > result_raw.txt + pct=$(echo "100*${primary_count}/${total_count}" | bc | awk '{ printf("%.0f\n",$1) }') + echo -e "Primary\t${pct}" > result_pct.txt + for ff in asdfxyz*txt; + do + name=$(echo "${ff}" | sed 's#asdfxyz_##' | sed 's#.txt$##') + count=$(cat "${ff}") + echo -e "${name}\t${count}" >> result_raw.txt + pct=$(echo "100*${count}/${total_count}" | bc -l | awk '{ printf("%.0f\n",$1) }') + echo -e "${name}\t${pct}" >> result_pct.txt + done + + awk -F '\t' '{print $2}' result_pct.txt | paste -sd+ | bc > debug_total_pct.txt + >>> + + output { + Int total_cnt = read_int("total_cnt.txt") + Map[String, Int] flag_cnts = read_map("result_pct.txt") + Map[String, Float] flag_pcts = read_map("result_pct.txt") + Int summed_percentages = read_int("debug_total_pct.txt") + } + + Int local_ssd_sz = if size(aligned_bam, "GiB") > 300 then 750 else 375 + runtime { + cpu: 8 + memory: "32 GiB" + disks: "local-disk " + local_ssd_sz + " LOCAL" + preemptible: 1 + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } +} + +task GetDuplicateReadnamesInQnameSortedBam { + meta { + desciption: "Get read names from a queryname-sorted bam, where such reads are duplicate records" + } + parameter_meta { + qns_bam: { + localization_optional: true + } + } + input { + File qns_bam + } + + output { + File dup_names_txt = "dup_read_names.txt" + Boolean result_may_be_corrupted = read_boolean("samtools.failed.txt") + } + + command <<< + # the way this works is the following: + # 0) relying on the re-auth.sh script to export the credentials + # 1) perform the remote sam-view subsetting in the background + # 2) listen to the PID of the background process, while re-auth every 1200 seconds + source /opt/re-auth.sh + set -euxo pipefail + + # assumption + sort_order=$(samtools view -H ~{qns_bam} | grep "^@HD" | tr '\t' '\n' | grep "^SO:" | awk -F ':' '{print $2}') + if [[ "queryname" != "${sort_order}" ]]; then echo -e "Sort order ${sort_oder} isn't the expected 'queryname'." && exit 1; fi + + # remote grab read names + echo "false" > samtools.failed.txt + samtools view ~{qns_bam} \ + | awk -F '\t' '{print $1}' \ + | uniq -d \ + > "dup_read_names.txt" \ + || { echo "true" > samtools.failed.txt; exit 77; } & + pid=$! + + set +e + count=1 + while true; do + sleep 1200 && date && source /opt/re-auth.sh + if [[ ${count} -gt 2 ]]; then exit 0; fi + if ! pgrep -x -P $pid; then exit 0; fi + count=$(( count+1 )) + done + >>> + + runtime { + cpu: 1 + memory: "4 GiB" + disks: "local-disk 10 HDD" + preemptible: 2 + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } +} + +################################################# +# light transformations (essentially reheader operations) +################################################# + +task ResetSamplename { + meta { + desciption: + "Reset the SM entry in the input bam's readgroup lines." + } + + parameter_meta { + bam: {localization_optional: true} + } + + input { + File bam + File? bai + String sample_name + } + + output { + File reheadered_bam = "~{out_prefix}.bam" + File? reheadered_bai = "~{out_prefix}.bam.bai" + } + + String prefix = basename(bam, ".bam") + String local_bam = "/cromwell_root/~{prefix}.bam" + String out_prefix = "~{prefix}.ResetSamplename" + command <<< + set -euxo pipefail + + time \ + gcloud storage cp ~{bam} ~{local_bam} + if ~{defined(bai)}; then touch ~{bai}; mv ~{bai} ~{local_bam}.bai; fi + + ###### cleanup the header + samtools view --no-PG -H ~{local_bam} > header.txt + grep -v "^@SQ" header.txt + + # fix SM in the RG lines + grep "^@RG" header.txt > rg_lines.txt + if ! grep -qF "SM:" rg_lines.txt; then + sed -i "s/$/SM:tbd/" rg_lines.txt + fi + awk -v sm="~{sample_name}" -F '\t' 'BEGIN {OFS="\t"} { for (i=1; i<=NF; ++i) { if ($i ~ "SM:") $i="SM:"sm } print}' \ + rg_lines.txt \ + > fixed_rg_lines.txt + cat fixed_rg_lines.txt + + # paste things back + grep -v "^@RG" header.txt > otherlines.txt + cat otherlines.txt fixed_rg_lines.txt > fixed_header.txt + + ###### samtools reheader + time \ + samtools reheader fixed_header.txt ~{local_bam} \ + > "~{out_prefix}.bam" + if ~{defined(bai)}; then + time \ + samtools index -@1 "~{out_prefix}.bam" + fi + >>> + + Int disk_size = 50 + 2 * ceil(size(bam, "GiB")) + runtime { + cpu: 2 + memory: "8 GiB" + disks: "local-disk ~{disk_size} SSD" + preemptible: 2 + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } +} + +################################################# +# intensive transformations -- filter +################################################# + +task FilterBamByLen { + meta { + desciption: "Filter a BAM by sequence length, and count the yileld if so requested" + warn: "It's assumed that for aligned BAM, alignment was done without hard clipping turned on. If this assumption isn't met, the resulting BAM may be corrupt." + } + + parameter_meta { + len_threshold_inclusive: "Reads longer than or equal to this length will be included." + bam : { localization_optional: true } + } + + input { + File bam + File? bai + Int len_threshold_inclusive + + Boolean compute_yield = false + + String disk_type = "HDD" + RuntimeAttr? runtime_attr_override + } + + output { + File fBAM = "~{out_prefx}.bam" + File? fBAI = "~{out_prefx}.bam.bai" + + Float? total_yield = read_float("all.yield.txt") + Float? filtered_yield = read_float("filtered.yield.txt") + } + + String base = basename(bam, ".bam") + String out_prefx = "~{base}.RL_ge_~{len_threshold_inclusive}" + + Boolean is_aligned = defined(bai) + + String local_bam = "/cromwell_root/~{base}.bam" + + command <<< + set -euxo pipefail + + time \ + gcloud storage cp ~{bam} ~{local_bam} + if ~{defined(bai)}; then mv ~{bai} "~{local_bam}.bai"; touch "~{local_bam}.bai"; fi + + # get total yield in the background + if ~{compute_yield}; then + # simply get the length of the sequence, excluding 2304 reads + samtools view -@1 \ + ~{true='-F2304' false=' ' is_aligned} \ + ~{local_bam} \ + | awk -F '\t' '{print length($10)}' \ + > all.read.lengths.txt & + fi + + if ~{is_aligned} ; then + # note here that 2048 and 256 reads are not longer than the primary record, + # so if the primary is already shorter than the threshold, they should be excluded too + # on the other hand, it is possible the 2048 records are shorter that the actual query when hardclipping is turned on + # hence we requre the input doesn't have hardclipping turned on + samtools view -@1 -h \ + --write-index \ + -e "length(seq)>=~{len_threshold_inclusive}" \ + -o "~{out_prefx}.bam##idx##~{out_prefx}.bam.bai" \ + ~{local_bam} + else + samtools view -@1 -h \ + -e "length(seq)>=~{len_threshold_inclusive}" \ + -o "~{out_prefx}.bam" \ + ~{local_bam} + fi + + if ~{compute_yield}; then + samtools view -@1 \ + ~{true='-F2304' false=' ' is_aligned} \ + "~{out_prefx}.bam" \ + | awk -F '\t' '{print length($10)}' \ + > filtered.read.lengths.txt + + # see https://stackoverflow.com/questions/450799/shell-command-to-sum-integers-one-per-line + awk '{s+=$1} END {printf "%.0f", s}' filtered.read.lengths.txt \ + > filtered.yield.txt + + wait # make sure total yield gather in the background is done + awk '{s+=$1} END {printf "%.0f", s}' all.read.lengths.txt \ + > all.yield.txt + fi + >>> + ################### + Int disk_size = 20 + 2 * ceil(size(bam, "GiB")) + + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 16, + disk_gb: disk_size, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " ~{disk_type}" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task GatherReadsWithoutMethylCalls { + meta { + desciption: "Collect records in the bam without the ML & MM tags" + } + parameter_meta { + bam: {localization_optional: true} + disk_type: "must be one of [HDD, SSD, LOCAL]. SSD is recommended" + } + + input { + File bam + File? bai + String disk_type + } + + output { + File no_ml_reads = "~{p}.no_ML.bam" + File no_mm_reads = "~{p}.no_MM.bam" + + File names_missing_only_one_tag = "missing_only_one_tag.read_names.txt" + File names_missing_both_tags = "no_mm_and_ml.read_names.txt" + } + + String p = basename(bam, ".bam") + String local_bam = "/cromwell_root/~{p}.bam" + + command <<< + set -euxo pipefail + time \ + gcloud storage cp ~{bam} ~{local_bam} + + export LC_ALL=C # attempt to make grep faster + samtools view -@1 -h ~{local_bam} \ + | grep -vF "ML:B:C" \ + | samtools view -@1 -bh \ + -o "~{p}.no_ML.bam" & + + samtools view -@1 -h ~{local_bam} \ + | grep -vF "MM:Z:" \ + | samtools view -@1 -bh \ + -o "~{p}.no_MM.bam" & + + wait + + ########## + samtools view -@1 "~{p}.no_ML.bam" | awk -F '\t' '{print $1}' | sort > no_ml.txt & + samtools view -@1 "~{p}.no_MM.bam" | awk -F '\t' '{print $1}' | sort > no_mm.txt & + wait + + comm -3 \ + no_ml.txt \ + no_mm.txt \ + > "missing_only_one_tag.read_names.txt" + comm -12 \ + no_ml.txt \ + no_mm.txt \ + > "no_mm_and_ml.read_names.txt" + >>> + + # here, we are a little "brave", given that the task generates some bams + # however, if the output bams are large enough, then the input definitely has issues (too many without MM/ML) + # hence the run will fail and we'll be warned by an OoD (or PAPI 10) + Int local_ssd_sz = if size(bam, "GiB") > 300 then 750 else 375 + Int pd_sz = 50 + ceil(size(bam, "GiB")) + Int disk_size = if "LOCAL" == disk_type then local_ssd_sz else pd_sz + runtime { + cpu: 10 + memory: "20 GiB" + disks: "local-disk ~{disk_size} ~{disk_type}" + preemptible: 2 + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } +} + +task SubsetBamToLocusLocal { + meta { + description: "For subsetting a BAM stored in GCS, expliciting localizing the BAM" + note: "This is intended as last resort when streaming from buckets fails" + } + + parameter_meta { + interval_list_file: "a Picard-style interval list file to subset reads with" + interval_id: "an ID string for representing the intervals in the interval list file" + prefix: "prefix for output bam and bai file names" + bam: {localization_optional: true} + } + + input { + File bam + File bai + + File interval_list_file + String interval_id + String prefix + + RuntimeAttr? runtime_attr_override + } + + Array[String] intervals = read_lines(interval_list_file) + + Int disk_size = 4*ceil(size([bam, bai], "GB")) + + String subset_prefix = prefix + "." + interval_id + + String local_bam = "/cromwell_root/~{basename(bam)}" + + command <<< + set -euxo pipefail + + time gcloud storage cp ~{bam} ~{local_bam} + mv ~{bai} "~{local_bam}.bai" && touch "~{local_bam}.bai" + + # see man page for what '-M' means + samtools view \ + -bhX \ + -M \ + -@ 1 \ + --write-index \ + -o "~{subset_prefix}.bam##idx##~{subset_prefix}.bam.bai" \ + ~{local_bam} "~{local_bam}.bai" \ + ~{sep=" " intervals} + >>> + + output { + File subset_bam = "~{subset_prefix}.bam" + File subset_bai = "~{subset_prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 16, + disk_gb: disk_size, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" # expensive, but much faster + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task DeduplicateQuerynameSortedBam { + meta { + desciption: "De-duplicate a queryname sorted bam. The queryname sort can be done either in natural order, or ascii order." + } + parameter_meta { + qnorder_bam: { + desciption: "queryname sorted BAM", + localization_optional: true + } + } + input { + File qnorder_bam + RuntimeAttr? runtime_attr_override + } + output { + File dedup_bam = "~{base}.dedup.bam" + File dup_read_names = "duplicated.readnames.txt" + } + + String base = basename(qnorder_bam, ".bam") + String local_bam = "/cromwell_root/~{base}.bam" + + Int disk_size = 3 * ceil(size(qnorder_bam, "GB")) + + command <<< + set -eux + + time gcloud storage cp ~{qnorder_bam} ~{local_bam} + + # if no duplicate at all, why bother + time samtools view ~{local_bam} | awk -F '\t' '{print $1}' | sort | uniq -d > duplicated.readnames.txt + touch duplicated.readnames.txt + cat duplicated.readnames.txt + cnt=$(wc -l duplicated.readnames.txt | awk '{print $1}') + if [[ ${cnt} -eq 0 ]]; then + echo "No duplicates found in the unmapped reads" + mv ~{local_bam} "~{base}.dedup.bam" + else + time \ + python3 /opt/remove_duplicate_ont_namesorted_unaligned.py \ + -p "~{base}.dedup" \ + -q "duplicated.readnames.bypython.txt" \ + ~{local_bam} + + cat "duplicated.readnames.bypython.txt" + + diff <(sort duplicated.readnames.txt) <(sort duplicated.readnames.bypython.txt) + fi + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-bam-dedup:0.1.2" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +################################################# +# intensive transformations -- map +################################################# + +task BamToFastq { + meta { + description : "Convert a long reads BAM file to a fastq file." + warn: "Please do not include 'RG' in tags_to_preserve, as that's automatically saved" + } + + parameter_meta { + bam: {localization_optional: true} + prefix: "Prefix for the output fastq file." + + save_all_tags: + "if true, saves all SAM tags to the FASTQ output; cannot set this true while also specifying tags_to_preserve " + tags_to_preserve: + "custom list of tags to preserve; please do not include 'RG' in tags_to_preserve, as that's automatically preserved" + + disk_type: "type of disk to use" + } + + input { + File bam + String prefix + + Boolean save_all_tags = false + Array[String] tags_to_preserve = [] + + String disk_type = "SSD" + RuntimeAttr? runtime_attr_override + } + + output { + File reads_fq = "~{prefix}.fq.gz" + } + + Boolean custom_tags_to_preserve = 0 ~{prefix}.fq.gz + >>> + + ######################### + Int disk_size = 10 + 3 * ceil(size(bam, "GiB")) + + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " ~{disk_type}" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task GetPileup { + meta { + desciption: + "Get pileup information with `samtools mpileup`. Current cmdline options are '-a -s -q 1 [-E|-B]' " + warn: + "Do not run this on a large BAM, i.e. pre-filter BAM before running this task. Also see if task BamToRelevantPileup is what you need." + } + + parameter_meta { + bam: {localization_optional: true} + disable_baq: "User choice to diable BAQ computation or not (see doc for samtools mpileup for detail)" + } + + input { + File bam + File bai + Boolean disable_baq + String prefix + File ref_fasta + } + + output { + File pileup = "~{prefix}.mpileup" + } + + String baq_option = if disable_baq then '-B' else '-E' + + String base = basename(bam) + String local_bam = "/cromwell_root/~{base}" + + command <<< + set -euxo pipefail + + time gcloud storage cp ~{bam} ~{local_bam} + + samtools mpileup \ + ~{baq_option} \ + -a \ + -s \ + -q 1 \ + -f ~{ref_fasta} \ + -o ~{prefix}.mpileup \ + ~{local_bam} + >>> + + runtime { + cpu: 1 + memory: "4 GiB" + disks: "local-disk 100 HDD" + preemptible: 2 + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } +} + +task BamToRelevantPileup { + meta { + desciption: + "Chop up a GRCh38 BAM by chromosome and further subset into requested genotyping sites; then convert to pileup format. See also task GetPileup." + note: + "This task may fail due to some strange samtools failures reading from NAS storage (including cloud PDs that aren't local SSDs). So we included inputs and outputs to guard against that." + } + + parameter_meta { + bam: {localization_optional: true} + bed: "sites where pileup is needed" + + max_retries: "Because of the strange samtools failures reading from NAS storage, we should make multiple attempts to get away from the trasient errors. If after the max retries, we still get those failures, this task will fail." + + pileup_stderr: "stderr output from the samtools mpileup commands, they should ALL be 1-liners." + } + input { + File bam + File bai + File bed + File ref_fasta + Boolean disable_baq + + String disk_type = "SSD" + Int max_retries = 1 + } + output { + File pileups = "pileup.mpileup" + Array[File] pileup_stderr = glob("*.mpileup.err") + } + + String baq_option = if disable_baq then '-B' else '-E' + + String base = basename(bam) + String local_bam = "/cromwell_root/~{base}" + + command <<< + set -euxo pipefail + + time \ + gcloud storage cp ~{bam} ~{local_bam} + mv ~{bai} "~{local_bam}.bai" + + # generate bed for parallel conversion + set +e + for i in `seq 1 22`; + do + grep -w "chr${i}" ~{bed} > "chr${i}.bed"; + done + grep -w "chrX" ~{bed} > "chrX.bed" + grep -w "chrY" ~{bed} > "chrY.bed" + set -e + rm ~{bed} + + # parallel conversion + cnt=0 + for bed in $(ls chr*.bed | sort -V); do + + if [[ ! -s ${bed} ]] ; then rm "${bed}" && continue; fi + + bash /opt/convert.2.pileup.sh \ + ~{local_bam} ~{ref_fasta} ~{baq_option} \ + ${bed} \ + & + + cnt=$((cnt + 1)) + if [[ $cnt -eq ~{cores} ]]; then wait; cnt=0; fi + done + wait + ls -lh + + # here we use a trick, that if any of the stderr file is large, the conversion must have failed + mpileup_stderr_sz=$(du -c *.mpileup.err | tail -n1 | awk '{print $1}') + if [[ "${mpileup_stderr_sz}" -gt 1024 ]]; then + du -c *.mpileup.err + echo "some chromosome failed to be converted to pileup" + exit 1 + fi + + rm -f chr*bam chr*bai + cat *.mpileup > pileup.mpileup + >>> + + Int cores = 12 + Int memory = 4 + cores + Int local_ssd_sz = if size(bam, "GiB") > 150 then 750 else 375 + Int pd_sz = 20 + 2 * ceil(size(bam, "GiB")) + Int disk_size = if "LOCAL" == disk_type then local_ssd_sz else pd_sz + + runtime { + cpu: "~{cores}" + memory: "~{memory} GiB" + disks: "local-disk ~{disk_size} ~{disk_type}" + preemptible: 1 + maxRetries: max_retries + docker: "us.gcr.io/broad-dsp-lrma/lr-bam-pileup:0.1.3" + } +} + +task SamtoolsReset { + meta { + description: "Use samtools reset to drop alignment information from the input bam." + } + + parameter_meta { + bam: { + desciption: "aligned BAM to operate on", + localization_optional: true + } + addtional_tags_to_drop: "tags in each alignment record to be dropped; usually these are tags produced by the mapper/aligner that generated the original alignment" + } + input { + File bam + # these are known mm2 tags and pbmm2 tags + Array[String] addtional_tags_to_drop = ['cg', 'cm', 'cs', + 'de', 'dv', + 'ms', + 'nn', + 'rl', + 's1', 's2', + 'tp', 'ts', + 'mc', 'mg', 'mi', 'rm'] + + Int? num_ssds + RuntimeAttr? runtime_attr_override + } + + output { + File res = "~{prefix}.unaligned.bam" + File original_sam_flag_stats = "orignal.SAM-flag.stats.txt" + } + + Array[String] std_tags_to_drop = ['MD', 'NM', 'AS', 'SA', 'XA'] + Array[String] tags_to_drop = flatten([std_tags_to_drop, addtional_tags_to_drop]) + + String prefix = basename(bam, ".bam") + + Int disk_size = if defined(num_ssds) then 375*select_first([num_ssds]) else 1+10*ceil(size([bam], "GB")) + String disk_type = if defined(num_ssds) then " LOCAL" else " SSD" + + String base = basename(bam, ".bam") + String local_bam = "/cromwell_root/~{base}.bam" + + command <<< + set -eux + + time gcloud storage cp ~{bam} ~{local_bam} + + samtools view -@1 ~{local_bam} | grep -v "^@" | awk -F '\t' '{print $2}' | sort | uniq -c > orignal.SAM-flag.stats.txt & + + samtools reset -@3 \ + --remove-tag ~{sep=',' tags_to_drop} \ + -o ~{prefix}.unaligned.bam \ + ~{local_bam} + wait + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 16, + disk_gb: disk_size, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " " + disk_type + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task QuerynameSortBamWithSamtools { + meta { + description: "queryname-sort a BAM with samtools. WARNING: see https://github.com/samtools/samtools/issues/1500 if you should use samtools" + } + + parameter_meta { + bam: "input BAM" + qnsort_bam: "output BAM sorted by query name" + num_ssds: "Number of local SSDs to use; if not provided, will use SSD persistent disks" + + multi_record_queries: "File holding names of queries that has multiple records in the output" + } + input { + File bam + Int? num_ssds + RuntimeAttr? runtime_attr_override + } + + output { + File qnsort_bam = "~{prefix}.qname-sorted.bam" + File multi_record_queries = "multi_record_queries.txt" + } + + String prefix = basename(bam, ".bam") + + Int disk_size = if defined(num_ssds) then 375*select_first([num_ssds]) else 1+4*ceil(size([bam], "GB")) + String disk_type = if defined(num_ssds) then " LOCAL" else " SSD" + + command <<< + + echo "don't use me yet; see if your version of samtools has this ticket resolved https://github.com/samtools/samtools/issues/1500"; exit 1 + + set -eux + + samtools view -H ~{bam} | grep "^@HD" | tr '\t' '\n' > hd.line.txt + if grep -q 'SO:queryname' hd.line.txt; + then + echo "already sorted" + mv ~{bam} "~{prefix}.qname-sorted.bam" + exit 0 + fi + samtools sort -@3 -m2G \ + -N \ + -O BAM \ + ~{bam} \ + > "~{prefix}.qname-sorted.bam" + + touch multi_record_queries.txt + samtools view "~{prefix}.qname-sorted.bam" | awk '{print $1}' | uniq -d > multi_record_queries.txt + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 16, + disk_gb: disk_size, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + disk_type + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task QuerynameSortBamWithPicard { + meta { + desciption: "See https://github.com/samtools/samtools/issues/1500 why we aren't using samtools. Note that this task is disk-space hungry." + } + + parameter_meta { + bam: "input BAM" + qnsort_bam: "output BAM sorted by query name" + num_ssds: "Number of local SSDs to use; if not provided, will use SSD persistent disks (instead of local SSDs)" + } + input { + File bam + Int? num_ssds + RuntimeAttr? runtime_attr_override + } + + output { + File qnsort_bam = "~{outbam}" + } + + String outbam = basename(bam, ".bam") + "picard-queryname-sorted.bam" + + String disk_type = if defined(num_ssds) then " LOCAL" else " SSD" + + Float N = size(bam, "GiB") + Int scaleup_factor = if (N > 100) then 6 else 4 + Int persistend_disk_size = 20 + ceil(scaleup_factor * N) + + Int disk_size = if defined(num_ssds) then 375*select_first([num_ssds]) else persistend_disk_size + + command <<< + set -eux + + # higher memory, also lower # of reads in memory given ~100 longer reads (1.5E4 bp vs 1.5E2 bp) + gatk SortSam \ + --java-options "-Xmx28G -Xms24G" \ + -use_jdk_deflater -use_jdk_inflater \ + --MAX_RECORDS_IN_RAM 5000 \ + -I ~{bam} \ + -O ~{outbam} \ + -SO queryname + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 6, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.4.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + disk_type + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +################################################# +# intensive transformations -- split +################################################# + +task SplitNameSortedUbam { + meta { + desciption: "Split a read-name-sorted unaligned BAM into chunks." + } + parameter_meta { + read_cnt: "number of reads in the uBAM; providing this will reduce run time." + n_reads: "desired number of reads per split; mutually exclusive with n_files" + n_files: "desired number of split files; mutually exclusive with n_reads" + uBAM: { localization_optional: true } + } + input { + File uBAM + Int? read_cnt + Int? n_reads + Int? n_files + + RuntimeAttr? runtime_attr_override + } + output { + Array[File] split = glob("split_outputs/*.bam") + } + + Boolean fail = defined(n_reads) == defined(n_files) # mutually exclusive + + Int X = select_first([n_reads, n_files]) + String split_arg = if defined(n_reads) then "--SPLIT_TO_N_READS ~{X}" else "--SPLIT_TO_N_FILES ~{X}" + String helper_arg = if (defined(read_cnt)) then "--TOTAL_READS_IN_INPUT ~{read_cnt}" else " " + + String base = basename(uBAM, ".bam") + String local_bam = "/cromwell_root/~{base}.bam" + + command <<< + set -eux + + if ~{fail}; then echo "one and only one of [n_reads, n_files] must be specified" && exit 1; fi + + # prep + time gcloud storage cp ~{uBAM} ~{local_bam} + mkdir -p split_outputs + + # higher memory, also lower # of reads in memory given ~100 longer reads (1.5E4 bp vs 1.5E2 bp) + gatk SplitSamByNumberOfReads \ + --java-options "-Xmx28G -Xms24G" \ + -use_jdk_deflater -use_jdk_inflater \ + --MAX_RECORDS_IN_RAM 5000 \ + -I ~{local_bam} \ + -O split_outputs \ + ~{split_arg} \ + ~{helper_arg} + >>> + ######################### + Int disk_size = 20 + ceil(3 * size(uBAM, "GiB")) + + RuntimeAttr default_attr = object { + cpu_cores: 6, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.4.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SplitByRG { + meta { + description: "Split a BAM file that was aggregated, for the same sample, into pieces by read group." + } + input { + File bam + + String out_prefix + + Int? num_ssds + + Boolean retain_rgless_records = false + Boolean sort_and_index = false + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: "BAM to be split" + out_prefix: "prefix for output bam and bai file names" + sort_and_index: "if the user wants to (pos-)sort and index the resulting BAMs; this indicates the input BAM is mapped" + + split_bam: "the resuling BAMs, each having reads only in a single read group" + split_bai: "the accompanying BAIs, if possible and explicit requested" + } + + Int disk_size = if defined(num_ssds) then 375*select_first([num_ssds]) else 1+3*ceil(size([bam], "GB")) + + Array[String] extra_args = if (retain_rgless_records) then ["-u", "~{out_prefix}_noRG.bam"] else [""] + command <<< + set -eux + + samtools view -H ~{bam} | grep "^@RG" > "read_groups_header.txt" + cat "read_groups_header.txt" | tr '\t' '\n' | grep "^ID:" | awk -F ':' '{print $2}' > "RG_ids.txt" + + samtools split -@3 \ + -f "~{out_prefix}_%#.bam" \ + ~{sep=" " extra_args} \ + ~{bam} + if ~{sort_and_index} ; + then + # cleanup space for the sorting + rm ~{bam} + for split_bam in "~{out_prefix}_"*.bam; + do + mv "${split_bam}" temp.bam + samtools sort \ + --write-index \ + -o "${split_bam}##idx##${split_bam}.bai" \ + temp.bam + done + fi + >>> + + output { + File read_group_header = "read_groups_header.txt" + Array[String] rg_ids = read_lines("RG_ids.txt") + Array[File] split_bam = glob("*.bam") + Array[File?] split_bai = glob("*.bai") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 16, + disk_gb: disk_size, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ShardAlignedBam { + meta { + desciption: "Split an WGS BAM based on a provided scatter scheme." + } + parameter_meta { + aligned_bam: { + localization_optional: true, + description: "input BAM file (must be coordinate sorted)." + } + aligned_bai: "input BAM index file" + + scatter_scheme: "A txt file holding how to scatter the WGS bam. Example (this example size-balance among the shards): ...\nchr5,chr19\nchr6,chrY,chrM\n..." + + parallel_subset_jobs: "an optimization; increasing this will lead to renting more powerfull VMs from GCP, though with shorter wall-clock time." + } + input { + File aligned_bam + File? aligned_bai + File scatter_scheme + + Int parallel_subset_jobs = 7 # empirical + + RuntimeAttr? runtime_attr_override + } + output { + File unmapped_reads = "~{base}.unmapped-reads.bam" + + Array[File] split_bams = glob("~{base}.shard-*.bam") + } + + Int disk_size = 3 * ceil(size(aligned_bam, "GB")) + + String base = basename(aligned_bam, ".bam") + + String local_bam = "/cromwell_root/~{base}.bam" + String local_bai = "/cromwell_root/~{base}.bam.bai" + + Int vm_cores = parallel_subset_jobs * 2 + 2 + Int vm_memory = vm_cores * 4 + + command <<< + set -eux + + # here we use an optimization, that is, in stead of relying on the slow Cromwell localization, + # we explicity localize the bam in the with gcloud storage cp + time gcloud storage cp ~{aligned_bam} ~{local_bam} + + echo "===========================================================" + echo "verify input bam is sorted by coordinate" + samtools view -H ~{local_bam} | grep "@HD" > hd.line + if ! grep -qF "SO:coordinate" hd.line; + then + echo "BAM must be coordinate sorted!" && echo && cat hd.line && exit 1 + fi + echo "===========================================================" + echo "index if bai not provided" + if ~{defined(aligned_bai)}; then + mv ~{aligned_bai} ~{local_bai} + else + time samtools index -@3 "~{local_bam}" + fi + echo "===========================================================" + echo "######################################" + echo "handle unmapped reads, if any, here" + samtools view -@3\ + -f4 \ + -o "~{base}.unmapped-reads.bam" \ + "~{local_bam}" & + echo "######################################" + echo "first pad the provided sharding scheme with the uncovered contigs in the bam header" + samtools view -H ~{local_bam} | grep "^@SQ" | awk -F '\t' '{print $2}' | awk -F ':' '{print $2}' > contigs.in.header.txt + comm -13 \ + <(tr ',' '\n' < ~{scatter_scheme} | sort) \ + <(sort contigs.in.header.txt) \ + | tr '\n' ',' \ + > uncovered.scatter_scheme.txt + cat uncovered.scatter_scheme.txt + cat uncovered.scatter_scheme.txt >> ~{scatter_scheme} + cat ~{scatter_scheme} + echo "######################################" + echo "now split according to the sharding scheme provided" + job_cnt=0 # assume few unmapped reads, so don't count that + idx=1 + while IFS= read -r one_shard + do + XX=$(echo "${one_shard}" | tr ',' ' ') + read -ra YY <<< "$XX" + samtools view -@1 \ + -o "~{base}.shard-${idx}.bam" \ + "~{local_bam}" \ + "${YY[@]}" & + idx=$(( idx + 1 )) + job_cnt=$(( job_cnt + 1 )) + # let's not shoot ourselves + if [[ ${job_cnt} == ~{parallel_subset_jobs} ]]; then wait; job_cnt=0; fi + done < ~{scatter_scheme} + wait + echo "===========================================================" + echo "DONE!" + ls + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: vm_cores, + mem_gb: vm_memory, + disk_gb: disk_size, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +################################################# +# intensive transformations -- merge +################################################# + +task MergeBamsWithSamtools { + meta { + description : "Merge several input BAMs into a single BAM." + warn: "assumes input BAMs are coordinate sorted" + } + + parameter_meta { + bams: {localization_optional: true} + out_prefix: "result file will be named .bam" + } + + input { + Array[File] bams + String out_prefix = "out" + + String disk_type = "LOCAL" + + RuntimeAttr? runtime_attr_override + } + + output { + File merged_bam = "~{out_prefix}.bam" + File merged_bai = "~{out_prefix}.bam.bai" + } + + command <<< + set -euxo pipefail + + mkdir -p bams_dir + time \ + gcloud storage cp ~{sep=' ' bams} /cromwell_root/bams_dir/ + ls bams_dir + + cd bams_dir && ls ./*.bam > bams.list + time \ + samtools merge \ + -p -c --no-PG \ + -@ 3 \ + --write-index \ + -o "~{out_prefix}.bam##idx##~{out_prefix}.bam.bai" \ + -b bams.list + mv ~{out_prefix}.bam \ + ~{out_prefix}.bam.bai \ + /cromwell_root + >>> + ######################### + Int local_ssd_sz = if size(bams, "GiB") > 150 then 750 else 375 + Int pd_sz = 10 + 3*ceil(size(bams, "GiB")) + Int disk_size = if "LOCAL" == disk_type then local_ssd_sz else pd_sz + + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 8, + disk_gb: disk_size, + preemptible_tries: if "LOCAL" == disk_type then 1 else 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " ~{disk_type}" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MergeBamsQuerynameSortedWithPicard { + meta { + desciption: "Merge list of bams that were queryname sorted with Picard" + } + parameter_meta { + qns_bams: { + desciption: "queryname-sorted, preferrably by Picard, bams to be merged", + localization_optional: true + } + base_names: "basenames of all files, INCLUDING the '.bam' extention." + out_prefix: "result file will be named .bam" + num_ssds: "if provided, will use LOCAL SSDs for faster speed at higher cost" + } + input { + Array[File] qns_bams + Array[String] base_names + String out_prefix + + Int? num_ssds + RuntimeAttr? runtime_attr_override + } + output { + File res = "~{out_prefix}.bam" + } + + Float N = ceil(size(qns_bams, "GB")) + Int scaleup_factor = if (N > 100) then 6 else 4 + Int persistend_disk_size = 20 + ceil(scaleup_factor * N) + + Int disk_size = if defined(num_ssds) then 375*select_first([num_ssds]) else persistend_disk_size + String disk_type = if defined(num_ssds) then " LOCAL" else " SSD" + + command <<< + set -eux + + mkdir -p bams_dir + gcloud storage cp ~{sep=' ' qns_bams} /cromwell_root/bams_dir/ + ls bams_dir + + # higher memory, also lower # of reads in memory given ~100 longer reads (1.5E4 bp vs 1.5E2 bp) + cd bams_dir + gatk MergeSamFiles \ + --java-options "-Xmx28G -Xms24G" \ + --USE_THREADING \ + -use_jdk_deflater -use_jdk_inflater \ + --SORT_ORDER queryname \ + -I ~{sep=" -I " base_names} \ + -O "/cromwell_root/~{out_prefix}.bam" + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 6, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-custom-gatk:4.4.0.0-samtools1.18" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + disk_type + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) } } diff --git a/wdl/tasks/Utility/FastqUtils.wdl b/wdl/tasks/Utility/FastqUtils.wdl new file mode 100644 index 000000000..818fea8f5 --- /dev/null +++ b/wdl/tasks/Utility/FastqUtils.wdl @@ -0,0 +1,152 @@ +version 1.0 + +import "../../structs/Structs.wdl" + + +task Stats { + meta { + desription: + "seqkit stats command" + } + parameter_meta { + fastq: "file to collect stats on" + seq_type: "argument to the --seq-type paramter" + } + + input { + File fastq + String seq_type + RuntimeAttr? runtime_attr_override + } + + output { + Map[String, Float] res = read_map("2.col.map.tsv") + } + + Int disk_size = 10 + ceil(size(fastq, "GB")) + + command <<< + set -eux + + seqkit stats \ + -aT \ + -t ~{seq_type} \ + -o 2_line.tsv \ + ~{fastq} + + datamash transpose \ + < 2_line.tsv \ + | grep -vw "^file" \ + | grep -vw "^format" \ + | grep -vw "^type" \ + > 2.col.map.tsv + >>> + + ################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 4, + disk_gb: disk_size, + preemptible_tries: 2, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-seqkit:2.4.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FilterByLength { + meta { + desciption: "Filter FASTQ by a length threshold (>=)." + } + parameter_meta { + threshold: "Sequences shorter than this will be dropped." + res: "result file" + } + input { + File fq + Int threshold + } + output { + File res = "~{prefix}.length-filter-ge-~{threshold}.fq.gz" + } + + String prefix = basename(basename(basename(fq, ".gz"), ".fastq"), ".fq") + Int disk_space = 10 + 2 * ceil(size(fq, "GiB")) + command <<< + set -eux + + seqkit seq -m ~{threshold} ~{fq} \ + | gzip \ + > "~{prefix}.length-filter-ge-~{threshold}.fq.gz" + >>> + runtime { + cpu: 2 + memory: "8 GiB" + disks: "local-disk ~{disk_space} SSD" + preemptible: 2 + maxRetries: 1 + docker: "us.gcr.io/broad-dsp-lrma/lr-seqkit:2.4.0" + } +} + +task FilterByLenSeqTk { + meta { + desciption: + "Alternative implementation to FilterByLength, here using seqtk" + } + parameter_meta { + exclude_len_threshold: "Sequeces shorter than this will be dropped from analysis." + } + + input { + File fastq + Int exclude_len_threshold + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10 + 2*ceil(size(fastq, "GB")) + + String base = basename(basename(fastq, ".fastq.gz"), ".fq.gz") + String out_prefx = "~{base}.RL_gt_~{exclude_len_threshold}" + + command <<< + set -eux + + seqtk seq \ + -L ~{exclude_len_threshold} \ + ~{fastq} \ + | gzip \ + > "~{out_prefx}.fastq.gz" + >>> + + output { + File res = "~{out_prefx}.fastq.gz" + } + + ################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 16, + disk_gb: disk_size, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-seqtk:1.3" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/tasks/Utility/Finalize.wdl b/wdl/tasks/Utility/Finalize.wdl index a95199bde..dfd8956e0 100644 --- a/wdl/tasks/Utility/Finalize.wdl +++ b/wdl/tasks/Utility/Finalize.wdl @@ -28,15 +28,13 @@ task FinalizeToFile { RuntimeAttr? runtime_attr_override } - - String gcs_output_dir = sub(outdir, "/+$", "") String gcs_output_file = gcs_output_dir + "/" + select_first([name, basename(file)]) command <<< set -euxo pipefail - gsutil -m cp "~{file}" "~{gcs_output_file}" + gcloud storage cp "~{file}" "~{gcs_output_file}" >>> output { @@ -48,17 +46,15 @@ task FinalizeToFile { cpu_cores: 1, mem_gb: 1, disk_gb: 10, - boot_disk_gb: 10, preemptible_tries: 2, max_retries: 2, - docker: "us.gcr.io/broad-dsp-lrma/lr-finalize:0.1.2" + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) docker: select_first([runtime_attr.docker, default_attr.docker]) @@ -76,12 +72,15 @@ task FinalizeToDir { description: "files to finalize", localization_optional: true } - keyfile : "[optional] File used to key this finaliation. Finalization will not take place until the KeyFile exists. This can be used to force the finaliation to wait until a certain point in a workflow. NOTE: The latest WDL development spec includes the `after` keyword which will obviate this." + file_names: "custom names for files; must be the same length as files if provided" outdir: "directory to which files should be uploaded" + + keyfile : "[optional] File used to key this finaliation. Finalization will not take place until the KeyFile exists. This can be used to force the finaliation to wait until a certain point in a workflow. NOTE: The latest WDL development spec includes the `after` keyword which will obviate this." } input { Array[File] files + Array[String]? file_names String outdir File? keyfile @@ -91,10 +90,34 @@ task FinalizeToDir { String gcs_output_dir = sub(outdir, "/+$", "") + Boolean fail = if(defined(file_names)) then length(select_first([file_names])) != length(files) else false + # this variable is defined because of meta-programing: + # Cromwell generates the script to be executed at runtime (duing the run of the workflow), + # but also at "compile time" when looked from the individual task perspective--the task is "compiled" right before it is run. + # so optional variables, if not specified, cannot be used in the command section because at that "compile time", they are undefined + # here we employ a hack: + # if the optional input file_names isn't provided, it's not used anyway, so we don't worry about the literal correctness of + # the variable's values--the variable used in generating the script--but only care that it is defined. + Array[String] names_for_cromwell = select_first([file_names, ["correctness_doesnot_matter_here"]]) command <<< set -euxo pipefail - cat ~{write_lines(files)} | gsutil -m cp -I "~{gcs_output_dir}" + if ~{fail}; then echo "input files and file_names don't have the same length!" && exit 1; fi + + if ~{defined(file_names)}; then + paste \ + ~{write_lines(files)} \ + ~{write_lines(names_for_cromwell)} \ + > file_and_customname.tsv + while IFS=$'\t' read -r ff nn; do + gcloud storage cp \ + "${ff}" \ + "~{gcs_output_dir}"/"${nn}" + done < file_and_customname.tsv + else + cat ~{write_lines(files)} | \ + gcloud storage cp -I "~{gcs_output_dir}" + fi >>> output { @@ -104,19 +127,17 @@ task FinalizeToDir { ######################### RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 1, + mem_gb: 4, disk_gb: 10, - boot_disk_gb: 10, preemptible_tries: 2, max_retries: 2, - docker: "us.gcr.io/broad-dsp-lrma/lr-finalize:0.1.2" + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) docker: select_first([runtime_attr.docker, default_attr.docker]) @@ -267,11 +288,13 @@ task WriteNamedFile { task CompressAndFinalize { meta { - description: "Gzip a file and finalize" + description: "(Block-)Gzip a file and finalize" } parameter_meta { - file : "File to compress and finalize." + file : {desciption: "File to compress and finalize.", localization_optional: true} + block_gzip: "if true, will block-gzip the file (preferrable for certain genomics files)" + outdir : "Google cloud path to the destination folder." name : "[optional] Name of the file to write. If not specified, the name of the input file will be used." runtime_attr_override : "[optional] Additional runtime parameters." @@ -282,20 +305,28 @@ task CompressAndFinalize { String outdir String? name + Boolean block_gzip = false + String disk_type = "SSD" RuntimeAttr? runtime_attr_override } String base = basename(file) String out = sub(select_first([name, base]), ".gz$", "") + ".gz" + # THIS IS ABSOLUTELY CRITICAL: DON'T CHANGE TYPE TO FILE, AS CROMWELL WILL TRY TO LOCALIZE THIS NON-EXISTENT FILE String gcs_output_file = sub(outdir, "/+$", "") + "/" + out - Int disk_size = 2 * ceil(size(file, "GB")) - command <<< set -euxo pipefail - gzip -vkc ~{file} > "~{base}.gz" + time \ + gcloud storage cp ~{file} localized + + if ~{block_gzip}; then + bgzip -kc -t ~{cores} localized > "~{base}.gz" + else + pigz -vkc localized > "~{base}.gz" + fi gsutil cp "~{base}.gz" "~{gcs_output_file}" >>> @@ -304,21 +335,21 @@ task CompressAndFinalize { } ######################### + Int disk_size = 2 * ceil(size(file, "GiB")) + Int cores = if (disk_size>4) then 4 else 1 RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, + cpu_cores: cores, + mem_gb: 2*cores, disk_gb: disk_size, - boot_disk_gb: 10, preemptible_tries: 2, max_retries: 2, - docker: "us.gcr.io/broad-dsp-lrma/lr-finalize:0.1.2" + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " ~{disk_type}" preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) docker: select_first([runtime_attr.docker, default_attr.docker]) @@ -331,57 +362,129 @@ task FinalizeAndCompress { } parameter_meta { - files : "Files to compress and finalize." + files : {desciption: "Files to compress and finalize.", localization_optional: true} outdir : "Google cloud path to the destination folder." - prefix : "[optional] Prefix to add to the output files." + folder : "new folder under 'outdir' to hold the compressed files." runtime_attr_override : "[optional] Additional runtime parameters." } input { Array[File] files - String outdir + Boolean block_gzip = false - String prefix + String outdir + String? folder + String disk_type = "HDD" RuntimeAttr? runtime_attr_override } - String gcs_output_file = sub(outdir, "/+$", "") + "/" + prefix + "/" - - Int disk_size = 5 * ceil(size(files, "GB")) + String gcs_output_dir = sub(outdir, "/+$", "") + "/" + if (defined(folder)) then "~{folder}/" else "" command <<< set -euxo pipefail - for ff in ~{sep=' ' files}; - do - base="$(basename -- ${ff})" - mv "${ff}" "${base}" && gzip -vk "${base}" - done + mkdir -p copy + time \ + gcloud storage cp ~{sep=' ' files} copy/ - gsutil -m cp /cromwell_root/*.gz "~{gcs_output_file}" + cd copy + for ff in *; do + if ~{block_gzip}; then bgzip -k "${ff}" ; else pigz -vk "${ff}"; fi + done + ls && + cd - && + gcloud storage rsync \ + copy/ \ + "~{gcs_output_dir}" >>> + output { + String gcs_path = gcs_output_dir + } + + ######################### + Int disk_size = 2 * ceil(size(files, "GiB")) + + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 4, + disk_gb: disk_size, + preemptible_tries: 2, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " ~{disk_type}" + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task TarGZFilesAndSave { + meta { + desciption: + "" + } + + parameter_meta { + files: {localization_optional: true} + name: "" + } + + input { + Array[File]+ files + String name + String outdir + + String disk_type = "SSD" + RuntimeAttr? runtime_attr_override + } + output { String gcs_path = gcs_output_file } + String gcs_output_dir = sub(outdir, "/+$", "") + String local_out = "~{name}" + String gcs_output_file = gcs_output_dir + "/~{local_out}" + + command <<< + set -euxo pipefail + + mkdir -p local + time \ + gcloud storage cp ~{sep=' ' files} \ + local/ + + tar cf - local/* | \ + pigz \ + > "~{local_out}" + + time \ + gcloud storage cp "~{local_out}" "~{gcs_output_file}" + >>> + ######################### + Int disk_size = 2 * ceil(size(files, "GiB")) + RuntimeAttr default_attr = object { cpu_cores: 2, - mem_gb: 7, + mem_gb: 4, disk_gb: disk_size, - boot_disk_gb: 10, preemptible_tries: 2, max_retries: 2, - docker: "us.gcr.io/broad-dsp-lrma/lr-finalize:0.1.2" + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " ~{disk_type}" preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) docker: select_first([runtime_attr.docker, default_attr.docker]) diff --git a/wdl/tasks/Utility/GeneralUtils.wdl b/wdl/tasks/Utility/GeneralUtils.wdl index ad1aa936e..f35a98f55 100644 --- a/wdl/tasks/Utility/GeneralUtils.wdl +++ b/wdl/tasks/Utility/GeneralUtils.wdl @@ -53,3 +53,346 @@ task GetTodayDate { docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" } } + +task CollapseArrayOfStrings { + meta { + desciption: "For collapsing an array of strings using space." + note: "When the next version (> 1.0) of WDL is supported on Terra, use the official solution." + } + input { + Array[String] input_array + String joiner + } + output { + String collapsed = read_string("result.txt") + } + + command <<< + set -euxo pipefail + + n=$(echo ~{joiner} | wc -c | awk '{print $1}') + if [[ ! "${n}" -eq 1 ]]; then echo "cannot collapse with multi-char joiner" && exit 1; fi + + tr '\n' "~{joiner}" < ~{write_lines(input_array)} \ + > result.txt + >>> + runtime { + disks: "local-disk 10 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task CoerceMapToArrayOfPairs { + meta { + desciption: "To coerce a WDL Map into Array[Pair], since Cromwell doesn't support it. Mostly used for iterating a map." + } + + input { + Map[String, String] input_map + } + + command <<< + set -eux + + two_col_tsv=~{write_map(input_map)} + cat "${two_col_tsv}" + wc -l "${two_col_tsv}" + # because some Cromwell versions' stdlib function write_map() doesn't have new line at end of file, so we add it explicitly + if [[ $(tail -c1 "${two_col_tsv}" | wc -l) -eq 0 ]]; then + sed -i -e '$a\' "${two_col_tsv}" + fi + # ' + wc -l "${two_col_tsv}" + awk -F '\t' '{print $1}' ${two_col_tsv} > keys.txt + awk -F '\t' '{print $2}' ${two_col_tsv} > values.txt + >>> + + output { + Array[Pair[String, String]] output_pairs = zip(read_lines("keys.txt"), read_lines("values.txt")) + } + + runtime { + disks: "local-disk 10 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task CoerceArrayOfPairsToMap { + meta { + description: + "To coerce an Array of Pair's to a Map; use only when you're sure the 'key' array is unique." + } + input { + Array[String] keys + Array[String] values + } + + command <<< + set -eux + + paste ~{write_lines(keys)} ~{write_lines(values)} > "result.tsv" + >>> + + output { + Map[String, String] output_map = read_map("result.tsv") + } + runtime { + disks: "local-disk 10 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task MergeMaps { + meta { + desciption: + "For merging two maps into one." + note: + "User is responsible for ensuring uniqueness of keys." + } + input { + Map[String, String] one + Map[String, String] two + } + output { + Map[String, String] merged = read_map("merged.tsv") + } + + command <<< + set -euxo pipefail + + ####### one + two_col_tsv=~{write_map(one)} + cat "${two_col_tsv}" + wc -l "${two_col_tsv}" + # because some Cromwell versions' stdlib function write_map() doesn't have new line at end of file, so we add it explicitly + if [[ $(tail -c1 "${two_col_tsv}" | wc -l) -eq 0 ]]; then + sed -i -e '$a\' "${two_col_tsv}" + fi + # ' + wc -l "${two_col_tsv}" + mv "${two_col_tsv}" "one.tsv" + ####### two + two_col_tsv=~{write_map(two)} + cat "${two_col_tsv}" + wc -l "${two_col_tsv}" + # because some Cromwell versions' stdlib function write_map() doesn't have new line at end of file, so we add it explicitly + if [[ $(tail -c1 "${two_col_tsv}" | wc -l) -eq 0 ]]; then + sed -i -e '$a\' "${two_col_tsv}" + fi + # ' + wc -l "${two_col_tsv}" + mv "${two_col_tsv}" "two.tsv" + ####### merge + cat "one.tsv" "two.tsv" > "merged.tsv" + cat "merged.tsv" + >>> + + runtime { + disks: "local-disk 10 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task MapToTsv { + meta { + desciption: + "For fixing an issue of some Cromwell servers where write_map misses the last line" + } + input { + Map[String, String] m + } + output { + File tsv = "two_col.tsv" + } + command <<< + two_col_tsv=~{write_map(m)} + cat "${two_col_tsv}" + wc -l "${two_col_tsv}" + # because some Cromwell versions' stdlib function write_map() doesn't have new line at end of file, so we add it explicitly + if [[ $(tail -c1 "${two_col_tsv}" | wc -l) -eq 0 ]]; then + sed -i -e '$a\' "${two_col_tsv}" + fi + # ' + wc -l "${two_col_tsv}" + mv "${two_col_tsv}" "two_col.tsv" + >>> + runtime { + disks: "local-disk 10 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +task ConcatenateFiles { + meta { + desciption: + "For concatinating files" + } + input { + Array[File]+ af + String out_name + } + output { + File merged = "~{out_name}" + } + command <<< + set -euxo pipefail + + cat ~{sep=' ' af} > "~{out_name}" + >>> + runtime { + disks: "local-disk 10 HDD" + docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest" + } +} + +struct HelperStructForUnzip { + Array[Pair[String, String]] contents +} +task Unzip { + meta { + desciption: + "unzip an array of pairs to a pair of two arrays of string" + note: + "this task also serves as how to do jq operations for in-task unzip" + } + input { + Array[Pair[String, String]] apss + } + output { + Pair[Array[String], Array[String]] res = (read_lines("left.txt"), read_lines("right.txt")) + } + + HelperStructForUnzip x = object {contents: apss} + command <<< + set -euxo pipefail + + mv ~{write_json(x)} tmp.json + + jq --raw-output '.contents[] | .left' tmp.json > left.txt + jq --raw-output '.contents[] | .right' tmp.json > right.txt + + cat left.txt + cat right.txt + >>> + + runtime { + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-dsp-lrma/jq:1.7.1" + } +} + +task SendEmailNotification { + meta { + desciption: + "Send email, e.g. when certain events happen." + warn: + "This may exhaust your SendGrid API quota, use caution or pay up." + } + parameter_meta { + sendgrid_api_key_file: "A JSON file holding the account key (guard it carefully)" + sender_name: "Name of the sender of the email" + sender_email: "Email address registered on SendGrid used to send email with." + + receiver_names_and_addresses: "intended receivers (don't spam)" + + email_subject: "The subject/title/topic of the email" + email_body: "The plain-text contents of the email" + + html: "a single HTML document to include in the email" + + txt_attachment_names_and_files: "Names and Files of .txt files to attach to the email." + tsv_attachment_names_and_files: "Names and Files of .tsv files to attach to the email." + pdf_attachment_names_and_files: "Names and Files of .pdf files to attach to the email." + } + input { + File sendgrid_api_key_file + String sender_name + String sender_email + + Array[Pair[String, String]] receiver_names_and_addresses + + String email_subject + String email_body + + File? html + + Array[Pair[String, File]]? txt_attachment_names_and_files + Array[Pair[String, File]]? tsv_attachment_names_and_files + Array[Pair[String, File]]? pdf_attachment_names_and_files + } + + HelperStructForUnzip receivers = object {contents: receiver_names_and_addresses} + + Boolean has_txt_attach = defined(txt_attachment_names_and_files) + HelperStructForUnzip txt_attach_obj = object {contents: select_first([txt_attachment_names_and_files, [('null', 'null')]])} + + Boolean has_tsv_attach = defined(tsv_attachment_names_and_files) + HelperStructForUnzip tsv_attach_obj = object {contents: select_first([tsv_attachment_names_and_files, [('null', 'null')]])} + + Boolean has_pdf_attach = defined(pdf_attachment_names_and_files) + HelperStructForUnzip pdf_attach_obj = object {contents: select_first([pdf_attachment_names_and_files, [('null', 'null')]])} + + command <<< + set -euxo pipefail + + ########################################################## + # some boiler-plate stuff to just do arg massaging + ########################################################## + ## receivers + mv ~{write_json(receivers)} tmp.json + + jq --raw-output '.contents[] | .left' tmp.json > receiver_names.txt + jq --raw-output '.contents[] | .right' tmp.json > receiver_emails.txt + rm tmp.json + ## txt + if ~{has_txt_attach}; then + mv ~{write_json(txt_attach_obj)} tmp.json + bash /opt/localize_files.sh \ + tmp.json \ + $(pwd) \ + txt.attach.tsv + rm tmp.json + fi + ## tsv + if ~{has_tsv_attach}; then + mv ~{write_json(tsv_attach_obj)} tmp.json + bash /opt/localize_files.sh \ + tmp.json \ + $(pwd) \ + tsv.attach.tsv + rm tmp.json + fi + ## pdf + if ~{has_pdf_attach}; then + mv ~{write_json(pdf_attach_obj)} tmp.json + bash /opt/localize_files.sh \ + tmp.json \ + $(pwd) \ + pdf.attach.tsv + rm tmp.json + fi + + ########################################################## + # kick off + ########################################################## + python3 /opt/send_email.py \ + --sendgrid_api_key ~{sendgrid_api_key_file} \ + --sender_name ~{sender_name} \ + --sender_email ~{sender_email} \ + --notification_receiver_names receiver_names.txt \ + --notification_receiver_emails receiver_emails.txt \ + --email_subject "~{email_subject}" \ + --email_body "~{email_body}" \ + ~{true='--txt_names_and_files' false=' ' has_txt_attach} \ + ~{true='txt.attach.tsv' false=' ' has_txt_attach} \ + ~{true='--tsv_names_and_files' false=' ' has_tsv_attach} \ + ~{true='tsv.attach.tsv' false=' ' has_tsv_attach} \ + ~{true='--pdf_names_and_files' false=' ' has_pdf_attach} \ + ~{true='pdf.attach.tsv' false=' ' has_pdf_attach} + + >>> + runtime { + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-dsp-lrma/lr-wdl-email:0.0.1" + } +} diff --git a/wdl/tasks/Utility/ONTUtils.wdl b/wdl/tasks/Utility/ONTUtils.wdl index a94672eaf..5613baa28 100644 --- a/wdl/tasks/Utility/ONTUtils.wdl +++ b/wdl/tasks/Utility/ONTUtils.wdl @@ -229,3 +229,95 @@ task PartitionManifest { } } +task DeduplicateBam { + + meta { + description: "Utility to drop (occationally happening) literal duplicate records in input BAM" + } + + parameter_meta { + aligned_bam: "input BAM file (must be coordinate sorted)" + aligned_bai: "input BAM index file" + same_name_as_input: "if true, output BAM will have the same name as input BAM, otherwise it will have the input basename with .dedup suffix" + runtime_attr_override: "override default runtime attributes" + } + + input { + File aligned_bam + File aligned_bai + + Boolean same_name_as_input = true + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 3 * ceil(size(aligned_bam, "GB")) + + String base = basename(aligned_bam, ".bam") + String prefix = if (same_name_as_input) then base else (base + ".dedup") + + command <<< + set -eux + + samtools view -H "~{aligned_bam}" | grep "@HD" > hd.line + if ! grep -qF "SO:coordinate" hd.line; + then + echo "BAM must be coordinate sorted!" && echo && cat hd.line && exit 1 + fi + echo "===========================================================" + echo "collecting duplicate information" + time \ + samtools view -@ 1 "~{aligned_bam}" | \ + awk -F "\t" 'BEGIN {OFS="\t"} {print $1, $2, $3, $4, $5, $6}' | \ + sort | uniq -d \ + > "~{aligned_bam}".duplicates.txt + + cnt=$(wc -l "~{aligned_bam}".duplicates.txt | awk '{print $1}') + if [[ ${cnt} -eq 0 ]]; + then + echo "No duplicates found" + if ! ~{same_name_as_input} ; + then + mv "~{aligned_bam}" "~{prefix}.bam" + mv "~{aligned_bai}" "~{prefix}.bam.bai" + fi + exit 0 + fi + echo "===========================================================" + echo "de-duplicating" + time python3 /opt/remove_duplicate_ont_aln.py \ + "~{aligned_bam}" \ + --prefix "~{prefix}" \ + --annotations "~{aligned_bam}".duplicates.txt + echo "===========================================================" + echo "DONE" + samtools index "~{prefix}.bam" + >>> + + output { + File corrected_bam = "~{prefix}.bam" + File corrected_bai = "~{prefix}.bam.bai" + File duplicate_record_signatures = "~{prefix}.duplicate.signatures.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 16, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-bam-dedup:0.1.2" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/tasks/Utility/PBUtils.wdl b/wdl/tasks/Utility/PBUtils.wdl index 4a3354f1b..884c690c6 100644 --- a/wdl/tasks/Utility/PBUtils.wdl +++ b/wdl/tasks/Utility/PBUtils.wdl @@ -990,7 +990,7 @@ task Align { boot_disk_gb: 10, preemptible_tries: 3, max_retries: 2, - docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.29" + docker: "us.gcr.io/broad-dsp-lrma/lr-smrttools:12.0.0.176214" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { @@ -1007,11 +1007,14 @@ task Align { task PBIndex { meta { - description: "Index a BAM file." + description: "Index a PacBio long reads BAM file to create the pbi." } parameter_meta { - bam: "Input BAM file." + bam: { + desciption: "Input BAM file.", + localization_optional: true + } runtime_attr_override: "Override default runtime attributes." } @@ -1021,35 +1024,39 @@ task PBIndex { RuntimeAttr? runtime_attr_override } - Int disk_size = 1 + 2*ceil(size(bam, "GB")) + String base = basename(bam) command <<< set -euxo pipefail - mv ~{bam} ~{basename(bam)} + time \ + gcloud storage cp ~{bam} ~{base} - pbindex ~{basename(bam)} + pbindex ~{base} >>> output { - File pbi = "~{basename(bam)}.pbi" + File pbi = "~{base}.pbi" } ######################### + + Int disk_size = 10 + ceil(size(bam, "GB")) + RuntimeAttr default_attr = object { - cpu_cores: 1, + cpu_cores: 2, mem_gb: 4, disk_gb: disk_size, boot_disk_gb: 10, - preemptible_tries: 0, - max_retries: 0, - docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.29" + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-smrttools:12.0.0.176214" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) diff --git a/wdl/tasks/Utility/ReadLengths.wdl b/wdl/tasks/Utility/ReadLengths.wdl new file mode 100644 index 000000000..c087b7239 --- /dev/null +++ b/wdl/tasks/Utility/ReadLengths.wdl @@ -0,0 +1,238 @@ +version 1.0 + +# utility file for tasks focusing on collecting read length-related metrics + +task GetLengthsFromBam { + meta { + description: + "Get read length of reads in a bam (secondary and supplementary alignments are excluded if the BAM is aligned)" + } + parameter_meta { + bam: {localization_optional: true} + } + input { + File bam + } + + Int disk_sz = 10 + ceil(size(bam, "GiB")) + + String base = basename(bam) + String local_bam = "/cromwell_root/~{base}" + + command <<< + set -euxo pipefail + + time gcloud storage cp ~{bam} ~{local_bam} + samtools view -@1 \ + -F 256 \ + -F 2048 \ + ~{local_bam} \ + | awk -F '\t' '{print length($10)}' \ + > "lengths.txt" + >>> + + output { + File read_lengths = "lengths.txt" + } + + runtime { + cpu: 2 + memory: "8 GiB" + disks: "local-disk ~{disk_sz} HDD" + preemptible_tries: 3 + max_retries: 2 + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.1" + } +} + +task GetLengthsFromFastq { + meta { + description: + "Get read length of reads in a FASTQ(.gz) file" + } + parameter_meta { + fastq: {localization_optional: true} + } + input { + File fastq + } + + Int disk_size = 10 + ceil(size(fastq, "GiB")) + + String base = basename(fastq) + String local_fq = "/cromwell_root/~{base}" + + command <<< + set -euxo pipefail + + time gcloud storage cp ~{fastq} ~{local_fq} + + if [[ "~{fastq}" =~ \.gz$ ]];then + zcat ~{local_fq} | awk '{if(NR%4==2) print length}' > "lengths.txt" + else + awk '{if(NR%4==2) print length}' ~{local_fq} > "lengths.txt" + fi + >>> + + output { + File read_lengths = "lengths.txt" + } + + runtime { + cpu: 2 + memory: "4 GiB" + disks: "local-disk ~{disk_size} HDD" + preemptible_tries: 3 + max_retries: 2 + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" + } +} + +task GetNumReadsAndShorts { + meta { + desciption: + "Get number of reads and those that are too short. Also compress." + } + input { + File read_lengths_txt + Int short_threshold + } + + String prefix = basename(read_lengths_txt, ".txt") + command <<< + set -eux + + wc -l ~{read_lengths_txt} | awk '{print $1}' > "total.txt" + + awk -v thesh=~{short_threshold} \ + '{if ($1 "shorts.txt" + + mv ~{read_lengths_txt} ~{prefix}.txt + bzip2 -v9 ~{prefix}.txt + >>> + + output { + Float num_seqs = read_float("total.txt") + Float num_shorts = read_float("shorts.txt") + File rl_bz2 = "~{prefix}.txt.bz2" + } + runtime {cpu: 1 memory: "4 GiB" disks: "local-disk 100 HDD" docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest"} +} + +task Dyst { + meta { + desciption: "Using a program called dyst to display a txt version of read length histogram" + } + input { + File read_lengths_txt + } + + command <<< + set -eux + + mv ~{read_lengths_txt} "read_lengths.txt" + dyst -h + dyst -n -b 100 -i "read_lengths.txt" \ + > "read_lengths.hist.txt" + cat "read_lengths.hist.txt" + >>> + + output { + File histogram = "read_lengths.hist.txt" + } + + runtime { + cpu: 4 + memory: "20 GiB" + disks: "local-disk 100 HDD" + preemptible_tries: 3 + max_retries: 2 + docker: "us.gcr.io/broad-dsp-lrma/lr-dyst-peaker:0.0.2" + } +} + +task Peaker { + meta { + desciption: "Heuristically detect peaks of the txt histogram generated by dyst." + } + input { + File dyst_histogram + } + + command <<< + set -eux + + grep -v "^#" ~{dyst_histogram} | awk -F ':' '{print $1}' \ + > "prepped_dyst_plain.hist" + + python3 /opt/find_peaks.py \ + -i "prepped_dyst_plain.hist" \ + -o "peaks.txt" + >>> + + output { + Array[Int] peaks = read_lines("peaks.txt") + } + + runtime { + disks: "local-disk 100 HDD" + preemptible_tries: 3 + max_retries: 2 + docker: "us.gcr.io/broad-dsp-lrma/lr-dyst-peaker:0.0.1" + } +} + +task ReverseYield { + meta { + desciption: "Given a read length array, calculate at which lengths does one get [10%, 20%, ..., 90%] of reads, i.e. deciles." + } + input { + File read_lengths_txt + } + + command <<< + python3 /opt/reverse_yield.py \ + -i ~{read_lengths_txt} \ + -o "reverse_yield.txt" + >>> + + output { + Array[Int] reverse_yield = read_lines("reverse_yield.txt") + } + + runtime { + disks: "local-disk 10 HDD" + preemptible_tries: 3 + max_retries: 2 + docker: "us.gcr.io/broad-dsp-lrma/lr-dyst-peaker:0.0.2" + } +} + +task Skewness { + meta { + desciption: "Measure skewness of the readlength distribution" + } + input { + File read_lengths_txt + } + + command <<< + python3 /opt/measure_g1_skew.py \ + -i ~{read_lengths_txt} \ + -o "skew.txt" + >>> + + output { + Float skew = read_float("skew.txt") + } + + runtime { + disks: "local-disk 10 HDD" + preemptible_tries: 3 + max_retries: 2 + docker: "us.gcr.io/broad-dsp-lrma/lr-dyst-peaker:0.0.2" + } +} diff --git a/wdl/tasks/Utility/Utils.wdl b/wdl/tasks/Utility/Utils.wdl index 754eda331..fd2ae2af0 100644 --- a/wdl/tasks/Utility/Utils.wdl +++ b/wdl/tasks/Utility/Utils.wdl @@ -1323,6 +1323,7 @@ task MergeBams { meta { description : "Merge several input BAMs into a single BAM." + deprecated: true } parameter_meta { @@ -1510,7 +1511,7 @@ task SubsetBam { bai: "index for bam file" locus: "genomic locus to select" prefix: "prefix for output bam and bai file names" - runtime_attr_override: "Override the default runtime attributes." + is_samtools_failed: "if true, the streaming of BAM from the bucket didn't succeed, so consider the result BAM corrupted." } input { @@ -1522,8 +1523,6 @@ task SubsetBam { RuntimeAttr? runtime_attr_override } - - Int disk_size = 4*ceil(size([bam, bai], "GB")) command <<< @@ -1531,13 +1530,24 @@ task SubsetBam { export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) - samtools view -bhX ~{bam} ~{bai} ~{locus} > ~{prefix}.bam - samtools index ~{prefix}.bam + echo "false" > "samtools.failed.txt" + + samtools view \ + -bhX \ + -M \ + -@ 1 \ + --verbosity=8 \ + --write-index \ + -o "~{prefix}.bam##idx##~{prefix}.bam.bai" \ + ~{bam} ~{bai} \ + ~{locus} \ + || { echo "samtools seem to have failed"; echo "true" > "samtools.failed.txt"; exit 77; } >>> output { File subset_bam = "~{prefix}.bam" File subset_bai = "~{prefix}.bam.bai" + Boolean is_samtools_failed = read_boolean("samtools.failed.txt") } ######################### @@ -1548,13 +1558,13 @@ task SubsetBam { boot_disk_gb: 10, preemptible_tries: 2, max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.9" + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) @@ -1697,6 +1707,7 @@ task ResilientSubsetBam { interval_list_file: "a Picard-style interval list file to subset reads with" interval_id: "an ID string for representing the intervals in the interval list file" prefix: "prefix for output bam and bai file names" + is_samtools_failed: "if true, the streaming of BAM from the bucket didn't succeed, so consider the result BAM corrupted." } input { @@ -1725,6 +1736,8 @@ task ResilientSubsetBam { source /opt/re-auth.sh set -euxo pipefail + echo "false" > "samtools.failed.txt" + # see man page for what '-M' means samtools view \ -bhX \ @@ -1734,7 +1747,7 @@ task ResilientSubsetBam { --write-index \ -o "~{subset_prefix}.bam##idx##~{subset_prefix}.bam.bai" \ ~{bam} ~{bai} \ - ~{sep=" " intervals} && exit 0 || { echo "samtools seem to have failed"; exit 77; } & + ~{sep=" " intervals} && exit 0 || { echo "samtools seem to have failed"; echo "true" > "samtools.failed.txt"; exit 77; } & pid=$! set +e @@ -1742,12 +1755,13 @@ task ResilientSubsetBam { while true; do sleep 1200 && date && source /opt/re-auth.sh count=$(( count+1 )) - if [[ ${count} -gt 6 ]]; then exit 0; fi + if [[ ${count} -gt 6 ]]; then echo "true" > "samtools.failed.txt" && exit 0; fi # way too many attempts, get out if ! pgrep -x -P $pid; then exit 0; fi done >>> output { + Boolean is_samtools_failed = read_boolean("samtools.failed.txt") File subset_bam = "~{subset_prefix}.bam" File subset_bai = "~{subset_prefix}.bam.bai" } @@ -1766,7 +1780,7 @@ task ResilientSubsetBam { runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) @@ -1947,78 +1961,6 @@ task FilterBamOnTag { } } -task DeduplicateBam { - - meta { - description: "Utility to drop (occationally happening) duplicate records in input BAM" - } - - parameter_meta { - aligned_bam: "input BAM file" - aligned_bai: "input BAM index file" - same_name_as_input: "if true, output BAM will have the same name as input BAM, otherwise it will have the input basename with .dedup suffix" - runtime_attr_override: "override default runtime attributes" - } - - input { - File aligned_bam - File aligned_bai - - Boolean same_name_as_input = true - - RuntimeAttr? runtime_attr_override - } - - Int disk_size = 3 * ceil(size(aligned_bam, "GB")) - - String base = basename(aligned_bam, ".bam") - String prefix = if (same_name_as_input) then base else (base + ".dedup") - - command <<< - echo "===========================================================" - echo "collecting duplicate information" - time \ - samtools view -@ 1 "~{aligned_bam}" | \ - awk -F "\t" 'BEGIN {OFS="\t"} {print $1, $2, $3, $4, $5}' | \ - sort | uniq -d \ - > "~{aligned_bam}".duplicates.txt - echo "===========================================================" - echo "de-duplicating" - time python3 /opt/remove_duplicate_ont_aln.py \ - --prefix "~{prefix}" \ - --annotations "~{aligned_bam}".duplicates.txt \ - "~{aligned_bam}" - echo "===========================================================" - echo "DONE" - samtools index "~{prefix}.bam" - >>> - - output { - File corrected_bam = "~{prefix}.bam" - File corrected_bai = "~{prefix}.bam.bai" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: 4, - mem_gb: 16, - disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 0, - max_retries: 1, - docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.10" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} task Cat { diff --git a/wdl/tasks/Utility/VariantUtils.wdl b/wdl/tasks/Utility/VariantUtils.wdl index eaecfbaf9..c360a6680 100644 --- a/wdl/tasks/Utility/VariantUtils.wdl +++ b/wdl/tasks/Utility/VariantUtils.wdl @@ -33,11 +33,13 @@ task MergePerChrCalls { GREPCMD="zgrep" fi - $GREPCMD '^#' $VCF_WITH_HEADER | grep -v -e '^##contig' -e CHROM > header - grep '^@SQ' ~{ref_dict} | awk '{ print "##contig=" }' | sed 's/[SL]N://g' >> header - $GREPCMD -m1 CHROM $VCF_WITH_HEADER >> header + $GREPCMD '^#' $VCF_WITH_HEADER | grep -v -e '^##contig' -e CHROM > header.txt + grep '^@SQ' ~{ref_dict} | awk '{ print "##contig=" }' | sed 's/[SL]N://g' >> header.txt + $GREPCMD -m1 CHROM $VCF_WITH_HEADER >> header.txt - ((cat header) && ($GREPCMD -h -v '^#' ~{sep=' ' vcfs})) | bcftools sort | bgzip > ~{prefix}.vcf.gz + cat header.txt <($GREPCMD -h -v '^#' ~{sep=' ' vcfs}) | bcftools sort | bgzip > ~{prefix}.vcf.gz + echo $? + ls tabix -p vcf ~{prefix}.vcf.gz >>> diff --git a/wdl/tasks/VariantCalling/CCSPepper.wdl b/wdl/tasks/VariantCalling/CCSPepper.wdl deleted file mode 100644 index b619fa11d..000000000 --- a/wdl/tasks/VariantCalling/CCSPepper.wdl +++ /dev/null @@ -1,340 +0,0 @@ -version 1.0 - -####################################################### -# This pipeline calls small variants using DeepVariant. -####################################################### - -import "../../structs/Structs.wdl" - - -workflow CCSPepper { - - meta { - description: "Workflow for getting haplotagged BAM, VCF and gVCF from DV-pepper. Note VCF is un-phased." - } - - parameter_meta { - bam: "Input BAM file" - bai: "Input BAM index file" - ref_fasta: "Reference fasta file" - ref_fasta_fai: "Reference fasta index file" - pepper_threads: "Number of threads for Pepper" - pepper_memory: "Memory for Pepper" - dv_threads: "Number of threads for DeepVariant" - dv_memory: "Memory for DeepVariant" - # when running large scale workflows, we sometimes see errors like the following - # A resource limit has delayed the operation: generic::resource_exhausted: allocating: selecting resources: selecting region and zone: - # no available zones: 2763 LOCAL_SSD_TOTAL_GB (738/30000 available) usage too high - zones: "select which zone (GCP) to run this task" - } - - input { - File bam - File bai - - File ref_fasta - File ref_fasta_fai - - Int pepper_threads - Int pepper_memory - - Int dv_threads - Int dv_memory - - String zones = "us-central1-b us-central1-c" - } - - call Pepper as get_hap_tagged_bam { - input: - bam = bam, - bai = bai, - ref_fasta = ref_fasta, - ref_fasta_fai = ref_fasta_fai, - threads = pepper_threads, - memory = pepper_memory, - zones = zones - } - - call DV as deep_variant { - input: - bam = get_hap_tagged_bam.hap_tagged_bam, - bai = get_hap_tagged_bam.hap_tagged_bai, - ref_fasta = ref_fasta, - ref_fasta_fai = ref_fasta_fai, - threads = dv_threads, - memory = dv_memory, - zones = zones - } - - output { - File VCF = deep_variant.VCF - File VCF_tbi = deep_variant.VCF_tbi - - File gVCF = deep_variant.gVCF - File gVCF_tbi = deep_variant.gVCF_tbi - - File hap_tagged_bam = get_hap_tagged_bam.hap_tagged_bam - File hap_tagged_bai = get_hap_tagged_bam.hap_tagged_bai - } -} - -task Pepper { - input { - File bam - File bai - - File ref_fasta - File ref_fasta_fai - - Int threads - Int memory - String zones - - RuntimeAttr? runtime_attr_override - } - - Int bam_sz = ceil(size(bam, "GB")) - Int disk_size = if bam_sz > 200 then 2*bam_sz else bam_sz + 200 - - String output_root = "/cromwell_root/pepper_output" - - String prefix = basename(bam, ".bam") + ".pepper" - - command <<< - # avoid the infamous pipefail 141 https://stackoverflow.com/questions/19120263 - set -eux - SM=$(samtools view -H ~{bam} | grep -m1 '^@RG' | sed 's/\t/\n/g' | grep '^SM:' | sed 's/SM://g') - - set -euxo pipefail - - touch ~{bai} - num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) - - mkdir -p "~{output_root}" - - # no gVCF as it Pepper simply doesn't produce gVCF on CCS data - run_pepper_margin_deepvariant \ - call_variant \ - -b ~{bam} \ - -f ~{ref_fasta} \ - -t "${num_core}" \ - -s "${SM}" \ - -o "~{output_root}" \ - -p "~{prefix}" \ - --phased_output \ - --ccs - - find "~{output_root}/" -print | sed -e 's;[^/]*/;|____;g;s;____|; |;g' \ - > "~{output_root}/dir_structure.txt" - - if [[ -f "~{output_root}/intermediate_files/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" ]]; then - mv "~{output_root}/intermediate_files/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" \ - "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" - mv "~{output_root}/intermediate_files/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam.bai" \ - "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam.bai" - fi - >>> - - output { - File hap_tagged_bam = "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam" - File hap_tagged_bai = "~{output_root}/MARGIN_PHASED.PEPPER_SNP_MARGIN.haplotagged.bam.bai" - - # maybe less useful - File output_dir_structure = "~{output_root}/dir_structure.txt" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: threads, - mem_gb: memory, - disk_gb: disk_size, - boot_disk_gb: 100, - preemptible_tries: 1, - max_retries: 1, - docker: "kishwars/pepper_deepvariant:r0.4.1" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - zones: zones - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task DV { - - input { - File bam - File bai - - File ref_fasta - File ref_fasta_fai - - Int threads - Int memory - String zones - - RuntimeAttr? runtime_attr_override - } - - String prefix = basename(bam, ".bam") + ".deepvariant" - String output_root = "/cromwell_root/dv_output" - - Int bam_sz = ceil(size(bam, "GB")) - Boolean is_big_bam = bam_sz > 100 - Int inflation_factor = if (is_big_bam) then 10 else 5 - Int minimal_disk = 1000 - Int disk_size = if inflation_factor * bam_sz > minimal_disk then inflation_factor * bam_sz else minimal_disk - - command <<< - set -euxo pipefail - - num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) - - mkdir -p "~{output_root}" - - export MONITOR_MOUNT_POINT="/cromwell_root/" - bash vm_local_monitoring_script.sh &> resources.log & - job_id=$(ps -aux | grep -F 'vm_local_monitoring_script.sh' | head -1 | awk '{print $2}') - - /opt/deepvariant/bin/run_deepvariant \ - --model_type=PACBIO \ - --ref=~{ref_fasta} \ - --reads=~{bam} \ - --output_vcf="~{output_root}/~{prefix}.vcf.gz" \ - --output_gvcf="~{output_root}/~{prefix}.g.vcf.gz" \ - --num_shards="${num_core}" \ - --use_hp_information || cat resources.log - if ps -p "${job_id}" > /dev/null; then kill "${job_id}"; fi - - find "~{output_root}/" -print | sed -e 's;[^/]*/;|____;g;s;____|; |;g' \ - > "~{output_root}/dir_structure.txt" - >>> - - output { - - File resouce_monitor_log = "resources.log" - - File output_dir_structure = "~{output_root}/dir_structure.txt" - - File VCF = "~{output_root}/~{prefix}.vcf.gz" - File VCF_tbi = "~{output_root}/~{prefix}.vcf.gz.tbi" - - File gVCF = "~{output_root}/~{prefix}.g.vcf.gz" - File gVCF_tbi = "~{output_root}/~{prefix}.g.vcf.gz.tbi" - - File visual_report_html = "~{output_root}/~{prefix}.visual_report.html" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: threads, - mem_gb: memory, - disk_gb: disk_size, - boot_disk_gb: 100, - preemptible_tries: 3, - max_retries: 0, - docker: "us.gcr.io/broad-dsp-lrma/lr-deepvariant:1.3.0" - # docker: "google/deepvariant:1.2.0-gpu" # kept here to remind ourselves, occassionally, to review if it's better with GPU - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - zones: zones - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} - -task MarginPhase { - - meta { - description: "Generates phased VCF. Note this runs fast so no need to parallize." - } - - input { - File bam - File bai - - File unphased_vcf - File? unphased_vcf_tbi - - File ref_fasta - File ref_fasta_fai - - Int memory - String zones - - RuntimeAttr? runtime_attr_override - } - - Int bam_sz = ceil(size(bam, "GB")) - Int disk_size = if bam_sz > 200 then 2*bam_sz else bam_sz + 200 - - Int cores = 64 - - String prefix = basename(bam, ".bam") + ".pepper" - String output_root = "/cromwell_root/margin_output" - - command <<< - set -euxo pipefail - - num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) - - mkdir -p "~{output_root}" "~{output_root}/logs" - touch ~{bai} - - # note the -M option was suggested by an author of margin - # it's unclear which phasedBAM one should use: this, or the one generated from the Pepper step - margin phase \ - ~{bam} \ - ~{ref_fasta} \ - ~{unphased_vcf} \ - /opt/margin_dir/params/misc/allParams.phase_vcf.json \ - -t "${num_core}" \ - -M \ - -o "~{output_root}/~{prefix}" \ - 2>&1 | tee "~{output_root}/logs/5_margin_phase_vcf.log" - - bgzip -c "~{output_root}/~{prefix}".phased.vcf > "~{output_root}/~{prefix}".phased.vcf.gz && \ - tabix -p vcf "~{output_root}/~{prefix}".phased.vcf.gz - >>> - - - output { - File phaseset_bed = "~{output_root}/~{prefix}.phaseset.bed" - File phasedVCF = "~{output_root}/~{prefix}.phased.vcf.gz" - File phasedtbi = "~{output_root}/~{prefix}.phased.vcf.gz.tbi" - } - - ######################### - RuntimeAttr default_attr = object { - cpu_cores: cores, - mem_gb: memory, - disk_gb: disk_size, - boot_disk_gb: 100, - preemptible_tries: 3, - max_retries: 0, - docker: "kishwars/pepper_deepvariant:r0.4.1" - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - zones: zones - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - docker: select_first([runtime_attr.docker, default_attr.docker]) - } -} diff --git a/wdl/tasks/VariantCalling/CallSmallVariants.wdl b/wdl/tasks/VariantCalling/CallSmallVariants.wdl new file mode 100644 index 000000000..4c599b65c --- /dev/null +++ b/wdl/tasks/VariantCalling/CallSmallVariants.wdl @@ -0,0 +1,201 @@ +version 1.0 + +import "../../pipelines/TechAgnostic/Utility/ShardWholeGenome.wdl" # this isn't optimal; the choice was made assuming ShardWholeGenome could be useful for other users as well. +import "../../deprecated/tasks/PEPPER-MARGIN-DeepVariant.wdl" as PMDV + +import "DeepVariant.wdl" +import "Clair.wdl" as Clair3 + +import "PhaseSmallVariantsAndTagBam.wdl" as PhaseAndTag + +workflow Work { + meta { + description: "Call small variants using reads-based methods (i.e. not for assembly-contig-based methods)." + } + parameter_meta { + # inputs + prefix: "Prefix for output files" + per_chr_bam_bai_and_id: "WGS bam sharded per chromosome/contig." + is_ont: "If the input data is ONT" + is_r10_4_pore_or_later: "If the ONT input data is generated on R10.4 simples/duplex pores." + model_for_dv_andor_pepper: "Model string to be used on DV or the PEPPER-Margin-DeepVariant toolchain. Please refer to their github pages for accepted values." + ref_scatter_interval_list_locator: "A file holding paths to interval_list files, used for custom sharding the of the input BAM; when not provided, will shard WG by contig (possibly slower)" + ref_scatter_interval_list_ids: "A file that gives short IDs to the interval_list files; when not provided, will shard WG by contig (possibly slower)" + use_gpu: "Use GPU acceleration for DV (or PEPPER) or not" + use_margin_for_tagging: "if false, will use margin-phased VCF for haplotagging the BAM; applicable only when input data isn't ONT data with pore older than R10.4" + + # outputs + haplotagged_bam: "BAM haplotagged using a small variant single-sample VCF." + haplotagged_bai: "Index for haplotagged_bam." + haplotagged_bam_tagger: "VCF used for doing the haplotagging. 'Legacy' if the input is ONT data generated on pores before R10.4." + + legacy_g_vcf: "PEPPER-MARGIN-DeepVariant gVCF; available only when input is ONT data generated on pores older than R10.4." + legacy_g_tbi: "Index for PEPPER-MARGIN-DeepVariant gVCF; available only when input is ONT data generated on pores older than R10.4." + legacy_phased_vcf: "Phased PEPPER-MARGIN-DeepVariant VCF; available only when input is ONT data generated on pores older than R10.4." + legacy_phased_tbi: "Indes for phased PEPPER-MARGIN-DeepVariant VCF; available only when input is ONT data generated on pores older than R10.4." + legacy_phasing_stats_tsv: "Phasing stats of legacy_phased_vcf in TSV format; available only when input is ONT data generated on pores older than R10.4." + legacy_phasing_stats_gtf: "Phasing stats of legacy_phased_vcf in GTF format; available only when input is ONT data generated on pores older than R10.4." + + dv_g_vcf: "DeepVariant gVCF; available for CCS data and ONT data generated with pores >= R10.4." + dv_g_tbi: "Index for DeepVariant ; available for CCS data and ONT data generated with pores >= R10.4." + dv_margin_phased_vcf: "Phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_margin_phased_tbi: "Index for phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_margin_phasing_stats_tsv: "Phasing stats (TSV format) of phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_margin_phasing_stats_gtf: "Phasing stats (GTF format) of phased DeepVariant VCF genrated with Margin; available for CCS data and ONT data generated with pores >= R10.4." + dv_whatshap_phased_vcf: "Phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." + dv_whatshap_phased_tbi: "Index for phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_whatshap_phasing_stats_tsv: "Phasing stats (TSV format) of phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." + dv_vcf_whatshap_phasing_stats_gtf: "Phasing stats (GTF format) of phased DeepVariant VCF genrated with WhatsHap; available for CCS data and ONT data generated with pores >= R10.4." + + dv_nongpu_resources_usage_log: "Resource usage monitoring log for DV (per shard); available for CCS data and ONT data generated with pores >= R10.4." + dv_nongpu_resources_usage_visual: "Resource usage monitoring log visualization for DV (per shard); available for CCS data and ONT data generated with pores >= R10.4." + } + input { + # sample info + File bam + File bai + String prefix + + Array[Pair[String, Pair[File, File]]] per_chr_bam_bai_and_id + + Boolean is_ont + Boolean is_r10_4_pore_or_later + String model_for_dv_andor_pepper + + # reference info + Map[String, String] ref_map + + File? ref_scatter_interval_list_locator + File? ref_scatter_interval_list_ids + + # smallVar-specific args + Boolean run_clair3 + Boolean use_margin_for_tagging + + # optimization + Int dv_threads + Int dv_memory + Boolean use_gpu = false + String zones = "us-central1-a us-central1-b us-central1-c us-central1-f" + } + output { + File? clair_vcf = RunClair3.clair_vcf + File? clair_tbi = RunClair3.clair_tbi + File? clair_gvcf = RunClair3.clair_gvcf + File? clair_gtbi = RunClair3.clair_gtbi + + File haplotagged_bam = use_this_haptag_bam + File haplotagged_bai = use_this_haptag_bai + String haplotagged_bam_tagger = use_this_haptagger + + # this block available only for legacy ONT data (those older than R10.4) + File? legacy_g_vcf = WorkOnLegacyONTdata.legacy_ont_dvp_g_vcf + File? legacy_g_tbi = WorkOnLegacyONTdata.legacy_ont_dvp_g_tbi + File? legacy_phased_vcf = WorkOnLegacyONTdata.legacy_ont_dvp_phased_vcf + File? legacy_phased_tbi = WorkOnLegacyONTdata.legacy_ont_dvp_phased_tbi + File? legacy_phasing_stats_tsv = WorkOnLegacyONTdata.legacy_ont_dvp_phased_vcf_stats_tsv + File? legacy_phasing_stats_gtf = WorkOnLegacyONTdata.legacy_ont_dvp_phased_vcf_stats_gtf + + # this block available for CCS and modern ONT data + File? dv_g_vcf = DV.g_vcf + File? dv_g_tbi = DV.g_tbi + + File? dv_margin_phased_vcf = PnT.margin_phased_vcf + File? dv_margin_phased_tbi = PnT.margin_phased_tbi + + File? dv_vcf_margin_phasing_stats_tsv = PnT.margin_phasing_stats_tsv + File? dv_vcf_margin_phasing_stats_gtf = PnT.margin_phasing_stats_gtf + + File? dv_whatshap_phased_vcf = PnT.whatshap_phased_vcf + File? dv_whatshap_phased_tbi = PnT.whatshap_phased_tbi + + File? dv_vcf_whatshap_phasing_stats_tsv = PnT.whatshap_phasing_stats_tsv + File? dv_vcf_whatshap_phasing_stats_gtf = PnT.whatshap_phasing_stats_gtf + + Array[File]? dv_nongpu_resources_usage_log = DV.nongpu_resource_usage_logs + Array[File]? dv_nongpu_resources_usage_visual = DV.nongpu_resource_usage_visual + } + + #################################################################################################################################### + # custom-shard WG (for load-balancing) + #################################################################################################################################### + # but if custom sharding isn't requested, then per-chr sharding is already done, so no need to redo + if (defined(ref_scatter_interval_list_locator)) { + call ShardWholeGenome.Split as CustomSplitBamForSmallVar { + input: + ref_dict = ref_map['dict'], + bam = bam, + bai = bai, + ref_scatter_interval_list_locator = ref_scatter_interval_list_locator, + ref_scatter_interval_list_ids = ref_scatter_interval_list_ids + } + } + Array[Pair[String, Pair[File, File]]] how_to_shard_wg_for_calling = select_first([CustomSplitBamForSmallVar.id_bam_bai_of_shards, + per_chr_bam_bai_and_id]) + + #################################################################################################################################### + # DV, major workhorse + #################################################################################################################################### + if ((!is_ont) || is_r10_4_pore_or_later) { # pacbio or recent ONT data + + call DeepVariant.Run as DV { + input: + how_to_shard_wg_for_calling = how_to_shard_wg_for_calling, + prefix = prefix, + model_for_dv_andor_pepper = model_for_dv_andor_pepper, + + ref_map = ref_map, + + dv_threads = dv_threads, + dv_memory = dv_memory, + use_gpu = use_gpu, + zones = zones + } + + call PhaseAndTag.Run as PnT { + input: + use_margin_for_tagging = use_margin_for_tagging, + + bam = bam, + bai = bai, + per_chr_bam_bai_and_id = per_chr_bam_bai_and_id, + is_ont = is_ont, + + unphased_vcf = DV.vcf, + unphased_tbi = DV.tbi, + + ref_map = ref_map, + + zones = zones + } + } + + if (is_ont && (!is_r10_4_pore_or_later)) { # legacy ( 100 + Int inflation_factor = if (is_big_bam) then 10 else 5 + Int minimal_disk = 50 + Int disk_size = if inflation_factor * bam_sz > minimal_disk then inflation_factor * bam_sz else minimal_disk + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + mkdir -p "~{output_root}" + + export MONITOR_MOUNT_POINT="/cromwell_root/" + bash /opt/vm_local_monitoring_script.sh &> resources.log & + job_id=$(ps -aux | grep -F 'vm_local_monitoring_script.sh' | head -1 | awk '{print $2}') + + /opt/deepvariant/bin/run_deepvariant \ + --model_type=~{model_type} \ + --ref=~{ref_fasta} \ + --reads=~{bam} \ + --output_vcf="~{output_root}/~{prefix}.vcf.gz" \ + --output_gvcf="~{output_root}/~{prefix}.g.vcf.gz" \ + --num_shards="${num_core}" || cat resources.log + if ps -p "${job_id}" > /dev/null; then kill "${job_id}"; fi + + find "~{output_root}/" -print | sed -e 's;[^/]*/;|____;g;s;____|; |;g' \ + > "~{output_root}/dir_structure.txt" + >>> + + output { + + File resouce_monitor_log = "resources.log" + File? gpu_monitor_log = "gpu.usages.log" + File output_dir_structure = "~{output_root}/dir_structure.txt" + + File VCF = "~{output_root}/~{prefix}.vcf.gz" + File VCF_tbi = "~{output_root}/~{prefix}.vcf.gz.tbi" + + File gVCF = "~{output_root}/~{prefix}.g.vcf.gz" + File gVCF_tbi = "~{output_root}/~{prefix}.g.vcf.gz.tbi" + + File visual_report_html = "~{output_root}/~{prefix}.visual_report.html" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: threads, + mem_gb: memory, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-deepvariant:1.5.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + zones: zones + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task DV_gpu { + parameter_meta { + model_type: "which DV pre-trained model to use. Must be one of [PACBIO, ONT_R104] (or anything later supported after DV's 1.5.0 release)." + } + + input { + File bam + File bai + + File ref_fasta + File ref_fasta_fai + + String model_type + + Int threads + Int memory + String zones + + RuntimeAttr? runtime_attr_override + } + + String prefix = basename(bam, ".bam") + ".deepvariant" + String output_root = "/cromwell_root/dv_output" + + Int bam_sz = ceil(size(bam, "GB")) + Boolean is_big_bam = bam_sz > 100 + Int inflation_factor = if (is_big_bam) then 10 else 5 + Int minimal_disk = 100 + Int disk_size = if inflation_factor * bam_sz > minimal_disk then inflation_factor * bam_sz else minimal_disk + + Int max_cpu = 12 + Int use_this_cpu = if threads > max_cpu then max_cpu else threads + Int max_memory = 64 + Int use_this_memory = if memory > max_memory then max_memory else memory + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + mkdir -p "~{output_root}" + + export MONITOR_MOUNT_POINT="/cromwell_root/" + bash vm_local_monitoring_script.sh &> resources.log & + job_id=$(ps -aux | grep -F 'vm_local_monitoring_script.sh' | head -1 | awk '{print $2}') + gpustat -a -i 1 &> gpu.usages.log & + gpu_tracking_job_id=$(ps -aux | grep -F 'gpustat' | head -1 | awk '{print $2}') + + /opt/deepvariant/bin/run_deepvariant \ + --model_type=~{model_type} \ + --ref=~{ref_fasta} \ + --reads=~{bam} \ + --output_vcf="~{output_root}/~{prefix}.vcf.gz" \ + --output_gvcf="~{output_root}/~{prefix}.g.vcf.gz" \ + --num_shards="${num_core}" || cat resources.log + if ps -p "${job_id}" > /dev/null; then kill "${job_id}"; fi + if ps -p "${gpu_tracking_job_id}" > /dev/null; then kill "${gpu_tracking_job_id}"; fi + + find "~{output_root}/" -print | sed -e 's;[^/]*/;|____;g;s;____|; |;g' \ + > "~{output_root}/dir_structure.txt" + >>> + + output { + + File resouce_monitor_log = "resources.log" + File? gpu_monitor_log = "gpu.usages.log" + File output_dir_structure = "~{output_root}/dir_structure.txt" + + File VCF = "~{output_root}/~{prefix}.vcf.gz" + File VCF_tbi = "~{output_root}/~{prefix}.vcf.gz.tbi" + + File gVCF = "~{output_root}/~{prefix}.g.vcf.gz" + File gVCF_tbi = "~{output_root}/~{prefix}.g.vcf.gz.tbi" + + File visual_report_html = "~{output_root}/~{prefix}.visual_report.html" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: use_this_cpu, + mem_gb: use_this_memory, + disk_gb: disk_size, + boot_disk_gb: 30, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-deepvariant:1.5.0-gpu" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + zones: zones + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + + gpuType: "nvidia-tesla-v100" + gpuCount: 1 + } +} diff --git a/wdl/tasks/VariantCalling/MarginPhase.wdl b/wdl/tasks/VariantCalling/MarginPhase.wdl new file mode 100644 index 000000000..47c0aa046 --- /dev/null +++ b/wdl/tasks/VariantCalling/MarginPhase.wdl @@ -0,0 +1,119 @@ +version 1.0 + +import "../../structs/Structs.wdl" + +task MarginPhase { + + meta { + description: "Generates phased VCF. Note this runs fast so no need to parallize." + } + + parameter_meta { + # inputs + data_type: "Must be one of [ONT, PacBio]" + bam: {localization_optional: true} + + # outputs + phaseset_bed: "a BED file describing the phasesets and the reason why phasing was broken with respect to the previous phaseset" + chunk_csv: "a CSV describing the boundaries of each chunk" + # suboptimal_haplotagged_bam: "a BAM file with all reads tagged (HP) as 1, 2, or 0. Note it is named suboptimal because we recommend using the parameters tuned for phasing variants, coupled with the following sentense from the github repo: 'The haplotag parameters are tuned to produce more phased reads and more accurate local read phasing, and were tuned using variants generated by PEPPER.' " + } + + input { + File bam + File bai + + String data_type + + File unphased_vcf + File unphased_tbi + + File ref_fasta + File ref_fasta_fai + + Int memory = 200 + String zones = "us-central1-a us-central1-b us-central1-c us-central1-f" + + RuntimeAttr? runtime_attr_override + } + + Int bam_sz = ceil(size(bam, "GB")) + Int disk_size = if bam_sz > 200 then 2*bam_sz else bam_sz + 200 + + String prefix = basename(unphased_vcf, ".vcf.gz") + String output_root = "/cromwell_root/margin_output" + + # name of pre-made parameter set json file. See margin github page for appropriate values. + String parameter_json_file_name = if data_type == 'ONT' then "allParams.phase_vcf.ont.json" else "allParams.phase_vcf.pb-hifi.json" + + String base = basename(bam) + String local_bam = "/cromwell_root/~{base}" + String local_bai = "~{local_bam}.bai" + + command <<< + set -euxo pipefail + + time gcloud storage cp ~{bam} ~{local_bam} + mv ~{bai} ~{local_bai} + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + mkdir -p "~{output_root}" "~{output_root}/logs" + touch ~{bai} + + # note the -M option was suggested by an author of margin + # it's unclear which phasedBAM one should use: this, or the one generated from the Pepper step + margin phase \ + ~{local_bam} \ + ~{ref_fasta} \ + ~{unphased_vcf} \ + "/opt/margin_dir/params/phase/~{parameter_json_file_name}" \ + -t "${num_core}" \ + -M \ + -o "~{output_root}/~{prefix}" \ + 2>&1 | tee "~{output_root}/logs/5_margin_phase_vcf.log" + + ls "~{output_root}/" + cd "~{output_root}/" && \ + bgzip -c "~{prefix}".phased.vcf > "~{prefix}".phased.vcf.gz && \ + tabix -p vcf "~{prefix}".phased.vcf.gz + ls + + mv "~{prefix}".phased.vcf.gz "~{prefix}".margin-phased.vcf.gz + mv "~{prefix}".phased.vcf.gz.tbi "~{prefix}".margin-phased.vcf.gz.tbi + mv "~{prefix}".chunks.csv "~{prefix}".margin-phased.chunks.csv + mv "~{prefix}".phaseset.bed "~{prefix}".margin-phased.phaseset.bed + >>> + + output { + File phased_vcf = "~{output_root}/~{prefix}.margin-phased.vcf.gz" + File phased_tbi = "~{output_root}/~{prefix}.margin-phased.vcf.gz.tbi" + + File chunk_csv = "~{output_root}/~{prefix}.margin-phased.chunks.csv" + File phaseset_bed = "~{output_root}/~{prefix}.margin-phased.phaseset.bed" + + # File suboptimal_haplotagged_bam = "~{output_root}/~{prefix}.margin-phase-haplotagged.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 64, + mem_gb: memory, + disk_gb: disk_size, + boot_disk_gb: 100, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-margin:2.2.dev-69f6fff" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + zones: zones + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/tasks/VariantCalling/PBSV.wdl b/wdl/tasks/VariantCalling/PBSV.wdl index 19ab34da6..7f520baf5 100644 --- a/wdl/tasks/VariantCalling/PBSV.wdl +++ b/wdl/tasks/VariantCalling/PBSV.wdl @@ -11,7 +11,7 @@ workflow RunPBSV { parameter_meta { bam: "input BAM from which to call SVs" bai: "index accompanying the BAM" - is_ccs: "if input BAM is CCS reads" + is_hifi: "if input BAM is HiFi reads" ref_fasta: "reference to which the BAM was aligned to" ref_fasta_fai: "index accompanying the reference" prefix: "prefix for output" @@ -22,7 +22,7 @@ workflow RunPBSV { input { File bam File bai - Boolean is_ccs + Boolean is_hifi File ref_fasta File ref_fasta_fai @@ -31,17 +31,20 @@ workflow RunPBSV { String zones File? tandem_repeat_bed + Boolean is_ont = false } call Discover { input: bam = bam, bai = bai, + is_hifi = is_hifi, ref_fasta = ref_fasta, ref_fasta_fai = ref_fasta_fai, tandem_repeat_bed = tandem_repeat_bed, prefix = prefix, - zones = zones + zones = zones, + is_ont = is_ont } call Call { @@ -49,13 +52,15 @@ workflow RunPBSV { svsigs = [ Discover.svsig ], ref_fasta = ref_fasta, ref_fasta_fai = ref_fasta_fai, - ccs = is_ccs, + is_hifi = is_hifi, prefix = prefix, - zones = zones + zones = zones, + is_ont = is_ont } output { File vcf = Call.vcf + File tbi = Call.tbi } } @@ -63,18 +68,21 @@ task Discover { input { File bam File bai + Boolean is_hifi File ref_fasta File ref_fasta_fai File? tandem_repeat_bed String? chr String prefix String zones + Boolean is_ont = false RuntimeAttr? runtime_attr_override } parameter_meta { bam: "input BAM from which to call SVs" bai: "index accompanying the BAM" + is_hifi: "if input BAM is HiFi reads" ref_fasta: "reference to which the BAM was aligned to" ref_fasta_fai: "index accompanying the reference" tandem_repeat_bed: "BED file containing TRF finder (e.g. http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.trf.bed.gz)" @@ -82,7 +90,7 @@ task Discover { prefix: "prefix for output" } - Int MINIMAL_DISK = 500 + Int MINIMAL_DISK = 50 Boolean is_big_bam = size(bam, "GB") > 100 Int inflation_factor = if (is_big_bam) then 5 else 2 Int disk_size = inflation_factor * (ceil(size([bam, bai, ref_fasta, ref_fasta_fai], "GB")) + 1) @@ -90,10 +98,50 @@ task Discover { String fileoutput = if defined(chr) then "~{prefix}.~{chr}.svsig.gz" else "~{prefix}.svsig.gz" + String disk_type = if is_ont then "SSD" else "HDD" + command <<< set -euxo pipefail + # pbsv, for ONT inputs, could fail with a really strange error that looks like the following + # >|> 20230828 01:52:02.907 -|- FATAL -|- Run -|- 0x7f704219e4c0|| -|- pbsv discover ERROR: map::at + # upon investigation, it's most likely caused by fields in the RG lines in the header that are not ID, SM, PU + # so we fix it here + # hastag facts-in-life + if ~{is_ont}; then + samtools view --no-PG -H ~{bam} > orig.header.txt + grep -v "^@SQ" orig.header.txt + + # keep the bare minimum info in @RG + grep "^@RG" orig.header.txt > RG.lines + cat RG.lines + while IFS= read -r line; do + id_field=$(echo "${line}" | tr '\t' '\n' | grep "^ID:") + pu_field=$(echo "${line}" | tr '\t' '\n' | grep "^PU:") + sm_field=$(echo "${line}" | tr '\t' '\n' | grep "^SM:") + echo -e "@RG\t${id_field}\t${pu_field}\t${sm_field}" >> fixed.rg.lines + done < RG.lines + cat fixed.rg.lines + + # patch back the header + # this is to follow the order we observe typically in BAM headers: @HD first, @SQ lines, @RG lines, then the rest (usually @PG lines) + grep -v "^@RG" orig.header.txt > non.RG.lines + cat <(head -n1 non.RG.lines) \ + <(grep "^@SQ" non.RG.lines) \ + fixed.rg.lines \ + <(tail +2 non.RG.lines | grep -v "^@HD" | grep -v "^@SQ") \ + > fixed.header.txt + cat fixed.header.txt + + date + samtools reheader --no-PG fixed.header.txt ~{bam} | samtools view -@1 --no-PG -o tmp.bam + date + + samtools view -H tmp.bam | grep "^@RG" + mv tmp.bam ~{bam} + fi pbsv discover \ + ~{true='--hifi' false='' is_hifi} \ ~{if defined(tandem_repeat_bed) then "--tandem-repeats ~{tandem_repeat_bed}" else ""} \ ~{bam} \ ~{fileoutput} @@ -108,18 +156,16 @@ task Discover { cpu_cores: if(defined(chr)) then 8 else 32, mem_gb: if(defined(chr)) then 32 else 128, disk_gb: runtime_disk_size, - boot_disk_gb: 10, - preemptible_tries: 0, - max_retries: 0, - docker: "us.gcr.io/broad-dsp-lrma/lr-sv:0.1.8" + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-smrttools:12.0.0.176214" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " ~{disk_type}" zones: zones - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) docker: select_first([runtime_attr.docker, default_attr.docker]) @@ -131,9 +177,11 @@ task Call { Array[File] svsigs File ref_fasta File ref_fasta_fai - Boolean ccs + Boolean is_hifi String prefix String zones + Boolean DEBUG = false + Boolean is_ont = false RuntimeAttr? runtime_attr_override } @@ -141,7 +189,7 @@ task Call { svsigs: "per-chromosome *.svsig.gz files" ref_fasta: "reference to which the BAM was aligned to" ref_fasta_fai: "index accompanying the reference" - ccs: "use optimizations for CCS data" + is_hifi: "if input BAM is HiFi reads" prefix: "prefix for output" } @@ -150,40 +198,44 @@ task Call { command <<< set -euxo pipefail - num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) - - pbsv call -j $num_core --log-level INFO ~{true='--ccs' false='' ccs} \ + pbsv call \ + -j 0 \ + --log-level ~{true='INFO' false='WARN' DEBUG} \ + --log-file pbsv.call.log \ + ~{true='--hifi' false='' is_hifi} \ ~{ref_fasta} \ ~{sep=' ' svsigs} \ ~{prefix}.pbsv.pre.vcf + # some trivial postprocessing cat ~{prefix}.pbsv.pre.vcf | grep -v -e '##fileDate' > ~{prefix}.pbsv.vcf + bgzip -c ~{prefix}.pbsv.vcf > ~{prefix}.pbsv.vcf.gz + tabix -p vcf ~{prefix}.pbsv.vcf.gz >>> output { - File vcf = "~{prefix}.pbsv.vcf" + File call_log = "pbsv.call.log" # make sure this is always the top, so that in case something goes wrong, we still get the log de-localized + File vcf = "~{prefix}.pbsv.vcf.gz" + File tbi = "~{prefix}.pbsv.vcf.gz.tbi" } ######################### RuntimeAttr default_attr = object { - cpu_cores: 4, + cpu_cores: 16, mem_gb: 96, disk_gb: disk_size, - boot_disk_gb: 10, - preemptible_tries: 1, - max_retries: 0, - docker: "us.gcr.io/broad-dsp-lrma/lr-sv:0.1.8" + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-smrttools:12.0.0.176214" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) zones: zones preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) docker: select_first([runtime_attr.docker, default_attr.docker]) } } - diff --git a/wdl/tasks/VariantCalling/PhaseSmallVariantsAndTagBam.wdl b/wdl/tasks/VariantCalling/PhaseSmallVariantsAndTagBam.wdl new file mode 100644 index 000000000..4446a06d7 --- /dev/null +++ b/wdl/tasks/VariantCalling/PhaseSmallVariantsAndTagBam.wdl @@ -0,0 +1,109 @@ +version 1.0 + +import "../Alignment/WhatsHap.wdl" +import "MarginPhase.wdl" as Margin +import "../Utility/VariantUtils.wdl" + +workflow Run { + meta { + desciption: + "For read-based small variant VCF phasing and haplotagging BAM" + } + parameter_meta { + use_margin_for_tagging: "if false, will use margin-phased VCF for haplotagging the BAM" + } + + input { + Boolean use_margin_for_tagging + + File bam + File bai + Array[Pair[String, Pair[File, File]]] per_chr_bam_bai_and_id + Boolean is_ont + + File unphased_vcf + File unphased_tbi + + Map[String, String] ref_map + + String zones = "us-central1-a us-central1-b us-central1-c us-central1-f" + } + + output { + File margin_phased_vcf = MarginPhase.phased_vcf + File margin_phased_tbi = MarginPhase.phased_tbi + File margin_phasing_stats_tsv = MPhaseStats.stats_tsv + File margin_phasing_stats_gtf = MPhaseStats.stats_gtf + + File whatshap_phased_vcf = MergeWhatsHapPhasedPerChrVCFs.vcf + File whatshap_phased_tbi = MergeWhatsHapPhasedPerChrVCFs.tbi + File whatshap_phasing_stats_tsv = WHPhaseStats.stats_tsv + File whatshap_phasing_stats_gtf = WHPhaseStats.stats_gtf + + File hap_tagged_bam = WhatsHapTag.tagged_bam + File hap_tagged_bai = WhatsHapTag.tagged_bai + String haplotagged_bam_tagger = if use_margin_for_tagging then "MARGIN" else "WhatsHap" + } + + #################################################################################################################################### + # TODO: we need to comeback and make a choice on which phasing tool to use + #################################################################################################################################### + ############# + # phase variants using margin + ############# + call Margin.MarginPhase as MarginPhase { + input: + bam=bam, bai=bai, + data_type= if (is_ont) then "ONT" else "PacBio", + + unphased_vcf=unphased_vcf, + unphased_tbi=unphased_tbi, + + ref_fasta=ref_map['fasta'], ref_fasta_fai=ref_map['fai'], + zones = zones + } + call WhatsHap.Stats as MPhaseStats { input: phased_vcf=MarginPhase.phased_vcf, phased_tbi=MarginPhase.phased_tbi} + + ############# + # phase variant using whatshap phase; but because it is much slower than margin phase, we do scatter-gather + ############# + scatter (triplet in per_chr_bam_bai_and_id) { + call VariantUtils.SubsetVCF as ChopDVVCF { + input: + vcf_gz = unphased_vcf, + vcf_tbi = unphased_tbi, + locus = triplet.left, + prefix = basename(unphased_vcf, ".vcf.gz") + "." + triplet.left + } + call WhatsHap.Phase as WhatsHapPhase { + input : + chromosome=triplet.left, + bam=triplet.right.left, bai=triplet.right.right, + unphased_vcf=ChopDVVCF.subset_vcf, unphased_tbi=ChopDVVCF.subset_tbi, + ref_fasta=ref_map['fasta'], ref_fasta_fai=ref_map['fai'], + } + } + call VariantUtils.MergePerChrCalls as MergeWhatsHapPhasedPerChrVCFs { + input: + vcfs = WhatsHapPhase.phased_vcf, ref_dict = ref_map['dict'], + prefix = basename(unphased_vcf, ".vcf.gz") + ".whatshap-phased" + } + call WhatsHap.Stats as WHPhaseStats { input: phased_vcf=MergeWhatsHapPhasedPerChrVCFs.vcf, phased_tbi=MergeWhatsHapPhasedPerChrVCFs.tbi} + + #################################################################################################################################### + ############# + # !CHOICE! haplotag with WhatsHap, but using which phased VCF?! + ############# + File phased_snp_vcf = if (use_margin_for_tagging) then MarginPhase.phased_vcf else MergeWhatsHapPhasedPerChrVCFs.vcf + File phased_snp_tbi = if (use_margin_for_tagging) then MarginPhase.phased_tbi else MergeWhatsHapPhasedPerChrVCFs.tbi + + call WhatsHap.HaploTagBam as WhatsHapTag { + input: + to_tag_bam = bam, + to_tag_bai = bai, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + phased_vcf = phased_snp_vcf, + phased_tbi = phased_snp_tbi + } +} diff --git a/wdl/tasks/VariantCalling/Sniffles2.wdl b/wdl/tasks/VariantCalling/Sniffles2.wdl index eaafe6a05..ae1d85d3e 100644 --- a/wdl/tasks/VariantCalling/Sniffles2.wdl +++ b/wdl/tasks/VariantCalling/Sniffles2.wdl @@ -54,51 +54,63 @@ workflow Sniffles2 { } } - - task SampleSV { meta { description: "This task calls SV candidates from a single sample." } + parameter_meta { + bam: { desciption: "input BAM from which to call SVs", localization_optional: true } + bai: "index accompanying the BAM" + minsvlen: "minimum SV length in bp. Default 50" + sample_id: "Sample ID" + prefix: "prefix for output" + phase_sv: "if you're sure the BAM is phased/haplotagged, turn this on to generate phased SV" + tandem_repeat_bed: "BED file containing TRF finder (e.g. http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.trf.bed.gz)" + } + input { File bam File bai Int minsvlen = 50 String sample_id String prefix + File? tandem_repeat_bed + Boolean phase_sv = false RuntimeAttr? runtime_attr_override } - parameter_meta { - bam: "input BAM from which to call SVs" - bai: "index accompanying the BAM" - minsvlen: "minimum SV length in bp. Default 50" - sample_id: "Sample ID" - prefix: "prefix for output" - } - Int cpus = 8 Int disk_size = 2*ceil(size([bam, bai], "GB")) - String snf_output = "~{prefix}.sniffles.snf" - String vcf_output = "~{prefix}.sniffles.vcf" + String postfix = if phase_sv then "-phased" else "" + String snf_output = "~{prefix}.sniffles~{postfix}.snf" + String vcf_output = "~{prefix}.sniffles~{postfix}.vcf.gz" + String tbi_output = "~{prefix}.sniffles~{postfix}.vcf.gz.tbi" + + String local_bam = "/cromwell_root/~{basename(bam)}" command <<< - set -eux + set -euxo pipefail + + time gcloud storage cp ~{bam} ~{local_bam} + mv ~{bai} "~{local_bam}.bai" + touch ~{bai} # handle the bai-older-than-bam warning sniffles -t ~{cpus} \ - -i ~{bam} \ + -i ~{local_bam} \ --minsvlen ~{minsvlen} \ --sample-id ~{sample_id} \ + ~{if defined(tandem_repeat_bed) then "--tandem-repeats ~{tandem_repeat_bed}" else ""} \ + ~{true="--phase" false="" phase_sv} \ --vcf ~{vcf_output} \ --snf ~{snf_output} - tree >>> output { File snf = "~{snf_output}" File vcf = "~{vcf_output}" + File tbi = "~{tbi_output}" } ######################### @@ -109,7 +121,7 @@ task SampleSV { boot_disk_gb: 10, preemptible_tries: 3, max_retries: 2, - docker: "us.gcr.io/broad-dsp-lrma/lr-sniffles2:2.0.6" + docker: "us.gcr.io/broad-dsp-lrma/lr-sniffles2:2.2" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { @@ -123,7 +135,6 @@ task SampleSV { } } - task MergeCall { meta { @@ -161,7 +172,7 @@ task MergeCall { boot_disk_gb: 10, preemptible_tries: 3, max_retries: 2, - docker: "us.gcr.io/broad-dsp-lrma/lr-sniffles2:2.0.6" + docker: "us.gcr.io/broad-dsp-lrma/lr-sniffles2:2.2" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { @@ -174,4 +185,4 @@ task MergeCall { docker: select_first([runtime_attr.docker, default_attr.docker]) } -} \ No newline at end of file +} diff --git a/wdl/tasks/Visualization/NanoPlot.wdl b/wdl/tasks/Visualization/NanoPlot.wdl index c5d97de06..d3b1cff70 100644 --- a/wdl/tasks/Visualization/NanoPlot.wdl +++ b/wdl/tasks/Visualization/NanoPlot.wdl @@ -186,24 +186,33 @@ task NanoPlotFromBam { } parameter_meta { - bam: "A bam file to use as input" - bai: "The bai file for the bam file" - runtime_attr_override: "Override the default runtime attributes" + bam: {localization_optional: true} } input { File bam File bai + String disk_type = "SSD" + RuntimeAttr? runtime_attr_override } - Int disk_size = 2*ceil(size(bam, "GB")) + 10 + Int pd_disk_size = 50 + ceil(size(bam, "GiB")) + Int local_disk_size = if(size(bam, "GiB")>300) then 750 else 375 + Int disk_size = if('LOCAL'==disk_type) then local_disk_size else pd_disk_size + + String base = basename(bam) + String local_bam = "/cromwell_root/~{base}" command <<< set -euxo pipefail + time \ + gcloud storage cp ~{bam} ~{local_bam} + touch ~{bai} # avoid the warning bai is older than bam + mv ~{bai} "~{local_bam}.bai" num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) @@ -213,7 +222,7 @@ task NanoPlotFromBam { --tsv_stats \ --no_supplementary \ --verbose \ - --bam "~{bam}" + --bam "~{local_bam}" cat NanoStats.txt | \ grep -v -e '^Metrics' -e '^highest' -e '^longest' | \ @@ -223,55 +232,18 @@ task NanoPlotFromBam { tee map.txt >>> - #number_of_reads 143488 - #number_of_bases 993469297.0 - #number_of_bases_aligned 402067275.0 - #fraction_bases_aligned 0.4 - #median_read_length 5081.0 - #mean_read_length 6923.7 - #read_length_stdev 6116.7 - #n50 9210.0 - #average_identity 92.8 - #median_identity 94.5 - #mean_qual 14.6 - #median_qual 15.0 - #Reads_Q5 143488 - #Reads_Q7 143488 - #Reads_Q10 140551 - #Reads_Q12 119386 - #Reads_Q15 71164 - output { File stats = "NanoStats.txt" Map[String, Float] stats_map = read_map("map.txt") Array[File] plots = glob("*.png") -# File AlignedReadlengthvsSequencedReadLength_dot = "AlignedReadlengthvsSequencedReadLength_dot.png" -# File AlignedReadlengthvsSequencedReadLength_kde = "AlignedReadlengthvsSequencedReadLength_kde.png" -# File LengthvsQualityScatterPlot_dot = "LengthvsQualityScatterPlot_dot.png" -# File LengthvsQualityScatterPlot_kde = "LengthvsQualityScatterPlot_kde.png" -# File MappingQualityvsAverageBaseQuality_dot = "MappingQualityvsAverageBaseQuality_dot.png" -# File MappingQualityvsAverageBaseQuality_kde = "MappingQualityvsAverageBaseQuality_kde.png" -# File MappingQualityvsReadLength_dot = "MappingQualityvsReadLength_dot.png" -# File MappingQualityvsReadLength_kde = "MappingQualityvsReadLength_kde.png" -# File Non_weightedHistogramReadlength = "Non_weightedHistogramReadlength.png" -# File Non_weightedLogTransformed_HistogramReadlength = "Non_weightedLogTransformed_HistogramReadlength.png" -# File PercentIdentityHistogramDynamic_Histogram_percent_identity = "PercentIdentityHistogramDynamic_Histogram_percent_identity.png" -# File PercentIdentityvsAlignedReadLength_dot = "PercentIdentityvsAlignedReadLength_dot.png" -# File PercentIdentityvsAlignedReadLength_kde = "PercentIdentityvsAlignedReadLength_kde.png" -# File PercentIdentityvsAverageBaseQuality_dot = "PercentIdentityvsAverageBaseQuality_dot.png" -# File PercentIdentityvsAverageBaseQuality_kde = "PercentIdentityvsAverageBaseQuality_kde.png" -# File WeightedHistogramReadlength = "WeightedHistogramReadlength.png" -# File WeightedLogTransformed_HistogramReadlength = "WeightedLogTransformed_HistogramReadlength.png" -# File Yield_By_Length = "Yield_By_Length.png" } ######################### RuntimeAttr default_attr = object { cpu_cores: 8, - mem_gb: 24, + mem_gb: 12, disk_gb: disk_size, - boot_disk_gb: 10, preemptible_tries: 0, max_retries: 1, docker: "us.gcr.io/broad-dsp-lrma/lr-nanoplot:1.40.0-1" @@ -280,8 +252,7 @@ task NanoPlotFromBam { runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " ~{disk_type}" preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) docker: select_first([runtime_attr.docker, default_attr.docker]) @@ -295,28 +266,31 @@ task NanoPlotFromUBam { } parameter_meta { - bam: "BAM file" - runtime_attr_override: "Runtime attributes to override" + uBAM: {localization_optional: true} } input { - File bam + File uBAM RuntimeAttr? runtime_attr_override } - Int disk_size = 2*ceil(size(bam, "GB")) + Int disk_size = 2*ceil(size(uBAM, "GB")) + String base = basename(uBAM) + String local_bam = "/cromwell_root/~{base}" command <<< set -euxo pipefail + time gcloud storage cp ~{uBAM} ~{local_bam} + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) NanoPlot -t ${num_core} \ -c orangered \ --N50 \ --tsv_stats \ - --ubam "~{bam}" + --ubam ~{local_bam} cat NanoStats.txt | \ grep -v -e '^Metrics' -e '^highest' -e '^longest' | \ @@ -326,30 +300,11 @@ task NanoPlotFromUBam { tee map.txt >>> - #number_of_reads 991 - #number_of_bases 12949457.0 - #median_read_length 13705.0 - #mean_read_length 13067.1 - #read_length_stdev 9581.3 - #n50 18618.0 - #mean_qual 0.0 - #median_qual 0.0 - #Reads_Q5 0 - #Reads_Q7 0 - #Reads_Q10 0 - #Reads_Q12 0 - #Reads_Q15 0 - output { File stats = "NanoStats.txt" Map[String, Float] stats_map = read_map("map.txt") Array[File] plots = glob("*.png") - File Non_weightedHistogramReadlength = "Non_weightedHistogramReadlength.png" - File Non_weightedLogTransformed_HistogramReadlength = "Non_weightedLogTransformed_HistogramReadlength.png" - File WeightedHistogramReadlength = "WeightedHistogramReadlength.png" - File WeightedLogTransformed_HistogramReadlength = "WeightedLogTransformed_HistogramReadlength.png" - File Yield_By_Length = "Yield_By_Length.png" } ######################### @@ -360,7 +315,7 @@ task NanoPlotFromUBam { boot_disk_gb: 10, preemptible_tries: 2, max_retries: 1, - docker: "quay.io/biocontainers/nanoplot:1.35.5--pyhdfd78af_0" + docker: "us.gcr.io/broad-dsp-lrma/lr-nanoplot:1.40.0-1" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { diff --git a/wdl/tasks/Visualization/VisualizeResourceUsage.wdl b/wdl/tasks/Visualization/VisualizeResourceUsage.wdl new file mode 100644 index 000000000..5c821de05 --- /dev/null +++ b/wdl/tasks/Visualization/VisualizeResourceUsage.wdl @@ -0,0 +1,30 @@ +version 1.0 + +task SimpleRscript { + meta { + description: "Uses a simple R script to visualize resource usage of a resource-hungry task." + } + parameter_meta { + resource_log: "Resource usage log file." + output_pdf_name: "name of the output plot, must end in .pdf" + plot_title: "Title of the plot" + } + input { + File resource_log + String output_pdf_name + String plot_title + } + output { + File plot_pdf = "~{output_pdf_name}" + } + + command <<< + set -eux + + /opt/plot.resources.R "~{resource_log}" "~{output_pdf_name}" "~{plot_title}" + >>> + runtime { + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-dsp-lrma/lr-resource-visual:0.1.1" + } +}