Skip to content

Commit

Permalink
Two new workflows:
Browse files Browse the repository at this point in the history
    handle (ref-indepedent, alignment) a single piece of on-instrument demultiplexed CCS bam
  • Loading branch information
SHuang-Broad committed Mar 1, 2024
1 parent d95be34 commit 1e26de7
Show file tree
Hide file tree
Showing 3 changed files with 380 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ workflows:
- name: PBMASIsoSeqDemultiplex
subclass: wdl
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/PBMASIsoSeqDemultiplex.wdl
- name: ProcessOnInstrumentDemuxedChunkRefFree
subclass: wdl
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunkRefFree.wdl
- name: ProcessOnInstrumentDemuxedChunk
subclass: wdl
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl

###################################################
# TechAgnostic - *mics data processing
Expand Down
182 changes: 182 additions & 0 deletions wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
version 1.0

import "../../../tasks/Utility/GeneralUtils.wdl" as GU

import "../../../tasks/Alignment/AlignHiFiUBAM.wdl" as ALN

import "../../TechAgnostic/Utility/AlignedBamQCandMetrics.wdl" as QCMetrics

import "../../../tasks/Utility/Finalize.wdl" as FF

workflow ProcessOnInstrumentDemuxedChunk {

meta {
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform alignment and QC check, and collect a few metrics."
}

parameter_meta {

#########
# inputs
readgroup_id:
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>; no whitespaces allowed"

bam_SM_field:
"value to place in the SM field of the resulting BAM header's @RG line"

platform:
"PacBio platform used for generating the data; accepted value: [Sequel, Revio]"

gcs_out_root_dir:
"output files will be copied over there"

qc_metrics_config_json:
"A config json to for running the QC and metrics-collection sub-workflow 'AlignedBamQCandMetrics'"

fingerprint_sample_id:
"For fingerprint verification: the ID of the sample supposedly this BAM belongs to; note that the fingerprint VCF is assumed to be located at {fingerprint_store}/{fingerprint_sample_id}*.vcf(.gz)?"

expected_sex_type:
"If provided, triggers sex concordance check. Accepted value: [M, F, NA, na]"

#########
# outputs
wgs_cov:
"whole genome mean coverage"

nanoplot_summ:
"Summary on alignment metrics provided by Nanoplot (todo: study the value of this output)"

sam_flag_stats:
"SAM flag stats"

fingerprint_check:
"Summary on (human) fingerprint checking results"

contamination_est:
"cross-(human)individual contamination estimation by VerifyBAMID2"

inferred_sex_info:
"Inferred sex concordance information if expected sex type is provided"

methyl_tag_simple_stats:
"Simple stats on the reads with & without SAM methylation tags (MM/ML)."

aBAM_metrics_files:
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks"
}

input {
String gcs_out_root_dir

File uBAM
File? uPBI

String readgroup_id
String bam_SM_field

String platform

# args for optional QC subworkflows
File? qc_metrics_config_json
String? fingerprint_sample_id
String? expected_sex_type

File ref_map_file
String disk_type = "SSD"
}

output {
String last_processing_date = today.yyyy_mm_dd

File aligned_bam = FinalizeAlignedBam.gcs_path
File aligned_bai = FinalizeAlignedBai.gcs_path
File aligned_pbi = FinalizeAlignedPbi.gcs_path

String movie = AlignHiFiUBAM.movie

# metrics block (caveat: always has to keep an eye on the QC subworkflow about outputs)
Float wgs_cov = QCandMetrics.wgs_cov
Map[String, Float] nanoplot_summ = QCandMetrics.nanoplot_summ
Map[String, Float] sam_flag_stats = QCandMetrics.sam_flag_stats

# fingerprint
Map[String, String]? fingerprint_check = QCandMetrics.fingerprint_check

# contam
Float? contamination_est = QCandMetrics.contamination_est

# sex concordance
Map[String, String]? inferred_sex_info = QCandMetrics.inferred_sex_info

# methyl
Map[String, String]? methyl_tag_simple_stats = QCandMetrics.methyl_tag_simple_stats

# file-based QC/metrics outputs all packed into a finalization map
Map[String, String] aBAM_metrics_files = QCandMetrics.aBAM_metrics_files
}

###################################################################################
# prep work
# where to store final results
String workflow_name = "ProcessOnInstrumentDemuxedChunk"
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name

# String bc_specific_out = outdir + '/' + readgroup_id
String bc_specific_aln_out = outdir + '/alignments/' + readgroup_id
String bc_specific_metrics_out = outdir + "/metrics/" + readgroup_id

###################################################################################
# align
call ALN.AlignHiFiUBAM as AlignHiFiUBAM { input:
uBAM = uBAM,
uPBI = uPBI,
bam_sample_name = bam_SM_field,
ref_map_file = ref_map_file,
application = 'HIFI',
disk_type = disk_type
}

File aBAM = AlignHiFiUBAM.aligned_bam
File aBAI = AlignHiFiUBAM.aligned_bai
File aPBI = AlignHiFiUBAM.aligned_pbi

# save
call FF.FinalizeToFile as FinalizeAlignedBam { input: outdir = bc_specific_aln_out, file = aBAM, name = readgroup_id + '.bam' }
call FF.FinalizeToFile as FinalizeAlignedBai { input: outdir = bc_specific_aln_out, file = aBAI, name = readgroup_id + '.bai' }
call FF.FinalizeToFile as FinalizeAlignedPbi { input: outdir = bc_specific_aln_out, file = aPBI, name = readgroup_id + '.pbi' }

###################################################################################
# QC
AlignedBamQCnMetricsConfig qcm_config = read_json(select_first([qc_metrics_config_json]))
call QCMetrics.Work as QCandMetrics { input:
bam = aBAM,
bai = aBAI,

tech = platform,

cov_bed = qcm_config.cov_bed,
cov_bed_descriptor = qcm_config.cov_bed_descriptor,

fingerprint_vcf_store = qcm_config.fingerprint_vcf_store,
fingerprint_sample_id = fingerprint_sample_id,

expected_sex_type = expected_sex_type,

vbid2_config_json = qcm_config.vbid2_config_json,

methyl_tag_check_bam_descriptor = qcm_config.methyl_tag_check_bam_descriptor,
save_methyl_uncalled_reads = qcm_config.save_methyl_uncalled_reads,

ref_map_file = ref_map_file,
disk_type = disk_type,

output_prefix = readgroup_id, # String output_prefix = bam_sample_name + "." + flowcell
gcs_out_root_dir = bc_specific_metrics_out,
}

###################################################################################
call GU.GetTodayDate as today {}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
version 1.0

import "../../../tasks/Utility/Utils.wdl"
import "../../../tasks/Utility/BAMutils.wdl" as BU
import "../../../tasks/Utility/GeneralUtils.wdl" as GU

import "../../../tasks/Visualization/NanoPlot.wdl" as QC0
import "../../TechAgnostic/Utility/CountTheBeans.wdl" as QC1
import "../../TechAgnostic/Utility/DystPeaker.wdl" as QC2
import "../../TechAgnostic/Utility/FASTQstats.wdl" as QC3

import "../../../tasks/Utility/Finalize.wdl" as FF
import "../../TechAgnostic/Utility/SaveFilesToDestination.wdl" as SAVE

workflow ProcessOnInstrumentDemuxedChunkRefFree {

meta {
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform ref-independent prep work, and collect a few metrics"
}

parameter_meta {
input_hifi_fq:
"The preexisting Hifi FASTQ; if provided the metrics calculation is faster; if not, you most likely want to set convert_to_fq to true"

readgroup_id:
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>"

short_reads_threshold:
"a length threshold below which reads are classified as short"

bam_descriptor:
"a one-word description of the purpose of the BAM (e.g. 'a_particular_seq_center'; used for saving the reads that don't have any MM/ML tags; doesn't need to be single-file specific)"

run_nanoplot:
"if true, will kick off Nanoplot to collect metrics on the uBAM; this isn't necessary if you also run the alignment version to process the uBAM as Nanoplot is run there automatically and produces a superset of metrics"

convert_to_fq:
"if true, input HiFi uBAM will be converted to FASTQ"
hifi_fq:
"available only if convert_to_fq is true."

nanoplot_u_summ:
"A few metrics output by Nanoplot on the uBAM"
seqkit_stats:
"A few metrics output by seqkit stats"

read_len_summaries:
"A few metrics summarizing the read length distribution"
read_len_peaks:
"Peaks of the read length distribution (heruistic)"
read_len_deciles:
"Deciles of the read length distribution"

methyl_tag_simple_stats:
"Simple stats on the reads with & without SAM methylation tags (MM/ML)."

uBAM_metrics_files:
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks"
}

input {
File uBAM
String readgroup_id
File? input_hifi_fq

String bam_descriptor
Int short_reads_threshold

Boolean run_nanoplot = false
Boolean convert_to_fq = false
Boolean i_donot_want_fq = false

String disk_type = "SSD"

String gcs_out_root_dir
}

output {
String? movie = movie_name

File? converted_hifi_fq = FinalizeFQ.gcs_path

String last_processing_date = today.yyyy_mm_dd

# todo merge these two together
Map[String, Float] seqkit_stats = select_first([SeqKitOnBAM.stats, SeqKitOnFASTQ.stats])
Map[String, Float]? nanoplot_u_summ = NanoPlotFromUBam.stats_map

Map[String, String] read_len_summaries = DystPeaker.read_len_summaries
Array[Int] read_len_peaks = DystPeaker.read_len_peaks
Array[Int] read_len_deciles = DystPeaker.read_len_deciles

# methylation call rate stats
Map[String, String] methyl_tag_simple_stats = NoMissingBeans.methyl_tag_simple_stats

# file-based outputs all packed into a finalization map
Map[String, String] uBAM_metrics_files = files_out
}

###################################################################################
# prep work
# arg validation
if (defined(input_hifi_fq) == convert_to_fq) {
if (!i_donot_want_fq) {
call Utils.StopWorkflow as MutExProvided { input:
reason = "You most likely want to either convert to FASTQ here, or have a pre-existing FASTQ, but not (miss) both."
}
}
}

# where to store final results
String workflow_name = "ProcessOnInstrumentDemuxedChunkRefFree"
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name

String outdir_ref_free = outdir + '/RefFree' # ideally, this isn't necessary, but it's a relic we'd not touch right now (2023-12-23)
String bc_specific_out = outdir_ref_free + '/' + readgroup_id
String bc_specific_metrics_out = bc_specific_out + "/metrics"

###################################################################################
# convert to FASTQ if there isn't already one
if (!defined(input_hifi_fq) && !i_donot_want_fq) {
call BU.GetReadGroupInfo as RG { input: bam = uBAM, keys = ['PU']}
String movie_name = RG.read_group_info['PU']

###################################################################################
call BU.BamToFastq { input: bam = uBAM, prefix = "does_not_matter"}
call FF.FinalizeToFile as FinalizeFQ { input:
outdir = bc_specific_out,
file = BamToFastq.reads_fq,
name = readgroup_id + '.hifi.fq.gz'
}
}

###################################################################################
# more QCs and metrics
##########
if (run_nanoplot) {
call QC0.NanoPlotFromUBam { input: uBAM = uBAM }
FinalizationManifestLine a = object
{files_to_save: flatten([[NanoPlotFromUBam.stats], NanoPlotFromUBam.plots]),
is_singleton_file: false,
destination: bc_specific_metrics_out + "/nanoplot",
output_attribute_name: "nanoplot"}
}
##########
call QC1.CountTheBeans as NoMissingBeans { input:
bam=uBAM,
bam_descriptor=bam_descriptor,
gcs_out_root_dir=bc_specific_metrics_out,
use_local_ssd=disk_type=='LOCAL'
}
Map[String, String] methyl_out = {"missing_methyl_tag_reads":
NoMissingBeans.methyl_tag_simple_stats['files_holding_reads_without_tags']}
##########
call QC2.DystPeaker { input:
input_file=uBAM, input_is_bam=true,
id=readgroup_id, short_reads_threshold=short_reads_threshold,
gcs_out_root_dir=bc_specific_metrics_out
}
Map[String, String] read_len_out = {"read_len_hist": DystPeaker.read_len_hist,
"raw_rl_file": DystPeaker.read_len_summaries['raw_rl_file']}

##########
# seqkit stats, detail depends on if FASTQ is available/desired or not
if (i_donot_want_fq) {
call QC3.FASTQstats as SeqKitOnBAM { input: reads = uBAM, file_type = "BAM" }
}
if (!i_donot_want_fq) {
File use_this_hifi_fq = select_first([input_hifi_fq, BamToFastq.reads_fq])
call QC3.FASTQstats as SeqKitOnFASTQ { input: reads=use_this_hifi_fq, file_type='FASTQ' }
}

###################################################################################
if (run_nanoplot) {
call SAVE.SaveFilestoDestination as SaveRest { input:
instructions = select_all([a]),
already_finalized = select_all([methyl_out,
read_len_out])
}
}
if (!run_nanoplot) {
call GU.MergeMaps { input:
one = methyl_out,
two = read_len_out,
}
}
Map[String, String] files_out = select_first([SaveRest.result, MergeMaps.merged])

call GU.GetTodayDate as today {}
}

0 comments on commit 1e26de7

Please sign in to comment.