-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
handle (ref-indepedent, alignment) a single piece of on-instrument demultiplexed CCS bam
- Loading branch information
1 parent
d95be34
commit 1e26de7
Showing
3 changed files
with
380 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
182 changes: 182 additions & 0 deletions
182
wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
version 1.0 | ||
|
||
import "../../../tasks/Utility/GeneralUtils.wdl" as GU | ||
|
||
import "../../../tasks/Alignment/AlignHiFiUBAM.wdl" as ALN | ||
|
||
import "../../TechAgnostic/Utility/AlignedBamQCandMetrics.wdl" as QCMetrics | ||
|
||
import "../../../tasks/Utility/Finalize.wdl" as FF | ||
|
||
workflow ProcessOnInstrumentDemuxedChunk { | ||
|
||
meta { | ||
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform alignment and QC check, and collect a few metrics." | ||
} | ||
|
||
parameter_meta { | ||
|
||
######### | ||
# inputs | ||
readgroup_id: | ||
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>; no whitespaces allowed" | ||
|
||
bam_SM_field: | ||
"value to place in the SM field of the resulting BAM header's @RG line" | ||
|
||
platform: | ||
"PacBio platform used for generating the data; accepted value: [Sequel, Revio]" | ||
|
||
gcs_out_root_dir: | ||
"output files will be copied over there" | ||
|
||
qc_metrics_config_json: | ||
"A config json to for running the QC and metrics-collection sub-workflow 'AlignedBamQCandMetrics'" | ||
|
||
fingerprint_sample_id: | ||
"For fingerprint verification: the ID of the sample supposedly this BAM belongs to; note that the fingerprint VCF is assumed to be located at {fingerprint_store}/{fingerprint_sample_id}*.vcf(.gz)?" | ||
|
||
expected_sex_type: | ||
"If provided, triggers sex concordance check. Accepted value: [M, F, NA, na]" | ||
|
||
######### | ||
# outputs | ||
wgs_cov: | ||
"whole genome mean coverage" | ||
|
||
nanoplot_summ: | ||
"Summary on alignment metrics provided by Nanoplot (todo: study the value of this output)" | ||
|
||
sam_flag_stats: | ||
"SAM flag stats" | ||
|
||
fingerprint_check: | ||
"Summary on (human) fingerprint checking results" | ||
|
||
contamination_est: | ||
"cross-(human)individual contamination estimation by VerifyBAMID2" | ||
|
||
inferred_sex_info: | ||
"Inferred sex concordance information if expected sex type is provided" | ||
|
||
methyl_tag_simple_stats: | ||
"Simple stats on the reads with & without SAM methylation tags (MM/ML)." | ||
|
||
aBAM_metrics_files: | ||
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks" | ||
} | ||
|
||
input { | ||
String gcs_out_root_dir | ||
|
||
File uBAM | ||
File? uPBI | ||
|
||
String readgroup_id | ||
String bam_SM_field | ||
|
||
String platform | ||
|
||
# args for optional QC subworkflows | ||
File? qc_metrics_config_json | ||
String? fingerprint_sample_id | ||
String? expected_sex_type | ||
|
||
File ref_map_file | ||
String disk_type = "SSD" | ||
} | ||
|
||
output { | ||
String last_processing_date = today.yyyy_mm_dd | ||
|
||
File aligned_bam = FinalizeAlignedBam.gcs_path | ||
File aligned_bai = FinalizeAlignedBai.gcs_path | ||
File aligned_pbi = FinalizeAlignedPbi.gcs_path | ||
|
||
String movie = AlignHiFiUBAM.movie | ||
|
||
# metrics block (caveat: always has to keep an eye on the QC subworkflow about outputs) | ||
Float wgs_cov = QCandMetrics.wgs_cov | ||
Map[String, Float] nanoplot_summ = QCandMetrics.nanoplot_summ | ||
Map[String, Float] sam_flag_stats = QCandMetrics.sam_flag_stats | ||
|
||
# fingerprint | ||
Map[String, String]? fingerprint_check = QCandMetrics.fingerprint_check | ||
|
||
# contam | ||
Float? contamination_est = QCandMetrics.contamination_est | ||
|
||
# sex concordance | ||
Map[String, String]? inferred_sex_info = QCandMetrics.inferred_sex_info | ||
|
||
# methyl | ||
Map[String, String]? methyl_tag_simple_stats = QCandMetrics.methyl_tag_simple_stats | ||
|
||
# file-based QC/metrics outputs all packed into a finalization map | ||
Map[String, String] aBAM_metrics_files = QCandMetrics.aBAM_metrics_files | ||
} | ||
|
||
################################################################################### | ||
# prep work | ||
# where to store final results | ||
String workflow_name = "ProcessOnInstrumentDemuxedChunk" | ||
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name | ||
|
||
# String bc_specific_out = outdir + '/' + readgroup_id | ||
String bc_specific_aln_out = outdir + '/alignments/' + readgroup_id | ||
String bc_specific_metrics_out = outdir + "/metrics/" + readgroup_id | ||
|
||
################################################################################### | ||
# align | ||
call ALN.AlignHiFiUBAM as AlignHiFiUBAM { input: | ||
uBAM = uBAM, | ||
uPBI = uPBI, | ||
bam_sample_name = bam_SM_field, | ||
ref_map_file = ref_map_file, | ||
application = 'HIFI', | ||
disk_type = disk_type | ||
} | ||
|
||
File aBAM = AlignHiFiUBAM.aligned_bam | ||
File aBAI = AlignHiFiUBAM.aligned_bai | ||
File aPBI = AlignHiFiUBAM.aligned_pbi | ||
|
||
# save | ||
call FF.FinalizeToFile as FinalizeAlignedBam { input: outdir = bc_specific_aln_out, file = aBAM, name = readgroup_id + '.bam' } | ||
call FF.FinalizeToFile as FinalizeAlignedBai { input: outdir = bc_specific_aln_out, file = aBAI, name = readgroup_id + '.bai' } | ||
call FF.FinalizeToFile as FinalizeAlignedPbi { input: outdir = bc_specific_aln_out, file = aPBI, name = readgroup_id + '.pbi' } | ||
|
||
################################################################################### | ||
# QC | ||
AlignedBamQCnMetricsConfig qcm_config = read_json(select_first([qc_metrics_config_json])) | ||
call QCMetrics.Work as QCandMetrics { input: | ||
bam = aBAM, | ||
bai = aBAI, | ||
|
||
tech = platform, | ||
|
||
cov_bed = qcm_config.cov_bed, | ||
cov_bed_descriptor = qcm_config.cov_bed_descriptor, | ||
|
||
fingerprint_vcf_store = qcm_config.fingerprint_vcf_store, | ||
fingerprint_sample_id = fingerprint_sample_id, | ||
|
||
expected_sex_type = expected_sex_type, | ||
|
||
vbid2_config_json = qcm_config.vbid2_config_json, | ||
|
||
methyl_tag_check_bam_descriptor = qcm_config.methyl_tag_check_bam_descriptor, | ||
save_methyl_uncalled_reads = qcm_config.save_methyl_uncalled_reads, | ||
|
||
ref_map_file = ref_map_file, | ||
disk_type = disk_type, | ||
|
||
output_prefix = readgroup_id, # String output_prefix = bam_sample_name + "." + flowcell | ||
gcs_out_root_dir = bc_specific_metrics_out, | ||
} | ||
|
||
################################################################################### | ||
call GU.GetTodayDate as today {} | ||
} |
192 changes: 192 additions & 0 deletions
192
wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunkRefFree.wdl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
version 1.0 | ||
|
||
import "../../../tasks/Utility/Utils.wdl" | ||
import "../../../tasks/Utility/BAMutils.wdl" as BU | ||
import "../../../tasks/Utility/GeneralUtils.wdl" as GU | ||
|
||
import "../../../tasks/Visualization/NanoPlot.wdl" as QC0 | ||
import "../../TechAgnostic/Utility/CountTheBeans.wdl" as QC1 | ||
import "../../TechAgnostic/Utility/DystPeaker.wdl" as QC2 | ||
import "../../TechAgnostic/Utility/FASTQstats.wdl" as QC3 | ||
|
||
import "../../../tasks/Utility/Finalize.wdl" as FF | ||
import "../../TechAgnostic/Utility/SaveFilesToDestination.wdl" as SAVE | ||
|
||
workflow ProcessOnInstrumentDemuxedChunkRefFree { | ||
|
||
meta { | ||
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform ref-independent prep work, and collect a few metrics" | ||
} | ||
|
||
parameter_meta { | ||
input_hifi_fq: | ||
"The preexisting Hifi FASTQ; if provided the metrics calculation is faster; if not, you most likely want to set convert_to_fq to true" | ||
|
||
readgroup_id: | ||
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>" | ||
|
||
short_reads_threshold: | ||
"a length threshold below which reads are classified as short" | ||
|
||
bam_descriptor: | ||
"a one-word description of the purpose of the BAM (e.g. 'a_particular_seq_center'; used for saving the reads that don't have any MM/ML tags; doesn't need to be single-file specific)" | ||
|
||
run_nanoplot: | ||
"if true, will kick off Nanoplot to collect metrics on the uBAM; this isn't necessary if you also run the alignment version to process the uBAM as Nanoplot is run there automatically and produces a superset of metrics" | ||
|
||
convert_to_fq: | ||
"if true, input HiFi uBAM will be converted to FASTQ" | ||
hifi_fq: | ||
"available only if convert_to_fq is true." | ||
|
||
nanoplot_u_summ: | ||
"A few metrics output by Nanoplot on the uBAM" | ||
seqkit_stats: | ||
"A few metrics output by seqkit stats" | ||
|
||
read_len_summaries: | ||
"A few metrics summarizing the read length distribution" | ||
read_len_peaks: | ||
"Peaks of the read length distribution (heruistic)" | ||
read_len_deciles: | ||
"Deciles of the read length distribution" | ||
|
||
methyl_tag_simple_stats: | ||
"Simple stats on the reads with & without SAM methylation tags (MM/ML)." | ||
|
||
uBAM_metrics_files: | ||
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks" | ||
} | ||
|
||
input { | ||
File uBAM | ||
String readgroup_id | ||
File? input_hifi_fq | ||
|
||
String bam_descriptor | ||
Int short_reads_threshold | ||
|
||
Boolean run_nanoplot = false | ||
Boolean convert_to_fq = false | ||
Boolean i_donot_want_fq = false | ||
|
||
String disk_type = "SSD" | ||
|
||
String gcs_out_root_dir | ||
} | ||
|
||
output { | ||
String? movie = movie_name | ||
|
||
File? converted_hifi_fq = FinalizeFQ.gcs_path | ||
|
||
String last_processing_date = today.yyyy_mm_dd | ||
|
||
# todo merge these two together | ||
Map[String, Float] seqkit_stats = select_first([SeqKitOnBAM.stats, SeqKitOnFASTQ.stats]) | ||
Map[String, Float]? nanoplot_u_summ = NanoPlotFromUBam.stats_map | ||
|
||
Map[String, String] read_len_summaries = DystPeaker.read_len_summaries | ||
Array[Int] read_len_peaks = DystPeaker.read_len_peaks | ||
Array[Int] read_len_deciles = DystPeaker.read_len_deciles | ||
|
||
# methylation call rate stats | ||
Map[String, String] methyl_tag_simple_stats = NoMissingBeans.methyl_tag_simple_stats | ||
|
||
# file-based outputs all packed into a finalization map | ||
Map[String, String] uBAM_metrics_files = files_out | ||
} | ||
|
||
################################################################################### | ||
# prep work | ||
# arg validation | ||
if (defined(input_hifi_fq) == convert_to_fq) { | ||
if (!i_donot_want_fq) { | ||
call Utils.StopWorkflow as MutExProvided { input: | ||
reason = "You most likely want to either convert to FASTQ here, or have a pre-existing FASTQ, but not (miss) both." | ||
} | ||
} | ||
} | ||
|
||
# where to store final results | ||
String workflow_name = "ProcessOnInstrumentDemuxedChunkRefFree" | ||
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name | ||
|
||
String outdir_ref_free = outdir + '/RefFree' # ideally, this isn't necessary, but it's a relic we'd not touch right now (2023-12-23) | ||
String bc_specific_out = outdir_ref_free + '/' + readgroup_id | ||
String bc_specific_metrics_out = bc_specific_out + "/metrics" | ||
|
||
################################################################################### | ||
# convert to FASTQ if there isn't already one | ||
if (!defined(input_hifi_fq) && !i_donot_want_fq) { | ||
call BU.GetReadGroupInfo as RG { input: bam = uBAM, keys = ['PU']} | ||
String movie_name = RG.read_group_info['PU'] | ||
|
||
################################################################################### | ||
call BU.BamToFastq { input: bam = uBAM, prefix = "does_not_matter"} | ||
call FF.FinalizeToFile as FinalizeFQ { input: | ||
outdir = bc_specific_out, | ||
file = BamToFastq.reads_fq, | ||
name = readgroup_id + '.hifi.fq.gz' | ||
} | ||
} | ||
|
||
################################################################################### | ||
# more QCs and metrics | ||
########## | ||
if (run_nanoplot) { | ||
call QC0.NanoPlotFromUBam { input: uBAM = uBAM } | ||
FinalizationManifestLine a = object | ||
{files_to_save: flatten([[NanoPlotFromUBam.stats], NanoPlotFromUBam.plots]), | ||
is_singleton_file: false, | ||
destination: bc_specific_metrics_out + "/nanoplot", | ||
output_attribute_name: "nanoplot"} | ||
} | ||
########## | ||
call QC1.CountTheBeans as NoMissingBeans { input: | ||
bam=uBAM, | ||
bam_descriptor=bam_descriptor, | ||
gcs_out_root_dir=bc_specific_metrics_out, | ||
use_local_ssd=disk_type=='LOCAL' | ||
} | ||
Map[String, String] methyl_out = {"missing_methyl_tag_reads": | ||
NoMissingBeans.methyl_tag_simple_stats['files_holding_reads_without_tags']} | ||
########## | ||
call QC2.DystPeaker { input: | ||
input_file=uBAM, input_is_bam=true, | ||
id=readgroup_id, short_reads_threshold=short_reads_threshold, | ||
gcs_out_root_dir=bc_specific_metrics_out | ||
} | ||
Map[String, String] read_len_out = {"read_len_hist": DystPeaker.read_len_hist, | ||
"raw_rl_file": DystPeaker.read_len_summaries['raw_rl_file']} | ||
|
||
########## | ||
# seqkit stats, detail depends on if FASTQ is available/desired or not | ||
if (i_donot_want_fq) { | ||
call QC3.FASTQstats as SeqKitOnBAM { input: reads = uBAM, file_type = "BAM" } | ||
} | ||
if (!i_donot_want_fq) { | ||
File use_this_hifi_fq = select_first([input_hifi_fq, BamToFastq.reads_fq]) | ||
call QC3.FASTQstats as SeqKitOnFASTQ { input: reads=use_this_hifi_fq, file_type='FASTQ' } | ||
} | ||
|
||
################################################################################### | ||
if (run_nanoplot) { | ||
call SAVE.SaveFilestoDestination as SaveRest { input: | ||
instructions = select_all([a]), | ||
already_finalized = select_all([methyl_out, | ||
read_len_out]) | ||
} | ||
} | ||
if (!run_nanoplot) { | ||
call GU.MergeMaps { input: | ||
one = methyl_out, | ||
two = read_len_out, | ||
} | ||
} | ||
Map[String, String] files_out = select_first([SaveRest.result, MergeMaps.merged]) | ||
|
||
call GU.GetTodayDate as today {} | ||
} |