-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
handle (ref-indepedent, alignment) a single piece of on-instrument demultiplexed CCS bam
- Loading branch information
1 parent
17f712b
commit 295be09
Showing
3 changed files
with
336 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
180 changes: 180 additions & 0 deletions
180
wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
version 1.0 | ||
|
||
import "../../../tasks/Utility/GeneralUtils.wdl" as GU | ||
|
||
import "../../../tasks/Alignment/AlignHiFiUBAM.wdl" as ALN | ||
|
||
import "../../TechAgnostic/Utility/AlignedBamQCandMetrics.wdl" as QCMetrics | ||
|
||
import "../../../tasks/Utility/Finalize.wdl" as FF | ||
|
||
workflow ProcessOnInstrumentDemuxedChunk { | ||
|
||
meta { | ||
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform alignment and QC check, and collect a few metrics." | ||
} | ||
|
||
parameter_meta { | ||
|
||
######### | ||
# inputs | ||
readgroup_id: | ||
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>; no whitespaces allowed" | ||
|
||
bam_SM_field: | ||
"value to place in the SM field of the resulting BAM header's @RG line" | ||
|
||
platform: | ||
"PacBio platform used for generating the data; accepted value: [Sequel, Revio]" | ||
|
||
gcs_out_root_dir: | ||
"output files will be copied over there" | ||
|
||
qc_metrics_config_json: | ||
"A config json to for running the QC and metrics-collection sub-workflow 'AlignedBamQCandMetrics'" | ||
|
||
fingerprint_sample_id: | ||
"For fingerprint verification: the ID of the sample supposedly this BAM belongs to; note that the fingerprint VCF is assumed to be located at {fingerprint_store}/{fingerprint_sample_id}*.vcf(.gz)?" | ||
|
||
expected_sex_type: | ||
"If provided, triggers sex concordance check. Accepted value: [M, F, NA, na]" | ||
|
||
######### | ||
# outputs | ||
wgs_cov: | ||
"whole genome mean coverage" | ||
|
||
nanoplot_summ: | ||
"Summary on alignment metrics provided by Nanoplot (todo: study the value of this output)" | ||
|
||
sam_flag_stats: | ||
"SAM flag stats" | ||
|
||
fingerprint_check: | ||
"Summary on (human) fingerprint checking results" | ||
|
||
contamination_est: | ||
"cross-(human)individual contamination estimation by VerifyBAMID2" | ||
|
||
inferred_sex_info: | ||
"Inferred sex concordance information if expected sex type is provided" | ||
|
||
methyl_tag_simple_stats: | ||
"Simple stats on the reads with & without SAM methylation tags (MM/ML)." | ||
|
||
aBAM_metrics_files_out: | ||
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks" | ||
} | ||
|
||
input { | ||
String gcs_out_root_dir | ||
|
||
File uBAM | ||
File? uPBI | ||
|
||
String readgroup_id | ||
String bam_SM_field | ||
|
||
String platform | ||
|
||
# args for optional QC subworkflows | ||
File? qc_metrics_config_json | ||
String? fingerprint_sample_id | ||
String? expected_sex_type | ||
|
||
File ref_map_file | ||
String disk_type = "SSD" | ||
} | ||
|
||
output { | ||
String last_processing_date = today.yyyy_mm_dd | ||
|
||
File aligned_bam = FinalizeAlignedBam.gcs_path | ||
File aligned_bai = FinalizeAlignedBai.gcs_path | ||
File aligned_pbi = FinalizeAlignedPbi.gcs_path | ||
|
||
String movie = AlignHiFiUBAM.movie | ||
|
||
# metrics block (caveat: always has to keep an eye on the QC subworkflow about outputs) | ||
Float wgs_cov = QCandMetrics.wgs_cov | ||
Map[String, Float] nanoplot_summ = QCandMetrics.nanoplot_summ | ||
Map[String, Float] sam_flag_stats = QCandMetrics.sam_flag_stats | ||
|
||
# fingerprint | ||
Map[String, String]? fingerprint_check = QCandMetrics.fingerprint_check | ||
|
||
# contam | ||
Float? contamination_est = QCandMetrics.contamination_est | ||
|
||
# sex concordance | ||
Map[String, String]? inferred_sex_info = QCandMetrics.inferred_sex_info | ||
|
||
# methyl | ||
Map[String, String]? methyl_tag_simple_stats = QCandMetrics.methyl_tag_simple_stats | ||
|
||
# file-based QC/metrics outputs all packed into a finalization map | ||
Map[String, String] aBAM_metrics_files_out = QCandMetrics.aBAM_metrics_files_out | ||
} | ||
|
||
################################################################################### | ||
# prep work | ||
# where to store final results | ||
String workflow_name = "ProcessOnInstrumentDemuxedChunk" | ||
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name | ||
|
||
# String bc_specific_out = outdir + '/' + readgroup_id | ||
String bc_specific_aln_out = outdir + '/alignments/' + readgroup_id | ||
String bc_specific_metrics_out = outdir + "/metrics/" + readgroup_id | ||
|
||
################################################################################### | ||
# align | ||
call ALN.AlignHiFiUBAM as AlignHiFiUBAM { input: | ||
uBAM = uBAM, | ||
uPBI = uPBI, | ||
bam_sample_name = bam_SM_field, | ||
ref_map_file = ref_map_file, | ||
application = 'HIFI' | ||
} | ||
|
||
File aBAM = AlignHiFiUBAM.aligned_bam | ||
File aBAI = AlignHiFiUBAM.aligned_bai | ||
File aPBI = AlignHiFiUBAM.aligned_pbi | ||
|
||
# save | ||
call FF.FinalizeToFile as FinalizeAlignedBam { input: outdir = bc_specific_aln_out, file = aBAM, name = readgroup_id + '.bam' } | ||
call FF.FinalizeToFile as FinalizeAlignedBai { input: outdir = bc_specific_aln_out, file = aBAI, name = readgroup_id + '.bai' } | ||
call FF.FinalizeToFile as FinalizeAlignedPbi { input: outdir = bc_specific_aln_out, file = aPBI, name = readgroup_id + '.pbi' } | ||
|
||
################################################################################### | ||
# QC | ||
AlignedBamQCnMetricsConfig qcm_config = read_json(select_first([qc_metrics_config_json])) | ||
call QCMetrics.Work as QCandMetrics { input: | ||
bam = aBAM, | ||
bai = aBAI, | ||
|
||
tech = platform, | ||
|
||
cov_bed = qcm_config.cov_bed, | ||
cov_bed_descriptor = qcm_config.cov_bed_descriptor, | ||
|
||
fingerprint_vcf_store = qcm_config.fingerprint_vcf_store, | ||
fingerprint_sample_id = fingerprint_sample_id, | ||
|
||
expected_sex_type = expected_sex_type, | ||
|
||
vbid2_config_json = qcm_config.vbid2_config_json, | ||
|
||
methyl_tag_check_bam_descriptor = qcm_config.methyl_tag_check_bam_descriptor, | ||
|
||
ref_map_file = ref_map_file, | ||
disk_type = disk_type, | ||
|
||
output_prefix = readgroup_id, # String output_prefix = bam_sample_name + "." + flowcell | ||
gcs_out_root_dir = bc_specific_metrics_out, | ||
} | ||
|
||
################################################################################### | ||
call GU.GetTodayDate as today {} | ||
} |
150 changes: 150 additions & 0 deletions
150
wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunkRefFree.wdl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
version 1.0 | ||
|
||
|
||
import "../../../tasks/Utility/BAMutils.wdl" as BU | ||
import "../../../tasks/Utility/GeneralUtils.wdl" as GU | ||
|
||
import "../../../tasks/Visualization/NanoPlot.wdl" as QC0 | ||
import "../../TechAgnostic/Utility/CountTheBeans.wdl" as QC1 | ||
import "../../TechAgnostic/Utility/DystPeaker.wdl" as QC2 | ||
import "../../TechAgnostic/Utility/FASTQstats.wdl" as QC3 | ||
|
||
import "../../../tasks/Utility/Finalize.wdl" as FF | ||
import "../../TechAgnostic/Utility/SaveFilesToDestination.wdl" as SAVE | ||
|
||
workflow ProcessOnInstrumentDemuxedChunkRefFree { | ||
|
||
meta { | ||
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform ref-independent prep work, and collect a few metrics" | ||
} | ||
|
||
parameter_meta { | ||
readgroup_id: | ||
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>" | ||
|
||
short_reads_threshold: | ||
"a length threshold below which reads are classified as short" | ||
|
||
bam_descriptor: | ||
"a one-word description of the purpose of the BAM (e.g. 'a_particular_seq_center'; used for saving the reads that don't have any MM/ML tags; doesn't need to be single-file specific)" | ||
|
||
hifi_stats: | ||
"A few metrics output by Nanoplot" | ||
hifi_fq_stats: | ||
"A few metrics output by seqkit stats" | ||
|
||
read_len_summaries: | ||
"A few metrics summarizing the read length distribution" | ||
read_len_peaks: | ||
"Peaks of the read length distribution (heruistic)" | ||
read_len_deciles: | ||
"Deciles of the read length distribution" | ||
|
||
methyl_tag_simple_stats: | ||
"Simple stats on the reads with & without SAM methylation tags (MM/ML)." | ||
|
||
uBAM_metrics_files_out: | ||
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks" | ||
} | ||
|
||
input { | ||
File uBAM | ||
String readgroup_id | ||
|
||
String bam_descriptor | ||
Int short_reads_threshold | ||
|
||
String disk_type = "SSD" | ||
|
||
String gcs_out_root_dir | ||
} | ||
|
||
output { | ||
String movie = movie_name | ||
|
||
File hifi_fq = FinalizeFQ.gcs_path | ||
|
||
String last_processing_date = today.yyyy_mm_dd | ||
|
||
# todo merge these two together | ||
Map[String, Float] hifi_stats = NanoPlotFromUBam.stats_map | ||
Map[String, Float] hifi_fq_stats = FASTQstats.stats | ||
|
||
Map[String, String] read_len_summaries = DystPeaker.read_len_summaries | ||
Array[Int] read_len_peaks = DystPeaker.read_len_peaks | ||
Array[Int] read_len_deciles = DystPeaker.read_len_deciles | ||
|
||
# methylation call rate stats | ||
Map[String, String] methyl_tag_simple_stats = NoMissingBeans.methyl_tag_simple_stats | ||
|
||
# file-based outputs all packed into a finalization map | ||
Map[String, String] uBAM_metrics_files_out = FF.result | ||
} | ||
|
||
################################################################################### | ||
# prep work | ||
# where to store final results | ||
String workflow_name = "ProcessOnInstrumentDemuxedChunkRefFree" | ||
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name | ||
|
||
String outdir_ref_free = outdir + '/RefFree' # ideally, this isn't necessary, but it's a relic we'd not touch right now (2023-12-23) | ||
String bc_specific_out = outdir_ref_free + '/' + readgroup_id | ||
String bc_specific_metrics_out = bc_specific_out + "/metrics" | ||
|
||
################################################################################### | ||
call BU.GetReadGroupInfo as RG {input: bam = uBAM, keys = ['PU']} | ||
String movie_name = RG.read_group_info['PU'] | ||
|
||
################################################################################### | ||
# convert to FASTQ (most for assembly) | ||
call BU.BamToFastq {input: bam = uBAM, prefix = "does_not_matter"} | ||
call FF.FinalizeToFile as FinalizeFQ { input: | ||
outdir = bc_specific_out, | ||
file = BamToFastq.reads_fq, | ||
name = readgroup_id + '.hifi.fq.gz' | ||
} | ||
|
||
################################################################################### | ||
# more QCs and metrics | ||
########## | ||
call QC0.NanoPlotFromUBam {input: uBAM = uBAM} | ||
FinalizationManifestLine a = object | ||
{files_to_save: flatten([[NanoPlotFromUBam.stats], NanoPlotFromUBam.plots]), | ||
is_singleton_file: false, | ||
destination: bc_specific_metrics_out + "/nanoplot", | ||
output_attribute_name: "nanoplot"} | ||
########## | ||
call QC1.CountTheBeans as NoMissingBeans { input: | ||
bam=uBAM, | ||
bam_descriptor=bam_descriptor, | ||
gcs_out_root_dir=bc_specific_metrics_out, | ||
use_local_ssd=disk_type=='LOCAL' | ||
} | ||
Map[String, String] methyl_out = {"missing_methyl_tag_reads": | ||
NoMissingBeans.methyl_tag_simple_stats['files_holding_reads_without_tags']} | ||
########## | ||
call QC2.DystPeaker { input: | ||
input_file=uBAM, input_is_bam=true, | ||
id=readgroup_id, short_reads_threshold=short_reads_threshold, | ||
gcs_out_root_dir=bc_specific_metrics_out | ||
} | ||
Map[String, String] read_len_out = {"read_len_hist": DystPeaker.read_len_hist, | ||
"raw_rl_file": DystPeaker.read_len_summaries['raw_rl_file']} | ||
|
||
########## | ||
call QC3.FASTQstats { input: | ||
reads=BamToFastq.reads_fq, | ||
file_type='FASTQ' | ||
} | ||
|
||
################################################################################### | ||
call SAVE.SaveFilestoDestination as FF { input: | ||
instructions = select_all([a]), | ||
already_finalized = select_all([methyl_out, | ||
read_len_out]) | ||
} | ||
|
||
call GU.GetTodayDate as today {} | ||
} |