|
| 1 | +version 1.0 |
| 2 | + |
| 3 | +import "DeduplicateAndResetONTAlignedBam.wdl" as FixAndReset |
| 4 | + |
| 5 | +import "../../TechAgnostic/Utility/SplitBamByReadgroup.wdl" as Major |
| 6 | + |
| 7 | +import "../../../tasks/Utility/Utils.wdl" |
| 8 | +import "../../../tasks/Utility/BAMutils.wdl" as BU |
| 9 | +import "../../../tasks/Utility/ONTUtils.wdl" as OU |
| 10 | + |
| 11 | +workflow RemoveDuplicateFromMergedONTBamAndSplitByReadgroup { |
| 12 | + |
| 13 | + meta { |
| 14 | + desciption: "Remove duplicate records from an ONT alinged BAM, drop alignment information, and split by the bam by read groups." |
| 15 | + } |
| 16 | + parameter_meta { |
| 17 | + fix_bam_header: "Sometimes, the bam given to us contains a specific error mode. We fix it here." |
| 18 | + scatter_scheme: "A txt file holding how to scatter the WGS bam. Example (this example size-balance among the shards): ...\nchr5,chr19\nchr6,chrY,chrM\n..." |
| 19 | + } |
| 20 | + |
| 21 | + input { |
| 22 | + File input_bam |
| 23 | + File? input_bai |
| 24 | + |
| 25 | + Boolean fix_bam_header |
| 26 | + File scatter_scheme |
| 27 | + |
| 28 | + String gcs_out_root_dir |
| 29 | + } |
| 30 | + |
| 31 | + output { |
| 32 | + Map[String, String]? runid_2_ont_basecall_model = GetBasecallModel.runid_2_model |
| 33 | + |
| 34 | + Map[String, String] rgid_2_bam = WORKHORSE.rgid_2_bam |
| 35 | + Map[String, String] rgid_2_PU = WORKHORSE.rgid_2_PU |
| 36 | + Map[String, String]? rgid_2_ubam_emptyness = WORKHORSE.rgid_2_ubam_emptyness |
| 37 | + Boolean rgid_2_bam_are_aligned = WORKHORSE.rgid_2_bam_are_aligned |
| 38 | + |
| 39 | + String last_processing_date = WORKHORSE.last_processing_date |
| 40 | + } |
| 41 | + |
| 42 | + ############################################################################################################################## |
| 43 | + # input file validation and fixing |
| 44 | + call BU.GatherBamMetadata { |
| 45 | + input: bam = input_bam |
| 46 | + } |
| 47 | + if ('coordinate' != GatherBamMetadata.sort_order) { |
| 48 | + call Utils.StopWorkflow { input: reason = "Input bam isn't coordinate-sorted, but rather sorted by ~{GatherBamMetadata.sort_order}" } |
| 49 | + } |
| 50 | + |
| 51 | + # reality of life--submitted files sometimes need fixings in their headers |
| 52 | + if (fix_bam_header) { |
| 53 | + call FixParticularBamHeaderIssue { input: bam = input_bam } |
| 54 | + } |
| 55 | + |
| 56 | + call FixAndReset.DeduplicateAndResetONTAlignedBam as Dedup { input: |
| 57 | + aligned_bam = select_first([FixParticularBamHeaderIssue.fixed, input_bam]), aligned_bai = input_bai, scatter_scheme = scatter_scheme |
| 58 | + } |
| 59 | + |
| 60 | + File ok_input_bam = Dedup.result |
| 61 | + |
| 62 | + ############################################################################################################################## |
| 63 | + # delegate |
| 64 | + call Major.SplitBamByReadgroup as WORKHORSE { |
| 65 | + input: |
| 66 | + input_bam = ok_input_bam, |
| 67 | + |
| 68 | + unmap_bam = false, # already done above |
| 69 | + convert_to_fq = false, # no need for ONT data, usually |
| 70 | +
|
| 71 | + validate_output_bams = true, |
| 72 | + |
| 73 | + gcs_out_root_dir = gcs_out_root_dir, |
| 74 | + debug_mode = false |
| 75 | + } |
| 76 | + |
| 77 | + call OU.GetBasecallModel { input: bam = ok_input_bam } |
| 78 | +} |
| 79 | + |
| 80 | +task FixParticularBamHeaderIssue { |
| 81 | + meta { |
| 82 | + description: "Someone submitted to us BAMs with irregular headers. Fix that here." |
| 83 | + } |
| 84 | + input { |
| 85 | + File bam |
| 86 | + } |
| 87 | + output { |
| 88 | + File fixed = "~{prefix}.rg.fixed.bam" |
| 89 | + } |
| 90 | + |
| 91 | + Int disk_size = 100 + 2 * ceil(size(bam, 'GiB')) |
| 92 | + |
| 93 | + String prefix = basename(bam, '.bam') |
| 94 | + |
| 95 | + command <<< |
| 96 | + set -eux |
| 97 | + |
| 98 | + samtools view -H ~{bam} > original.header.txt |
| 99 | + cat original.header.txt |
| 100 | + |
| 101 | + # strip away the unwanted @RG line |
| 102 | + grep -vE "^@RG[[:space:]]SM:" original.header.txt > to.add.sample.name.header.txt |
| 103 | + diff original.header.txt to.add.sample.name.header.txt || true |
| 104 | + |
| 105 | + # add back the sample name to the correct @RG lines |
| 106 | + formatted_sample_name=$(grep -E "^@RG[[:space:]]SM:" original.header.txt | tr '\t' '\n' | grep "^SM:") |
| 107 | + TAB=$'\t' # sed doesn't officially recoganize \t as tab |
| 108 | + for line_num in `grep -n "^@RG" to.add.sample.name.header.txt | awk -F ':' '{print $1}'` |
| 109 | + do |
| 110 | + echo "${line_num}" |
| 111 | + sed -i.bak "${line_num}s/$/""${TAB}""${formatted_sample_name}""/" to.add.sample.name.header.txt |
| 112 | + done |
| 113 | + cat to.add.sample.name.header.txt |
| 114 | + mv to.add.sample.name.header.txt fixed.header.txt |
| 115 | + |
| 116 | + samtools reheader fixed.header.txt ~{bam} > ~{prefix}.rg.fixed.bam |
| 117 | + >>> |
| 118 | + |
| 119 | + runtime { |
| 120 | + cpu: 1 |
| 121 | + memory: "4 GiB" |
| 122 | + disks: "local-disk ~{disk_size} LOCAL" |
| 123 | + preemptible: 2 |
| 124 | + maxRetries: 1 |
| 125 | + docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3" |
| 126 | + } |
| 127 | +} |
0 commit comments