1. Updated path to MANE transcripts

2. Updated Calypso to generate comp het annotations 3. Added script to make comp het tsv
iobio · Nov 8, 2024 · ec6d86a · ec6d86a
1 parent fbafd03
commit ec6d86a
Show file tree

Hide file tree

Showing 3 changed files with 129 additions and 8 deletions.
diff --git a/calypso.py b/calypso.py
@@ -500,6 +500,7 @@ def main():
   print('# Generate tsv files to upload annotations to Mosaic', file = bash_file)
   print('GENERATE_TSV=', root_path, '/generate_annotation_tsv.py', sep = '', file = bash_file)
   print('GENERATE_HGVS_TSV=', root_path, '/generate_hgvs_annotation_tsv.py', sep = '', file = bash_file)
+  print('GENERATE_COMP_HET_TSV=', root_path, '/generate_comphet_tsv.py', sep = '', file = bash_file)
   print('GENERATE_VARIANT_QUALITY_TSV=', root_path, '/generate_variant_quality_tsv.py', sep = '', file = bash_file)
   print('MOSAIC_JSON=', args.mosaic_json, sep = '', file = bash_file)
 
@@ -513,6 +514,15 @@ def main():
       generate_hgvs_tsv(bash_file, reference)
       tsv_files.append('hgvs.tsv')
 
+    # Extract compound hets
+    elif resource == 'compound_heterozygotes':
+      for annotation in private_annotations:
+        if private_annotations[annotation]['name'] == 'Comp Het':
+          uid = annotation
+          break
+      generate_comp_het_tsv(bash_file, uid, proband)
+      tsv_files.append('comp_het.tsv')
+
     # Extract the Variant Quality scores after getting the uid of this annotation
     elif resource == 'variant_quality':
       for annotation in private_annotations:
@@ -663,7 +673,7 @@ def main():
 
     # Public annotations are posted to the annotation project, private annotations are posted to the
     # case project
-    if tsv == 'variant_quality.tsv':
+    if tsv == 'variant_quality.tsv' or tsv == 'comp_het.tsv':
       print('-p ', args.project_id, ' ', sep = '', end = '', file = upload_file)
     else:
       print('-p ', annotation_project_id, ' ', sep = '', end = '', file = upload_file)
@@ -1067,9 +1077,11 @@ def bash_resources(use_queue, resource_info, working_directory, bash_file, vcf,
   # Generate the names of the intermediate and final vcf files
   vcf_base = os.path.abspath(vcf).split('/')[-1].rstrip('vcf.gz')
   filtered_vcf = str(vcf_base) + '_calypso_filtered.vcf.gz'
+  comphet_vcf = str(vcf_base) + '_calypso_comphet.vcf.gz'
   print('FILEPATH=', working_directory, sep = '', file = bash_file)
   print('ANNOTATEDVCF=$FILEPATH/' + str(vcf_base) + '_annotated.vcf.gz', sep = '', file = bash_file)
   print('FILTEREDVCF=$FILEPATH/' + str(filtered_vcf), sep = '', file = bash_file)
+  print('COMPHET_VCF=$FILEPATH/' + str(comphet_vcf), sep = '', file = bash_file)
   print('STDOUT=calypso_annotation_pipeline.stdout', file = bash_file)
   print('STDERR=calypso_annotation_pipeline.stderr', file = bash_file)
 
@@ -1263,9 +1275,10 @@ def filter_vcf(bash_file, samples, proband, resource_info, threads):
   # ...or that the CADD_Phred score is over 15...
   print('  ("CADD_Phred" in INFO && INFO.CADD_Phred > 15) ||', file = bash_file)
 
-  # ...or that the REVEL or MutScores are over 0.1...
+  # ...or that the REVEL, MutScores, or AlphaMissense are over 0.1...
   print('  ("REVEL" in INFO && INFO.REVEL > 0.1) ||', file = bash_file)
   print('  ("MutScore" in INFO && INFO.MutScore > 0.1) ||', file = bash_file)
+  print('  ("AM_SC" in INFO && INFO.AM_SC > 0.1) ||', file = bash_file)
 
   # ...or that the variant has a spliceAI score > 0.1 for any of the acceptor / donor sites.
   print('  ("SpliceAI_DS_AG" in INFO && INFO.SpliceAI_DS_AG > 0.1) ||', file = bash_file)
@@ -1305,16 +1318,26 @@ def filter_vcf(bash_file, samples, proband, resource_info, threads):
   print('    ((sample.GQ >= 10 && sample.GQ < 20 && sample.AB >= 0.7) || ', file = bash_file)
   print('    (sample.GQ >= 20 && sample.AB >= 0.7 && sample.AB < 0.8))\" \\', file = bash_file)
   print('  --sample-expr \"hom_hi_qual: sample.hom_alt && sample.GQ >= 20 && sample.AB >= 0.8\" \\', file = bash_file)
+  print('  --trio \"comphet_side:comphet_side(kid, mom, dad)" \\', file = bash_file)
 
   # And final compression
   print('  --pass-only \\', file = bash_file)
   print('  2>> $STDERR \\', file = bash_file)
   print('  | $BCFTOOLS view -O z -o $FILTEREDVCF --threads ', threads, ' - \\', sep = '', file = bash_file)
   print('  >> $STDOUT 2>> $STDERR', file = bash_file)
-  print('echo "complete"', file = bash_file)
   print('$BCFTOOLS index -t $FILTEREDVCF', file = bash_file)
   print(file = bash_file)
 
+  # Extract all comp hets
+  print('# Extract compound hets', file = bash_file)
+  print('echo -n "Extracting compound hets..."', file = bash_file)
+  print('$SLIVAR compound-hets -v $FILTEREDVCF \\', file = bash_file)
+  print('  --sample-field comphet_side \\', file = bash_file)
+  print('  --sample-field denovo -p $PED \\', file = bash_file)
+  print('  | $BCFTOOLS view -O z -o $COMPHET_VCF', file = bash_file)
+  print('$BCFTOOLS index -t $COMPHET_VCF', file = bash_file)
+  print('echo "complete"', file = bash_file)
+  print(file = bash_file)
 
 
 
@@ -1338,6 +1361,18 @@ def generate_hgvs_tsv(bash_file, reference):
   print('-s $TOOLPATH ', file = bash_file)
   print('echo "complete"', file = bash_file)
 
+# Call the command line to extract compound hets
+def generate_comp_het_tsv(bash_file, uid, proband):
+  print(file = bash_file)
+  print('# Resource: Comp Hets', sep = '', file = bash_file)
+  print('echo -n "Creating tsv file for Comp Hets..."', sep = '', file = bash_file)
+  print('python3 $GENERATE_COMP_HET_TSV ', end = '', file = bash_file)
+  print('-i $COMPHET_VCF ', end = '', file = bash_file)
+  print('-s $TOOLPATH ', end = '', file = bash_file)
+  print('-o "comp_het.tsv" ', end = '', file = bash_file)
+  print('-u "', uid, '"', sep = '', file = bash_file)
+  print('echo "complete"', file = bash_file)
+
 # Call the command to extract the variant quality values
 def generate_variant_quality_tsv(bash_file, uid, proband):
   print(file = bash_file)

diff --git a/generate_comphet_tsv.py b/generate_comphet_tsv.py
@@ -0,0 +1,86 @@
+from datetime import date
+from os.path import exists
+from sys import path
+
+import argparse
+import os
+import sys
+
+import read_resource_jsons as read_resources
+
+def main():
+
+  # Parse the command line
+  args = parse_command_line()
+
+  # Define the executable bcftools command
+  if args.tools_directory:
+    if not args.tools_directory.endswith('/'):
+      args.tools_directory = str(args.tools_directory) + '/'
+    bcftools = args.tools_directory + 'bcftools/bcftools'
+  else:
+    bcftools = 'bcftools'
+
+  # Open an output tsv file to write annotations to
+  output_file = open(args.output_tsv, 'w')
+
+  # Write the header line to the tsv file
+  print('CHROM\tSTART\tEND\tREF\tALT\t', str(args.uid), sep = '', file = output_file)
+
+  # Parse the vcf file, check the annotations and generate a tsv file for upload to Mosaic
+  process_vcf(bcftools, args.input_vcf, output_file)
+
+  # Close the output tsv file
+  output_file.close()
+
+# Input options
+def parse_command_line():
+  global version
+  parser = argparse.ArgumentParser(description='Process the command line')
+
+  # Required arguments
+  parser.add_argument('--input_vcf', '-i', required = True, metavar = 'string', help = 'The input vcf file to annotate')
+  parser.add_argument('--output_tsv', '-o', required = True, metavar = 'string', help = 'The output tsv file')
+  parser.add_argument('--tools_directory', '-s', required = True, metavar = 'string', help = 'The path to the directory where the tools live')
+  parser.add_argument('--uid', '-u', required = True, metavar = 'string', help = 'The uid for the private comp het annotation')
+
+  return parser.parse_args()
+
+# The majority of annotations can be processed in a standard way. The type is checked and numerical
+# annotations are checked to ensure they fall within the required bounds
+def process_vcf(bcftools, vcf, output_file):
+
+  # Loop over all records in the vcf file
+  command = bcftools + ' query -f \'%CHROM\\t%POS\\t%END\\t%REF\\t%ALT\\t1\\n\' ' + str(vcf)
+  for record in os.popen(command).readlines():
+
+    # Split the record on tabs
+    fields = record.rstrip().split('\t')
+
+    # Update the chromosome and position
+    fields[0], fields[2] = updateCoords(fields[0], fields[2])
+
+    # Build the output record from the updated fields
+    print('\t'.join(fields), file = output_file)
+
+# Update the chromosome and position in the tsv file
+def updateCoords(chrom, pos):
+
+  # Check that the chromosome does not include "chr" prefix
+  if chrom.startswith('chr'): chrom = chrom.strip('chr')
+
+  # Add one to the end position
+  pos = str(int(pos) + 1)
+
+  # Return the updated values
+  return chrom, pos
+
+# If the script fails, provide an error message and exit
+def fail(message):
+  print(message, sep = "")
+  exit(1)
+
+# Initialise global variables
+
+if __name__ == "__main__":
+  main()
diff --git a/generate_hgvs_annotation_tsv.py b/generate_hgvs_annotation_tsv.py
@@ -12,7 +12,7 @@
 def main():
 
   # Parse the command line
-  args = parseCommandLine()
+  args = parse_command_line()
 
   # Define the executable bcftools command
   if args.tools_directory:
@@ -47,9 +47,9 @@ def main():
     fail('The delimiter field is not provided for resource ' + str(resource) + ' and so its annotation cannot be processed.')
 
   # Get all the MANE transcript ids
-  maneFile = '/scratch/ucgd/lustre-work/marth/marth-projects/calypso/reannotation/data/GRCh38/reference/MANE.GRCh38.v1.3.ensembl.transcript_ids.txt'
-  mane     = open(maneFile, 'r')
-  maneIds  = []
+  mane_file = '/scratch/ucgd/lustre-labs/marth/scratch/calypso/data/GRCh38/reference/MANE.GRCh38.v1.3.ensembl.transcript_ids.txt'
+  mane = open(mane_file, 'r')
+  maneIds = []
   for record in mane.readlines():
     maneIds.append(record.rstrip())
   mane.close()
@@ -174,7 +174,7 @@ def main():
   outputFile.close()
 
 # Input options
-def parseCommandLine():
+def parse_command_line():
   global version
   parser = argparse.ArgumentParser(description='Process the command line')