Skip to content

Commit

Permalink
1. Updated path to MANE transcripts
Browse files Browse the repository at this point in the history
2. Updated Calypso to generate comp het annotations
3. Added script to make comp het tsv
  • Loading branch information
Alistair Ward committed Nov 8, 2024
1 parent fbafd03 commit ec6d86a
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 8 deletions.
41 changes: 38 additions & 3 deletions calypso.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,7 @@ def main():
print('# Generate tsv files to upload annotations to Mosaic', file = bash_file)
print('GENERATE_TSV=', root_path, '/generate_annotation_tsv.py', sep = '', file = bash_file)
print('GENERATE_HGVS_TSV=', root_path, '/generate_hgvs_annotation_tsv.py', sep = '', file = bash_file)
print('GENERATE_COMP_HET_TSV=', root_path, '/generate_comphet_tsv.py', sep = '', file = bash_file)
print('GENERATE_VARIANT_QUALITY_TSV=', root_path, '/generate_variant_quality_tsv.py', sep = '', file = bash_file)
print('MOSAIC_JSON=', args.mosaic_json, sep = '', file = bash_file)

Expand All @@ -513,6 +514,15 @@ def main():
generate_hgvs_tsv(bash_file, reference)
tsv_files.append('hgvs.tsv')

# Extract compound hets
elif resource == 'compound_heterozygotes':
for annotation in private_annotations:
if private_annotations[annotation]['name'] == 'Comp Het':
uid = annotation
break
generate_comp_het_tsv(bash_file, uid, proband)
tsv_files.append('comp_het.tsv')

# Extract the Variant Quality scores after getting the uid of this annotation
elif resource == 'variant_quality':
for annotation in private_annotations:
Expand Down Expand Up @@ -663,7 +673,7 @@ def main():

# Public annotations are posted to the annotation project, private annotations are posted to the
# case project
if tsv == 'variant_quality.tsv':
if tsv == 'variant_quality.tsv' or tsv == 'comp_het.tsv':
print('-p ', args.project_id, ' ', sep = '', end = '', file = upload_file)
else:
print('-p ', annotation_project_id, ' ', sep = '', end = '', file = upload_file)
Expand Down Expand Up @@ -1067,9 +1077,11 @@ def bash_resources(use_queue, resource_info, working_directory, bash_file, vcf,
# Generate the names of the intermediate and final vcf files
vcf_base = os.path.abspath(vcf).split('/')[-1].rstrip('vcf.gz')
filtered_vcf = str(vcf_base) + '_calypso_filtered.vcf.gz'
comphet_vcf = str(vcf_base) + '_calypso_comphet.vcf.gz'
print('FILEPATH=', working_directory, sep = '', file = bash_file)
print('ANNOTATEDVCF=$FILEPATH/' + str(vcf_base) + '_annotated.vcf.gz', sep = '', file = bash_file)
print('FILTEREDVCF=$FILEPATH/' + str(filtered_vcf), sep = '', file = bash_file)
print('COMPHET_VCF=$FILEPATH/' + str(comphet_vcf), sep = '', file = bash_file)
print('STDOUT=calypso_annotation_pipeline.stdout', file = bash_file)
print('STDERR=calypso_annotation_pipeline.stderr', file = bash_file)

Expand Down Expand Up @@ -1263,9 +1275,10 @@ def filter_vcf(bash_file, samples, proband, resource_info, threads):
# ...or that the CADD_Phred score is over 15...
print(' ("CADD_Phred" in INFO && INFO.CADD_Phred > 15) ||', file = bash_file)

# ...or that the REVEL or MutScores are over 0.1...
# ...or that the REVEL, MutScores, or AlphaMissense are over 0.1...
print(' ("REVEL" in INFO && INFO.REVEL > 0.1) ||', file = bash_file)
print(' ("MutScore" in INFO && INFO.MutScore > 0.1) ||', file = bash_file)
print(' ("AM_SC" in INFO && INFO.AM_SC > 0.1) ||', file = bash_file)

# ...or that the variant has a spliceAI score > 0.1 for any of the acceptor / donor sites.
print(' ("SpliceAI_DS_AG" in INFO && INFO.SpliceAI_DS_AG > 0.1) ||', file = bash_file)
Expand Down Expand Up @@ -1305,16 +1318,26 @@ def filter_vcf(bash_file, samples, proband, resource_info, threads):
print(' ((sample.GQ >= 10 && sample.GQ < 20 && sample.AB >= 0.7) || ', file = bash_file)
print(' (sample.GQ >= 20 && sample.AB >= 0.7 && sample.AB < 0.8))\" \\', file = bash_file)
print(' --sample-expr \"hom_hi_qual: sample.hom_alt && sample.GQ >= 20 && sample.AB >= 0.8\" \\', file = bash_file)
print(' --trio \"comphet_side:comphet_side(kid, mom, dad)" \\', file = bash_file)

# And final compression
print(' --pass-only \\', file = bash_file)
print(' 2>> $STDERR \\', file = bash_file)
print(' | $BCFTOOLS view -O z -o $FILTEREDVCF --threads ', threads, ' - \\', sep = '', file = bash_file)
print(' >> $STDOUT 2>> $STDERR', file = bash_file)
print('echo "complete"', file = bash_file)
print('$BCFTOOLS index -t $FILTEREDVCF', file = bash_file)
print(file = bash_file)

# Extract all comp hets
print('# Extract compound hets', file = bash_file)
print('echo -n "Extracting compound hets..."', file = bash_file)
print('$SLIVAR compound-hets -v $FILTEREDVCF \\', file = bash_file)
print(' --sample-field comphet_side \\', file = bash_file)
print(' --sample-field denovo -p $PED \\', file = bash_file)
print(' | $BCFTOOLS view -O z -o $COMPHET_VCF', file = bash_file)
print('$BCFTOOLS index -t $COMPHET_VCF', file = bash_file)
print('echo "complete"', file = bash_file)
print(file = bash_file)



Expand All @@ -1338,6 +1361,18 @@ def generate_hgvs_tsv(bash_file, reference):
print('-s $TOOLPATH ', file = bash_file)
print('echo "complete"', file = bash_file)

# Call the command line to extract compound hets
def generate_comp_het_tsv(bash_file, uid, proband):
print(file = bash_file)
print('# Resource: Comp Hets', sep = '', file = bash_file)
print('echo -n "Creating tsv file for Comp Hets..."', sep = '', file = bash_file)
print('python3 $GENERATE_COMP_HET_TSV ', end = '', file = bash_file)
print('-i $COMPHET_VCF ', end = '', file = bash_file)
print('-s $TOOLPATH ', end = '', file = bash_file)
print('-o "comp_het.tsv" ', end = '', file = bash_file)
print('-u "', uid, '"', sep = '', file = bash_file)
print('echo "complete"', file = bash_file)

# Call the command to extract the variant quality values
def generate_variant_quality_tsv(bash_file, uid, proband):
print(file = bash_file)
Expand Down
86 changes: 86 additions & 0 deletions generate_comphet_tsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from datetime import date
from os.path import exists
from sys import path

import argparse
import os
import sys

import read_resource_jsons as read_resources

def main():

# Parse the command line
args = parse_command_line()

# Define the executable bcftools command
if args.tools_directory:
if not args.tools_directory.endswith('/'):
args.tools_directory = str(args.tools_directory) + '/'
bcftools = args.tools_directory + 'bcftools/bcftools'
else:
bcftools = 'bcftools'

# Open an output tsv file to write annotations to
output_file = open(args.output_tsv, 'w')

# Write the header line to the tsv file
print('CHROM\tSTART\tEND\tREF\tALT\t', str(args.uid), sep = '', file = output_file)

# Parse the vcf file, check the annotations and generate a tsv file for upload to Mosaic
process_vcf(bcftools, args.input_vcf, output_file)

# Close the output tsv file
output_file.close()

# Input options
def parse_command_line():
global version
parser = argparse.ArgumentParser(description='Process the command line')

# Required arguments
parser.add_argument('--input_vcf', '-i', required = True, metavar = 'string', help = 'The input vcf file to annotate')
parser.add_argument('--output_tsv', '-o', required = True, metavar = 'string', help = 'The output tsv file')
parser.add_argument('--tools_directory', '-s', required = True, metavar = 'string', help = 'The path to the directory where the tools live')
parser.add_argument('--uid', '-u', required = True, metavar = 'string', help = 'The uid for the private comp het annotation')

return parser.parse_args()

# The majority of annotations can be processed in a standard way. The type is checked and numerical
# annotations are checked to ensure they fall within the required bounds
def process_vcf(bcftools, vcf, output_file):

# Loop over all records in the vcf file
command = bcftools + ' query -f \'%CHROM\\t%POS\\t%END\\t%REF\\t%ALT\\t1\\n\' ' + str(vcf)
for record in os.popen(command).readlines():

# Split the record on tabs
fields = record.rstrip().split('\t')

# Update the chromosome and position
fields[0], fields[2] = updateCoords(fields[0], fields[2])

# Build the output record from the updated fields
print('\t'.join(fields), file = output_file)

# Update the chromosome and position in the tsv file
def updateCoords(chrom, pos):

# Check that the chromosome does not include "chr" prefix
if chrom.startswith('chr'): chrom = chrom.strip('chr')

# Add one to the end position
pos = str(int(pos) + 1)

# Return the updated values
return chrom, pos

# If the script fails, provide an error message and exit
def fail(message):
print(message, sep = "")
exit(1)

# Initialise global variables

if __name__ == "__main__":
main()
10 changes: 5 additions & 5 deletions generate_hgvs_annotation_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
def main():

# Parse the command line
args = parseCommandLine()
args = parse_command_line()

# Define the executable bcftools command
if args.tools_directory:
Expand Down Expand Up @@ -47,9 +47,9 @@ def main():
fail('The delimiter field is not provided for resource ' + str(resource) + ' and so its annotation cannot be processed.')

# Get all the MANE transcript ids
maneFile = '/scratch/ucgd/lustre-work/marth/marth-projects/calypso/reannotation/data/GRCh38/reference/MANE.GRCh38.v1.3.ensembl.transcript_ids.txt'
mane = open(maneFile, 'r')
maneIds = []
mane_file = '/scratch/ucgd/lustre-labs/marth/scratch/calypso/data/GRCh38/reference/MANE.GRCh38.v1.3.ensembl.transcript_ids.txt'
mane = open(mane_file, 'r')
maneIds = []
for record in mane.readlines():
maneIds.append(record.rstrip())
mane.close()
Expand Down Expand Up @@ -174,7 +174,7 @@ def main():
outputFile.close()

# Input options
def parseCommandLine():
def parse_command_line():
global version
parser = argparse.ArgumentParser(description='Process the command line')

Expand Down

0 comments on commit ec6d86a

Please sign in to comment.