-
Notifications
You must be signed in to change notification settings - Fork 0
/
calypso.py
1778 lines (1463 loc) · 82.9 KB
/
calypso.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import argparse
import json
import os
import subprocess
import sys
from datetime import date
from os.path import exists
from pprint import pprint
from subprocess import Popen
from sys import path
import read_resource_jsons as read_resources
def main():
print()
print('Starting the Calypso pipeline')
# Parse the command line
args = parse_command_line()
# Check the supplied parameters are as expected, then expand the path to enable scripts and api commands
# to be accessed for Calypso
root_path = os.path.dirname(__file__)
checkArguments(args, root_path)
# Import the api client
path.append(args.api_client)
from mosaic import Mosaic, Project, Store
api_store = Store(config_file = args.client_config)
api_mosaic = Mosaic(config_file = args.client_config)
# Open an api client project object for the defined project
project = api_mosaic.get_project(args.project_id)
# Get the project settings and define the reference genome
project_settings = project.get_project_settings()
reference = project_settings['reference']
if reference not in allowed_references:
fail('The specified reference (' + str(reference) + ') is not recognised. Allowed values are: ' + str(', '.join(allowed_references)))
print('Using the reference: ', reference, sep = '')
# Read the resources json file to identify all the resources that will be used in the annotation pipeline
if args.resource_json:
resource_info = check_resources(reference, args.data_directory, args.tools_directory, args.resource_json)
else:
args.resource_json = str(args.data_directory) + 'resources_' + str(reference) + '.json'
resource_info = check_resources(reference, args.data_directory, args.tools_directory, args.resource_json)
print('Using resources file: ', args.resource_json, sep = '')
resource_info = read_resources.read_resources(reference, root_path, resource_info, args.no_vep)
# If threads were not set, default to 4
args.threads = 4 if not args.threads else args.threads
# Define the tools to be used by Calypso
resource_info = calypso_tools(resource_info)
# Define paths to be used by Calypso
set_working_directory(resource_info['version'])
# Get the samples in the Mosaic projects and determine the id of the proband, and the family structure
mosaic_samples = {}
mosaic_sample_ids = {}
proband = False
for sample in project.get_samples():
relation = False
sex = False
affected_status = False
for attribute in project.get_attributes_for_sample(sample['id']):
if attribute['name'] == 'Relation':
for value in attribute['values']:
if value['sample_id'] == sample['id']:
relation = value['value']
break
# Get the sample sex
elif attribute['uid'] == 'sex':
for value in attribute['values']:
if value['sample_id'] == sample['id']:
sex = value['value']
break
# Get the samples affectation status
elif attribute['uid'] == 'affected_status':
for value in attribute['values']:
if value['sample_id'] == sample['id']:
affected_status = value['value']
break
# Fail if this sample has no Relation set, and store the sample name if this is the proband
if not relation:
fail('The Relation attribute is not set for sample ' + sample['name'])
elif relation == 'Proband':
proband = sample['name']
if not sex:
fail('The Sex attribute is not set for sample ' + sample['name'])
if not affected_status:
fail('The Affected Status attribute is not set for sample ' + sample['name'])
# Add this sample to the list of samples
mosaic_samples[sample['name']] = {'id': sample['id'], 'relation': relation, 'sex': sex, 'affected_status': affected_status}
mosaic_sample_ids[sample['id']] = sample['name']
# Fail if no proband is defined
if not proband:
fail('Could not find a proband for this project')
# Get the vcf file for each sample. Currently, Calypso only works for projects with a single multi-sample vcf.
# Determine the name of this vcf file, then determine if this file has chromosomes listed as e.g. '1' or 'chr1'.
# If a vcf was supplied on the command line, use this. This is for occasions where the vcf file has not been
# linked to samples in Mosaic
if args.input_vcf:
vcf = os.path.abspath(args.input_vcf)
# Check the file exists
if not exists(vcf):
fail('Input vcf file does not exist')
# Check that the index file also exists
index_file = vcf + '.tbi'
if not exists(index_file):
fail('The vcf index file does not exist')
# Get the vcf header
header = os.popen(str(resource_info['tools']['bcftools']) + ' view -h ' + str(vcf)).read()
for line in header.split('\n'):
if line.startswith('#CHROM'):
vcf_samples = line.rstrip().split('\t')[9:]
break
# Loop over the samples in the vcf file and find the ones in the mosaic_samples list
for vcf_sample in vcf_samples:
# This is a hack for UDN where the sample name has the experiment id appended
vcf_sample_name = vcf_sample
if args.udn:
if '-' in vcf_sample:
vcf_sample = vcf_sample.split('-')[0]
mosaic_samples[vcf_sample]['vcf_file'] = vcf
mosaic_samples[vcf_sample]['vcf_sample_name'] = vcf_sample_name
else:
if vcf_sample in mosaic_samples:
mosaic_samples[vcf_sample]['vcf_file'] = vcf
mosaic_samples[vcf_sample]['vcf_sample_name'] = vcf_sample
print(' Sample ', vcf_sample, ' appears as "', vcf_sample_name, '" in the header of vcf file: ', vcf, sep = '')
# Check that all samples have been associated with a vcf file
samples_with_no_vcf = []
for sample in mosaic_samples:
if 'vcf_file' not in mosaic_samples[sample]:
samples_with_no_vcf.append(sample)
if samples_with_no_vcf:
fail('\nERROR: The following samples do not appear in the vcf header:\n ' + '\n '.join(samples_with_no_vcf))
# If the vcf wasn't specified on the command line, get it from the Mosaic project
else:
all_vcfs = []
# Loop over all the samples
for sample in mosaic_samples:
vcf_files = {}
sample_id = mosaic_samples[sample]['id']
for sample_file in project.get_sample_files(sample_id):
if sample_file['type'] == 'vcf':
vcf_files[sample_file['id']] = {'name': sample_file['name'], 'uri': sample_file['uri'], 'vcf_sample_name': sample_file['vcf_sample_name']}
# If there are no vcf files for this sample output a warning, but continue, unless this is the proband
if len(vcf_files) == 0:
if mosaic_samples[sample]['relation'] == 'Proband':
fail('calypso_vcf_files: The proband (' + sample + ') is not associated with any vcf files. The proband must appear in a vcf file (a vcf file can be specified on the command line using the --input_vcf (-i) argument)')
else:
print(' WARNING: Sample ' + sample + ', listed as having the relationship ' + mosaic_samples[sample]['relation'] + ' is not associated with a vcf and so will contribute no variants')
mosaic_samples[sample]['vcf_file'] = False
mosaic_samples[sample]['vcf_sample_name'] = False
# If there is more than one vcf file for the sample, fail
elif len(vcf_files) > 1:
vcf_file_names = []
for vcf_file_id in vcf_files:
vcf_file_names.append(vcf_files[vcf_file_id]['name'])
print()
fail('ERROR: Multiple vcf files for the same sample (' + sample + ', id: ' + str(sample_id) + '). Calypso requires a single multi sample vcf. The following vcf files were found:\n ' + '\n '.join(vcf_file_names))
else:
for vcf_file in vcf_files:
uri = vcf_files[vcf_file]['uri']
# If the uri begins with 'file://', this needs to be removed
final_uri = uri[6:] if uri.startswith('file://') else uri
vcf_name = vcf_files[vcf_file]['vcf_sample_name']
if final_uri not in all_vcfs:
all_vcfs.append(final_uri)
mosaic_samples[sample]['vcf_file'] = final_uri
mosaic_samples[sample]['vcf_sample_name'] = vcf_name
print('Sample ', sample, ' appears as "', vcf_name, '" in the header of vcf file: ', final_uri, sep = '')
# If there are multiple VCF files, not all samples are in a single joint-called vcf. This case
# is not yet handled:
if len(all_vcfs) > 1:
fail('ERROR: Multiple vcf files for the samples. Calypso requires a single multi sample vcf. The following vcf files were found:\n ' + '\n '.join(all_vcfs))
# Store the name of the vcf file
vcf = mosaic_samples[list(mosaic_samples.keys())[0]]['vcf_file']
# If the ped file has not been defined, generate a bed file from pedigree information in Mosaic
if not args.ped:
# Open a ped file in the working directory
ped_filename = str(working_directory) + str(proband) + '.ped'
ped_file = open(ped_filename, 'w')
print('#Family_id', 'individual_id', 'paternal_id', 'maternal_id', 'sex', 'affected_status', sep = '\t', file = ped_file)
# If this is a singleton, there may be no pedigree information
if len(mosaic_samples) == 1:
sample_vcf_id = mosaic_samples[proband]['vcf_sample_name']
sex = mosaic_samples[proband]['sex']
if sex == 'Male':
sex_id = 1
elif sex == 'Female':
sex_id = 2
else:
sex_id = 0
print('WARNING: Unknown sex for sample: ', proband, sep = '')
affected_status = mosaic_samples[proband]['affected_status']
if affected_status == 'Affected':
affected_id = 2
elif affected_status == 'Unaffected':
affected_id = 1
else:
affected_id = 0
print('WARNING: Unknown affected status for sample: ', proband, sep = '')
print(sample_vcf_id, sample_vcf_id, '0', '0', sex_id, affected_id, sep = '\t', file = ped_file)
# If there are multiple samples, get the pedigree information
else:
# Parse the pedigree associated with the proband
for pedigree_line in project.get_pedigree(mosaic_samples[proband]['id']):
kindred_id = pedigree_line['pedigree']['kindred_id']
sample_id = mosaic_sample_ids[pedigree_line['pedigree']['sample_id']]
paternal_id = mosaic_sample_ids[pedigree_line['pedigree']['paternal_id']] if pedigree_line['pedigree']['paternal_id'] else 0
maternal_id = mosaic_sample_ids[pedigree_line['pedigree']['maternal_id']] if pedigree_line['pedigree']['maternal_id'] else 0
sex = pedigree_line['pedigree']['sex']
affection_status = pedigree_line['pedigree']['affection_status']
# The ped file must use the names as they appear in the vcf file, so use the sample_id to get the vcf_sample_name
sample_vcf_id = mosaic_samples[sample_id]['vcf_sample_name']
paternal_vcf_id = mosaic_samples[paternal_id]['vcf_sample_name'] if paternal_id != 0 else 0
maternal_vcf_id = mosaic_samples[maternal_id]['vcf_sample_name'] if maternal_id != 0 else 0
print(kindred_id, sample_vcf_id, paternal_vcf_id, maternal_vcf_id, sex, affection_status, sep = '\t', file = ped_file)
# Close the ped file
ped_file.close()
# Set args.ped to the created ped file
args.ped = ped_filename
# Check that the reference genome associated with the vcf file matches the project reference
header = os.popen(str(resource_info['tools']['bcftools']) + ' view -h ' + str(vcf)).read()
chr1_length = False
for line in header.split('\n'):
if line.startswith('##contig=<ID=chr1,'):
for contig_field in line.rstrip().replace('##contig=<ID=chr1,', '').rstrip('>').split(','):
if contig_field.startswith('length='):
chr1_length = contig_field.split('=')[1]
break
break
elif line.startswith('##contig=<ID=1,'):
for contig_field in line.rstrip().replace('##contig=<ID=1,', '').rstrip('>').split(','):
if contig_field.startswith('length='):
chr1_length = contig_field.split('=')[1]
break
break
# Check the length of chr1
is_correct = False
if args.ignore_vcf_ref_check:
print('Check on VCF reference genome NOT performed')
else:
if not chr1_length:
print('Could not verify the reference genome for the vcf file')
else:
if reference == 'GRCh38' and str(chr1_length).startswith('248956422'):
is_correct = True
elif reference == 'GRCh37' and str(chr1_length).startswith('249250621'):
is_correct = True
if is_correct:
print('The vcf file matches the reference genome of the project (', str(reference), ')', sep = '')
else:
print('The vcf file DOES NOT match the reference genome of the project (', str(reference), ')', sep = '')
print('Please verify the reference genome in Mosaic and ensure these match')
exit(1)
# Check that we have read access to the vcf file
if not os.access(vcf, os.R_OK):
fail('\nFAILED: Calypso does not have access to the vcf file which is required to complete the pipeline')
##########
##########
########## DELETE WHEN HANDLED S3 FILES
##########
##########
if vcf.startswith('s3'):
vcf = vcf.split('/')[-1]
# Determine the format of the chromosomes in the vcf file
header = os.popen(str(resource_info['tools']['bcftools']) + ' view -h ' + str(vcf)).read()
for line in header.split('\n'):
if line.startswith('##contig'):
chr_id = (line.split('=')[2]).split(',')[0]
chr_format = True if chr_id.startswith('chr') else False
break
# Loop over all the samples. For each sample, get the header final header line from its associated vcf file (which
# contains the list of all samples in the vcf) and determine at which position this sample appears
for sample in mosaic_samples:
vcf = mosaic_samples[sample]['vcf_file']
vcf_sample_name = mosaic_samples[sample]['vcf_sample_name']
# Only process samples associated with vcf files
if vcf:
####################
####################
#################### HACK FOR UDN S3 files
#################### DELETE
####################
####################
if vcf.startswith('s3://udn-joint-analysis/wgs/joint-by-family/'):
vcf = vcf.replace('s3://udn-joint-analysis/wgs/joint-by-family/', './')
# Get the final header line
data = os.popen(str(resource_info['tools']['bcftools']) + ' view -h ' + str(vcf)).readlines()[-1].rstrip().split('\t')
# Read through the samples and define the sample order
for index, vcf_sample in enumerate(data[9:]):
if vcf_sample == vcf_sample_name:
mosaic_samples[sample]['vcf_position'] = index
# Check that every sample in samples has vcf_position set
for sample in mosaic_samples:
if vcf and 'vcf_position' not in mosaic_samples[sample]:
fail('Sample ' + str(sample) + ' is listed in the ped file, but does not appear in the vcf header')
# If the HPO terms are not specified on the command line, get them from the project. If they are specified on the command
# line, use these
if not args.hpo:
hpo_terms = []
for hpo_term in project.get_sample_hpo_terms(mosaic_samples[proband]['id']):
if hpo_term['hpo_id'] not in hpo_terms:
hpo_terms.append(hpo_term['hpo_id'])
args.hpo = ','.join(hpo_terms)
if args.hpo:
print('Using the HPO terms: ', args.hpo, sep = '')
else:
print('Using the HPO terms: No terrms available')
# Get all the project attributes in the target Mosaic project
# Get all the project attributes from Mosaic for the project
project_attributes = {}
for attribute in project.get_project_attributes():
for attribute_project in attribute['values']:
value = False
if int(attribute_project['project_id']) == int(args.project_id):
value = attribute_project['value']
project_attributes[attribute['uid']] = {'name': attribute['name'], 'id': attribute['id'], 'value': value}
# Get all of the public attributes that are available for import
public_attributes = {}
for attribute in api_mosaic.get_public_project_attributes():
public_attributes[attribute['uid']] = {'name': attribute['name'], 'id': attribute['id']}
# Read the Mosaic json and validate its contents
if not args.mosaic_json:
args.mosaic_json = args.data_directory + 'resources_mosaic_' + reference + '.json'
print('Using Mosaic resources file: ', args.mosaic_json, sep = '')
mosaic_info = read_resources.read_mosaic_json(args.mosaic_json, reference)
# Check that the Mosaic resource file does not include resources that are not defined in the resources json
check_mosaic_resources(mosaic_info, resource_info)
# Check that a variant filters file has been supplied. The validity of the file will be checked when it is parsed.
# For now, it is sufficient that a file exists. If no file is supplied, use the one that should exist in the
# Calypso data directory
if not args.variant_filters:
if mosaic_info['variantFilters']:
args.variant_filters = str(args.data_directory) + str(mosaic_info['variantFilters'])
else:
fail('No json file describing the variant filter is specified. This can be specified in the mosaic resources file or on the command line')
if not os.path.exists(args.variant_filters):
fail('ERROR: The json file describing the preset variant filters does not exist (' + str(args.variant_filters) + ')')
# Get all the project annotations already in the Mosaic project
project_annotations = {}
for annotation in project.get_variant_annotations():
project_annotations[annotation['uid']] = {'id': annotation['id'], 'name': annotation['name'], 'type': annotation['value_type']}
# Get all available public annotations
public_annotations = {}
for annotation in project.get_variant_annotations_to_import():
public_annotations[annotation['uid']] = {'name': annotation['name'], 'id': annotation['id'], 'type': annotation['value_type']}
# Create all the required private annotations
private_annotations = {}
# Get the names (not the uids) of all the annotations in the project
existing_annotations = []
for annotation in project_annotations:
if annotation == 'variant_quality_grch37':
continue
elif annotation == 'variant_quality_grch38':
continue
existing_annotations.append(project_annotations[annotation]['name'])
# Loop over each resource in the Mosaic resources file
for resource in mosaic_info['resources']:
annotations = {}
# Loop over all annotations for this resource
if mosaic_info['resources'][resource]['annotation_type'] == 'private':
# The project id for these annotations is the target project as these annotations will be created there. Update
# mosaic_info to reflect this.
mosaic_info['resources'][resource]['project_id'] = args.project_id
# Loop over all annotations for this resource
for annotation in mosaic_info['resources'][resource]['annotations']:
value_type = mosaic_info['resources'][resource]['annotations'][annotation]['type']
# If the annotation has "isSample" set, there needs to be an annotation for every sample in the vcf file for the
# project (some projects include samples that were not sequenced and so these will not have annotations). Append the
# the samples relationship to the proband to the annotation name.
if mosaic_info['resources'][resource]['annotations'][annotation]['isSample']:
for sample in mosaic_samples:
# Include the sample name in the annotation, otherwise non-unique names could result. For example, if the proband
# has 2 brothers, the generated annotation name for the 2 brothers will be identical
if mosaic_samples[sample]['vcf_sample_name']:
annotations[str(annotation) + ' ' + str(mosaic_samples[sample]['relation']) + ' ' + str(sample)] = value_type
else:
annotations[annotation] = value_type
# Reset this annotation in the resources dictionary. It will be replaced below with the new annotations with the
# corresponding uids
category = mosaic_info['resources'][resource]['annotations'][annotation]['category']
severity = mosaic_info['resources'][resource]['annotations'][annotation]['severity']
display_type = mosaic_info['resources'][resource]['annotations'][annotation]['display_type']
mosaic_info['resources'][resource]['annotations'][annotation] = {'uid': False, 'type': False, 'id': False, 'category': category, 'severity': severity, 'display_type': display_type}
# Loop over the annotations to be created, check that there doesn't already exist an annotation of that name in the
# project, and if not, create a new, private annotation
for annotation in annotations:
value_type = annotations[annotation]
# If the annotation already exists, get the uid of the annotation, otherwise create the annotation.
if annotation in existing_annotations:
for existing_annotation in project_annotations:
if str(project_annotations[existing_annotation]['name']) == annotation:
private_annotations[existing_annotation] = {'name': annotation, 'id': project_annotations[existing_annotation]['id']}
mosaic_info['resources'][resource]['annotations'][annotation] = {'uid': existing_annotation, 'type': value_type, 'id': project_annotations[existing_annotation]['id']}
break
else:
severity = mosaic_info['resources'][resource]['annotations'][annotation]['severity']
display_type = mosaic_info['resources'][resource]['annotations'][annotation]['display_type']
category = mosaic_info['resources'][resource]['annotations'][annotation]['category']
data = project.post_variant_annotation(name = annotation, value_type = value_type, privacy_level = 'private', category = category, severity = severity, display_type = display_type)
private_annotations[data['uid']] = {'name': annotation, 'id': data['id']}
mosaic_info['resources'][resource]['annotations'][annotation] = {'uid': data['uid'], 'type': value_type, 'id': data['id']}
# Add the created private annotation to the project_annotations dictionary
project_annotations[data['uid']] = {'id': data['id'], 'name': annotation, 'type': value_type}
# Loop over all of the public annotations defined in the Mosaic resources json file and check that they exist
# in Mosaic and can be imported
for resource in mosaic_info['resources']:
if mosaic_info['resources'][resource]['annotation_type'] == 'public':
# Loop over all the annotations required for this resource
for annotation in mosaic_info['resources'][resource]['annotations']:
uid = mosaic_info['resources'][resource]['annotations'][annotation]['uid']
# If the annotation uid is not in the publicAnnotations dictionary, then this annotation does not exist for
# import. Fail and indicate that the resource file may contain errors
if uid not in public_annotations:
fail('Annotation with uid ' + str(uid) + ', for resource ' + str(resource) + ', is not available for import. Check that the uid in the Mosaic resources file is correct')
# Build the toml file for vcfanno. This defines all of the annotations that vcfanno needs to use
# with the path to the required files. The build the lua file that vcfanno uses to grab lua functions
# for postannotation
toml_filename = buildToml(working_directory, resource_info)
lua_filename = generate_lua_file(working_directory)
# Check if parents exist for this sample
has_mother = False
has_father = False
for sample in mosaic_samples:
if mosaic_samples[sample]['relation'] == 'Mother':
has_mother = True
elif mosaic_samples[sample]['relation'] == 'Fother':
has_father = True
has_parents = True if has_mother and has_father else False
# Generate the bash script to run the annotation pipeline
bash_filename, bash_file = open_bash_script(working_directory)
filtered_vcf = bash_resources(args.queue, resource_info, working_directory, bash_file, vcf, chr_format, args.ped, lua_filename, toml_filename)
annotate_vcf(resource_info, bash_file, chr_format, args.threads, mosaic_samples)
filter_vcf(bash_file, mosaic_samples, proband, resource_info, args.threads, has_parents)
# Process the filtered vcf file to extract the annotations to be uploaded to Mosaic
print('# Generate tsv files to upload annotations to Mosaic', file = bash_file)
print('GENERATE_TSV=', root_path, '/generate_annotation_tsv.py', sep = '', file = bash_file)
print('GENERATE_HGVS_TSV=', root_path, '/generate_hgvs_annotation_tsv.py', sep = '', file = bash_file)
print('GENERATE_COMP_HET_TSV=', root_path, '/generate_comphet_tsv.py', sep = '', file = bash_file)
print('GENERATE_VARIANT_QUALITY_TSV=', root_path, '/generate_variant_quality_tsv.py', sep = '', file = bash_file)
print('MOSAIC_JSON=', args.mosaic_json, sep = '', file = bash_file)
# Loop over all the resources to be uploaded to Mosaic
tsv_files = []
for resource in mosaic_info['resources']:
# The HGVS annotations require additional processing to identify the codes corresponding to the MANE transcript
# and strip the transcript information from the annotation
if resource == 'HGVS':
generate_hgvs_tsv(bash_file, reference)
tsv_files.append('hgvs.tsv')
# Extract compound hets
elif resource == 'compound_heterozygotes':
if has_parents:
uid = False
for annotation in private_annotations:
if private_annotations[annotation]['name'] == 'Compound Heterozygotes':
uid = annotation
break
if not uid:
fail('No private annotation with the name Compound Heterozygotes defined in the resources')
output_tsv = "comp_het.tsv"
generate_comp_het_tsv(bash_file, "COMPHET_VCF", output_tsv, uid, proband)
tsv_files.append(output_tsv)
elif resource == 'compound_heterozygotes_rare':
if has_parents:
uid = False
for annotation in private_annotations:
if private_annotations[annotation]['name'] == 'Rare Compound Heterozygotes':
uid = annotation
break
if not uid:
fail('No private annotation with the name Rare Compound Heterozygotes defined in the resources')
output_tsv = "comp_het_rare.tsv"
generate_comp_het_tsv(bash_file, "COMPHET_RARE_VCF", output_tsv, uid, proband)
tsv_files.append(output_tsv)
# Extract the Variant Quality scores after getting the uid of this annotation
elif resource == 'variant_quality':
for annotation in private_annotations:
if private_annotations[annotation]['name'] == 'Variant Quality':
uid = annotation
break
generate_variant_quality_tsv(bash_file, uid, proband)
tsv_files.append('variant_quality.tsv')
# Annotations to ignore
elif resource == 'GQ':
pass
# Remaining resources
else:
tsv_files.append(generate_annotation_tsv(bash_file, resource, reference))
# Close the bash script and make it executable
print('echo "Calypso pipeline version ', version, ' completed successfully"', sep = '', file = bash_file)
bash_file.close()
make_executable = os.popen('chmod +x ' + bash_filename).read()
# Prepare the target project. This includes:
# 1. Remove any unnecessary annotations from the project
# 2. Import public annotations into the project
# 3. Set the default annotations for the project
# 4. Set up required variant filters
# 5. Add and set project attributes
if 'remove' in mosaic_info:
# Loop over the annotations to remove, get the annotation id, and remove them from the project
for uid in mosaic_info['remove']:
if uid in project_annotations:
project.delete_variant_annotation(project_annotations[uid]['id'])
# Loop over all of the resources in the Mosaic resources file and get all of the annotation uids to import.
# Only public attributes can be imported
for resource in mosaic_info['resources']:
if mosaic_info['resources'][resource]['annotation_type'] == 'public':
# Loop over all the annotations required for this resource
for annotation in mosaic_info['resources'][resource]['annotations']:
uid = mosaic_info['resources'][resource]['annotations'][annotation]['uid']
# Check if the annotation is already in the target project. If not, import the annotation and update the
# projectAnnotations dictionary
if uid not in project_annotations:
project.post_import_annotation(public_annotations[uid]['id'])
project_annotations[uid] = {'id': public_annotations[uid]['id'], 'name': public_annotations[uid]['name'], 'type': public_annotations[uid]['type']}
# Import any annotations specified in the resource file
if 'import_annotations' in mosaic_info:
for uid in mosaic_info['import_annotations']:
if uid not in project_annotations:
if uid not in public_annotations:
print('WARNING: resource file requests annotation with uid ', uid, ' be imported, but this is not a public attribut and so is not imported', sep = '')
else:
project.post_import_annotation(public_annotations[uid]['id'])
project_annotations[uid] = {'id': public_annotations[uid]['id'], 'name': public_annotations[uid]['name'], 'type': public_annotations[uid]['type']}
# Set the projects default annotations. The default annotations are what a new user will see in the table by default
# and all users will have the option to reset the annotations table to show only the default annotations
default_annotation_ids = []
# Get the ids of all the default annotations
for annotation in mosaic_info['defaultAnnotations']:
if annotation in project_annotations:
default_annotation_ids.append(project_annotations[annotation]['id'])
elif annotation in public_annotations:
project.post_import_annotation(public_annotations[annotation]['id'])
project_annotations[uid] = {'id': public_annotations[annotation]['id'], 'name': public_annotations[annotation]['name'], 'type': public_annotations[annotation]['type']}
default_annotation_ids.append(public_annotations[annotation]['id'])
# Check if the annotation is a private annotation. In this case, the supplied annotation will not be the uid as
# this is not known, but will be the annotation name
else:
private_annotation_id = False
for private_annotation in private_annotations:
if str(private_annotations[private_annotation]['name']) == str(annotation):
private_annotation_id = private_annotations[private_annotation]['id']
if private_annotation_id:
default_annotation_ids.append(private_annotation_id)
else:
fail('Default annotation ' + str(annotation) + ' has not been imported or created in the project')
# Set the default annotations in the Mosaic project. This requires the annotation version ids, not the
# annotation ids. Use the latest version. If latest isn't set, use default
annotation_version_ids = []
for annotation in project.get_variant_annotations():
if annotation['id'] in default_annotation_ids:
latest_annotation_version_id = annotation['latest_annotation_version_id']
if not latest_annotation_version_id:
for annotation_version in annotation['annotation_versions']:
if annotation_version['version'] == 'default':
annotation_version_ids.append(annotation_version['id'])
else:
annotation_version_ids.append(latest_annotation_version_id)
project.put_project_settings(selected_variant_annotation_version_ids = annotation_version_ids)
# Generate scripts to upload filtered variants to Mosaic
upload_filename = working_directory + '02_calypso_upload_variants.sh'
try:
upload_file = open(upload_filename, 'w')
except:
fail('Could not open ' + str(upload_filename) + ' to write to')
# Write the command to file to upload the filtered variants
print('# Upload variants to Mosaic', file = upload_file)
print('API_CLIENT=', args.api_client, sep = '', file = upload_file)
print('CONFIG=', args.client_config, sep = '', file = upload_file)
print('VCF=', filtered_vcf, sep = '', file = upload_file)
print(file = upload_file)
print('python3 $API_CLIENT/variants/upload_variants.py ', end = '', file = upload_file)
print('-a $API_CLIENT ', end = '', file = upload_file)
print('-c $CONFIG ', end = '', file = upload_file)
print('-p ', str(args.project_id) + ' ', sep = '', end = '', file = upload_file)
print('-m "no-validation" ', sep = '', end = '', file = upload_file)
print('-v $VCF ', file = upload_file)
print(file = upload_file)
# Close the file
upload_file.close()
# Make the annotation script executable
make_executable = os.popen('chmod +x ' + str(upload_filename)).read()
# Open a file with the commands to upload all the annotations
upload_filename = working_directory + '03_calypso_upload_annotations.sh'
try:
upload_file = open(upload_filename, 'w')
except:
fail('Could not open ' + str(upload_filename) + ' to write to')
if reference == 'GRCh37':
annotation_project_id = api_store.get('Project ids', 'annotations_grch37')
elif reference == 'GRCh38':
annotation_project_id = api_store.get('Project ids', 'annotations_grch38')
print('API_CLIENT=', args.api_client, sep = '', file = upload_file)
print('CONFIG=', args.client_config, sep = '', file = upload_file)
print('UPLOAD_SCRIPT=$API_CLIENT/variant_annotations/upload_annotations.py', sep = '', file = upload_file)
for tsv in tsv_files:
print(file = upload_file)
print('# Upload ', tsv, ' annotations', sep = '', file = upload_file)
print('TSV=', working_directory, tsv, sep = '', file = upload_file)
print('python3 $UPLOAD_SCRIPT ', end = '', file = upload_file)
print('-a $API_CLIENT ', end = '', file = upload_file)
print('-c $CONFIG ', end = '', file = upload_file)
# Public annotations are posted to the annotation project, private annotations are posted to the
# case project
if tsv == 'variant_quality.tsv' or tsv == 'comp_het.tsv' or tsv == 'comp_het_rare.tsv':
print('-p ', args.project_id, ' ', sep = '', end = '', file = upload_file)
else:
print('-p ', annotation_project_id, ' ', sep = '', end = '', file = upload_file)
print('-t $TSV', sep = '', file = upload_file)
# Close the upload file
upload_file.close()
# Make the annotation script executable
make_executable = os.popen('chmod +x ' + str(upload_filename)).read()
# Determine all of the variant filters (from the calypso_mosaic_filters.json) that are to be added; remove any filters that already
# exist with the same name; fill out variant filter details not in the json (e.g. the uids of private annotations created by
# Calypso); create the filters; and finally update the project settings to put the filters in the correct category and sort order.
# Note that the filters to be applied depend on the family structure. E.g. de novo filters won't be added to projects without parents
if not args.variant_filters:
args.variant_filters = mosaic_info['variant_filters']
if args.api_client.endswith('/'):
variant_filter_command = 'python3 ' + args.api_client + 'project_setup/set_variant_filters.py'
else:
variant_filter_command = 'python3 ' + args.api_client + '/project_setup/set_variant_filters.py'
variant_filter_command += ' -a ' + args.api_client
variant_filter_command += ' -c ' + args.client_config
variant_filter_command += ' -p ' + args.project_id
variant_filter_command += ' -f ' + args.variant_filters
os.system(variant_filter_command)
# Output a summary file listing the actions undertaken by Calypso with all version histories
#res.calypsoSummary(working_directory, version, resource_info, reference)
#print('Calypso pipeline version ', version, ' completed successfully', sep = '')
# Run Exomiser on the original vcf file and process the exomiser outputs as private annotations. Only
# run if hpo terms are supplied
if args.hpo:
print('Generating exomiser scripts...', end = '')
application_properties(working_directory, args.tools_directory, reference)
yaml = generate_yml(working_directory, proband, reference, str(working_directory) + str(filtered_vcf), args.ped, args.hpo)
exomiser_script_name, exomiser_script = generate_exomiser_script(working_directory, args.tools_directory, yaml)
print('complete')
# Check the variant filters json is set
if not args.exomiser_filters_json:
if 'exomiser_variant_filters' in mosaic_info:
args.exomiser_filters_json = str(args.data_directory) + str(mosaic_info['exomiser_variant_filters'])
else:
fail('No json file describing the exomiser variant filters is specified. This can be specified in the mosaic resources file or on the command line')
if not os.path.exists(args.exomiser_filters_json):
fail('ERROR: The json file describing the preset exomiser variant filters does not exist (' + str(args.exomiser_filters_json) + ')')
# The exomiser script currently uses the filtered vcf, so no new variants will need to be uploaded
#upload_exomiser_variants(working_directory, args.api_client, args.client_config, args.project_id, proband)
print('Applying exomiser filters...', end = '')
exomiser_annotations(root_path, working_directory, args.api_client, args.client_config, args.project_id, proband, args.exomiser_filters_json)
print('complete')
print('Calypso pipeline completed successfully')
# Input options
def parse_command_line():
global version
parser = argparse.ArgumentParser(description='Process the command line')
# Define the location of the api_client and the ini config file
parser.add_argument('--api_client', '-a', required = True, metavar = 'string', help = 'The api_client directory')
parser.add_argument('--client_config', '-c', required = True, metavar = 'string', help = 'The ini config file for Mosaic')
# Required arguments
parser.add_argument('--data_directory', '-d', required = False, metavar = 'string', help = 'The path to the directory where the resources live')
parser.add_argument('--tools_directory', '-s', required = False, metavar = 'string', help = 'The path to the directory where the tools to use live')
parser.add_argument('--input_vcf', '-i', required = False, metavar = 'string', help = 'The input vcf file to annotate')
parser.add_argument('--ped', '-e', required = False, metavar = 'string', help = 'The pedigree file for the family. Not required for singletons')
parser.add_argument('--project_id', '-p', required = True, metavar = 'string', help = 'The project id that variants will be uploaded to')
parser.add_argument('--variant_filters', '-f', required = False, metavar = 'string', help = 'The json file describing the variant filters to apply to each project')
# Exomiser arguments
parser.add_argument('--exomiser_filters_json', '-x', required = False, metavar = 'string', help = 'The json describing the exomiser filters')
# Optional pipeline arguments
parser.add_argument('--reference', '-r', required = False, metavar = 'string', help = 'The reference genome to use. Allowed values: ' + ', '.join(allowed_references))
parser.add_argument('--resource_json', '-j', required = False, metavar = 'string', help = 'The json file describing the annotation resources')
parser.add_argument('--no_vep', '-n', required = False, action = "store_false", help = "If set, do not run VEP annotation")
parser.add_argument('--threads', '-t', required = False, metavar = 'integer', help = 'The number of threads to use')
# Optional argument to handle HPO terms
parser.add_argument('--hpo', '-o', required = False, metavar = "string", help = "A comma separate list of hpo ids for the proband")
# Optional mosaic arguments
parser.add_argument('--mosaic_json', '-m', required = False, metavar = 'string', help = 'The json file describing the Mosaic parameters')
# Flag for UDN projects as they have a slightly different naming convention
parser.add_argument('--udn', '-u', required = False, action = 'store_true', help = 'Set for UDN projects to handle the request id in the sample name')
# Do not check the vcf header to verify the reference genome based on chromosome length
parser.add_argument('--ignore_vcf_ref_check', '-g', required = False, action = 'store_true', help = 'Ignore the check on the VCF reference genome')
# Use the queue in Utah
parser.add_argument('--queue', '-q', required = False, action = 'store_true', help = 'Add queue information to the 01 script')
# Version
parser.add_argument('--version', '-v', action='version', version='Calypso annotation pipeline version: ' + str(version))
return parser.parse_args()
# Check the supplied arguments
def checkArguments(args, root_path):
# If Calypso is being run from the standard directory, the location of the data directory is
# predictable. If the directory has not been specified on the command line, check for the
# existence of this known directory
if not args.data_directory: args.data_directory = '/'.join(root_path.split('/')[0:-1]) + '/data/'
if not args.tools_directory: args.tools_directory = '/'.join(root_path.split('/')[0:-1]) + '/tools/'
# Ensure the data path ends with a "/", then add the reference directory and check that the
# directory exists
if args.data_directory[-1] != '/': args.data_directory += '/'
if not os.path.isdir(args.data_directory): fail('ERROR: The data directory does not exist (' + str(args.data_directory) + ')')
# Check that the tools directory exists and add to the path so scripts from here can be used
if args.tools_directory[-1] != '/': args.tools_directory += '/'
if not os.path.exists(args.tools_directory): fail('ERROR: The tools directory does not exist (' + str(args.tools_directory) + ')')
# Check that the api_client directory exists
if args.api_client[-1] != '/': args.api_client += '/'
if not os.path.exists(args.api_client): fail('ERROR: The api client directory does not exist (' + str(args.api_client) + ')')
# Create a directory where all Calypso associated files will be stored
def set_working_directory(version):
global working_directory
working_directory += version + "/"
# If the directory doesn't exist, create it
if not os.path.exists(working_directory):
os.makedirs(working_directory)
######
###### Following are routines for reading the resources json
######
# Check the supplied arguments
def check_resources(reference, data_directory, tools_directory, filename):
resource_info = {}
resource_info['path'] = data_directory + str(reference) + '/'
# Store the directory where tools reside if defined
if tools_directory:
resource_info['toolsPath'] = str(tools_directory) if tools_directory.endswith('/') else str(tools_directory) + str('/')
else:
resource_info['toolsPath'] = False
if not exists(filename):
fail('The resource file "' + filename + '" does not exist')
resource_info['json'] = filename
# Return the info on resources
return resource_info
# Define all of the tools to be used in Calypso, check they exist and output their versions
def calypso_tools(resource_info):
# Define the tools. If no path to the tools is provided, assume the tools are available in the system and don't need a path.
# The preference is to use the parh to the Calypso tools directory
resource_info['tools'] = {}
if resource_info['toolsPath']:
resource_info['tools']['bcftools'] = str(resource_info['toolsPath']) + 'bcftools/bcftools'
resource_info['tools']['vcfanno'] = str(resource_info['toolsPath']) + 'vcfanno'
resource_info['tools']['slivar'] = str(resource_info['toolsPath']) + 'slivar'
resource_info['tools']['vep'] = str(resource_info['toolsPath']) + 'ensembl-vep/vep'
else:
resource_info['tools']['bcftools'] = 'bcftools'
resource_info['tools']['vcfanno'] = 'vcfanno'
resource_info['tools']['slivar'] = 'slivar'
resource_info['tools']['vep'] = 'vep'
print('Using the following tools:')
# Bcftools
bcftoolsVersion = os.popen(resource_info['tools']['bcftools'] + ' -v').readlines()[0].rstrip()
print(' ', bcftoolsVersion, sep = '')
# vcfanno
proc = Popen(resource_info['tools']['vcfanno'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(' ', proc.stderr.readlines()[2].decode().rstrip(), sep = '')
# slivar
proc = Popen(resource_info['tools']['slivar'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(' ', proc.stderr.readlines()[0].decode().rstrip()[2:], sep = '')
# vep
vepInfo = os.popen(resource_info['tools']['vep']).readlines()
print(' vep versions:')
for i in range(5, 9, 1): print(' ', vepInfo[i].rstrip(), sep = '')
# Return the updated resource_info
return resource_info
######
###### Following are routines for parsing the mosaic json
######
# Check that all Mosaic resources are defined in the resources json
def check_mosaic_resources(mosaic_info, resource_info):
failed_resources = []
for resource in mosaic_info['resources']:
if resource not in resource_info['resources'].keys():
failed_resources.append(resource)
# Return the list of failed resources
return failed_resources
######
###### Following are routines for building files to be used in the data processing steps
######
# Build the toml file defining all the annotations that vcfanno should use
def buildToml(working_directory, resource_info):
# Create a toml file
toml_filename = 'calypso_annotations.toml'
toml = str(working_directory) + str(toml_filename)
try:
toml_file = open(toml, 'w')
except:
fail('There was a problem opening a file (calypso_annotations.toml) to write to')
# Get the path to the data
datapath = resource_info['path']
# Add each required resource to the toml file
for resource in resource_info['resources']:
if resource_info['resources'][resource]['toml']:
print('[[annotation]]', file = toml_file)
print('file="', str(datapath), str(resource_info['resources'][resource]['file']), '"', sep = '', file = toml_file)
# Some of the annotations include "fields", "columns", and "names". Include these in the toml if they exist
if 'fields' in resource_info['resources'][resource]:
text, no_values = toml_info(resource_info, resource, 'fields')
print(text, file = toml_file)
if 'columns' in resource_info['resources'][resource]:
text, no_values = toml_info(resource_info, resource, 'columns')
print(text, file = toml_file)
if 'names' in resource_info['resources'][resource]:
text, no_values = toml_info(resource_info, resource, 'names')
print(text, file = toml_file)
# Write out the ops
ops = 'ops=["'
no_ops = len(resource_info['resources'][resource]['ops'])
for i in range(0, no_ops - 1):
ops += str(resource_info['resources'][resource]['ops'][i]) + '", "'
ops += str(resource_info['resources'][resource]['ops'][-1]) + '"]'
print(ops, file = toml_file)
print(file = toml_file)
# If there is post annotation information to include in the toml, include it
if 'post_annotation' in resource_info['resources'][resource]:
post = resource_info['resources'][resource]['post_annotation']
print('[[postannotation]]', file = toml_file)
if 'name' in post:
print('name="', post['name'], '"', sep = '', file = toml_file)
if 'fields' in post:
fieldString = 'fields=["'
for i in range(0, len(post['fields']) - 1):
fieldString += str(post['fields'][i]) + '", "'
fieldString += str(post['fields'][-1]) + '"]'
print(fieldString, file = toml_file)