-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtool.py
1062 lines (861 loc) · 60.4 KB
/
tool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#setting up the logger first and foremost
#importing 'logging', prompting the user to install it if not available.
try:
import logging
except ModuleNotFoundError:
print(f"Hello, the logging module was not found, please make sure it is installed, and run the script again.")
raise SystemExit(1)
#setting up two loggers - one to print serious errors in the terminal, the other to generate the log file.
file_logger = logging.getLogger('file_logger') #getting the logger. the name is passed because multiple loggers are being used.
file_logger.setLevel(logging.INFO) #setting the level to info as we want to write information into the log file.
fhandler = logging.FileHandler('log.log') #setting up filehandler to produce an output log file called 'log.log'.
fhandler.setFormatter(logging.Formatter('%(levelname)s - %(asctime)s - {%(pathname)s:%(lineno)d} - %(message)s')) #formatting the logger messages
file_logger.addHandler(fhandler) #adding the handler to the logger
#repeating the same for a terminal logger
stream_logger = logging.getLogger('stream_logger')
stream_logger.setLevel(logging.ERROR)
shandler = logging.StreamHandler()
shandler.setFormatter(logging.Formatter('%(levelname)s - %(asctime)s - {%(pathname)s:%(lineno)d} - %(message)s'))
stream_logger.addHandler(shandler)
#importing all the necessary modules, showing an error and exiting the program if they're not installed.
try:
import matplotlib.pyplot as plt
import os
import gffutils
import seaborn as sns
from operator import attrgetter
from statistics import mean
import pandas as pd
import argparse
import numpy as np
import copy
except ModuleNotFoundError as e:
stream_logger.error(f"Hello, the following module was not found, please make sure it is installed and run the script again.\n{e}")
raise SystemExit(1)
#first create a large data structure - should be easy
#second, calculate positions of introns in every gene.
#third, match.
#creating a command line
#parser is created, containing the program name, description, epilog, and formatter.
parser = argparse.ArgumentParser(prog='summer project',description='this program creates takes junctions data files, a genome sequence file, and a genome annotations file, and assesses accuracy of gene models.',epilog='thanks and regards',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
#arguments are added
#here we create flags to take files/folder at the command line
parser.add_argument('--junctions', required=True, help='use this argument to pass the folder containing junctions data file(s). these may be organized in any way, it will recursively extract all files.')
parser.add_argument('--gff_whole', required=True, help='use this argument to pass the GFF file containing all the feature annotations for the whole genome (including intron features).')
parser.add_argument('--gff_introns_only', required=True, help=f"use this argument to pass the GFF file containing only 'intron' feature annotations.")
parser.add_argument('--genome_sequence', required=True, help='use this argument to pass the whole genome sequence fasta file.')
#adding the optional arguments with defaults
parser.add_argument('--AED_threshold', required=False, type= float, default= 0.5,help='use this argument to pass the desired AED (Annotation Edit Distance) cutoff. this is optional.')
parser.add_argument('--intron_cutoff', required=False, type= float, default= 0.2,help='use this argument to pass a cutoff for filtering introns. this is optional.')
#a list containing the arguments passed in the command line is created.
args = parser.parse_args()
#defining variables
folder = str(args.junctions)
gff_whole = str(args.gff_whole)
gff_introns_only = str(args.gff_introns_only)
genome_fasta = str(args.genome_sequence)
AED_threshold = float(args.AED_threshold)
cutoff = float(args.intron_cutoff)
#error handling to check that the paths supplied exist
if os.path.exists(folder) and os.path.exists(gff_whole) and os.path.exists(gff_introns_only) and os.path.exists(genome_fasta):
pass
else:
stream_logger.error(f"Please check that all the files/folders supplied exist. Not all were found. Exiting program.")
raise SystemExit(1)
#now the program is starting.
##### class for introns ######
#this class was partly written by Dr. Kathryn Crouch based on a list structure in an earlier draft of the script written by me
# Note: we don't actually care about non-unique counts - chosen not to capture this information
class Intron:
_gene_list = False
_in_annotation = False
# constructor creates
def __init__(self, contig, start, end, strand, unique, NU):
self._key = f'{contig}:{start}-{end}:{strand}'
self._contig = contig
# error handling to check start and end are ints
try:
assert type(start) == int
assert type(end) == int
except AssertionError:
stream_logger.error("During initialization of this intron object, start and end positions were not found to be integers. Please check and try again. Raising SystemExit.")
raise SystemExit(1)
self._start = (start)
self._end = (end)
# error handling to check + or -
try:
assert str(strand) == "+" or str(strand) == "-"
except AssertionError:
stream_logger.error("During initialization of this intron object, strand was not '+' or '-'. Please check and try again. Raising SystemExit.")
raise SystemExit(1)
self._strand = strand
# error handling to check this is an int
try:
assert type(unique) == int
except AssertionError:
stream_logger.error("During initialization of this intron object, unique read depth was not found to be an integer. Please check and try again. Raising SystemExit.")
raise SystemExit(1)
self._unique = unique
# getters
@property
def key(self):
return self._key
@property
def contig(self):
return self._contig
@property
def start(self):
return self._start
@property
def end(self):
return self._end
@property
def strand(self):
return self._strand
@property
def unique(self):
return self._unique
@property
def in_annotation(self):
return self._in_annotation
@property
#if we have any records of this intron existing in any gene(s), this will be listed. else, it shows a result of the intron being in zero genes.
def gene_list(self):
if self._gene_list:
return self._gene_list
else:
return 0
# setters - only for attributes that we want to update
# unique setter should have some checking for ints
@unique.setter
def unique(self, count):
try:
self._unique = int(count)
except ValueError:
stream_logger.error(f"The unique could not be set since it is not an integer. Please check, raising SystemExit.")
raise SystemExit(1)
@in_annotation.setter
def in_annotation(self, bool):
self._in_annotation = bool
@gene_list.setter
def gene_list(self, genes):
self._gene_list = genes
# method to increment unique counts
def add_unique(self, count):
# check that count is an int
try:
self.unique += int(count)
except ValueError:
stream_logger.error(f"The unique count could not be incremented since the supplied count is not an integer. Please check, raising SystemExit.")
raise SystemExit(1)
# make a list of genes that our intron is in
def add_genes(self, *genes):
if self.gene_list:
for gene in genes:
self.gene_list.append(gene)
else:
self.gene_list = list(genes)
# what to show when we use print
def __repr__(self):
gene_ids = ', '.join([gene.id for gene in self.gene_list]) if self.gene_list else 'None'
return f'{self.contig}:{self.start}-{self.end}:{self.strand}\nUnique counts:\t{self.unique}\nGenes:\t{gene_ids}\n'
#defining function to check clashes
def clash_checker(intronA, intronB):
# KC: would not intronB.start <= intronA.end be a simpler test here? #MG: no, it would not cover clash case II (consult clash_cases diagram in appendix folder)
if int(intronB.start) - int(intronA.end) <= 1: #i.e. B's start pos - A's end pos
return True
else:
return False
#comparing all our predicted gene models (post filtering) to annotation models using AED
def AED_calculator(ann_model, predicted_model):
#ann model will be the query, and predicted models will be the reference (created from experimental evidence)
#both should be ordered by the start pos.
sorted_query = sorted(ann_model, key=lambda x: x.start)
sorted_ref = sorted(predicted_model, key=lambda x: x.start)
#mathematical concept taken from maker2 paper
#citation for the same: Holt, C., & Yandell, M. (2011). MAKER2: an annotation pipeline and genome-database management tool for second-generation genome projects. BMC bioinformatics, 12, 491. https://doi.org/10.1186/1471-2105-12-491
#introns will be compared in pairs. for each pair of unknown position and length (with the only assumption being directionality and order)
#we'll go intron-wise in the query
#there are 13 possible cases (check diagram). writing statements for these, and creating a variable to store intersection (base pair wise)
intersection = 0
for query_intron in sorted_query:
#print("\nQuery Intron")
#print(query_intron)
for ref_intron in sorted_ref:
#print("\nRef Intron")
#print(ref_intron)
query_end = int(query_intron.end)
query_start = int(query_intron.start)
ref_end = int(ref_intron.end)
ref_start = int(ref_intron.start)
#ann is query, junc is ref
#case I - perfect match
if query_end - ref_end == 0 and query_start - ref_start == 0:
common_bp = (query_end - query_start) + 1
intersection = intersection + common_bp
#case II - ref_intron within query_intron
elif query_end - ref_end > 0 and query_start - ref_start < 0:
common_bp = (ref_end - ref_start) + 1
intersection = intersection + common_bp
#case III - query_intron within ref_intron
elif query_end - ref_end < 0 and query_start - ref_start > 0:
common_bp = (query_end - query_start) + 1
intersection = intersection + common_bp
#case IV - ref_intron ahead, query lagging
elif query_end - ref_end > 0 and query_start - ref_start > 0 and query_start - ref_end < 0:
common_bp = (ref_end - query_start) + 1
intersection = intersection + common_bp
#case V - query ahead, ref lagging
elif query_end - ref_end < 0 and query_start - ref_start < 0 and query_end - ref_start > 0:
common_bp = (query_end - ref_start) + 1
intersection = intersection + common_bp
#case VI - ref outside and after query
elif query_end - ref_end < 0 and query_start - ref_start < 0 and query_end - ref_start < 0:
#there are no more introns that have any common sequence with the query intron, breaking the loop
break
#case VII - same start, query is longer
elif query_end - ref_end > 0 and query_start - ref_start == 0:
common_bp = (ref_end - ref_start) + 1
intersection = intersection + common_bp
#case VIII - same start, reference is longer
elif query_end - ref_end < 0 and query_start - ref_start == 0:
common_bp = (query_end - query_start) + 1
intersection = intersection + common_bp
#case IX - same end, query is longer
elif query_end - ref_end == 0 and query_start - ref_start < 0:
common_bp = (ref_end - ref_start) + 1
intersection = intersection + common_bp
#case X - same end, ref is longer
elif query_end - ref_end == 0 and query_start - ref_start > 0:
common_bp = (query_end - query_start) + 1
intersection = intersection + common_bp
#case XI - exact trail, query first, then ref
elif query_end - ref_end < 0 and query_start - ref_start < 0 and query_end - ref_start == 0:
common_bp = 1
intersection = intersection + common_bp
#case XII - exact trail, ref first, then query
elif query_end - ref_end > 0 and query_start - ref_start > 0 and query_start - ref_end == 0:
common_bp = 1
intersection = intersection + common_bp
#case XIII - ref and query apart - ref first, then query
elif query_end - ref_end > 0 and query_start - ref_start > 0 and query_start - ref_end > 0:
pass #there is no commonality, hence no intersection to be counted
#now calculating the number of bp in introns in the query
query_bp = 0
for query_intron in sorted_query:
bp = (query_intron.end) - (query_intron.start) + 1
query_bp = query_bp + bp
#similarly, calculating for ref
ref_bp = 0
for ref_intron in sorted_ref:
bp = (ref_intron.end) - (ref_intron.start) + 1
ref_bp = ref_bp + bp
#calculating sensitivity (SN)
try:
SN = intersection / ref_bp
except ZeroDivisionError:
SN = intersection / 0.000001
#calculating specificity (SP)
try:
SP = intersection / query_bp
except ZeroDivisionError: #there may be a case where query_bp is zero - the ann model containing zero introns. here, we can tweak it to be a very small value to avoid a Zero Division Error
SP = intersection / 0.000001
#calculating congruency (C)
C = (SN + SP) / 2
#calculating incongruency (D), also known as AED
AED = 1 - C
return AED
#creating a function to predict gene models from a list of Intron class objects
#this function contains some modifications done by Dr. Kathryn Crouch with the purpose of removing bugs, each of these is highlighted with a comment starting with #KC.
def gene_models_predictor(introns):
#sorting directionally
introns = sorted(introns, key=lambda x: x.start)
gene_models_list = []
list1 = [] #first gene model
for intron in introns:
#we start with model number 1
if len(list1) == 0: #this means we're adding the first intron in, this can be added as is, without any testing
list1.append(intron)
gene_models_list.append(list1.copy()) #KC: added .copy()
else: #we have an existing set of model(s)
#we need to check if there is a clash between current 'intron' and last intron in every gene model present in gene_models_list
#how does this work? - if all are clashing, new model(s) needs to be prepared and added in, otherwise we can just add the intron into whichever gene model it is not clashing with
#testing the former
# KC changed this to any - you end up with more models to assess later, but you were missing some with all
if any(clash_checker(model[-1], intron) for model in gene_models_list):
#what now? - in that case, we create a new list containing all models with the last intron removed. then duplicates are removed (since there might be models that are same with only the last intron differing), and this will be repeated until either a model is found which doesn't clash with the intron-of-interest, or until all models are emptied out, in which case a new model will be created that contains only the intron-of-interest.
new_models = copy.deepcopy(gene_models_list) #KC: changed copy to deepcopy
while len(new_models) != 0 and all(clash_checker(model[-1], intron) for model in new_models): #note - an empty model will throw an error with clash_checker, and the model[-1] bit. this won't be an issue since if the first condition in an 'and' statement evaluates false, the second isn't evaluated at all!
#we need to pop last item from each list. if after popping the model is empty, the entire model is removed.
models_to_remove = []
for model in new_models:
try:
model.pop()
except IndexError: #popping from empty list
pass
if len(model) == 0:
models_to_remove.append(model.copy()) #KC: added copy
for model in models_to_remove:
if model in new_models:
new_models.remove(model)
del models_to_remove
#removing duplicates
new_models_without_duplicates = []
for model in new_models:
if model not in new_models_without_duplicates:
new_models_without_duplicates.append(model)
new_models = new_models_without_duplicates
#once the while loop has stopped running, either the list is empty (there are no more models left), or there is at least one model which is not clashing with the intron of interest.
#checking former
if len(new_models) == 0:
new_gene_model = [intron]
gene_models_list.append(copy.deepcopy(new_gene_model)) #KC: added deepcopy
#checking latter
elif not all(clash_checker(model[-1], intron) for model in new_models):
#manually going through each potential new model. if there is still a clash, it can be skipped, if there is no clash, the new intron can be appended and the new model can be added to the master list
for model in new_models:
if clash_checker(model[-1], intron):
pass
else:
model.append(intron)
gene_models_list.append(copy.deepcopy(model)) #KC: added deepcopy
#testing the latter
else: #if this is true, intron may be safely added to at least one of the existing models
for model in gene_models_list:
if clash_checker(model[-1], intron):
pass
else:
model.append(intron)
return gene_models_list
#defining a function to translate DNA sequence
#function adapted from - https://www.geeksforgeeks.org/dna-protein-python-3/
def translate(DNA_seq):
codon_table = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W',
}
protein_seq =""
for i in range(0, len(DNA_seq), 3):
codon = DNA_seq[i:i + 3]
protein_seq += codon_table[codon]
return protein_seq
#extracting all junctions files found in given folder (including all sub-folders)
files = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(folder)) for f in fn]
dict_all = dict() #this is a dictionary that will contain all introns found in the junctions files.
#iterating through junction files
for file in files:
with open(file) as in_file:
#skipping header
next(in_file)
#iterating
for line in in_file:
# unpacking everything in one go
location, strand, unique, NU = line.rstrip().rsplit('\t')
#creating key for dictionary
key = str(location) + ':' + str(strand)
#creating value for the key created
contig = location.split(':')[0]
start_pos = ((location.split(':')[1]).split('-')[0])
end_pos = ((location.split(':')[1]).split('-')[1])
# creating your value here isn't necessary - only create it below where you need it
# we don't care about introns with no unique reads
# by excluding these, we can cut down processing time for the GFF part
if int(unique) >= 1:
if key in dict_all:
# I made the intron data into a class - we can use the class method to increment these counts
# this is safer and more efficient than partially overwriting the list like you were doing
# note that although I wrote a lot of boiler plate above, the code here is now reduced from 10 lines to 1 line, and it's safer!
dict_all[key].add_unique(int(unique))
else:
# This is now a dict where each value is an Intron object, with its identifier key
dict_all[key] = Intron(contig, int(start_pos), int(end_pos), strand, int(unique), int(NU))
file_logger.info(f"these are all the introns in the all read junctions files - {len(dict_all)}")
#writing a gff file listing all introns in the junction files.
with open('all_introns.gff','w') as out_file:
for intron in dict_all.values():
out_file.write(f"{intron.contig}\tVEuPathDB\tintron\t{intron.start}\t{intron.end}\t.\t{intron.strand}\t.\tID={intron.key}\n")
#works as expected
file_logger.info(f"the file 'all_introns.gff' was successfully written. this contains gff records of all junctions supplied.")
dict_ann_only = dict() #this is a dictionary that contains introns found only in annotations, that were not found in the junctions files.
#now opening gff file - introns only
with open(gff_introns_only) as in_file:
for line in in_file:
line = line.rstrip().rsplit('\t')
#reproducing the key
key = str(line[0]) + ':' + str(line[3]) + '-' + str(line[4]) + ':' + str(line[6])
if key in dict_all:
dict_all[key].in_annotation= True
else:
dict_ann_only[key] = line
file_logger.info(f"these are all the introns found in annotations only, unsupported by reads - {len(dict_ann_only)}")
dict_read_only = dict() #this is a dictionary containing introns that we found only in reads, not annotated. we expect these to be high in number, the majority in fact, as there will be many instances of noise 'introns' with very low unique read depth
dict_both = dict() #this is a dictionary that contains introns found in both the reads (junctions files) and annotations
for key, intron in dict_all.items():
if not intron.in_annotation:
dict_read_only[key] = intron
else:
dict_both[key] = intron
file_logger.info(f"these are all the introns found in the reads only, i.e. not annotated - {len(dict_read_only)}")
file_logger.info(f"these are all the introns found in reads that were annotated - {len(dict_both)}") #for SRR6493537 (used to test a bit of this script), this is 30402 (total junctions in reads) - 3250 (junctions in reads not annotated) #30402 - 3250 = 27152
#creating histograms to show the distribution of unique read depth in reads_only_introns and reads_that_were_annotated.
read_only_unique = []
for intron in dict_read_only.values():
read_only_unique.append(intron.unique)
both_unique = []
for intron in dict_both.values():
both_unique.append(intron.unique)
#creating distribution plots
plt.hist(read_only_unique, color='lightgreen', ec='black', bins=15)
#the plot is titled
plt.title('Read Depth (Unique) in Introns Found in Reads Only')
#the plot is saved
plt.savefig('readsonly_depth.png')
#the plot object is erased.
plt.clf()
plt.hist(both_unique, color='lightgreen', ec='black', bins=15)
#the plot is titled
plt.title('Read Depth (Unique) in Introns Found in Reads That Were Annotated')
#the plot is saved
plt.savefig('readsandanns_depth.png')
#the plot object is erased.
plt.clf()
file_logger.info(f"two histograms were successfully created - named 'readsonly_depth.png' and 'readsandanns_depth.png'. \nboth saved in the working directory.")
#removing extra lists from memory
del both_unique, read_only_unique
#creating the genome annotation database from the gff file
db = gff_whole.replace('.gff','.db')
#if the database already exists, this is loaded. else, a new database is created.
if not os.path.isfile(db):
db = gffutils.create_db(gff_whole,dbfn=db,force=True,keep_order=True,merge_strategy='merge',id_spec=['ID', 'Name'],sort_attribute_values=True)
else:
db = gffutils.FeatureDB(db, keep_order=True)
#this section of the code (lines 567 - 622) was written by Dr. Kathryn Crouch as an improvement upon a previous code written by me that had similar functionality but less-than-ideal efficiency.
# it is more efficient to run one query to get all the genes, then iterate through it gene-wise
# also, since introns cannot cross chromosomes, I've split the intron dict into chromosomes so you only consider the introns that are on the correct chromosome each time
# this reduces the computational burden further
# make a new dict where the chromosomes are the keys
# the value is a list of Intron objects in that chromosome
dict_by_chr = {}
for key, intron in dict_all.items():
if intron.contig in dict_by_chr:
dict_by_chr[intron.contig].append(intron)
else:
dict_by_chr[intron.contig] = [intron]
# now we consider only one chromosome at a time
#while doing this, also creating a master list of ALL genes
genes_master = []
for chr, intronList in dict_by_chr.items():
# sort the list of introns in this chromosome by start
# this will make it easier for us to avoid iterating through every intron for every gene
intronList.sort(key=lambda x: x.start)
# I'm going to query just once per chromosome and iterate through the list of genes
for gene in db.region(seqid=chr,featuretype='protein_coding_gene', completely_within=True):
#adding gene to master list
#creating the key
key = str(gene.id) + ':' + str(gene.start) + '-' + str(gene.end) + ':' + str(gene.strand)
#adding to list
genes_master.append(key)
# I believe this list of genes is ordered already
# this list will help us to avoid upstream introns in future iterations
introns_to_remove = 0
# Note that we are only looking at introns that we know are already on the same contig as the current gene
for intron in intronList:
# one end of the intron is in a gene
# we only want to add the gene to the intron if they're both on the same strand.
if ((gene.end >= intron.start and intron.start >= gene.start and str(gene.strand) == str(intron.strand)) or (gene.end >= intron.end and intron.end >= gene.start and str(gene.strand) == str(intron.strand))):
# note that we add the whole gene feature object here - this contains the location and strand information
# you can use this later to determine if the intron is incompletely within a gene and if the strand matches
dict_all[intron.key].add_genes(gene)
# if the end of this intron is before the end of the current gene, the intron cannot span to the next gene
# we will remove it so that we do not waste time considering introns that cannot be in genes
if intron.end <= gene.end:
introns_to_remove += 1
# if the intron starts AFTER this gene, there is no point in going through the rest of the list, so stop this loop
if intron.start >= gene.end:
break
# we remove introns that we know must be upstream of the next gene before the next loop so that we don't consider them
# this must be done outside of the loop that iterates this list
del intronList[0:introns_to_remove]
# a list of the genes is now stored in each Intron object, but if you wanted to access by gene
#creating a dict to store information on genes
gene_dict = {}
for intron in dict_all.values():
if intron.gene_list:
for gene in intron.gene_list:
#creating the key
key = str(gene.id) + ':' + str(gene.start) + '-' + str(gene.end) + ':' + str(gene.strand)
#print(key) #example - TGME49_318880:5521-10311:+
if key in gene_dict:
gene_dict[key].append(intron)
else:
gene_dict[key] = [intron]
#setting a cutoff value
#calculating int/intronX over whole genome
int_by_intX = []
for gene in gene_dict.keys():
#assigning 'Intron X' to a variable
IntronX = max(gene_dict[gene], key=attrgetter('unique'))
#print(IntronX)
for intron in gene_dict[gene]:
int_by_intX.append(int(intron.unique)/int(IntronX.unique))
#print(int_by_intX)
#plotting this in a univariate plot
sns.violinplot(y=int_by_intX).figure.savefig('cutoff.png') #doing violin instead of swarmplot because the latter was taking hours and hours to run due to too many data points
#the plot object is erased.
plt.clf()
#based on this plot, setting default cutoff at 0.2 until 5000, after which we'll count all introns above 1000 as valid. #this will be added as a default to this program's command line.
#report the creation of plot in the logger
file_logger.info(f"a violin plot of distribution of cutoff values across the whole genome was created. this is named 'cutoff.png' and may be found in the working directory.")
#deleting extra stuff from memory
del int_by_intX, dict_ann_only, dict_both, dict_read_only
#creating counts for each flag
likely_valid_count = 0
likely_invalid_count = 0
unsure_count = 0
#now going gene by gene and writing both the gene output file, and appending predictions in gff format to the main supplied gff file.
with open(gff_whole,'a') as gff_file:
with open('gene_info.csv','w') as out_file:
#writing the header in the output file
out_file.write(f"Gene\tFlag\tCase\tRef Transcript\tAED Scores\tTotal Introns Found\tNumber of Valid Introns\tMean Read Depth Before Filtering\tMean Read Depth After Filtering\tMaximum Read Depth\tCutoff Used\tEvidence of Validity of Ref Transcript?\tEvidence of Existence of Alternative Transcript?\tBest Alternative Model\n")
#here, adding genes to gene dict which didn't coincide with *any* intron from junction files. because of the way the gene dict has been written, so far only those genes will be included for which at least one intron was found in junctions. we also want to report information on genes for which no introns were found
for gene in genes_master:
if gene in gene_dict:
pass
else:
gene_dict[gene] = [] #the value is an empty list, which represents that no introns were found
#iterating through genes. writing output accordingly.
for gene, intron_list in gene_dict.items():
#defining some variables
gene_ID = gene.split(':')[0]
gene_start = int(gene.split(':')[1].split('-')[0])
gene_end = int(gene.split(':')[1].split('-')[1])
gene_strand = gene.split(':')[2]
total_introns = len(intron_list)
#checking if any introns were found in junction files for this gene. if not, we can write this in with the 'unsure' flag
if len(intron_list) == 0:
case_ID = "C"
flag = "Unsure - Intron Evidence not Found in Supplied Dataset, No Alternatives"
out_file.write(f"{gene_ID}\t{flag}\t{case_ID}\t-\t-\t{total_introns}\t0\t-\t-\t-\t-\t-\tFalse\t-\n")
unsure_count = unsure_count + 1
continue
#assigning 'Intron X'
IntronX = max(intron_list, key=attrgetter('unique'))
#sorting the intron_list by start
intron_list.sort(key=lambda x: x.start)
#defining some more variables
for intron in intron_list:
contig = intron.contig
break
read_depth_values = [o.unique for o in intron_list]
mean_read_depth_all = mean(read_depth_values)
max_read_depth = IntronX.unique
#here we need to consider the special case of exon only genes - since IntronX will not be a true representation of the expression of that gene, we need to impose an absolute metric. for our purposes, an intron with more than 1000 reads is good enough.
exon_only = False
if len(list(db.children(id=gene_ID, completely_within= True, featuretype= 'intron'))) == 0:
exon_only = True
#filtering out invalid introns
filtered_intron_list = []
for intron in intron_list:
if IntronX.unique > 5000 or exon_only == True:
cutoff_used = "More than 1000 (absolute)"
if intron.unique > 1000:
filtered_intron_list.append(intron)
else:
cutoff_used = f"More than or equal to {cutoff} * max read depth"
if int(intron.unique)/int(IntronX.unique) >= cutoff:
filtered_intron_list.append(intron)
del intron_list
#defining some more variables
total_introns_after_filtering = len(filtered_intron_list)
read_depth_values = [o.unique for o in filtered_intron_list]
if total_introns_after_filtering == 0:
mean_read_depth_filtered = 0
else:
mean_read_depth_filtered = mean(read_depth_values)
#for any exon-only gene for which no introns remained after filtering, we can say that they're valid
if exon_only == True and len(filtered_intron_list) == 0:
flag = "Likely Valid - Exon Only, No Alternatives"
case_ID = "A"
out_file.write(f"{gene_ID}\t{flag}\t{case_ID}\t-\t-\t{total_introns}\t{total_introns_after_filtering}\t{mean_read_depth_all}\t{mean_read_depth_filtered}\t{max_read_depth}\t{cutoff_used}\tTrue\t-\t-\n")
likely_valid_count = likely_valid_count + 1
continue
#if exon-only is False, meaning gene has an intron, but filtered_intron_list is still zero: that means that the record contains a gene with an intron that has not been found in junction files.
# KC: it likely means that the intron has been added based on other RNA-seq experiments that you are not considering here. It does not necessarily mean it is manually added
if exon_only == False and len(filtered_intron_list) == 0:
flag = "Unsure - Intron Evidence not Found in Supplied Dataset, No Alternatives"
case_ID = "C"
out_file.write(f"{gene_ID}\t{flag}\t{case_ID}\t-\t-\t{total_introns}\t{total_introns_after_filtering}\t{mean_read_depth_all}\t{mean_read_depth_filtered}\t{max_read_depth}\t{cutoff_used}\tTrue\t-\t-\n")
unsure_count = unsure_count + 1
continue
#iterating through introns to find and check flanking introns. the moment the first valid one is found, the loop is stopped and the information is recorded.
#if there is a flanking intron, then either the start of the first intron will be out of bounds of the gene, or the end of one of the introns in the list will be out of bounds
#checking if start is out of bounds
if filtered_intron_list[0].start <= gene_start:
flag = "Likely Invalid - Flanking Intron, No Alternatives"
case_ID = "D"
out_file.write(f"{gene_ID}\t{flag}\t{case_ID}\t-\t-\t{total_introns}\t{total_introns_after_filtering}\t{mean_read_depth_all}\t{mean_read_depth_filtered}\t{max_read_depth}\t{cutoff_used}\tFalse\t-\t-\n")
likely_invalid_count = likely_invalid_count + 1
continue
#checking if end is out of bounds
# KC: When you checked the start above, you just checked the first intron in the list # why not just check the last intron in the list here? Faster than iterating....
#MG: The reason for iterating is that the list is arranged by start position (ascending), not end position. Further, this is a list of all (filtered) introns, and not a gene model, hence there may well be clashes. In this case, there may be introns in the middle of this list, which are longer than subsequent introns and go out of bounds, while shorter introns towards the end may be within bounds. For this reason, I iterate and break the loop at the first instance of a flanking intron.
#MG: The other option was to rearrange this list by end position in descending order, but I didn't want to mess with an order that is uniform and works.
for intron in filtered_intron_list:
continue_to_next_gene = False
if intron.end >= gene_end:
flag = "Likely Invalid - Flanking Intron, No Alternatives"
case_ID = "E"
out_file.write(f"{gene_ID}\t{flag}\t{case_ID}\t-\t-\t{total_introns}\t{total_introns_after_filtering}\t{mean_read_depth_all}\t{mean_read_depth_filtered}\t{max_read_depth}\t{cutoff_used}\tFalse\t-\t-\n")
continue_to_next_gene = True #once info has been written, we can move onto the next gene
likely_invalid_count = likely_invalid_count + 1
break
if continue_to_next_gene:
continue
#if exon-only is true, for which at least one intron remained after filtering, we can say that it is probably not valid as it is not accounting for what the evidence suggests
if exon_only == True and len(filtered_intron_list) != 0:
flag = "Likely Invalid - Alternative(s) Available"
case_ID = "B"
#creating predictions
predicted_gene_models = gene_models_predictor(filtered_intron_list)
#since AED will be 1 (since the annotation is completely different from evidence, it contains no introns), we will not calculate it at all at all
#instead, we'll just write these predictions in the gff file and calculate the best alternative model (the bit of code for this is repeated, this was a last minute addition so not much time for refactoring)
#writing the gff file, and finding mean read depth
ID_counter = 0
mean_read_depth_list = []
for prediction in predicted_gene_models:
ID_counter = ID_counter + 1
gff_file.write(f"{contig}\tVEuPathDB_Prediction\tPredicted_Transcript\t{gene_start}\t{gene_end}\t.\t{gene_strand}\t.\tID={gene_ID}_p{ID_counter};Parent={gene_ID}\n")
#writing introns first, it is straightforward
for counter, intron in enumerate(prediction):
#the ID for each intron will be {prediction.id}-I{counter+1} (because counter is zero indexed but these IDs in gff files are 1 indexed)
gff_file.write(f"{contig}\tVEuPathDB_Prediction\tintron\t{intron.start}\t{intron.end}\t.\t{gene_strand}\t.\tID={gene_ID}_p{ID_counter}-I{counter+1};Parent={gene_ID}_p{ID_counter};gene_id={gene_ID}\n")
#writing exons, now, we need a counter, we need to write the first and last exon manually, and the rest within a loop over pairs of introns
#creating this counter
exon_counter = 1 #starting at one because this is for IDs and IDs are 1-indexed
#writing first exon
gff_file.write(f"{contig}\tVEuPathDB_Prediction\texon\t{gene_start}\t{(prediction[0].start)-1}\t.\t{gene_strand}\t.\tID={gene_ID}_p{ID_counter}-E{exon_counter};Parent={gene_ID}_p{ID_counter};gene_id={gene_ID}\n")
exon_counter = 2
#writing all middling exons
for intronA, intronB in zip(prediction, prediction[1:]):
gff_file.write(f"{contig}\tVEuPathDB_Prediction\texon\t{(intronA.end)+1}\t{(intronB.start)-1}\t.\t{gene_strand}\t.\tID={gene_ID}_p{ID_counter}-E{exon_counter};Parent={gene_ID}_p{ID_counter};gene_id={gene_ID}\n")
exon_counter = exon_counter + 1
#writing the last exon
gff_file.write(f"{contig}\tVEuPathDB_Prediction\texon\t{(prediction[-1].end)+1}\t{(gene_end)}\t.\t{gene_strand}\t.\tID={gene_ID}_p{ID_counter}-E{exon_counter};Parent={gene_ID}_p{ID_counter};gene_id={gene_ID}\n")
#gff written
read_depth = []
for intron in prediction:
read_depth.append(intron.unique)
mean_read_depth_list.append(mean(read_depth))
#best model suggestion
#finding which index has the highest number in mean_read_depth_list
index_max = np.argmax(mean_read_depth_list)
#this is the index of the model in predicted_gene_models_list that is the best alt model
best_alt_model_ID = f"{gene_ID}_p{index_max+1}" #+1 added because index will be zero indexed but IDs are 1 indexed
#writing in gene_info
out_file.write(f"{gene_ID}\t{flag}\t{case_ID}\t-\t-\t{total_introns}\t{total_introns_after_filtering}\t{mean_read_depth_all}\t{mean_read_depth_filtered}\t{max_read_depth}\t{cutoff_used}\tTrue\tTrue\t{best_alt_model_ID}\n")
likely_invalid_count = likely_invalid_count + 1
continue
#here on out, we only have genes for which ALL the filtered introns are found within bounds
# for all of these genes, we want to go transcript-wise.
for transcript in (list(db.children(id=gene_ID, completely_within= True, featuretype= 'mRNA'))):
ref_transcript_ID = transcript.id
#for this transcript, finding the beginning position of the first CDS and the end position of the last CDS to find beginning and end of coding sequence
#strand does not matter right now - we're just interested in the beginning and end. once the DNA sequence has been found
CDS_start = int((list(db.children(id=transcript.id, completely_within= True, featuretype= 'CDS'))[0]).start)
CDS_end = int((list(db.children(id=transcript.id, completely_within= True, featuretype= 'CDS'))[-1]).end)
#creating predictions
predicted_gene_models = gene_models_predictor(filtered_intron_list)
#checking these predictions to see if there are any intermittent stop codons, if there are, these models are removed.
for prediction in predicted_gene_models:
predictions_to_remove = []
#how would this work??? we'd have to declare the sequence invalid if the CDS start OR end is within the bounds of any intron. checking this first
for intron in prediction:
try:
assert CDS_start < intron.start or CDS_start > intron.end
assert CDS_end < intron.start or CDS_end > intron.end
except AssertionError:
predictions_to_remove.append(prediction)
break
for prediction in predictions_to_remove:
if prediction in predicted_gene_models:
predicted_gene_models.remove(prediction)
#now for the actual sequence extraction, this is discussed in report's future work section
#formatting this annotated transcript
ann_gene_model = []
for ann_intron in list(db.children(id=transcript.id, completely_within= True, featuretype= 'intron')):
ann_intron = Intron(contig, int(ann_intron.start), int(ann_intron.end), ann_intron.strand, 0, 0) #unique and non-unique values set to zero because these are annotations, so this information does not apply
ann_gene_model.append(ann_intron)
#checking if ann_gene_model is valid
ann_seq = ""
if gene_strand == "+":
CDSes = db.children(id=transcript.id,featuretype='CDS',order_by='start') #all the CDS children of the transcript are found, and are ordered by their start position.
elif gene_strand == "-":
CDSes = db.children(id=transcript.id,featuretype='CDS',order_by='start',reverse=True) #all the CDS children of the transcript are found, and are ordered by their start position, and then reversed because the strand is negative.
for CDS in CDSes:
ann_seq = ann_seq + str(CDS.sequence(genome_fasta,use_strand=True)) #use_strand = True ensures that the coding sequence is read with respect to the strand directionality.
#our sequence should be divisible by 3. if not, it is not valid.
if len(ann_seq) % 3 != 0:
ann_seq_valid = False
else: #it can be translated and checked for stop codons
try:
assert (translate(ann_seq)).count("*") == 1 and (translate(ann_seq)).endswith("*")
ann_seq_valid = True
except AssertionError:
ann_seq_valid = False
#filtering out based on AED
predictions_to_remove = []
for prediction in predicted_gene_models:
if float(AED_calculator(ann_gene_model,prediction)) > AED_threshold:
predictions_to_remove.append(prediction)
for prediction in predictions_to_remove:
if prediction in predicted_gene_models:
predicted_gene_models.remove(prediction)
#if AED based filtering filtered out *all* the models, this could mean two things - 1) either there wasn't enough evidence to validate if annotation was correct or not, or 2) there was plenty of evidence but the annotation was *so* incorrect that it can't be used as even a rough yardstick to compare predictions to, and so the script is unable to recommend predictions.
#in the former case, the annotation may be correct or incorrect, in the latter, the annotation is wrong. but there is no way to know which it is (at least programmatically within the paradigm of this tool)
#for this reason, the flag unsure, no alternatives is allotted.
if len(predicted_gene_models) == 0:
flag = "Unsure - Alternative(s) Not Available"
case_ID = "O"
unsure_count = unsure_count + 1
out_file.write(f"{gene_ID}\t{flag}\t{case_ID}\t{ref_transcript_ID}\t-\t{total_introns}\t{total_introns_after_filtering}\t{mean_read_depth_all}\t{mean_read_depth_filtered}\t{max_read_depth}\t{cutoff_used}\t-\tFalse\t-\n")
continue
#out of the models we have left, creating pandas dataframe to store information
#class objects can't be properly stored in a pandas dataframe, so storing indexes to the predicted_gene_models list instead
indices = []
for i in range(0,len(predicted_gene_models),):
indices.append(i)
#initiating dataframe
df = pd.DataFrame(indices, columns=['Gene_Model'])
#adding AED and mean read depth info
AED_list = []
mean_read_depth_list = []
for prediction in predicted_gene_models:
AED_list.append(float(AED_calculator(ann_gene_model,prediction)))
read_depth = []
for intron in prediction:
read_depth.append(intron.unique)
mean_read_depth_list.append(mean(read_depth))
df.insert(0,'AED',AED_list,True)
df.insert(0,'Mean_Read_Depth',mean_read_depth_list,True)
#sorting models based on AED
df.sort_values('AED')
#adding a column for IDs - the ID for each prediction will be {transcript.id}+'_p'+{index} (this may also be considered AED-based ranking)
prediction_IDs = []
for i in range(1,len(df.index)+1,):
prediction_IDs.append(f"{transcript.id}_p{i}")
df.insert(0,'IDs',prediction_IDs,True)
#writing this df into the gff file
for index, row in df.iterrows():
#each row is one predicted transcript
gff_file.write(f"{contig}\tVEuPathDB_Prediction\tPredicted_Transcript\t{gene_start}\t{gene_end}\t.\t{gene_strand}\t.\tID={row['IDs']};Parent={gene_ID}\n")
model = predicted_gene_models[int(row['Gene_Model'])]
#writing introns first, it is straightforward
for counter, intron in enumerate(model):
#the ID for each intron will be {prediction.id}-I{counter+1} (because counter is zero indexed but these IDs in gff files are 1 indexed)
gff_file.write(f"{contig}\tVEuPathDB_Prediction\tintron\t{intron.start}\t{intron.end}\t.\t{gene_strand}\t.\tID={row['IDs']}-I{counter+1};Parent={row['IDs']};gene_id={gene_ID}\n")
#writing exons, now, we need a counter, we need to write the first and last exon manually, and the rest within a loop over pairs of introns
#creating this counter
exon_counter = 1 #starting at one because this is for IDs and IDs are 1-indexed
#writing first exon
gff_file.write(f"{contig}\tVEuPathDB_Prediction\texon\t{gene_start}\t{(model[0].start)-1}\t.\t{gene_strand}\t.\tID={row['IDs']}-E{exon_counter};Parent={row['IDs']};gene_id={gene_ID}\n")
exon_counter = 2
#writing all middling exons
for intronA, intronB in zip(model, model[1:]):
gff_file.write(f"{contig}\tVEuPathDB_Prediction\texon\t{(intronA.end)+1}\t{(intronB.start)-1}\t.\t{gene_strand}\t.\tID={row['IDs']}-E{exon_counter};Parent={row['IDs']};gene_id={gene_ID}\n")
exon_counter = exon_counter + 1
#writing the last exon
gff_file.write(f"{contig}\tVEuPathDB_Prediction\texon\t{(model[-1].end)+1}\t{(gene_end)}\t.\t{gene_strand}\t.\tID={row['IDs']}-E{exon_counter};Parent={row['IDs']};gene_id={gene_ID}\n")
#gff written
#finally writing in the gene_info
#creating a string to input as AED result
AED_output = ""
for index, row in df.loc[:, ['AED', 'IDs']].iterrows():
AED_result = str(row['IDs']) + " : " + str(row['AED']) + "; "
AED_output = AED_output + AED_result
#finding the best alternative model using (highest) mean read depth
try:
best_alt_model_ID = df.iloc[(df['Mean_Read_Depth'].idxmax())]['IDs']
except ValueError: #if pandas is throwing a ValueError, that means that there were no predictions. in this case, there is no ID. this should be extremely rare.
best_alt_model_ID = "-"
#let's consider cases to allot an appropriate flag, we have the following bits of information - 1) validity of ann_sequence, 2) AEDs, 3) Mean Read Depth of each predicted transcript, 4) Number of Predictions
# KC: These make sense - but I would be a little cautious about saying "Valid" or "Invalid" - the language is a bit strong given the caveats. It's more like "this is the most likely" or "there is an alternative that is more likely" - you need to find a concise way to phrase that.
#MG: adding a unique case ID to output and a table in the report for additional context, instead of making flags too long/complicated/diverse outside of three major categories - valid, invalid, unsure.
#Case I - (Validity and AED == 0.0 found) + only one prediction - this means the annotation is highly likely to be valid, all evidence points it
#Case II - (Validity and AED == 0.0 found) + more than one prediction - if the highest mean read depth is also p1, then same as above, else it is likely valid with alternatives available
#Case III - Validity is true, but AED == 0.0 was not found, but there was only one prediction - this means that the prediction and ann are different, but ann is valid. in this case, we're not sure
#Case IV - validity is false, but AED == 0.0 and there was only one prediction - this means that our prediction was the same as the (invalid) annotation so we don't know the real answer. hence, we're not sure
#Case V - (validity and AED == 0.0) both are false, and there is only one prediction - in this case the program has found decisively that the ann is probably not valid and instead there is a prediction available
#Case VI - validity is true but AED == 0.0 is false (i.e. the ann was not in predictions), and there are multiple predictions - unsure about ann's validity since it is not found in evidence-based predictions, but it is valid, hence this is unsure
#Case VII - validity is false, AED == 0.0 was found, there are multiple predictions - likely invalid with alternatives available
#Case VIII - (validity and AED == 0.0) both false, multiple predictions found, same as above - likely invalid with alternatives available
#Case I
if ann_seq_valid == True and df.iloc[0]['AED'] == 0.0 and len(df.index) == 1:
flag = "Likely Valid - No Alternatives"
case_ID = "F"