-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRPT.py
425 lines (271 loc) · 14.2 KB
/
RPT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
# RPT.py
"""Generate RNA sensors for the isolation of PASTE recombinants by reverse promoter trapping (RPT)."""
import argparse
from DNA import rev_comp
import RADAR
import pandas as pd
def parse_args():
"""
Parse input FASTA and output CSV filenames.
Parse attB sequence for PASTE.
"""
parser = argparse.ArgumentParser(description = "Generate RNA sensors for the isolation of PASTE recombinants by reverse promoter trapping (RPT).")
parser.add_argument("-i", "--input", type = str, metavar = "", required = True, help = "Input FASTA file name.")
parser.add_argument("-o", "--output", type = str, metavar = "", required = True, help = "Output CSV file name.")
parser.add_argument("-a", "--attb", type = str, metavar = "", required = True, help = "AttB sequence.")
parser.add_argument("-n", "--number", type = int, metavar = "", required = False, default = 5, help = "Number of transcripts per target (default: 5).")
args = parser.parse_args()
return args
def find_terminator(sequence: str, start: int = 0) -> int:
"""
Find the first RNAPIII termination signal in a sequence.
"""
terminator = sequence.find("T"*7, start) # 7 T termination signal
return terminator
def find_antisense_protospacer(sequence: str, terminator: int) -> int:
"""
Find the first Cas9 protospacer in the antisense strand of a sequence upstream of an RNAPIII terminator.
"""
min_length = 200 # minimum length of transcript region for effective RNA sensor targeting
max_length = 350 # maximum length of transcript region for effective RNA sensor targeting
pam_proximal = 6 # length of PAM proximal region (excluded from the transcript)
min_distance_to_pam = min_length + pam_proximal # minimum distance from terminator to the end of the PAM
max_distance_to_pam = max_length + pam_proximal # maximum distance from terminator to the end of the PAM
start_index = terminator - max_distance_to_pam # start search here
end_index = terminator - min_distance_to_pam # end search here
if end_index < 0:
return -1
elif start_index < 0:
start_index = 0
antisense_pam = sequence.rfind("CC", start_index, end_index) # index of PAM
if antisense_pam == -1:
return -1
antisense_protospacer = antisense_pam + 22 # index of first base in antisense protospacer
return antisense_protospacer
def find_sense_protospacer(sequence: str, antisense_protospacer: int) -> int:
"""
Find the first Cas9 protospacer in the sense strand of a sequence 90-150 bp upstream of an antisense protospacer.
"""
min_cut_distance = 90 # minimum distance between Cas9 cleavage sites for efficient insertion of attB site by twinPE
max_cut_distance = 150 # maximum distance between Cas9 cleavage sites for efficient insertion of attB site by twinPE
pam_distal = 17 # length of PAM distal region
distance_to_GG = 4 # distance from cut site to G dinucleotides in PAM
min_distance = pam_distal + min_cut_distance - distance_to_GG # minimum distance to G dinucleotides in distal protospacer
max_distance = pam_distal + max_cut_distance - distance_to_GG # maximum distance to G dinucleotides in distal protospacer
start_index = antisense_protospacer - max_distance # start search here
end_index = antisense_protospacer - min_distance # end search here
if end_index < 0:
return -1
elif start_index < 0:
start_index = 0
sense_pam = sequence.rfind("GG", start_index, end_index) # index of PAM
if sense_pam == -1:
return -1
sense_protospacer = sense_pam - 21 # index of first base in sense protospacer
if sense_protospacer < 0:
return -1
return sense_protospacer
def find_next_antisense_protospacer(sequence: str, antisense_protospacer: int) -> int:
"""
Find the next antisense protospacer after a given one.
"""
antisense_pam = antisense_protospacer - 22
next_antisense_pam = sequence.rfind("CC", 0, antisense_pam) # index next PAM
if next_antisense_pam == -1:
return -1
next_antisense_protospacer = next_antisense_pam + 22 # index of first base in next protospacer
return next_antisense_protospacer
def find_end_of_terminator(sequence: str, terminator: int = 0) -> int:
"""
Find the end of an RNAPIII termination sequence.
"""
end = 0
for i in range(terminator, len(sequence)):
if sequence[i] == "T":
continue
else:
end = terminator + i
break
return end
def contains_repeats(sequence: str) -> bool:
"""
Determine whether a sequence contains any repeats.
"""
counter = 1
base = "N"
for i in range(1, len(sequence)):
next_base = sequence[i]
if next_base == base:
counter += 1
if next_base == "A" and counter == 10:
return True
if next_base == "T" and counter == 5:
return True # avoid repeats of 5 or more T's which can pause or terminate RNAPIII transcription
if (next_base == "G" or next_base == "C") and counter == 6:
return True
else:
counter = 1
base = next_base
return False
def calculate_gc_content(sequence: str) -> float:
"""
Calculate the GC content of a sequence.
"""
gc_count = 0
for i in range(len(sequence)):
if sequence[i] == "G" or sequence[i] == "C":
gc_count += 1
else:
continue
gc_content = gc_count/len(sequence)
return gc_content
def has_optimal_gc_content(sequence: str, lower_limit: float = 0.25, upper_limit: float = 0.75) -> bool:
"""
Determine whether the GC content of a sequence falls within an optimal range.
Optimal range is defined between a lower limit and upper limit (default: 0.25, 0.75).
"""
gc_content = calculate_gc_content(sequence)
if lower_limit <= gc_content <= upper_limit:
return True
else:
return False
def score_stops(transcript: str) -> float:
"""
Score a transcript based on the number of translational stops in any frame of the antisense sequence.
"""
stops = [rev_comp(s) for s in ["TAG", "TAA", "TGA"]] # antisense sequence of stops
stop_count = 0
for i in range(0, len(transcript)-2):
if transcript[i:i+3] in stops: # count number of stops in any frame
stop_count += 1
else:
continue
stop_score = stop_count / len(transcript)
# number of stops as a proportion of total codon space (number of codons across all frames of transcript)
# a lower score predicts fewer stops that have to be removed from an RNA sensor targeting the transcript
return stop_score
def find_target_transcripts(sequence: str, k: int) -> list:
"""
Find target transcripts within a sequence based on stop scores.
Return the coordinates of up to k transcripts with the lowest stop scores and their protospacers.
"""
top_coords = [None]*k
top_scores = [1]*k
i = 0
terminator = 0
while True:
start_index = find_end_of_terminator(sequence, terminator) # start search downstream of terminator
terminator = find_terminator(sequence, start_index) # find the first terminator
if terminator == -1:
break
antisense_protospacer = find_antisense_protospacer(sequence, terminator) # find proximal protospacer
while antisense_protospacer >= 0:
sense_protospacer = find_sense_protospacer(sequence, antisense_protospacer) # find distal protospacer
if sense_protospacer == -1:
antisense_protospacer = find_next_antisense_protospacer(sequence, antisense_protospacer)
continue
transcript_start = antisense_protospacer - 16 # start index of new transcript from Cas9 cleavage site of proximal protospacer
transcript_end = terminator
new_transcript = sequence[transcript_start: transcript_end] # new transcript
if contains_repeats(new_transcript): # ignore transcripts with repeats (sensors will be difficult to synthesize)
break
if not has_optimal_gc_content(new_transcript): # ignore transcripts with extreme GC content (sensors will be difficult to synthesize)
antisense_protospacer = find_next_antisense_protospacer(sequence, antisense_protospacer)
continue
new_score = score_stops(new_transcript) # score new transcript by number of stops
if top_scores[i] < new_score:
antisense_protospacer = find_next_antisense_protospacer(sequence, antisense_protospacer)
continue
top_scores[i] = new_score # if new score is better than worst top score, update top scores
top_coords[i] = (sense_protospacer, antisense_protospacer, terminator) # add coordinates of protospacer and transcript terminator for new score
_, i = max((score, index) for index, score in enumerate(top_scores)) # update index of worst top score
antisense_protospacer = find_next_antisense_protospacer(sequence, antisense_protospacer)
top_coords = [coord for coord in top_coords if coord] # take coordinates of top k or fewer transcripts and their protospacers
return top_coords
def generate_twinpe_guide(protospacer: str, template: str, pbs_length: int = 13, overlap_length: int = 30) -> str:
"""
Generate guide RNA to insert attB template at a target protospacer using twinPE.
"""
motif = "CGCGGTTCTATCTAGTTACGCGTTAAACCAACTAGAA" # tevopreq1 motif
scaffold = "GTTTAAGAGCTAAGCTGGAAACAGCATAGCAAGTTTAAATAAGGCTAGTCCGTTATCAACTCGAAAGAGTGGCACCGAGTCGGTGCT" # F+E tracrRNA
if not protospacer[0] == "G": # add 5' G to spacer if necessary
spacer = "G" + protospacer
else:
spacer = protospacer
pbs = spacer[:-3][-pbs_length:]
rtt_length = (len(template)+overlap_length)//2 # length of RTT for twinPE insertion based on overlap between guides
rtt = template[:rtt_length]
guide = spacer + scaffold + rev_comp(pbs + rtt) + motif
return guide
def generate_sensors_and_guides(input_filename, output_filename, attb_sequence: str, number_of_transcripts: int) -> None:
"""
Generate RNA sensors for selection of PASTE recombinants based on top RNAPIII transcripts.
"""
fasta_names = []
fasta_sequences = []
print(f"Opening file {input_filename} ...")
with open(input_filename) as input_fasta:
for line in input_fasta:
if line[0] == ">":
fasta_names.append(line[1:].rstrip())
fasta_sequences.append("")
else:
fasta_sequences[-1] += line.rstrip()
targets = []
regions = []
transcripts = []
sensors = []
sense_protospacers = []
antisense_protospacers = []
sense_guides = []
antisense_guides = []
for name, sequence in zip(fasta_names, fasta_sequences):
print(f"Finding up to {number_of_transcripts} transcripts for {name} ...")
target_coords = find_target_transcripts(sequence, number_of_transcripts)
total_transcripts = len(target_coords)
print(f"{total_transcripts} transcripts found for {name}.")
print(f"Generating sensors and guides for {name} ...")
for coord in target_coords:
targets.append(name)
sense_index, antisense_index, termination = coord
# index of sense protospacer, antisense protospacer and termination of transcript
region = sequence[sense_index: termination+7]
regions.append(region)
target_transcript = sequence[antisense_index-16: termination]
transcripts.append(target_transcript)
target_sensor = RADAR.generate_sensor(target_transcript) # generate RNA sensor against transcript
sensors.append(target_sensor)
sense_protospacer = sequence[sense_index: sense_index+20]
sense_protospacers.append(sense_protospacer)
antisense_protospacer = rev_comp(sequence[antisense_index-19: antisense_index+1])
antisense_protospacers.append(antisense_protospacer)
sense_guide = generate_twinpe_guide(sense_protospacer, attb_sequence)
sense_guides.append(sense_guide) # generate sense guide
antisense_guide = generate_twinpe_guide(antisense_protospacer, rev_comp(attb_sequence))
antisense_guides.append(antisense_guide) # generate antisense guide
print(f"Sensors and guides generated for {name}.")
print("Sensors and guides generated for all transcripts.")
df = pd.DataFrame({"target": targets,
"region": regions,
"transcript": transcripts,
"sensor": sensors,
"sense_protospacer": sense_protospacers,
"antisense_protospacer": antisense_protospacers,
"sense_guide": sense_guides,
"antisense_guide": antisense_guides})
print(f"Writing to {output_filename}.csv ...")
df.to_csv(output_filename + ".csv", sep = ",", encoding = "utf-8", index = False)
print("Completed successfully!")
if __name__ == "__main__":
args = parse_args()
input_filename = args.input
output_filename = args.output
attb_sequence = args.attb.upper()
number = args.number
print("-------------------------------------------------------------------")
print("Reverse Promoter Trapping (RPT) for Isolation of PASTE Recombinants")
print("-------------------------------------------------------------------")
if input_filename.split(".")[1] in ["fasta", "fa"]: # check input filename suffix is FASTA
generate_sensors_and_guides(input_filename, output_filename, attb_sequence, number)
else:
print("ERROR: Input file should be FASTA format.")