-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuild_alignment_dict_from_peptides.py
80 lines (60 loc) · 2.75 KB
/
build_alignment_dict_from_peptides.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
Build a dictionary of alignments based on current and all previous peptide coordinates.
The input is the all_previous_seqs.tsv file generated in https://github.com/pombase/all_previous_sgd_peptide_sequences
The dictionary structure, where keys are the systematic_id of genes:
"SPAC23E2.02": [{
"new_alignment": "--------------------------------------MNTSENDP ... GYNGTRY*",
"old_alignment": "MPLGRSSWICCAKYFVNTKSRFNEILPPRFTLIVSFYSMNTSENDP ... SGYNGTRY*"
}],
In the alignment gaps have the maximum penalty, to avoid scattered matches.
-----CAV
MAACATAV
And not
---C--AV
MAACATAV
The reason for this is that the alignment is based on changing / removing introns or changing the start of ending
coordinate of the start or end of the CDS, so you want maximal identity with minimum number of gaps.
"""
import argparse
from Bio import pairwise2, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import json
def main(previous_seqs_file, current_seqs_file, output_file):
old_seq_dict = dict()
# This one is a tsv file
with open(previous_seqs_file) as ins:
for line in map(str.rstrip, ins):
ls = line.split('\t')
gene_id = ls[0]
if gene_id in old_seq_dict:
old_seq_dict[ls[0]].append((Seq(ls[1]), ls[2]))
else:
old_seq_dict[ls[0]] = [(Seq(ls[1]), ls[2])]
changes_dict = dict()
record: SeqRecord
for record in SeqIO.parse(current_seqs_file, 'fasta'):
if record.id not in old_seq_dict:
continue
changes_dict[record.id] = list()
for old_seq, revision in old_seq_dict[record.id]:
alignments = pairwise2.align.globalms(record.seq, old_seq, match=1, mismatch=-2, open=-2, extend=0, penalize_end_gaps=False)
if len(alignments) == 0:
print('> No alignment found for {}, skipping'.format(record.id))
continue
changes_dict[record.id].append({
'revision': revision,
'new_alignment': alignments[0].seqA,
'old_alignment': alignments[0].seqB
})
with open(output_file, 'w') as out:
json.dump(changes_dict, out, indent=4)
if __name__ == '__main__':
class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
pass
parser = argparse.ArgumentParser(description=__doc__, formatter_class=Formatter)
parser.add_argument('--previous_seqs', default='data/sgd/all_previous_seqs.tsv')
parser.add_argument('--current_seqs', default='data/sgd/current_protein_seqs.fasta')
parser.add_argument('--output', default='data/sgd/coordinate_changes_dict.json')
args = parser.parse_args()
main(args.previous_seqs, args.current_seqs, args.output)