-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuild_alignment_dict_from_genome.py
186 lines (137 loc) · 7.67 KB
/
build_alignment_dict_from_genome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""
Build a dictionary of alignments based on the updated coordinates of genes, and store it as json.
The input is a coordinate changes file generated with https://github.com/pombase/genome_changelog
The dictionary structure, where keys are the systematic_id of genes:
"SPAC23E2.02": [{
"revision": "revision",
"new_coord": "join(446770..449241,449295..450530)",
"old_coord": "join(446491..446513,446679..449241,449295..450530)",
"new_alignment": "--------------------------------------MNTSENDP ... GYNGTRY*",
"old_alignment": "MPLGRSSWICCAKYFVNTKSRFNEILPPRFTLIVSFYSMNTSENDP ... SGYNGTRY*"
}],
In the alignment gaps have the maximum penalty, to avoid scattered matches.
-----CAV
MAACATAV
And not
---C--AV
MAACATAV
The reason for this is that the alignment is based on changing / removing introns or changing the start of ending
coordinate of the start or end of the CDS, so you want maximal identity with minimum number of gaps.
"""
import argparse
import pandas
import pickle
from Bio import pairwise2, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from genome_functions import get_feature_location_from_string
import json
from pathlib import Path
import glob
class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
pass
parser = argparse.ArgumentParser(description=__doc__, formatter_class=Formatter)
parser.add_argument('--genome', default='data/genome.pickle')
parser.add_argument('--coords', default='data/only_modified_coordinates.tsv')
parser.add_argument('--output', default='data/coordinate_changes_dict.json')
parser.add_argument('--old_genomes', default='data/old_genome_versions/*/*.contig')
parser.add_argument('--genome_sequence_changes', default='data/genome_sequence_changes.tsv')
args = parser.parse_args()
with open('config.json') as ins:
config = json.load(ins)
def read_old_genomes(files, format):
"""
Create a dictionary of dictionaries where the keys are contig_name and revision and value is
the sequence of the given contig at a given revision, such that to access the sequence of
chromosome X at revision Y, you do output[x][y].
It relies on a file structure where files are like this:
├── chromosome1
│ ├── 20011004.contig
│ ├── 20020322.contig
....
"""
out_dict = dict()
for f in files:
splitted_path = Path(f).parts
revision = splitted_path[-1].split('.')[-2]
contig = splitted_path[-2]
if contig not in out_dict:
out_dict[contig] = dict()
with open(f, errors='replace') as ins:
# We store it as seq because we don't want to store
out_dict[contig][revision] = Seq(SeqIO.read(ins, format)[0])
return out_dict
def choose_old_genome(previous_coordinate, latest_genome_seq, old_genomes_dict, genome_seq_changes: pandas.DataFrame):
if any(genome_seq_changes[['chromosome', 'date']].duplicated()):
raise ValueError('The script cannot handle two revisions made on the same date')
# Sort by date
genome_seq_changes = genome_seq_changes.sort_values(['date'], ascending=False).copy()
# Select which genome should be used
contig = config['chromosome2file'][previous_coordinate['chromosome']]
changes_this_contig = genome_seq_changes[genome_seq_changes['chromosome'] == contig]
newer_genome_sequences = previous_coordinate['date'] <= changes_this_contig['date']
# All revisions where changes were made are newer, use the oldest genome (they are sorted)
chromosome_revisions_dictionary = old_genomes_dict[contig]
if all(newer_genome_sequences):
return SeqRecord(chromosome_revisions_dictionary[changes_this_contig['revision'].iloc[-1]])
# All revisions where changes were made are older, use the newest genome
elif all(~newer_genome_sequences):
return latest_genome_seq
# revision of the next version
chosen_revision = changes_this_contig.loc[newer_genome_sequences, 'revision'].iloc[-1]
return SeqRecord(chromosome_revisions_dictionary[chosen_revision])
# Load info about changes in genome sequence
genome_seq_changes = pandas.read_csv(args.genome_sequence_changes, sep='\t', na_filter=False, dtype=str)
# We skip current versions
genome_seq_changes = genome_seq_changes[genome_seq_changes['chromosome'].duplicated()].copy()
print('\033[0;32mreading old genomes...\033[0m')
old_genomes_dict = read_old_genomes(glob.glob(args.old_genomes), 'embl')
print('\033[0;32mold genomes read\033[0m')
with open(args.genome, 'rb') as ins:
latest_genome = pickle.load(ins)
# Load coordinate changes
coordinate_data = pandas.read_csv(args.coords, delimiter='\t', na_filter=False)
# We only consider CDS features in genes that have alleles with sequence errors
coordinate_data = coordinate_data[(coordinate_data['feature_type'] == 'CDS')].copy()
# Make sure the order is right
coordinate_data.sort_values(['date', 'revision', 'chromosome', 'systematic_id', 'feature_type', 'added_or_removed'], inplace=True, ascending=[False, False, True, True, True, True])
changes_dict = dict()
for systematic_id in sorted(list(set(coordinate_data.systematic_id))):
if systematic_id not in latest_genome or 'contig' not in latest_genome[systematic_id]:
print(systematic_id, 'skipped, probably to do with alt-splicing')
continue
data_subset = coordinate_data[coordinate_data.systematic_id == systematic_id].copy()
# First added is the latest coordinates
latest_coordinate = data_subset[data_subset['added_or_removed'] == 'added'].iloc[0, :]
# The rest of "removed" are the older ones
previous_coordinates = data_subset[data_subset['added_or_removed'] == 'removed']
changes_dict[systematic_id] = list()
new_feature_loc = get_feature_location_from_string(latest_coordinate['value'])
if systematic_id.startswith(config['mitochondrial_prefix']):
new_seq = new_feature_loc.extract(latest_genome[systematic_id]['contig']).translate(table=config['mitochondrial_table'])
else:
new_seq = new_feature_loc.extract(latest_genome[systematic_id]['contig']).translate()
for i, previous_coordinate in previous_coordinates.iterrows():
this_change = dict()
this_change['revision'] = previous_coordinate['revision']
this_change['new_coord'] = latest_coordinate['value']
this_change['old_coord'] = previous_coordinate['value']
old_feature_loc = get_feature_location_from_string(previous_coordinate['value'])
# The genome on which a feature was defined might not have the same sequence, if so use the old sequence
old_genome = choose_old_genome(previous_coordinate, latest_genome[systematic_id]['contig'], old_genomes_dict, genome_seq_changes)
if systematic_id.startswith(config['mitochondrial_prefix']):
old_seq = old_feature_loc.extract(old_genome).translate(table=config['mitochondrial_table'])
else:
old_seq = old_feature_loc.extract(old_genome).translate()
# This can happen when the sequence was equal to the current sequence in the past, then changed to something else, then reverted again to what it currently is.
if new_seq.seq == old_seq.seq:
continue
# An alignment where gaps have the maximum penalty, to avoid scattered matches.
alignments = pairwise2.align.globalms(new_seq.seq, old_seq.seq, match=1, mismatch=-2, open=-2, extend=0, penalize_end_gaps=False)
if len(alignments) == 0:
continue
this_change['new_alignment'] = alignments[0].seqA
this_change['old_alignment'] = alignments[0].seqB
changes_dict[systematic_id].append(this_change)
with open(args.output, 'w') as out:
json.dump(changes_dict, out, indent=4)