-
Notifications
You must be signed in to change notification settings - Fork 0
/
sort_seqs_by_clade.py
188 lines (158 loc) · 7.24 KB
/
sort_seqs_by_clade.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env python3
"""Given a tree topology (with at least three monophyletic clades of interest),
a set of (at least three) sequence names defining clades of interest in the
tree topology, and a FASTA file containing all sequences represented in the
tree, this script generates a FASTA file for sequences contained in each of the
clades.
Usage:
python3 sort_seqs_by_clade.py <topology file in newick format> \
<file listing reference sequence names> \
<fasta file with all sequences> \
<path to output directory to be generated>
"""
import os
import sys
import shutil
from ete3 import Tree
from Bio import SeqIO
def retrieve_clade_seqs(topology_newick_file,
reference_seq_names_file,
input_fasta_file,
output_fasta_directory):
"""Main function for retrieving sequences for specific clades.
"""
# Parse the resulting tree to determine in which clade of interest, if any,
# the sequence was placed.
# Parse tree using ete3.
# Note: parentheses and commas get replaced with underscores.
t1 = Tree(topology_newick_file, quoted_node_names=True)
# Print tree.
#print('Full tree:')
#print(t1)
# Make a copy of the tree object.
t2 = t1.copy()
# Get list of reference sequences from input.
ref_seq_list = []
for i in open(reference_seq_names_file):
ref_seq_list.append(i.strip().split(',')[0])
print('\nReference sequences:')
for x in ref_seq_list:
print(x)
print()
# Check that there are at least three reference sequences defined.
assert len(ref_seq_list) >= 3, """Less than three reference sequences are
named in the file %s.""" % reference_seq_names_file
# Check that all the reference sequences are all in the tree topology.
all_leaf_names = [x.name for x in t2.get_leaves()]
for ref_seq_name in ref_seq_list:
assert ref_seq_name in all_leaf_names, """Reference sequence %s was not
found among the sequences represented in the tree %s.""" % \
(ref_seq_name, topology_newick_file)
# Initiate dict to store lists of seq names by ref seq name.
seq_names_by_ref_seq_name = {}
# For each reference sequence, traverse all nodes and find the node with
# the largest number of child nodes that are leaf (terminal) nodes,
# containing the reference sequence of interest, but not containing any of
# the other reference sequences.
ts_num = 0
first_ref_seq_node_name = None
ts_that_additional_seq_was_placed_in = None
for ts in ref_seq_list:
ts_num += 1
if ts_num == 1:
first_ref_seq_node_name = ts
# Root on another reference sequence for the first reference sequence in
# the list to get whole clade, then root the tree on the ancestor
# node of that first clade.
# Get a node name for a node corresponding to a different
# reference sequence.
other_ref_seq_node_name = None
for i in ref_seq_list:
if i != ts:
other_ref_seq_node_name = i
break
# Get node corresponding to a different reference sequence.
other_ref_seq_node = None
for node in t2.traverse():
if node.name == other_ref_seq_node_name:
other_ref_seq_node = node
break
assert other_ref_seq_node is not None
# Root on the other reference sequence node.
t2.set_outgroup(other_ref_seq_node)
elif ts_num == 2:
# Root on the first reference sequence node for all subsequent
# clades.
first_ref_seq_node = None
for node in t2.traverse():
leaf_list = []
for leaf in node.get_leaves():
if leaf.name == first_ref_seq_node_name:
first_ref_seq_node = node
break
t2.set_outgroup(first_ref_seq_node)
# Make a copy of the tree topology to work with for each run
# through this loop.
t3 = t2.copy()
# Make a list of nodes that contain reference seq, but not any others.
nodes_of_interest = []
for node in t3.traverse():
# Search in nodes that contain the reference sequence.
if node.search_nodes(name=ts):
# Search in nodes that don't contain other reference sequences.
contains_other_ref_seqs = False
for ts2 in ref_seq_list:
if not ts2 == ts:
if node.search_nodes(name=ts2):
contains_other_ref_seqs = True
if not contains_other_ref_seqs:
# Add nodes of interest to list.
nodes_of_interest.append(node)
# find the node with the most child leaf nodes.
node_w_most_leaves = sorted(nodes_of_interest, key=lambda x:\
len(x.get_leaves()), reverse=True)[0]
node_w_most_leaves.name = 'X'
print('\n\nClade defined by sequence ' + ts + ':')
print(node_w_most_leaves)
# Add list of leaf names from this clade to the dict.
seq_names_by_ref_seq_name[ts] = [x.name for x in \
node_w_most_leaves.get_leaves()]
# Print number of sequences in each clade.
print('\nNumber of sequences in each clade:')
sorted_keys = sorted(seq_names_by_ref_seq_name.keys(),
key=lambda x: len(seq_names_by_ref_seq_name[x]),
reverse=True)
for key in sorted_keys:
print(key + ':\t' + str(len(seq_names_by_ref_seq_name[key])))
# Parse input FASTA file.
all_seqs = list(SeqIO.parse(input_fasta_file, 'fasta'))
# Make output directory.
if os.path.isdir(output_fasta_directory):
shutil.rmtree(output_fasta_directory)
os.mkdir(output_fasta_directory)
# Write relevant sequences to output FASTA files.
for ref_seq_name in ref_seq_list:
# Define list of sequence objects to be written to output.
clade_specific_seq_objs = [x for x in all_seqs if x.id in \
seq_names_by_ref_seq_name[ref_seq_name]]
# Define path to output FASTA file.
output_fasta_file = os.path.join(output_fasta_directory,
ref_seq_name.replace(' ',
'_').replace('/', '_') +\
'__clade.faa'
)
# Open output FASTA file.
with open(output_fasta_file, 'w') as o:
# Write relevant sequences to the file.
SeqIO.write(clade_specific_seq_objs, o, 'fasta')
if __name__ == '__main__':
# Parse command line arguments.
cmdln = sys.argv
topology_newick_file = cmdln[1]
reference_seq_names_file = cmdln[2]
input_fasta_file = cmdln[3]
output_fasta_directory = cmdln[4]
retrieve_clade_seqs(topology_newick_file,
reference_seq_names_file,
input_fasta_file,
output_fasta_directory)