-
Notifications
You must be signed in to change notification settings - Fork 0
/
weightingWithDeClone.py
383 lines (349 loc) · 18.3 KB
/
weightingWithDeClone.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
import sys,os,subprocess,tempfile
import argparse
#from ete2 import Tree
#convert NEWICK tree notation with weights into nhx notation
#parameter: file - the path to the file with the tree in NEWICK-notation
# ignore_weights - boolean, if the parsed nhx-tree shall include weights or not
# set_minimum - str for minimal edge length (default: 0.0)
def parse_NEWICK_to_nhx(file,ignore_weights,minimal_branch_length=0.0):
print "Parsing Newick into nhx format"
nhx_tree = '' # empty tree, will be filled according to nhx-format
charpos = 0 # current charpos in line
numberUnamedNodes = 0 # current unamed internal nodes
numberOfSpecies = -1 # numberOfSpecies is equal to the number of colons
nameStartPos = 0 # startposition of the last species name
weightStartPos=0 #startposition of the last weight
isLeaf = False # current species is a leaf or not
newick_signs = [';', ',', '(', ')', ':'] # signs which have a distinct function in the NEWICK-Format
isWeight=False
includesNoWeight=False
f = open(file, "r") # file contains only one line
for line in f:
# if a : is in the Newick-Tree the tree is very likely weighted
if ':' not in line:
includesNoWeight=True
#for each character
for char in line:
if char == '(':
nameStartPos = charpos #the current species name begins
isLeaf = True # a species directly after a ( is a leaf
isWeight=False # after a (, there's never a weight
elif char == ')':
weightEndPos=charpos
if not includesNoWeight:
nhx_tree=check_edge_length(minimal_branch_length,nhx_tree,line,weightStartPos,weightEndPos)
nameEndPos = charpos # the current species name ends
if line[charpos - 1] not in newick_signs: # if there was a species id directly before this ) ...
# ...the nhx-block needs to be added
nhx_tree += '[&&NHX:D=?:Ev='
if isLeaf: # was the last species a leaf or an internal node?
nhx_tree += 'Extant'
else:
nhx_tree += 'Spec'
numberOfSpecies += 1 # the number of species is increased
lastSpeciesName=construct_species_name(line,nameStartPos,nameEndPos,numberUnamedNodes)
nhx_tree += ':S=' + str(numberOfSpecies) + ':ND=' + str(lastSpeciesName) + ']'
isLeaf = False # the next species directly behind a ) isn't a leaf
nameStartPos=charpos
isWeight=False # after a ), there's never directly a weight
elif char == ',':
weightEndPos=charpos
if not includesNoWeight:
nhx_tree=check_edge_length(minimal_branch_length,nhx_tree,line,weightStartPos,weightEndPos)
nameEndPos = charpos # the current species name ended
if line[charpos - 1] not in newick_signs: # if there was a species id directly before this ( ...
# ...the nhx-block needs to be added
nhx_tree += '[&&NHX:D=?:Ev='
if isLeaf: # was the last species a leaf or an internal node
nhx_tree += 'Extant'
else:
nhx_tree += 'Spec'
numberOfSpecies += 1 # the number of species is increased
lastSpeciesName=construct_species_name(line,nameStartPos,nameEndPos,numberUnamedNodes)
nhx_tree += ':S=' + str(numberOfSpecies) + ':ND=' + str(lastSpeciesName) + ']'
isLeaf = True
nameStartPos = charpos # the current species name begins
isWeight=False
elif char == ':':
weightStartPos=charpos
#setting temporarily a species name to check if it's empty
tempSpeciesName=line[nameStartPos+1:charpos]
#print 'temp: '+'start: '+str(nameStartPos)+' end: '+str(charpos)+' name: '+str(tempSpeciesName)
if len(tempSpeciesName) == 0:
numberUnamedNodes += 1
tempSpeciesName = "node" + str(numberUnamedNodes)
nhx_tree+=tempSpeciesName
if ignore_weights:
isWeight=True
elif char == ';': # the end of the Newick-tree is reached and a root node needs to be added
if line[charpos-1] != ')': #if a identifier for root node is given
nameEndPos=charpos
lastSpeciesName=construct_species_name(line,nameStartPos,nameEndPos,numberUnamedNodes)
nhx_tree+="[&&NHX:D=?:Ev=Spec:S=" + str(numberOfSpecies + 1) + ":ND="+ str(lastSpeciesName) +"]"
else:# if no identifier for root node is given
nhx_tree += "root[&&NHX:D=?:Ev=Spec:S=" + str(numberOfSpecies + 1) + ":ND=root]"
isWeight=False
#if the given Newick -tree doesn't include :, the check for unnamed internal nodes is here
elif includesNoWeight and line[charpos-1]==')':
#checking if the current internal node is unamed:
if line[charpos]==',' or line[charpos]==')':
numberUnamedNodes += 1
tempSpeciesName = "node" + str(numberUnamedNodes)
nhx_tree+=tempSpeciesName
#if the nhx-Tree shall not include weights, skip adding the current char to the tree, if it belongs to a weight
if not isWeight:
nhx_tree += char # append the current char to the nhx_tree
charpos += 1
nhx_tree.strip()
print nhx_tree
f.close()
#write the nhx-tree into a file
f = open(nhxFileOut,"w")
f.write(nhx_tree)
f.close()
def check_edge_length(minimal_branch_length,nhx_tree,line,weightStartPos,weightEndPos):
#if current edge length is smaller than demanded minimal edge length
#replace it with minimal edge length
weight_str=line[weightStartPos+1:weightEndPos]
weight=float(weight_str)
if weight<minimal_branch_length:
last_chars=len(weight_str)
nhx_tree=nhx_tree[:-last_chars] #removing last characters from nhx_tree
nhx_tree+=str(minimal_branch_length) # and replacing it with the minimal_branch_length
return nhx_tree
#constructs the name of the current species and gives it back.
def construct_species_name(line,nameStartPos,nameEndPos,numberUnamedNodes):
lastSpeciesName=line[nameStartPos+1:nameEndPos]
if ':' in lastSpeciesName:
lastSpeciesName=lastSpeciesName.split(':')[0]
# if the current species is an unnamed internal node, name it node_number
if len(lastSpeciesName) == 0:
lastSpeciesName = "node" + str(numberUnamedNodes)
return lastSpeciesName
#retrieve extant adjacencies
def readAdjacencyFile(file):
print "collect extant adjacencies from provided adjacency file"
# keys: (left marker, right marker), value: [(species,chromosome),...]
adjacencies = {}
f = open(file,"r")
for line in f:
fields = line.rstrip("\n").split("#")
adjas = fields[0].split(":")
left = adjas[1].split(" ")[0]
right = adjas[1].split(" ")[1]
specieslist = []
for extant in fields[1:]:
species = extant.split(":")[0].split(".")[0]
specieslist.append((species,"mockchrom"))
specieslist = set(specieslist)
adjacencies[(left,right)] = specieslist
return adjacencies
# double marker id to account for orientation
def doubleMarker(marker):
if "-" in marker:
return int(marker[1:]) * 2, (int(marker[1:]) * 2) - 1
else:
return (int(marker) * 2) - 1, int(marker) * 2
# double marker and find adjacencies in chromosomes
def findAdjacencies(speciesHash):
print "collect extant adjacencies from marker file..."
# keys: (left marker, right marker), value: [(species,chromosome),...]
adjacencies = {}
for species in speciesHash:
chromosomes = speciesHash[species]
for chrom in chromosomes:
orderList = chromosomes[chrom]
for i in range(0,len(orderList)-1):
first = orderList[i]
second = orderList[i+1]
doubleFirst = doubleMarker(first)
doubleSecond = doubleMarker(second)
#take right extrem of first and left extrem of second for adj tuple
adj = (doubleFirst[1],doubleSecond[0])
rev = (doubleSecond[0],doubleFirst[1])
if adj in adjacencies:
adjacencies[adj].append((species,chrom))
elif rev in adjacencies:
adjacencies[rev].append((species,chrom))
else:
adjacencies[adj] = [(species,chrom)]
return adjacencies
#for each extant adjacency, use declone to compute probability that the
#adjacency is present in an adjacency forest sampled randomly from a
#Boltzmann distribution. Then assign adjacency if it is above threshold to internal node of the tree.
def deCloneProbabilities(extantAdjacencies, kT, listOfInternalNodes, treefile):
print "Compute probabilities with DeClone..."
locateDeClone = os.path.dirname(sys.argv[0])
adjacenciesPerNode = {}
singleLeafAdj={}
extantWeightedAdjacencies={}
for adj in extantAdjacencies:
for species in extantAdjacencies[adj]:
node=species[0]
if node in extantWeightedAdjacencies:
extantWeightedAdjacencies[node].add(adj)
else:
adjset = set()
adjset.add(adj)
extantWeightedAdjacencies[node] = adjset
for adjacency in extantAdjacencies:
#produce list of extant adjacencies for declone
path = os.path.dirname(os.path.realpath(__file__))
tmpfile=tempfile.NamedTemporaryFile(delete=True) #create a temporary named file (appears with a arbitrary name in the directory)
species = extantAdjacencies[adjacency]
for spec in species:
tmpfile.write(spec[0]+" "+spec[0]+"\n")
tmpfile.seek(0) #go to the beginning of the tmpfile
command = locateDeClone+'/DeClone -t1 '+treefile+' -t2 '+treefile+' -a '+tmpfile.name+' -i -kT '+str(kT)
#use declone to compute probs
output = subprocess.check_output(command, shell=True)
#output is just matrix with probabilities
#each line of the output should contain max one number greater 0, save for internal nodes
lines = output.split("\n")
for line in lines:
if not line == "" and not line.startswith("\t") and not line.startswith(">"):
node = line.split("\t")[0] #for each internal node,find the prob for the current adj to be at this node
probs = line.split("\t")[1]
probs = probs.rstrip(" ")
probArray = probs.split(" ")
probArrayFl = [float(x) for x in probArray]
probability = max(probArrayFl, key=float)
#print probability
if node in listOfInternalNodes: #if node is an internal one
if len(species)>1: #if an adjacency occurs just in one external leaf,
# it's ignored for internal nodes (just evolved at this leaf)
if node in adjacenciesPerNode:
adjacenciesPerNode[node].add((adjacency,probability))
else:
adjset = set()
adjset.add((adjacency,probability))
adjacenciesPerNode[node] = adjset
else:
#ignored adjacencies with only one leaf occuring in
singleLeafAdj.update({adjacency:(species[0],probability)})
tmpfile.close() #tmpfile is closed and immediately deleted
#ignored adjacencies are written into a special file
f=open(singleLeafAdjOut,'w')
print('Removed '+str(len(singleLeafAdj))+' Adjacencies occurring just in one external node/leaf')
for adj in singleLeafAdj:
f.write('('+str(adj[0])+','+str(adj[1])+')'+'\t'+str(singleLeafAdj[adj][0])+'\t'+str(singleLeafAdj[adj][1])+'\n')
f.close()
print "Generating output..."
# output format: >internal node adjacency weight
# >external node adjacency
#internal nodes and root
file=open(listOfIntWeightOut, 'w')
for node in adjacenciesPerNode:
for adj_weight in adjacenciesPerNode[node]: #for each adjacency tuple with weight
file.write('>'+str(node)+'\t') #write the node's name
adj_L=str(adj_weight[0][0])
adj_R=str(adj_weight[0][1])
weight=str(adj_weight[1])
file.write('('+adj_L+','+adj_R+')\t') #write the adjacency tuple
file.write(weight+'\n') #write the adjacency weight
file.close()
#external leaves
file=open(listOfExtWeightOut, 'w')
for leaf in extantWeightedAdjacencies:
for adj in extantWeightedAdjacencies[leaf]:
file.write('>' + str(leaf) + '\t' + str(adj) + '\n')
file.close()
#get the internal nodes form the given nhx-treefile
# -seperatingChar: either ':', if tree includes weights or '[', if not
# -treefile: the path to the treefile
def get_internal_nodes_from_treefile(seperatingChar,treefile):
print "Get internal nodes from treefile"
listOfInternalNodes=[]
#read the nhx-tree
file=open(treefile,'r')
line=file.readline() #treefile consists only of one line
file.close()
#split the tree at each ')', because each internal node's name starts right after one
line_splitted_at_closed_brackets=line.strip().split(')')
for element in line_splitted_at_closed_brackets[1:]:
internal_node=element.strip().split(seperatingChar)[0]
#in case of the root node the '[&&NHX...'-block needs to be removed
listOfInternalNodes.append(internal_node.split('[')[0])
return listOfInternalNodes
def read_Marker_file(marker):
#compute adjacencies, compute weights with DeClone
print "Number of undoubled marker: "
# read undoubled marker, for each species
species_marker_order = {}
file = open(marker, "r")
species = ""
for line in file:
# new species
if line.startswith(">"):
if not species == "":
species_marker_order[species] = chromosomes
print species
print markerCount
species = line.split("\t")[0][1:].rstrip("\n")
chromosomes = {}
markerCount = 0
# new chromosome
elif line.startswith("#"):
chrom = line.rstrip("\n")[2:]
elif not line == "\n":
order = line.rstrip("\n")[:-2].split(" ")
markerCount += len(order)
chromosomes[chrom] = order
species_marker_order[species] = chromosomes
print species
print markerCount
file.close()
return species_marker_order
#parsing the input parameters
parser = argparse.ArgumentParser(description='Weights given tree in nhx-format with DeClone. Also converts tree in NEWICK-format into tree in nhx-format')
groupFormat = parser.add_mutually_exclusive_group(required=True)
groupFormat.add_argument("-nhx", "--nhx_Tree", type=str, help="path to the file with nhx-tree")
groupFormat.add_argument("-nf",'--Newick', type=str,help='path to the file with NEWICK-tree')
parser.add_argument("-i","--ignore_weights",action='store_const', const=True, help="boolean, for either ignore or consider edge length/weights, when parsing Newick Tree into nhx Tree")
parser.add_argument("-sm","--set_minimum", type=float, help="minimal value for any edge length, when parsing Newick Tree into nhx Tree", default=0.0)
groupAM = parser.add_mutually_exclusive_group(required=True)
groupAM.add_argument("-a","--adjacencies",type=str, help="path to adjacency-file")
groupAM.add_argument("-m","--markers",type=str,help="path to marker-file")
parser.add_argument("-kT",type=float,help="deClone constant", default=0.1)
parser.add_argument("-jP","--just_Parse",action='store_const', const=True, help="boolean, for either just parse the Newick-file or run DeClone after it.")
parser.add_argument("-out","--output",type=str, help="output directory, current directory as default", default=".")
args = parser.parse_args()
#global variables
#path for output file in Nhx-Format
nhxFileOut = args.output+'/nhx_tree'
listOfExtWeightOut = args.output+'/extant_adjacencies'
listOfIntWeightOut = args.output+'/weighted_internal_adjacencies'
singleLeafAdjOut = args.output+'/single_leaf_adjacencies'
if args.nhx_Tree:
# get List of internal nodes from treefile
listOfInternalNodes=get_internal_nodes_from_treefile(':',args.nhx_Tree)
if args.adjacencies:
extantAdjacencies=readAdjacencyFile(args.adjacencies)
deCloneProbabilities(extantAdjacencies, args.kT,listOfInternalNodes, args.nhx_Tree)
elif args.markers:
extantAdjacencies=findAdjacencies(read_Marker_file(args.markers))
deCloneProbabilities(extantAdjacencies, args.kT,listOfInternalNodes, args.nhx_Tree)
else:
parser.error('Error: wrong parameter number or usage.')
if args.Newick:
if args.ignore_weights:
parse_NEWICK_to_nhx(args.Newick, True,args.set_minimum)
listOfInternalNodes=get_internal_nodes_from_treefile('[',nhxFileOut) #when no weights given, after a node's name comes a [
else:
parse_NEWICK_to_nhx(args.Newick,False,args.set_minimum)
listOfInternalNodes=get_internal_nodes_from_treefile(':',nhxFileOut) #when weights given, after a node's name comes a :
if args.just_Parse:
print 'Parsed Newick-File to nhx-File.'
else:
if args.adjacencies:
extantAdjacencies=readAdjacencyFile(args.adjacencies)
deCloneProbabilities(extantAdjacencies, args.kT,listOfInternalNodes, nhxFileOut)
elif args.markers:
extantAdjacencies=findAdjacencies(read_Marker_file(args.markers))
deCloneProbabilities(extantAdjacencies, args.kT, listOfInternalNodes,nhxFileOut)
else:
parser.error('Error: wrong parameter number or usage.')
if not args.adjacencies and not args.markers:
parser.error('Error: wrong parameter number or usage.')
if not args.nhx_Tree and not args.Newick:
parser.error('Error: wrong parameter number or usage.')