-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathConstrainedMultiSample.py
650 lines (543 loc) · 35.3 KB
/
ConstrainedMultiSample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
# Copyright 2013 Massachusetts Institute of Technology
# BSD-2-Clause license https://github.com/agitter/MultiPCSF/blob/master/LICENSE
from optparse import OptionParser
import sys
import os
import shutil
import itertools
import multiprocessing
import time
import random
import NetworkUtil
__author__ = "Anthony Gitter"
__version__ = "0.1.0"
# Iterate between finding a common set of nodes among the samples
# and learning Steiner forests constrained to use those
# common nodes whenever possible
def main(argList):
# Parse the arguments, which either come from the command line or a list
# provided by the Python code calling this function
parser = CreateParser()
(opts, args) = parser.parse_args(argList)
print "Starting constrained multi-sample Steiner forest %s" % time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
print "Multi-PCSF version %s" % __version__
print "Parameters: %s" % opts
# TODO Add error checking of inputs
if opts.iterations < 1:
raise RuntimeError("Must have at least 1 iteration")
# TODO Should allow the option to run serially without the pool because a
# pool with 1 worker is not efficient
if opts.workers < 1:
opts.workers = multiprocessing.cpu_count()
# Assume negative prizes to implement the common set
# and change if using positive common set prizes
negativePrizes = True
if "positive" in opts.artificialPrizes:
negativePrizes = False
# Assume unweighted prizes
weightedPrizes = False
if "Weighted" in opts.artificialPrizes:
weightedPrizes = True
# Assume batch mode
batchMode = True
if opts.iterMode == "random":
batchMode = False
# Load all of the proteins in the interactome, ignoring
# genes. The artificial prizes will be created for a subset of these nodes.
allProts = LoadProteins(opts.interactomePath, opts.undirectedFile, opts.directedFile, opts.tfdnaFile)
# Load the negative prizes for the degree penalties or an empty dictionary
# if they aren't being used
directedFile = "None"
if opts.directedFile != "None":
directedFile = os.path.join(opts.interactomePath, opts.directedFile)
degPenalties = NetworkUtil.DegreePenalties(opts.mu, os.path.join(opts.interactomePath, opts.undirectedFile), directedFile)
# Create the initial stp files
# New directory to hold the original data before the iterations begin
# These stp files will be read and updated at subsequent iterations
initPath = os.path.join(opts.resultPath,"initial")
if not os.path.exists(initPath):
os.makedirs(initPath)
# Load the list of terminal files and the sample-to-group mapping
terminalMap, sampleMap, countMap = LoadTerminalFiles(opts.terminalPath, opts.masterTerminalFile)
# Store the groups in a fixed order
groups = sorted(terminalMap.iterkeys())
for group in groups:
print "%d samples in group %s" % (countMap[group], group)
# Create a pool for creating .stp files and learning Steiner forests in parallel
# using the specified number of workers. Use it to create the initial
# .stp files. Even when running the subsequent iterations in random sequential
# order, create a pool to learn the initial trees and final pruned trees (if applicable).
print "Creating a pool with %d workers" % opts.workers
pool = multiprocessing.Pool(opts.workers)
initialStpMap = dict()
for group in groups:
terminalFiles = terminalMap[group]
sampleNames = sampleMap[group]
# opts and initPath are invariant arguments for each sample
zippedArgs = itertools.izip(itertools.repeat(opts), itertools.repeat(initPath), terminalFiles, sampleNames)
initialStpMap[group] = pool.map(CreateStpHelper, zippedArgs) # Blocks until all are finished
# Store which proteins don't have prizes for each patient.
# These are the nodes that could potentially be Steiner nodes for
# each sample. This can't be recovered from the stp files at later
# iterations because both original prizes and artificial prizes will exist.
# Also track how the dummy node will be connected
# to the networks, either all prizes or all non-prizes (potential Steiner nodes)
potentialSteinerMap = dict()
dummyNeighborMap = dict()
for group in groups:
numSamples = countMap[group]
sampleNames = sampleMap[group]
initialStps = initialStpMap[group]
potentialSteiner = [] # A list of sets
dummyNeighborFiles = [] # A list of filenames
for i in range(numSamples):
dnFile = sampleNames[i] + "_dummyNeighbors.txt"
dummyNeighborFiles.append(dnFile)
potentialSteiner.append(DummyNeighbors(allProts, initPath, initialStps[i], dnFile, opts.dummyNeighbors))
potentialSteinerMap[group] = potentialSteiner
dummyNeighborMap[group] = dummyNeighborFiles
itrPath = os.path.join(opts.resultPath,"itr1")
if not os.path.exists(itrPath):
os.makedirs(itrPath)
# Initialize the artificial prizes to be an empty dictionary so that
# we learn the initial trees independently
artificialPrizes = dict()
# Write the unused itr1 artificial prizes so that the files exist for post-processing
for group in groups:
NetworkUtil.WriteDict(os.path.join(itrPath,"artificialPrizes_%s.txt" % group), artificialPrizes)
print "%d artificial prizes at iteration 1" % len(artificialPrizes)
# Add the degree penalties to the initial stp files. Pass in the empty artificial prize
# dictionary, which won't have an effect.
for group in groups:
sampleNames = sampleMap[group]
numSamples = countMap[group]
potentialSteiner = potentialSteinerMap[group]
dummyNeighborFiles = dummyNeighborMap[group]
for i in range(numSamples):
# Copy the dummy neighbors, which must be in the same directory as the stp file
UpdateStp(artificialPrizes, degPenalties, potentialSteiner[i], initPath, itrPath, sampleNames[i])
shutil.copyfile(os.path.join(initPath,dummyNeighborFiles[i]), os.path.join(itrPath,dummyNeighborFiles[i]))
# Learn the first iteration Steiner forests in parallel
# Run single-threaded belief propagation when using the worker pool
lastForestMap = dict()
for group in groups:
numSamples = countMap[group]
sampleNames = sampleMap[group]
dummyNeighborFiles = dummyNeighborMap[group]
zippedArgs = itertools.izip(itertools.repeat(opts), itertools.repeat(itrPath), itertools.repeat(itrPath), sampleNames, dummyNeighborFiles, itertools.repeat(1))
pool.map(LearnSteinerHelper, zippedArgs)
lastForests = [] # A list of sets, where each set contains the Steiner forest nodes
for i in range(numSamples):
lastForests.append(LoadForestNodes("%s/symbol_%s_%s_1.0_%d.txt" % (itrPath, sampleNames[i], str(opts.W), opts.depth)))
lastForestMap[group] = lastForests
# Learn the forests at all remaining iterations and return the directory
# that contains the forests from the last iteration.
if opts.iterations > 1:
if batchMode:
itrPath = Batch(opts, pool, initPath, allProts, sampleMap, potentialSteinerMap, dummyNeighborMap, lastForestMap, countMap, weightedPrizes, negativePrizes, degPenalties)
else:
itrPath = RandSequential(opts, initPath, allProts, sampleMap, potentialSteinerMap, dummyNeighborMap, lastForestMap, countMap, weightedPrizes, negativePrizes, degPenalties)
# Prune Steiner nodes from the forests that are not used to reach any prizes and
# are only present because they were in the common set.
# This is not necessary if only 1 iteration was run because in that case there
# is no common set.
# It is also not necessary if negative prizes were used.
if opts.iterations > 1 and (not negativePrizes):
print "Learning final forests"
print "Pruning forests from %s" % itrPath
finalPath = os.path.join(opts.resultPath,"final")
if not os.path.exists(finalPath):
os.makedirs(finalPath)
# Nothing is returned by these operations so they can be performed
# simultaneously independent of the groupings
sampleNames = FlattenDict(sampleMap, groups)
dummyNeighborFiles = FlattenDict(dummyNeighborMap, groups)
potentialSteiner = FlattenDict(potentialSteinerMap, groups)
for i in range(len(sampleNames)):
forestFile = "%s/symbol_%s_%s_1.0_%d.txt" % (itrPath, sampleNames[i], str(opts.W), opts.depth)
FilterStpEdges(forestFile, initPath, finalPath, sampleNames[i], degPenalties, potentialSteiner[i])
shutil.copyfile(os.path.join(initPath,dummyNeighborFiles[i]), os.path.join(finalPath,dummyNeighborFiles[i]))
zippedArgs = itertools.izip(itertools.repeat(opts), itertools.repeat(finalPath), itertools.repeat(finalPath), sampleNames, dummyNeighborFiles, itertools.repeat(1))
pool.map(LearnSteinerHelper, zippedArgs)
print "Finishing constrained multi-sample Steiner forest %s" % time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
pool.close()
# Learn all forests in an iteration sequentially in a random order, asssigning
# prizes for a fixed sample given all of the forests for the other samples
# in the same group.
# Those other forests will be the most recently forest learned for the sample
# regardless of whether it was learned at iteration i or i-1.
def RandSequential(opts, initPath, allProts, sampleMap, potentialSteinerMap, dummyNeighborMap, lastForestMap, countMap, weightedPrizes, negativePrizes, degPenalties):
print "Learning forests in random sequential mode"
# Iterate (rounds 2+)
itrPath = initPath
for itr in range(2,opts.iterations+1):
#lastPath = itrPath
itrPath = os.path.join(opts.resultPath,"itr%d" % itr)
if not os.path.exists(itrPath):
os.makedirs(itrPath)
# Only constrain the Steiner forests to be similar to other samples in the same group
for group in sampleMap.iterkeys():
sampleNames = sampleMap[group]
numSamples = countMap[group]
potentialSteiner = potentialSteinerMap[group]
dummyNeighborFiles = dummyNeighborMap[group]
lastForests = lastForestMap[group]
if len(sampleNames) != numSamples or len(potentialSteiner) != numSamples or len(dummyNeighborFiles) != numSamples or len(lastForests) != numSamples:
raise RuntimeError("Must have the same number of samples in group %s" % group)
# Randomly choose the order in which to learn forests at this iteration
order = range(numSamples)
random.shuffle(order)
# Write the order to a file
with open(os.path.join(itrPath, "sampleOrder_%s.txt" % group), "w") as f:
for index in order:
f.write("%d\t%s\n" % (index, sampleNames[index]))
# Iterate over all samples in the random order
for index in order:
# Create artificial prizes for this sample using all N-1 lastForests
otherLastForests = list(lastForests)
otherLastForests.pop(index)
if weightedPrizes:
# lambda2 is used as the alpha parameter
artificialPrizes = CreateWgtPrizes(allProts, otherLastForests, opts.lambda1, opts.lambda2, negativePrizes)
else:
# Use all N-1 other sets of potential Steiner nodes
otherPotentialSteiner = list(potentialSteiner)
otherPotentialSteiner.pop(index)
artificialPrizes = CreateUnwgtPrizes(allProts, otherPotentialSteiner, otherLastForests, opts.lambda1, opts.lambda2, negativePrizes)
NetworkUtil.WriteDict(os.path.join(itrPath,"%s_artificialPrizes.txt" % sampleNames[index]), artificialPrizes)
# Update the stp file based on the artificial prizes and degree penalties and copy the dummy neighbors
UpdateStp(artificialPrizes, degPenalties, potentialSteiner[index], initPath, itrPath, sampleNames[index])
shutil.copyfile(os.path.join(initPath,dummyNeighborFiles[index]), os.path.join(itrPath,dummyNeighborFiles[index]))
# Learn a new forest for this sample and update lastForests
# All samples (besides the first and last in the random order) will use last forests
# that are a mix of forests from this iteration and the previous iteration
LearnSteiner(opts, itrPath, itrPath, sampleNames[index], dummyNeighborFiles[index], opts.workers)
lastForests[index] = LoadForestNodes("%s/symbol_%s_%s_1.0_%d.txt" % (itrPath, sampleNames[index], str(opts.W), opts.depth))
# Store all forests learned for this group at this iteration so they can be
# retreived at the next iteration
lastForestMap[group] = lastForests
return itrPath
# Learn all forests in an iteration in batch in parallel using the multiprocessing pool.
# Update the artificial prizes once per iteration using the forests from the previous
# iteration in the same group
def Batch(opts, pool, initPath, allProts, sampleMap, potentialSteinerMap, dummyNeighborMap, lastForestMap, countMap, weightedPrizes, negativePrizes, degPenalties):
print "Learning forests in parallel batch mode"
# Iterate (rounds 2+)
itrPath = initPath
for itr in range(2,opts.iterations+1):
#lastPath = itrPath
itrPath = os.path.join(opts.resultPath,"itr%d" % itr)
if not os.path.exists(itrPath):
os.makedirs(itrPath)
# Only constrain the Steiner forests to be similar to other samples in the same group
for group in sampleMap.iterkeys():
sampleNames = sampleMap[group]
numSamples = countMap[group]
potentialSteiner = potentialSteinerMap[group]
dummyNeighborFiles = dummyNeighborMap[group]
lastForests = lastForestMap[group]
if len(sampleNames) != numSamples or len(potentialSteiner) != numSamples or len(dummyNeighborFiles) != numSamples or len(lastForests) != numSamples:
raise RuntimeError("Must have the same number of samples in group %s" % group)
# Update artificial prizes based on the forests from the previous iteration
if weightedPrizes:
# lambda2 is used as the alpha parameter
artificialPrizes = CreateWgtPrizes(allProts, lastForests, opts.lambda1, opts.lambda2, negativePrizes)
else:
artificialPrizes = CreateUnwgtPrizes(allProts, potentialSteiner, lastForests, opts.lambda1, opts.lambda2, negativePrizes)
NetworkUtil.WriteDict(os.path.join(itrPath,"artificialPrizes_%s.txt" % group), artificialPrizes)
print "%d artificial prizes in group %s at iteration %d" % (len(artificialPrizes), group, itr)
# Update the stp files based on the new artificial prizes and degree penalties
# and copy the potential Steiner node files, which need to be in itrPath
for i in range(numSamples):
UpdateStp(artificialPrizes, degPenalties, potentialSteiner[i], initPath, itrPath, sampleNames[i])
shutil.copyfile(os.path.join(initPath,dummyNeighborFiles[i]), os.path.join(itrPath,dummyNeighborFiles[i]))
# Learn new Steiner forests in parallel
zippedArgs = itertools.izip(itertools.repeat(opts), itertools.repeat(itrPath), itertools.repeat(itrPath), sampleNames, dummyNeighborFiles, itertools.repeat(1))
pool.map(LearnSteinerHelper, zippedArgs)
lastForests = []
for i in range(numSamples):
lastForests.append(LoadForestNodes("%s/symbol_%s_%s_1.0_%d.txt" % (itrPath, sampleNames[i], str(opts.W), opts.depth)))
lastForestMap[group] = lastForests
return itrPath
# TODO Could reduce the edge costs so that subtrees that have some high weight
# artificial prizes and low weight real prizes are not pruned
# Filter the initial stp file so that it includes only edges
# that were selected in a previous run of the Steiner forest
# algorithm. Add the node degree penalties.
def FilterStpEdges(forestFile, inPath, outPath, sampleName, degreePenalties, potentialSteiner):
edges = LoadForestEdges(forestFile)
with open(os.path.join(inPath,sampleName + ".stp")) as inStp:
with open(os.path.join(outPath,sampleName + ".stp"), "w") as outStp:
for line in inStp:
parts = line.split()
if parts[0] == "D":
# If the edge is directed it must have been used in
# the same direction in the forest. Both the stp
# file and the forest output follow the <biologicalTarget> <biologicalParent>
# convention, i.e. the edge points toward the root in the
# belief propagation not the biological parent.
if "%s %s" % (parts[1], parts[2]) in edges:
outStp.write(line)
elif parts[0] == "E":
# If the edge is undirected allow it to be used in
# either direction in the forest
if ("%s %s" % (parts[1], parts[2]) in edges) or ("%s %s" % (parts[2], parts[1]) in edges):
outStp.write(line)
else:
outStp.write(line)
# Write the degree penalties
for node in degreePenalties.iterkeys():
if node in potentialSteiner:
outStp.write("W %s %f\n" % (node, degreePenalties[node]))
# Update the initial stp file by adding artificial prizes present in the
# dictionary. Add the negative prizes used to implement penalties on
# node degree as well.
def UpdateStp(artificialPrizes, degreePenalties, potentialSteiner, inPath, outPath, sampleName):
newStpFile = os.path.join(outPath,sampleName + ".stp")
shutil.copyfile(os.path.join(inPath,sampleName + ".stp"), newStpFile)
with open(newStpFile, "a") as outStp:
# Create a set of nodes with an artificial prize or a degree penalty
keys = set(artificialPrizes.iterkeys()).union(degreePenalties.iterkeys())
# Append the new prizes to the end of the file summing the two types of
# artificial prizes if both are present
for node in keys:
if node in potentialSteiner:
artificialPrize = 0
degreePenalty = 0
if node in artificialPrizes:
artificialPrize = artificialPrizes[node]
if node in degreePenalties:
degreePenalty = degreePenalties[node]
outStp.write("W %s %f\n" % (node, artificialPrize + degreePenalty))
# Created weighted artificial prizes that are derived from the frequency
# of a node in the collection of Steiner forests.
def CreateWgtPrizes(allProts, lastForests, lambda1, alpha, negativePrizes):
forestFreq = NetworkUtil.SetFrequency(lastForests)
artificialPrizes = {}
if negativePrizes:
# Need to iterate over all proteins when creating negative prizes
for node in allProts:
freq = 0
if node in forestFreq:
freq = forestFreq[node]
# Only create non-zero prizes, i.e. for nodes that are not in all
# forests
if freq < 1:
artificialPrizes[node] = -lambda1 * ((1-freq) ** alpha)
else:
# For positive prizes only need to iterate over the nodes that appear
# in some forest
for node in forestFreq.iterkeys():
freq = forestFreq[node]
# Frequently is guaranteed to be > 0 because the keys are only
# the union of all forest nodes
artificialPrizes[node] = lambda1 * (freq ** alpha)
return artificialPrizes
# Create unweighted artificial prizes by explicitly constructing a common
# set and then generating prizes for all proteins accordingly depending
# on whether positive or negative prizes are being used.
def CreateUnwgtPrizes(allProts, potentialSteiner, lastForests, lambda1, lambda2, negativePrizes):
# Construct the common set
commonSet = set()
for prot in allProts:
# The same scoring function works for both positive and negative prizes
inScore = Penalty(prot, potentialSteiner, lastForests)
if lambda1 * inScore < lambda2:
commonSet.add(prot)
# Build the prizes based on the common set. With positive artificial prizes
# add prizes when a node is in the common set. With negative artificial
# prizes add a prize when a node is not in the common set.
if negativePrizes:
artificialPrizes = dict.fromkeys(allProts.difference(commonSet), -lambda1)
else:
artificialPrizes = dict.fromkeys(commonSet, lambda1)
return artificialPrizes
# This penalty function charges a cost of 1 for each sample in which the
# protein is a potential Steiner node (i.e. not a prize node) but
# not in the Steiner forest. The same function is used to calculate
# the reward when using negative prizes.
def Penalty(prot, potentialSteiner, lastForests):
if len(potentialSteiner) != len(lastForests):
raise RuntimeError("Must have the same number of samples")
count = 0
for i in range(len(lastForests)):
if (prot not in lastForests[i]) and (prot in potentialSteiner[i]):
count += 1
return count
# Load all edges of the Steiner forest as set whose members of of the form
# "<protein1> <protien2>". Expects the "symbol_*" output from the
# message passing algorithm
def LoadForestEdges(forestFile):
forest = set()
with open(forestFile) as f:
for line in f:
forest.add(line.strip())
return forest
# Load all members of the Steiner forest, both Steiner nodes and prizes,
# and return them as a set. Expects the "symbol_*" output from the
# message passing algorithm
def LoadForestNodes(forestFile):
forest = set()
with open(forestFile) as f:
for line in f:
forest.update(line.split())
return forest
# A helper function that takes all of the LearnSteiner functions as a tuple
def LearnSteinerHelper(args):
return LearnSteiner(*args)
# Learn a Steiner forest
# TODO call the Python function directly instead of using a system call
def LearnSteiner(opts, stpPath, outPath, sampleName, targetFile, threads):
print "Learning Steiner forest for %s" % sampleName
pyFile = os.path.join(os.path.dirname(__file__),"PCSF.py")
os.system("python %s --outputpath=%s --msgpath=%s --depth=%d --conn=3 --targetfile=%s --stppath=%s/ --stpfile=%s --W=%s_%s_%s --beta=1.0_1.0_1.0 --exclude=2 --species=human --threads=%s" % (pyFile, outPath, opts.msgPath, opts.depth, targetFile, stpPath, sampleName, str(opts.W), str(opts.W), str(opts.W), str(threads)))
# Find and return a set of the potential Steiner nodes, the non-prize
# proteins (not mRNAs) that are present in the network. Also
# write a list of dummy node neighbors, which can be either prizes or
# non-prizes.
def DummyNeighbors(allProts, path, stpFile, dnFile, neighborType):
prizes = set()
with open(os.path.join(path,stpFile)) as f:
for line in f:
parts = line.split()
# mRNAs can't be Steiner nodes
if parts[0] == "W" and not parts[1].endswith("_MRNA"):
prizes.add(parts[1])
psNodes = allProts.difference(prizes)
if neighborType == "prizes":
NetworkUtil.WriteCollection(os.path.join(path,dnFile), prizes)
elif neighborType == "nonprizes":
NetworkUtil.WriteCollection(os.path.join(path,dnFile), psNodes)
else:
raise RuntimeError("%s is not a valid type of dummy node neighbor connection" % neighborType)
return psNodes
# A helper function that takes all of the CreateStp args as a tuple
def CreateStpHelper(args):
return CreateStp(*args)
# Create the initial .stp file
# TODO call the Python function directly instead of using a system call
def CreateStp(opts, initPath, terminalFile, sampleName):
stpFile = sampleName + ".stp"
print "Creating %s" % stpFile
pyFile = os.path.join(os.path.dirname(__file__),"MessagePassingInput.py")
os.system("python %s --interactomepath=%s --terminalpath=%s --resultpath=%s --indirectedfilename=%s --beta=%f --terminalfile=%s --resultfilename=%s --directedfilename=%s --tfdnafilename=%s --mrnabeta=%s" % (pyFile, opts.interactomePath, opts.terminalPath, initPath, opts.undirectedFile, opts.beta, terminalFile, stpFile, opts.directedFile, opts.tfdnaFile, opts.mrnaBeta))
return stpFile
# Create a single list of all items in a dictionary of iterables. Takes a
# list of keys so that the order of items in the returned list
# is deterministic.
def FlattenDict(dictionary, keys):
if not len(keys) == len(dictionary.keys()):
raise RuntimeError("Dictionary does not have %d keys" % len(keys))
flat = []
for key in keys:
flat.extend(dictionary[key])
return flat
# Load all terminal files from the master file and create a map from
# groups to member samples. Default to placing all samples in a single
# group. Returns a map from groups to terminal files, a map from
# groups to sample names, and a map from groups to sample counts
def LoadTerminalFiles(terminalPath, masterTerminalFile):
terminalMap = dict()
sampleMap = dict()
defaultCount = 0
with open(os.path.join(terminalPath,masterTerminalFile)) as f:
for line in f:
curSample = line.strip().split("\t")
if len(curSample) > 2:
raise RuntimeError("%s has a line with %d columns" % (masterTerminalFile, len(curSample)))
elif len(curSample) == 1:
group = "defaultGroup"
defaultCount += 1
else:
group = curSample[1]
# Create lists for the group if it does not yet exist
if group not in terminalMap:
terminalMap[group] = []
sampleMap[group] = []
terminalMap[group].append(curSample[0])
name = curSample[0]
if name.endswith(".txt"):
name = name[0:-4] # Remove ".txt"
sampleMap[group].append(name)
# Count the number of samples in each group
countMap = dict()
for group in terminalMap.iterkeys():
countMap[group] = len(terminalMap[group])
print "Assigned %d samples to the default group" % defaultCount
return terminalMap, sampleMap, countMap
# Load all proteins in the interaction network but not the genes
# (the targets of protein-DNA interactions)
def LoadProteins(interactomePath, undirectedFile, directedFile, tfdnaFile):
prots = set()
# Load a proteins in the PPI
with open(os.path.join(interactomePath,undirectedFile)) as f:
for line in f:
parts = line.strip().upper().split()
if len(parts) == 3:
prots.add(parts[0])
prots.add(parts[1])
# Load the kinases and substrates if these interactions exist
if directedFile != "None":
with open(os.path.join(interactomePath,directedFile)) as f:
for line in f:
parts = line.strip().upper().split()
if len(parts) == 3:
prots.add(parts[0])
prots.add(parts[1])
# Load the TFs
if tfdnaFile != "None":
with open(os.path.join(interactomePath,tfdnaFile)) as f:
for line in f:
parts = line.strip().upper().split()
if len(parts) == 3:
# Load only the TF, not the gene
prots.add(parts[0])
return prots
# Setup the option parser
def CreateParser():
# Options that are fixed for the time being:
# Connection type for the artificial node / exclude / targetfile
# Output file names
# Beta (only allowed to be set once initially, range of Beta is not supported)
# Species
# W cannot be given as a range of values
parser = OptionParser(version="Multi-PCSF version %s" % __version__)
parser.add_option("--interactomepath", type="string", dest="interactomePath", help="This path points to the directory that contains the interaction network files",default='None')
parser.add_option("--terminalpath",type="string",dest="terminalPath",help="This path points to the directory that contains the terminal (node prize) files",default='None')
parser.add_option("--resultpath",type="string",dest="resultPath",help="This path points to the directory where the output files will be written.",default='None')
parser.add_option("--undirectedfile",type="string",dest="undirectedFile",help="The name of the protein-protein interaction file in the interactomepath directory. The file is expected to contain undirected interactions with probabilistic weights (e.g in [0,1]). Columns should be ordered [prot1 prot2 weight].",default='None')
parser.add_option("--terminalfile",type="string",dest="masterTerminalFile",help="A file in the terminalpath directory that lists the files that give the node prizes for each sample. All listed filenames should be relative to terminal path. If gene penalties are given in the terminal files, gene names should end with '_MRNA'. Optionally can include a tab-separated second column that assigns each sample to a group so the forests are only constrained to be similar to other samples in the same group.",default='None')
#parser.add_option("--resultfilename",type="string",dest="resultfilename",help="The name of the file where the combined information will be written to be used as input in the message passing tool.",default='None')
parser.add_option("--directedfile",type="string",dest="directedFile",help="Optional: The name of the interaction file where directed interactions (e.g. kinase-substrate) with probabilistic weights are available. Columns should be ordered [substrate kinase weight].",default='None')
parser.add_option("--tfdnafile",type="string",dest="tfdnaFile",help="Optional: The name of the interaction file where TF-DNA interactions with probabilistic weights are available. Columns should be ordered [TF gene weight]. Gene names should not end with '_MRNA' because '_MRNA' is automatically appended to them.",default='None')
parser.add_option("--mrnabeta",type="string",dest="mrnaBeta",help="Optional: The beta parameter for gene terminal nodes. Its default value is equal to the --beta value. It is applied to all terminals whose name ends with '_MRNA'. The scaling is only performed once when the initial stp files are created.",default='None')
#parser.add_option("--outputpath", type="string", dest="outputpath", help="This path points to the directory where the output files will be written",default='None')
parser.add_option("--msgpath",type="string",dest="msgPath",help="The path and file name of the msgsteiner executable",default='./msgsteiner')
parser.add_option("--depth",type="int",dest="depth",help="Depth parameter that limits the maximum depth from the Steiner tree root to the leaves",default=10)
#parser.add_option("--conn",type="string", dest="conn", help="How to connect the artificial node to the interactome: 1 to all nodes in the interactome, 2 to all node in the interactome, except the terminals, 3 to a given set of nodes in the interactome, 4 to a given set of nodes in the interactome, except the terminals.")
#parser.add_option("--stppath", type="string", dest="stppath", help="This path points to the directory where the stp file is available.")
#parser.add_option("--stpfile", type="string", dest="stpfile", help="The name of the stp file (without the file extension)")
parser.add_option("--W", type="float", dest="W",help="The cost of the edges from the artificial root node to its neighbors.",default=1.0)
#parser.add_option("--beta", type="string", dest="beta",help="the range and increment for the beta parameter. format: start_end_increment")
#parser.add_option("--exclude", type="string", dest="exclude", help="If set to 1 and the connection type is 2, 3, or 4 then remove prizes from all nodes that are connected to the artificial node")
#parser.add_option("--targetfile", type="string", dest="targetfile", help="a subset of the proteins in the interactome that will be connected to the artificial root in connection types 3 and 4",default='None')
#parser.add_option("--species", type="string", dest="species", help="the organism that you are working on",default='None')
parser.add_option("--beta",type="float",dest="beta",help="The scaling factor applied to the node prizes, which is used to control the relative strength of node prizes and edge costs. This scaling is only performed once when the initial stp files are created.",default=1.0)
parser.add_option("--lambda", type="float", dest="lambda1",help="The tradeoff coefficient for the penalty incurred by nodes in the Steiner forests that are not in the set of common nodes.",default=1.0)
# Redefine lambda2 in terms of how many trees should contain a node before you include it in the common set?
parser.add_option("--alpha", type="float", dest="lambda2",help="The tradeoff coefficient for the reward on the size of the set of common nodes when using unweighted artificial prizes or the power to which the node frequency is taken for weighted prizes.",default=1.0)
parser.add_option("--mu", type="float", dest="mu", default=0, help="A parameter used to penalize high-degree nodes from being selected as Steiner nodes. Does not affect prize nodes but does affect artificial prizes. The penalty is -mu*degree. Set mu <= 0 to disable the penalty (default).")
# Fix the number of iterations until other convergence criteria is implemented
parser.add_option("--iterations", type="int", dest="iterations", help="The number of iterations to run", default=10)
# Non-positive values will map to cpu_count()
parser.add_option("--workers", type="int", dest="workers", help="The number of worker processes to use in the multiprocessing pool or threads to use in multi-threaded belief propagation. Should not exceed the number of cores available. Defaults to the number of CPUs.", default=-1)
# Use positive or negative artificial prizes to encourage use of the common set or weighted prizes to avoid
# constructing a common set
parser.add_option("--artificialprizes", type="choice", dest="artificialPrizes", default="negativeWeighted", choices=["positive", "negative", "positiveWeighted", "negativeWeighted"], help="Use 'positive' or 'negative' prizes to encourage trees to include common set proteins. Use 'positiveWeighted' or 'negativeWeighted' (default) prizes to construct weighted artificial prizes based on the node frequency in the most recent forests.")
# Determine how to connect the dummy node
parser.add_option("--dummyneighbors", type="choice", dest="dummyNeighbors", default="prizes", choices=["prizes", "nonprizes"], help="Connect the dummy node to all 'prizes' (default) or 'nonprizes'.")
parser.add_option("--itermode", type="choice", dest="iterMode", default="batch", choices=["batch", "random"], help="Learn forests simultaneously in 'batch' (default) or sequentially in 'random' order. Batch mode computes artificial prizes with respect to all forests at the previous iteration. Random mode computes prizes for a specific sample given the most recent forests for all other samples.")
return parser
if __name__=="__main__":
# Use the command line arguments to setup the options (the same as the default OptionParser behavior)
main(sys.argv[1:])