-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathenrich_pvalues.py
executable file
·940 lines (794 loc) · 29.7 KB
/
enrich_pvalues.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Compare one dataset to another at a variety of p-value cutoffs.
Author: Michael D Dacre (Fraser Lab, Stanford University)
License: MIT
Version: 1.0b2
Created: 2018-05-30
Updated: 2018-05-31
See the README at:
https://github.com/TheFraserLab/enrich_pvalues/blob/master/README.rst
This software is split into four modes to allow easy p-value cutoff enrichment
plotting for any two datasets that share names (e.g. SNPs) and have p-values.
"""
from __future__ import print_function
import os as _os
import sys as _sys
import bz2 as _bz2
import gzip as _gzip
import json as _json
import argparse as _argparse
import math
import numpy as np
import pandas as pd
from tabulate import tabulate as _tab
from tqdm import tqdm as _tqdm
__version__ = '1.0b2'
# How many ticks to have on the two y-axes
YTICK_COUNT = 15
###############################################################################
# Core Enrichment Algorithm #
###############################################################################
def enrich_study(dataset, sig_comp, nsig_comp, simp_fl=False,
low_mem=False, conf=None):
"""Compute enrichment of significant data in sig_comp and nsig_comp.
This is the core algorithm of this script.
Read in all data from dataset and then take all names that beat a
significance cutoff (set in conf) and compare to names in sig_comp and
nsig_comp, computing an enrichment score (percentage in each, sig/nsig).
Repeats this for every p-value cutoff between conf['max_pval'] (default
0.05) and conf['min_pval'] (default 1e-15). We check each p-value cutoff
at intervals of 1e-x and 5e-x for each exponent x between the max and min.
Params
------
dataset : str
Path to data table to test, must contain names and p-values. Parse
instructions in the config
sig_comp : str or set
Set of names that are in the significant set, or path to a newline
separated set (made by split_study)
nsig_comp : str or set
Same as sig_comp, but for the non-significant comparison set
simp_fl : bool, optional
Treat the study_file as a two column file with no header, where the
columns are name and pvalue, separated by a tab
low_mem : bool, optional, not implemented
Do not load the whole set of names and p-values at the same time,
parse file over again for every comparison. Slower, but saves memory
for very larde datasets.
conf : dict, optional
A config file to use for parsing.
Returns
-------
[(p-value-cutoff), (enrichment-score)]
"""
# Build a range of cutoffs to use
max_exp = abs(math.floor(math.log10(conf['max_pval'])))
min_exp = abs(math.floor(math.log10(conf['min_pval'])))
cutoffs = []
for exp in range(max_exp, min_exp+1):
# Cumbersome way to handle floating point errors
cutoffs.append(float('{0:1e}'.format(5*(10**-exp))))
cutoffs.append(float('{0:1e}'.format(1*(10**-exp))))
_sys.stderr.write('Testing cutoffs:\n{0}\n'.format(cutoffs))
# Get the comparison sets
_sys.stderr.write('Getting comparison data\n')
scmp = get_set(sig_comp)
ncmp = get_set(nsig_comp)
del sig_comp
del nsig_comp
_sys.stderr.write(
'Got {0} sig names and {1} nonsig names.\n'
.format(len(scmp), len(ncmp))
)
# Get the right iterator
if simp_fl:
sit = simple_iterator(dataset)
else:
sit = study_iterator(
dataset, conf['test_sep'], conf['test_has_header'],
conf['test_name_col'], conf['test_pval_col']
)
# Keep only the two columns we care about, and only those that beat
# the max p-value we will test
data = []
add_data = data.append
max_cutoff = conf['max_pval']
_sys.stderr.write(
'Reading data file, keeping those less than P {0}\n'.format(max_cutoff)
)
for name, pval in _tqdm(sit, unit=' rows', disable=None):
if pval <= max_cutoff:
add_data([name, pval])
# Make our dataframe
_sys.stderr.write('Converting to DataFrame\n')
data = pd.DataFrame(data, columns=['name', 'pval'])
# Make unique, keep most significant
data = data.sort_values('pval').drop_duplicates(['name', 'pval'])
# Compute score
scores = {}
_sys.stderr.write('Calculating enrichments\n')
for cutoff in _tqdm(cutoffs, unit=' cuttofs', disable=None):
test_set = frozenset(data[data.pval <= cutoff].name)
test_len = len(test_set)
sigyl = len(test_set & scmp)
nsigyl = len(test_set & ncmp)
sigy = sigyl/test_len
nsigy = nsigyl/test_len
if nsigy == 0:
score = np.nan
else:
score = sigy/nsigy
scores['{0:1e}'.format(cutoff)] = {
'cutoff': cutoff,
'sig_data': test_len,
'sig_overlap': sigyl,
'nonsig_overlap': nsigyl,
'sig_score': sigy,
'nonsig_score': nsigy,
'enrichment_score': score
}
# Free memory
_sys.stderr.write('Done. Clearing memory\n')
del data, scmp, ncmp
_sys.stderr.write('Making DataFrame\n')
scores = pd.DataFrame.from_dict(scores, orient='index')
scores = scores.sort_values('cutoff', ascending=False)
print(scores)
return scores
def plot_scores(scores, outfile=None, figsize=(14,10), comp_prefix=None,
raw=False, show_one=False, lock_grid=False):
"""Plot enrichment score and sig count against cutoff.
Enrichment scores end up on left y axis, total number of significant right
y axis. x axis is the cutoffs.
Params
------
scores : DataFrame
from enrich_study
outfile : str, optional
Path to write figure to
figsize : tuple, optional
Size of figure
comp_prefix : str, optional
The prefix of the comparison data to use for title creation
raw : bool, optional
Plot raw counts instead of percentages
Returns
-------
fig, [ax1, ax2]
"""
from matplotlib import pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
if comp_prefix:
title = 'Enrichment vs {0} Data'.format(comp_prefix)
else:
title = 'Enrichment Scores at Various P-Value Cutoffs'
scores.reset_index(inplace=True)
sns.set_style('darkgrid')
sns.set_palette('deep')
fig, ax1 = plt.subplots(figsize=figsize)
# Plot counts
if raw:
cnt = YTICK_COUNT
scl = 0.01
scores.plot.line(y='sig_data', color='orange', ax=ax1, legend=False)
mx = scores.sig_data.max()
ax1.set_ylabel(
'Number Kept\n(Max {0:,}, Min {1:,})'.format(
mx, scores.sig_data.min()
), fontsize=14
)
set_yspace(ax1, start=0, end=mx, count=cnt, fixed=lock_grid, scale=scl)
else:
cnt = 10
scl = 0.01
scores['perc'] = scores.sig_data/scores.sig_data.max()
scores.plot.line(y='perc', color='orange', ax=ax1, legend=False)
ax1.set_ylabel(
'Percentage Kept vs P={0}'.format(scores.cutoff.max()),
fontsize=14
)
ax1.yaxis.set_major_formatter(
FuncFormatter(lambda y, _: '{0:.0%}'.format(y))
)
set_yspace(
ax1, start=0, end=1, count=cnt, fixed=True, scale=scl
)
# Format and label x-axis
ax1.set_xlabel('p-value cutoff', fontsize=14)
ax1.set_xticks(range(0, len(scores)+1, 1))
ax1.set_xticklabels(
scores.cutoff.apply(
lambda x: '{0}'.format(float('{0:1e}'.format(x)))
), rotation=45
)
# Plot enrichment score on opposite x-axis
ax2 = ax1.twinx()
scores.plot.line(
y='enrichment_score', color='orchid', ax=ax2, legend=False
)
ax2.set_ylabel(
'Enrichment Score\n(sig:non-sig enrichment)',
fontsize=14
)
set_yspace(
ax2, start=0, end=scores.enrichment_score.max(), count=cnt,
fixed=lock_grid, scale=scl
)
# Color enrichment below 1.0 as grey
ymin, ymax = ax2.get_ylim()
if ymax >= 1.1:
x = np.array(scores.index.to_series())
y = np.array(scores.enrichment_score)
x, y = get_x_y_at_one(x, y)
y_mask = np.ma.masked_greater(y, 1.001) # Mask everything below 1.0
ax2.plot(x, y_mask, color='grey')
# Highlight the 1.0 line
if show_one:
xmin, xmax = ax2.get_xlim()
ax2.plot((xmin, xmax), (1.0, 1.0), c='purple', alpha=0.2, zorder=-10)
ax2.text(xmax, 0.98, '1.0', fontdict={'fontsize': 12, 'weight': 'bold'})
ax2.set_xlim(xmin, xmax)
# Only write the grid once
ax2.grid(None)
# Write the title
ax2.set_title(title, fontsize=17)
# Format and add legend
fig.tight_layout()
fig.legend(
labels=('Percentage Kept', 'Enrichment Score'),
loc='lower right'
)
if outfile:
fig.savefig(outfile)
return fig, [ax1, ax2]
def set_yspace(ax, start, end, count=YTICK_COUNT, fixed=False, scale=None):
"""Set the matplotlib spacing for the y-axis."""
from matplotlib.ticker import LinearLocator
from matplotlib.ticker import MaxNLocator
# Float everything
start = float(start)
end = float(end)
# Round up the end
nlen = len(str(int(end)))
lp = float(10**(nlen-1))
end = math.ceil(end/lp)*lp
# Round down the start
start = math.floor(start/lp)*lp
# Set axes limits
ax.set_ylim(start, end)
# Set the tick counts
if fixed:
ax.yaxis.set_major_locator(
LinearLocator(count)
)
else:
ax.yaxis.set_major_locator(
MaxNLocator(nbins=count)
)
if scale:
scl = end*scale
ystart = start-scl
yend = end+scl
yticks = ax.get_yticks()
ax.set_ylim(ystart, yend)
ax.set_yticks(yticks)
# Return the linspace version of above, should be the same, but position
# is not guaranteed
return np.linspace(start, end, count)
def get_x_y_at_one(x, y):
"""Return x, y with x=1.0 added if it doesn't exist."""
from scipy import stats as sts
pre = None
fin = None
xl = list(x)
yl = list(y)
for c, i in enumerate(y):
if i == 1.0:
print(i, 'is 1')
return x, y
if i > 1.0:
if i == 0:
return x, y
tx = (xl[c-1], xl[c])
ty = (yl[c-1], yl[c])
break
reg = sts.linregress(tx, ty)
# new_y = (reg.slope*1.0)+reg.intercept
new_x = (1.0-reg.intercept)/reg.slope
xl.insert(c, new_x)
yl.insert(c, 1.0)
return np.array(xl), np.array(yl)
def get_set(x):
"""Return frozenset from x if x is iterable or newline separated file."""
if isinstance(x, str):
with open_zipped(x) as fin:
return frozenset(fin.read().strip().split('\n'))
return frozenset(x)
###############################################################################
# Splitting Comparison Study #
###############################################################################
def split_study(study_file, prefix=None, simp_fl=False, low_mem=False,
conf=None):
"""Split a sample into a significant file and a non-sig file.
Params
------
study_file : str
Path the the file to parse, can be gzipped.
prefix : str, optional
A prefix to use for output files, default is study_file name.
simp_fl : bool, optional
Treat the study_file as a two column file with no header, where the
columns are name and pvalue, separated by a tab
low_mem : bool, optional, not implemented
Parse file line by line, instead of using pandas. Currently only
low-mem works
conf : dict, optional
A config file to use for parsing.
Writes
------
<prefix>.sig.txt.gz, <prefix>.non-sig.txt.gz
Two newline separated of names that are significant or non-significant.
"""
prefix = prefix if prefix else str(study_file)
sig_fl = prefix + '.sig.txt.gz'
nsig_fl = prefix + '.nonsig.txt.gz'
# Cutoffs
sig_p = float(conf['comp_sig_pval'])
non_sig_p = float(conf['comp_nonsig_pval'])
_sys.stderr.write(
'Significant set is P <= {0}, non-sig is P >= {1}\n'
.format(sig_p, non_sig_p)
)
if simp_fl:
sample = simple_iterator(study_file)
else:
sample = study_iterator(
study_file, conf['comp_sep'], conf['comp_has_header'],
conf['comp_name_col'], conf['comp_pval_col']
)
sig_set = set()
nsig_set = set()
add_to_sig = sig_set.add
add_to_nsig = nsig_set.add
_sys.stderr.write('Splitting dataset\n')
for name, p_val in _tqdm(sample, unit=' rows', disable=None):
if p_val <= sig_p:
add_to_sig(name)
elif p_val >= non_sig_p:
add_to_nsig(name)
_sys.stderr.write('Sorting results and writing\n')
with open_zipped(sig_fl, 'w') as sigf, open_zipped(nsig_fl, 'w') as nsgf:
sigf.write('\n'.join(sorted(sig_set)))
nsgf.write('\n'.join(sorted(nsig_set)))
_sys.stderr.write(
'Splitting done, written {} rows to {} and {} rows to {}\n'
.format(len(sig_set), sig_fl, len(nsig_set), nsig_fl)
)
###############################################################################
# File Iterators #
###############################################################################
def study_iterator(infile, sep, has_header, name_col, pval_col):
"""Iterate through infile, yield (name, p-value).
Params
------
infile : str
Path to a file to work on.
sep : str
Single character to split on
has_header : bool
Is first line a header
name_col : str or int
Name of col as str if has_header is True, else 0-base column index
pval_col : str or int
Name of col as str if has_header is True, else 0-base column index
Yields
------
name : str
Name of record
p-value : float
P-Value of record
"""
with open_zipped(infile) as fin:
if has_header:
header = fin.readline().strip().split(sep)
name_idx = header.index(name_col)
pval_idx = header.index(pval_col)
else:
try:
name_idx = int(name_col)
pval_idx = int(pval_col)
except ValueError:
_sys.stderr.write(
'Comp column names must be numbers if no header\n'
)
raise
for line in fin:
f = line.rstrip().split(sep)
yield f[name_idx], float(f[pval_idx])
def simple_iterator(infile):
"""Iterate through infile, yield (col 1, col 2)."""
with open_zipped(infile) as fin:
for line in fin:
f = line.rstrip().split('\t')
assert len(f) == 2
yield f[0], float(f[1])
###############################################################################
# Config #
###############################################################################
# Contains defaults and help, used to generate a simple key: value dictionary
DEFAULT_CONFIG = {
'test_sep': {
'default': '\t', 'help': 'Separator used in the test dataset'
},
'comp_sep': {
'default': '\t', 'help': 'Separator used in the comparison dataset'
},
'test_has_header': {
'default': 1, 'help': '1 if has header, 0 if does not'
},
'test_name_col': {
'default': 'name',
'help': 'Column name (or number if no header) for names in test data'
},
'test_pval_col': {
'default': 'p_value',
'help': 'Column name (or number if no header) for pvals in test data'
},
'comp_has_header': {
'default': 1, 'help': '1 if has header, 0 if does not'
},
'comp_name_col': {
'default': 'name',
'help': 'Column name (or number if no header) for names in comparison data'
},
'comp_pval_col': {
'default': 'p_value',
'help': 'Column name (or number if no header) for pvals in comparison data'
},
'max_pval': {
'default': 0.05, 'help': 'Max pvalue to test enrichment for'
},
'min_pval': {
'default': 1e-15, 'help': 'Min pvalue to test enrichment for'
},
'comp_sig_pval': {
'default': 1e-4,
'help': 'pvalue to use as significant cutoff when splitting comparison data'
},
'comp_nonsig_pval': {
'default': 0.98,
'help': 'pvalue to use as not-significant cutoff when splitting comparison data'
}
}
def get_default_conf():
"""Return simple dict from DEAFULT_CONFIG."""
return {k: v['default'] for k, v in DEFAULT_CONFIG.items()}
def conf_help(outstream=_sys.stdout):
"""Print config help to outstream (default STDOUT)."""
conf = [
[k, repr(v['default']), v['help']] for k, v in DEFAULT_CONFIG.items()
]
help = _tab(conf, headers=['variable', 'default', 'help'])
help = 'Config file can contain the following values:\n\n{}\n'.format(
help
)
if outstream:
outstream.write(help)
return help
def parse_config_file(conf_file=None):
"""Load a dict from a json file and update it with defaults.
Params
------
conf_file : str, optional
Path to a json file. If None, just return default conf.
Returns
-------
config : dict
"""
conf = get_default_conf()
if conf_file:
with open_zipped(conf_file) as fin:
conf.update(_json.load(fin))
return conf
###############################################################################
# Helpers #
###############################################################################
def open_zipped(infile, mode='r'):
"""Return file handle of file regardless of compressed or not.
Also returns already opened files unchanged, text mode automatic for
compatibility with python2.
"""
# return already open files
if hasattr(infile, 'write'):
return infile
# make text mode automatic
if len(mode) == 1:
mode = mode + 't'
# refuse to handle non-strings that aren't files.
if not isinstance(infile, str):
raise ValueError("I cannot open a filename that isn't a string.")
# treat '-' appropriately
if infile == '-':
if 'w' in mode:
return _sys.stdout
return _sys.stdin
# if possible open zipped files
if infile.endswith('.gz'):
return _gzip.open(infile, mode)
if infile.endswith('.bz2'):
if hasattr(_bz2, 'open'):
return _bz2.open(infile, mode)
return _bz2.bz2file(infile, mode)
# fall back on regular open
return open(infile, mode)
###########################################################################
# Command Line Parsing #
###########################################################################
EPILOG = """
dump-config
-----------
The two datasets are described in an optional json config (controlled by the
DEFAULT_CONFIG constant in this script). To edit these settings, run
dump-config to get a json file to edit
split
-----
This mode takes a comparison dataset (described by the config file) and splits
it into two simple sets—a significant set, and a non-significant set. The
p-values that describe this division are set in the config.
The files from this step are required for the main algorithm, althugh they
don't have to be made with this algorithm, you can make them yourself.
run
---
Run the main algorithm—for each p-value cutoff (controlled by config), get
percentage of names in significant set from split mode (sig_overlap), and another
percentage of names in non-significant set (nsig_overlap) and caluclate an
enrichment score from the ratio of the two (sig_overlap:nsig_overlap).
Returns a pandas table with counts and enrichment scores for each cutoff.
Writes this as either a text file, a pandas file, or an excel file
plot
----
Create a plot from the table generated in run mode. The plot has the cutoffs
on the x-axis, the total number of names kept in the cutoff on the left y-axis,
and the enrichment score on the right y-axis.
"""
# Mode descriptions
MAIN_DESC = """\
Run the enrichment.
Requires a split comparison dataset, similar to the one produced by the
split_list mode. Inputs are (optionally compressed) tables. The default is to
expect a tab-delimited table with a header line where the column names are
'name' and 'p-value'. If ``--simple-file`` is passed, the expected input is a
two column file of name\\tp-value, with no header row.
To customize this, pass a json config file, the defaults can be written out by
running dump_config.
"""
SPLIT_DESC = """\
Split an existing study into a highly significant set and a non-significant set.
The expected input is an (optionally compressed) tab delimited file with a
header line and one column labelled 'name' and another labelled 'p-value'. If
``--simple-file`` is passed, the expected input is a two column file of
name\\tp-value, with no header row.
To customize this, pass a json config file, the defaults can be written out by
running dump_config.
The outputs are two newline-separated compressed files where each line is a
name to compare to (case-sensitive). The prefix can be specified with
'--prefix' (default is the same name as the input), the two suffices are
'sig.txt.gz' and 'non-sig.txt.gz', this is non-configurable.
By default, sig gets everything with a p-value smaller than 1e-4, non-sig gets
everything with a p-value greater than 0.99. These values can be configured
in the config.
"""
CONF_DESC = """\
Dump a default config to config file.
Optionally, you can pass an existing config file, and that one will be used
to update the defaults before writing the output conf file.
"""
def core_args(args):
"""Run the enrichment."""
conf = parse_config_file(args.config_file)
if args.max_p:
conf['max_pval'] = args.max_p
if args.min_p:
conf['min_pval'] = args.min_p
sig_comp = args.prefix + '.sig.txt.gz'
nsig_comp = args.prefix + '.nonsig.txt.gz'
bad = []
for fl in [sig_comp, nsig_comp]:
if not _os.path.isfile(fl):
bad.append(fl)
if bad:
raise OSError(
'Need both {0} and {1} to exist, missing {2}'
.format(sig_comp, nsig_comp, bad)
)
scores = enrich_study(
args.data, sig_comp, nsig_comp, simp_fl=args.simple_file,
low_mem=False, conf=conf
)
if args.output:
_sys.stderr.write('Writing score table to {0}\n'.format(args.output))
if args.output.endswith('xls') or args.output.endswith('xlsx'):
scores.to_excel(
args.output, index=False, sheet_name='Enrichment Scores'
)
elif args.output.endswith('pd') or args.output.endswith('pickle'):
scores.to_pickle(args.output)
else:
with open_zipped(args.output, 'w') as fout:
scores.to_csv(fout, sep=conf['test_sep'], index=False)
if args.plot:
_sys.stderr.write('Plotting scores to {0}\n'.format(args.plot))
plot_scores(scores, outfile=args.plot, comp_prefix=args.prefix)
def plot_args(args):
"""Run plotting only."""
conf = parse_config_file(args.config_file)
_sys.stderr.write('Getting scores\n')
if args.scores.endswith('xls') or args.scores.endswith('xlsx'):
scores = pd.read_excel(args.scores, sheet_name='Enrichment Scores')
elif args.scores.endswith('pd') or args.scores.endswith('pickle'):
scores = pd.read_pickle(args.scores)
else:
with open_zipped(args.scores) as fin:
scores = pd.read_csv(fin, sep=conf['test_sep'])
if args.min_p:
scores = scores[scores.cutoff <= args.min_p]
if args.max_p:
scores = scores[scores.cutoff >= args.max_p]
_sys.stderr.write('Plotting scores to {0}\n'.format(args.plot_output))
plot_scores(
scores, outfile=args.plot_output, comp_prefix=args.prefix,
raw=args.raw, show_one=args.show_one, lock_grid=args.fix_grid
)
def split_args(args):
"""Run the comp study splitting sub-command."""
conf = parse_config_file(args.config_file)
if args.sig_p:
conf['comp_sig_pval'] = float(args.sig_p)
if args.non_sig_p:
conf['comp_nonsig_pval'] = float(args.non_sig_p)
# Run the splitting algorithm
split_study(
args.data_file, prefix=args.prefix,
simp_fl=args.simple_file, conf=conf
)
def conf_args(args):
"""Run the config subcommand."""
conf = parse_config_file(args.config_file)
_sys.stderr.write('Writing default config to {0}\n\n'.format(args.outfile))
with open_zipped(args.outfile, 'w') as fout:
_json.dump(conf, fout, indent=4)
conf_help(_sys.stdout)
_sys.stdout.write(
'\nNone of the above are required, and any extras will be ignored\n\n'
)
def main(argv=None):
"""Run as a script."""
if not argv:
argv = _sys.argv[1:]
parser = _argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=_argparse.RawDescriptionHelpFormatter
)
# Shared arguments
conf_parser = _argparse.ArgumentParser(add_help=False)
conf_parser.add_argument(
'-c', dest='config_file',
help='Optional config file, get by running `dump-config`'
)
file_args = _argparse.ArgumentParser(add_help=False)
file_grp = file_args.add_argument_group('File Overrides')
file_grp.add_argument(
'--simple-file', action='store_true',
help='Treat input file as tab-sep two column file with no header. ' +
'First column is name, second is p-value.'
)
prefix_args = _argparse.ArgumentParser(add_help=False)
prefix_args.add_argument(
'-p', '--prefix', default='comp_data',
help='Optional comparison study prefix'
)
# Subparsers
subparsers = parser.add_subparsers(dest='mode')
conf_mode = subparsers.add_parser(
'dump-config', description=CONF_DESC, epilog=conf_help(None),
help='Dump a json config file', parents=[conf_parser],
formatter_class=_argparse.RawDescriptionHelpFormatter
)
conf_mode.add_argument('outfile', help='File to write json config to')
conf_mode.set_defaults(func=conf_args)
split_mode = subparsers.add_parser(
'split', description=SPLIT_DESC,
parents=[conf_parser, file_args, prefix_args],
help='Split an existing dataset for enrichment',
formatter_class=_argparse.RawDescriptionHelpFormatter
)
split_mode.add_argument(
'data_file', help='The dataset to parse, must have name and p-value'
)
conf_override = split_mode.add_argument_group(
'Config Overrides (Optional, in config file)'
)
conf_override.add_argument(
'--sig-p', help='P-Value to choose significant records',
metavar='sigP', type=float
)
conf_override.add_argument(
'--non-sig-p', help='P-Value to chose non-significant records',
metavar='unsigP', type=float
)
split_mode.set_defaults(func=split_args)
main_mode = subparsers.add_parser(
'run', description=MAIN_DESC, help='Run the enrichment',
parents=[conf_parser, file_args, prefix_args],
formatter_class=_argparse.RawDescriptionHelpFormatter
)
files = main_mode.add_argument_group('Optional Output Files')
files.add_argument(
'-o', '--output',
help='Write table to this file in addition to STDOUT, can be Excel fl'
)
files.add_argument(
'--plot', help='Write a plot to this file'
)
files2 = main_mode.add_argument_group('Required Inputs')
files2.add_argument(
'data', help='Described by config file (dump-config)'
)
mconf_override = main_mode.add_argument_group(
'Config Overrides (Optional, set in config file)'
)
mconf_override.add_argument(
'--max-p', help='Max p-value to consider',
metavar='maxP', type=float
)
mconf_override.add_argument(
'--min-p', help='Min p-value to consider',
metavar='maxP', type=float
)
main_mode.set_defaults(func=core_args)
plot_mode = subparsers.add_parser(
'plot', parents=[conf_parser],
description='Plot results', help='Plot results'
)
plot_mode.add_argument('scores', help='Scores table from run mode')
plot_mode.add_argument(
'plot_output', metavar="plot-output-file", help='File to write plot to'
)
plot_fmt = plot_mode.add_argument_group('format options')
plot_fmt.add_argument(
'--raw', action='store_true',
help="Plot raw counts instead of percentages"
)
plot_fmt.add_argument(
'-p', '--prefix', default='comp_data',
help='Optional comparison study prefix, only used for plot title'
)
plot_fmt.add_argument(
'--show-one', action='store_true',
help='Highlight enrichment >= 1.0 on the plot'
)
plot_fmt.add_argument(
'--fix-grid', action='store_true',
help='Force the y-grid to match on both axes'
)
plot_filt = plot_mode.add_argument_group('filter options')
plot_filt.add_argument(
'--min-p', metavar='P-VAL', type=float,
help="Limit points to those with a cutoff less than this P"
)
plot_filt.add_argument(
'--max-p', metavar='P-VAL', type=float,
help="Limit point to those with a cutoff greater than this P"
)
plot_mode.set_defaults(func=plot_args)
args = parser.parse_args(argv)
if hasattr(args, 'func'):
args.func(args)
else:
parser.print_help()
return 0
if __name__ == '__main__' and '__file__' in globals():
_sys.exit(main())