Skip to content

Commit fc23a1d

Browse files
committed
Code from a couple months ago for sign test stuff
1 parent b1a1d71 commit fc23a1d

File tree

3 files changed

+383
-0
lines changed

3 files changed

+383
-0
lines changed

FlyBaseAssocToGMT.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from __future__ import print_function
2+
from sys import argv, stdout
3+
from collections import defaultdict
4+
5+
def parse_obo(obo_file):
6+
children_of = defaultdict(list)
7+
current_id = ''
8+
for line in obo_file:
9+
if line.strip() == '':
10+
current_id = ''
11+
elif line.startswith('id'):
12+
current_id = line.strip().split()[-1]
13+
elif current_id and line.startswith('is_a'):
14+
parent = line.split()[1]
15+
children_of[parent].append(current_id)
16+
elif current_id and line.startswith('relationship'):
17+
parent = line.split()[2]
18+
children_of[parent].append(current_id)
19+
20+
return children_of
21+
22+
def all_children_of(children_of, term):
23+
retval = set(children_of[term])
24+
for child_term in children_of[term]:
25+
retval |= all_children_of(children_of, child_term)
26+
return retval
27+
28+
def all_terms_of(goterms, children_of, term):
29+
genes = set(goterms[term])
30+
for term in all_children_of(children_of, term):
31+
genes.update(set(goterms[term]))
32+
return genes
33+
34+
35+
if __name__ == "__main__":
36+
if len(argv) > 2:
37+
out = open(argv[2], 'w')
38+
else:
39+
out = stdout
40+
children_of = parse_obo(open('prereqs/go-basic.obo'))
41+
goterms = defaultdict(list)
42+
pfcs = {}
43+
for line in open(argv[1]):
44+
if line.startswith('!'): continue
45+
data = line.split('\t')
46+
FBgn = data[1]
47+
symbol = data[2]
48+
GOterm = data[4]
49+
PFC = data[8] # Process/Function/Component
50+
goterms[GOterm].append(symbol)
51+
pfcs[GOterm] = PFC
52+
53+
for GOterm in goterms.copy():
54+
goterm_genes = set(goterms[GOterm])
55+
for child_GOterm in all_children_of(children_of, GOterm):
56+
goterm_genes |= set(goterms[child_GOterm])
57+
print(GOterm, pfcs[GOterm], *goterm_genes, sep='\t', file=out)
58+
if out is not stdout:
59+
out.close()
60+

TrevorSignTest.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
from scipy.stats import ttest_1samp
2+
from scipy.stats import t as t_distribution
3+
from numpy import array, isnan, nan, average, sqrt, inf
4+
from progressbar import ProgressBar as pb
5+
from collections import namedtuple
6+
from argparse import ArgumentParser
7+
import pandas as pd
8+
import Utils as ut
9+
10+
def parse_args():
11+
parser = ArgumentParser()
12+
parser.add_argument('--min-genes', default=20, type=int)
13+
parser.add_argument('--test-only', default=False,
14+
help="Test only go terms with a given code in second column")
15+
parser.add_argument('--drop-genes', default=None)
16+
parser.add_argument('--print-header', default=False, action='store_true')
17+
parser.add_argument('--print-counts', default=False, action='store_true')
18+
parser.add_argument('--pseudocounts', default=1, type=int)
19+
parser.add_argument('--smart-drop', default=False, action='store_true',
20+
help='Use what we know about this dataset to remove '
21+
'genes on the X in males')
22+
parser.add_argument('ase')
23+
parser.add_argument('categories')
24+
parser.add_argument('outfile')
25+
args = parser.parse_args()
26+
return args
27+
28+
def weighted_var(x, w, dropna=False):
29+
'''Translation of code from R to do wighted variance
30+
31+
https://www.r-bloggers.com/weighted-t-test-in-r/
32+
33+
# weighted variance, inspired by a function from Gavin Simpson on R-Help
34+
var.wt <- function(x, w, na.rm = FALSE) {
35+
if (na.rm) {
36+
w <- w[i <- !is.na(x)]
37+
x <- x[i]
38+
}
39+
sum.w <- sum(w)
40+
return((sum(w*x^2) * sum.w - sum(w*x)^2) / (sum.w^2 - sum(w^2)))
41+
}
42+
'''
43+
x = array(x)
44+
w = array(w)
45+
if dropna:
46+
i = ~(isnan(x) | isnan(w))
47+
x = x[i]
48+
w = w[i]
49+
ws = sum(w)
50+
return ( sum(w * (x**2)) * ws - sum(w * x)**2) / (ws**2 - sum(w**2))
51+
52+
weighted_ttest_result = namedtuple('weighted_ttest_result',
53+
['estimate', 'se', 'conf_interval', 'tstat', 'df',
54+
'pvalue'])
55+
56+
def weighted_ttest(x, w, mu, conflevel=0.95, alternative='twosided', dropna=True):
57+
''' Weighted t-test.
58+
59+
alternative: 'less', 'greater', 'twosided'
60+
61+
weighted.t.test <- function(x, w, mu, conf.level = 0.95,
62+
alternative="two.sided", na.rm=TRUE) {
63+
64+
if(!missing(conf.level) &
65+
(length(conf.level) != 1 || !is.finite(conf.level) ||
66+
conf.level < 0 || conf.level > 1))
67+
stop("'conf.level' must be a single number between 0 and 1")
68+
69+
if (na.rm) {
70+
w <- w[i <- !is.na(x)]
71+
x <- x[i]
72+
}
73+
74+
# to achieve consistent behavior in loops, return NA-structure in case of
75+
# complete missings
76+
if (sum(is.na(x)) == length(x))
77+
return(list(estimate=NA, se=NA, conf.int=NA, statistic=NA, df=NA, p.value=NA))
78+
79+
# if only one value is present: this is the best estimate, no significance
80+
# test provided
81+
if (sum(!is.na(x)) == 1) {
82+
warning("Warning weighted.t.test: only one value provided; this value is
83+
returned without test of significance!", call.=FALSE)
84+
return(list(estimate=x[which(!is.na(x))], se=NA, conf.int=NA, statistic=NA,
85+
df=NA, p.value=NA))
86+
}
87+
88+
x.w <- weighted.mean(x,w, na.rm=na.rm)
89+
var.w <- var.wt(x,w, na.rm=na.rm)
90+
df <- length(x)-1
91+
t.value <- sqrt(length(x))*((x.w-mu)/sqrt(var.w))
92+
se <- sqrt(var.w)/sqrt(length(x))
93+
94+
if (alternative == "less") {
95+
pval <- pt(t.value, df)
96+
cint <- c(-Inf, x.w + se*qt(conf.level, df) )
97+
}
98+
else if (alternative == "greater") {
99+
pval <- pt(t.value, df, lower.tail = FALSE)
100+
cint <- c(x.w - se * qt(conf.level, df), Inf)
101+
}
102+
else {
103+
pval <- 2 * pt(-abs(t.value), df)
104+
alpha <- 1 - conf.level
105+
cint <- x.w + se*qt(1 - alpha/2, df)*c(-1,1)
106+
}
107+
108+
names(t.value) <- "t"
109+
return(list(estimate=x.w, se=se, conf.int=cint, statistic=t.value, df=df,
110+
p.value=pval))
111+
}
112+
'''
113+
x = array(x)
114+
w = array(w)
115+
if dropna:
116+
i = ~(isnan(x) | isnan(w))
117+
x = x[i]
118+
w = w[i]
119+
good_values = sum(~isnan(x))
120+
if good_values == 0:
121+
return weighted_ttest_result(estimate=nan, se=nan, conf_interval=nan,
122+
tstat=nan, df=nan, pvalue=nan)
123+
elif good_values == 1:
124+
# if only one value is present: this is the best estimate, no
125+
# significance test provided
126+
return weighted_ttest_result(estimate=x[~isnan(x)], se=nan, conf_interval=nan,
127+
tstat=nan, df=nan, pvalue=nan)
128+
x_w = average(x, weights=w)
129+
var_w = weighted_var(x, w, dropna=dropna)
130+
df = len(x) - 1
131+
t_stat = sqrt(len(x)) * (x_w - mu) / sqrt(var_w)
132+
se = sqrt(var_w) / sqrt(len(x))
133+
if alternative == 'less':
134+
pval = t_distribution.cdf(t_stat, df)
135+
cint = (-inf, x_w + se * t_distribution.ppf(conflevel, df))
136+
elif alternative == 'greater':
137+
pval = t_distribution.sf(t_stat, df)
138+
cint = (x_w - se * t_distribution.ppf(conflevel, df), inf)
139+
else:
140+
pval = 2 * t_distribution.sf(abs(t_stat), df)
141+
alpha = 1-conflevel
142+
qt = t_distribution.ppf(1 - alpha/2, df)
143+
cint = (x_w - se * qt, x_w + se * qt)
144+
return weighted_ttest_result(estimate=x_w, se=se, conf_interval=cint, tstat=t_stat, df
145+
= df, pvalue=pval)
146+
147+
148+
149+
if __name__ == "__main__":
150+
args = parse_args()
151+
drop_genes = set()
152+
if args.drop_genes:
153+
for gene in open(args.drop_genes):
154+
drop_genes.add(gene.strip())
155+
ase = (pd
156+
.read_table(args.ase, **ut.pd_kwargs)
157+
.select(**ut.sel_startswith(('melXsim', 'simXmel')))
158+
.rename(columns=lambda x: x if x.endswith('_ase_value') else x + '_ase_value')
159+
.rename(columns=lambda x: x.replace('melXsim_cyc14C_rep3',
160+
'melXsim_cyc14C_rep0'))
161+
.sort_index(axis=1)
162+
)
163+
ase.drop(drop_genes, inplace=True, errors='ignore')
164+
if args.smart_drop:
165+
chrom_of = ut.get_chroms()
166+
males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2')
167+
is_male = [col.startswith(males) for col in ase.columns]
168+
ase.ix[chrom_of[ase.index] == 'X', is_male] = nan
169+
170+
171+
categories = {}
172+
for line in open(args.categories):
173+
line = line.strip().split()
174+
if ((len(line[2:]) < args.min_genes)
175+
or (args.test_only and line[1] != args.test_only)):
176+
continue
177+
categories[line[0]] = set(line[2:])
178+
179+
tstats = pd.DataFrame(index=categories, columns=ase.columns, data=nan)
180+
pvals = pd.DataFrame(index=categories, columns=ase.columns, data=nan)
181+
ngenes = pd.DataFrame(index=categories, columns=ase.columns, data=nan)
182+
for sample in pb()(ase.columns):
183+
for category in categories:
184+
ase_vals = ase.ix[categories[category], sample].dropna()
185+
ngenes.ix[category, sample] = ase_vals.count()
186+
if ase_vals.count() < args.min_genes:
187+
continue
188+
t_result = ttest_1samp(ase_vals, 0)
189+
tstats.ix[category, sample] = t_result.statistic
190+
pvals.ix[category, sample] = t_result.pvalue

sign_test.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# Performs sign test using a .gmt file and an ase file (containing gene names and ase values)
2+
3+
from __future__ import print_function
4+
import scipy.stats as stats
5+
from argparse import ArgumentParser
6+
7+
parser = ArgumentParser()
8+
parser.add_argument('--min-genes', default=20, type=int)
9+
parser.add_argument('--test-only', default=False,
10+
help="Test only go terms with a given code in second column")
11+
parser.add_argument('--drop-genes', default=None)
12+
parser.add_argument('--print-header', default=False, action='store_true')
13+
parser.add_argument('--print-counts', default=False, action='store_true')
14+
parser.add_argument('--pseudocounts', default=1, type=int)
15+
parser.add_argument('ase')
16+
parser.add_argument('categories')
17+
parser.add_argument('outfile')
18+
parser.add_argument('cutoff', type=int)
19+
20+
args = parser.parse_args()
21+
ase = args.ase
22+
categories = args.categories
23+
outfile = args.outfile
24+
CUTOFF = args.cutoff
25+
26+
27+
28+
top_genes = []
29+
top_values = []
30+
bottom_genes = []
31+
bottom_values = []
32+
33+
drop_genes = set()
34+
if args.drop_genes:
35+
for gene in open(args.drop_genes):
36+
drop_genes.add(gene.strip())
37+
38+
with open(ase, 'r') as ase:
39+
ase.readline()
40+
41+
counter = 0
42+
for line in ase:
43+
print(line)
44+
line = line.strip().split('\t')
45+
gene = line[0]
46+
if gene in drop_genes: continue
47+
ase_value = line[-1]
48+
print(ase_value)
49+
assert False
50+
if ase_value == "NA":
51+
continue
52+
else:
53+
ase_value = float(ase_value)
54+
55+
if counter < CUTOFF:
56+
if ase_value > 0:
57+
top_genes.append(gene)
58+
top_values.append(ase_value)
59+
else:
60+
bottom_genes.append(gene)
61+
bottom_values.append(ase_value)
62+
63+
else:
64+
if ase_value > min(top_values):
65+
del top_genes[top_values.index(min(top_values))]
66+
top_genes.append(gene)
67+
top_values.remove(min(top_values))
68+
top_values.append(ase_value)
69+
70+
elif ase_value < max(bottom_values):
71+
del bottom_genes[bottom_values.index(max(bottom_values))]
72+
bottom_genes.append(gene)
73+
bottom_values.remove(max(bottom_values))
74+
bottom_values.append(ase_value)
75+
76+
counter += 1
77+
78+
79+
out = open(outfile, 'w')
80+
if args.print_header:
81+
if args.print_counts:
82+
print("category\toddsratio\tpval\ttop_yes\ttop_no\tbottom_yes\tbottom_no\tty_genes\tby_genes", file=out)
83+
else:
84+
print("category\toddsratio\tpval", file=out)
85+
86+
tests = 0
87+
88+
with open(categories, 'r') as categories:
89+
for line in categories:
90+
line = line.strip().split('\t')
91+
category = line[0]
92+
genes = set(line[2:])
93+
if args.test_only and line[1] != args.test_only: continue
94+
95+
bottom_yes = args.pseudocounts
96+
bottom_no = args.pseudocounts
97+
top_yes = args.pseudocounts
98+
top_no = args.pseudocounts
99+
100+
ty = []
101+
by = []
102+
for i in top_genes:
103+
if i in genes:
104+
top_yes += 1
105+
ty.append(i)
106+
else:
107+
top_no += 1
108+
for i in bottom_genes:
109+
if i in genes:
110+
bottom_yes += 1
111+
by.append(i)
112+
else:
113+
bottom_no += 1
114+
115+
if bottom_yes + top_yes >= args.min_genes:
116+
tests += 1
117+
oddsratio, pvalue = stats.fisher_exact([[bottom_yes, top_yes], [bottom_no, top_no]])
118+
outlist = [category, str(oddsratio), str(pvalue)]
119+
if args.print_counts:
120+
outlist.extend([top_yes, top_no, bottom_yes,
121+
bottom_no, ','.join(ty),
122+
','.join(by)])
123+
print(*outlist, sep='\t', file=out)
124+
125+
out.close()
126+
127+
if tests > 0:
128+
print("Num top genes:", len(top_genes))
129+
print("Num bottom genes:", len(bottom_genes))
130+
print("Total number of tests: " + str(tests))
131+
print("Adjusted significance level: " + str(0.05 / tests))
132+
else:
133+
print("Not enough genes tested.")

0 commit comments

Comments
 (0)