Skip to content

Commit 59f40bc

Browse files
DEV: Create no hail lfdr class (#237)
FEAT: Implement function for manhattan plotting negative log p values
1 parent 3356d9a commit 59f40bc

File tree

1 file changed

+204
-0
lines changed

1 file changed

+204
-0
lines changed

python/varspark/lfdrvsnohail.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
import regex as re
2+
import pandas as pd
3+
from varspark.stats.lfdr import *
4+
5+
class LocalFdrVs:
6+
local_fdr: object
7+
df_: object
8+
9+
def __init__(self, df):
10+
"""
11+
Constructor class
12+
:param df: Takes a pandas dataframe as argument with three columns: variant_id,
13+
logImportance and splitCount.
14+
"""
15+
self.df_ = df.sort_values('logImportance', ascending=True)
16+
17+
@classmethod
18+
def from_imp_df(cls, df):
19+
"""
20+
Alternative class instantiation from a pandas dataframe
21+
:param cls: LocalFdrVs class
22+
:param df: Pandas dataframe with columns locus, alleles, importance, and splitCount.
23+
:return: Initialized class instance.
24+
"""
25+
df = df[df['splitCount'] >= 1]
26+
df = df.assign(logImportance=np.log(df.importance))
27+
#df['variant_id'] = df.apply(
28+
# lambda row: str(row['locus'][0]) + '_' + str(row['locus'][1]) + '_' + \
29+
# str('_'.join(row['alleles'])), axis=1)
30+
return cls(df[['variant_id', 'logImportance', 'splitCount']])
31+
32+
def find_split_count_th(self, cutoff_list=[1, 2, 3, 4, 5, 10, 15, 20], quantile=0.75, bins=120):
33+
"""
34+
Finds the ideal threshold for the splitCount. Ideal being the lowest differences between
35+
the fitted skewed normal distribution vs the real data
36+
:param cutoff_list: List of all values to be tried for the cutoff in the splitCount
37+
:param quantile: Quantile to evaluate the distribution
38+
:param bins: Number of bins for the distribution
39+
:return: best splitCount threshold
40+
"""
41+
best_split = [1, np.inf]
42+
43+
for split in cutoff_list:
44+
# impDfWithLog = self.df_[self.df_.splitCount >= split]
45+
impDfWithLog = self.df_[self.df_.splitCount >= split] # temp
46+
impDfWithLog = impDfWithLog[['variant_id', 'logImportance']].set_index(
47+
'variant_id').squeeze()
48+
49+
local_fdr = LocalFdr()
50+
local_fdr.bins = bins
51+
52+
impDfWithLog = impDfWithLog + sys.float_info.epsilon
53+
x, f_observed_y = local_fdr._observed_density(impDfWithLog)
54+
f_y = local_fdr._fit_density(x, f_observed_y)
55+
56+
C = np.quantile(impDfWithLog, q=quantile)
57+
58+
initial_f0_params = local_fdr._estimate_skewnorm_params(x[x < C], f_observed_y[x < C],
59+
SkewnormParams.initial_list(impDfWithLog))
60+
61+
res = skewnorm.pdf(x, a=initial_f0_params.a, loc=initial_f0_params.loc,
62+
scale=initial_f0_params.scale) - f_observed_y
63+
res = sum(res[x < C] ** 2)
64+
if best_split[1] > res:
65+
best_split[0] = split
66+
best_split[1] = res
67+
68+
return best_split[0]
69+
70+
def plot_log_densities(self, ax, cutoff_list=[1, 2, 3, 4, 5, 10, 15, 20], palette='Set1',
71+
find_automatic_best=False, xLabel='log(importance)', yLabel='density'):
72+
"""
73+
Plotting the log densities to visually identify the unimodal distributions.
74+
:param ax: Matplotlib axis as a canvas for this plot.
75+
:param cutoff_list: list of potential splitCount thresholds
76+
:param find_automatic_best: The user may let the computer highlight the potential best option.
77+
:param palette: Matplotlib color palette used for the plotting.
78+
:param xLabel: Label on the x-axis of the plot.
79+
:param yLabel: Label on the y-axis of the plot.
80+
"""
81+
assert type(palette) == str, 'palette should be a string'
82+
assert type(xLabel) == str, 'xLabel should be a string'
83+
assert type(yLabel) == str, 'yLabel should be a string'
84+
85+
n_lines = len(cutoff_list)
86+
colors = sns.mpl_palette(palette, n_lines)
87+
df = self.df_
88+
for i, c in zip(cutoff_list, colors):
89+
sns.kdeplot(df.logImportance[df.splitCount >= i],
90+
ax=ax, c=c, bw_adjust=0.5) # bw low show sharper distributions
91+
92+
if find_automatic_best:
93+
potential_best = self.find_split_count_th(cutoff_list=cutoff_list)
94+
sns.kdeplot(df.logImportance[df.splitCount >= potential_best],
95+
ax=ax, c=colors[potential_best - 1], bw_adjust=0.5, lw=8, linestyle=':')
96+
best_split = [str(x) if x != potential_best else str(x) + '*' for x in cutoff_list]
97+
else:
98+
best_split = cutoff_list
99+
100+
ax.legend(title='Minimum split counts in distribution')
101+
ax.legend(labels=best_split, bbox_to_anchor=(1, 1))
102+
ax.set_xlabel(xLabel)
103+
ax.set_ylabel(yLabel)
104+
105+
def plot_log_hist(self, ax, split_count, bins=120, xLabel='log(importance)', yLabel='count'):
106+
"""
107+
Ploting the log histogram for the chosen split_count
108+
:param ax: Matplotlib axis as a canvas for this plot.
109+
:param split_count: Minimum split count threshold for the plot.
110+
:param bins: Number of bins in the histogram
111+
:param xLabel: Label on the x-axis of the plot.
112+
:param yLabel: Label on the y-axis of the plot.
113+
"""
114+
115+
assert bins > 0, 'bins should be bigger than 0'
116+
assert split_count > 0, 'split_count should be bigger than 0'
117+
assert type(xLabel) == str, 'xLabel should be a string'
118+
assert type(yLabel) == str, 'yLabel should be a string'
119+
120+
df = self.df_
121+
sns.histplot(df.logImportance[df.splitCount >= split_count], ax=ax, bins=bins)
122+
ax.set_xlabel(xLabel)
123+
ax.set_ylabel(yLabel)
124+
125+
def plot(self, ax):
126+
self.local_fdr.plot(ax)
127+
128+
def compute_fdr(self, countThreshold=2, local_fdr_cutoff=0.05, bins=120):
129+
"""
130+
Compute the FDR and p-values of the SNPs.
131+
:param countThreshold: The split count threshold for the SNPs to be considered.
132+
:param local_fdr_cutoff: Threshold of False positives over total of genes
133+
:param bins: number of bins to which the log importances will be aggregated
134+
:return: A tuple with a dataframe containing the SNPs and their p-values,
135+
and the expected FDR for the significant genes.
136+
"""
137+
138+
assert countThreshold > 0, 'countThreshold should be bigger than 0'
139+
assert 0 < local_fdr_cutoff < 1, 'local_fdr_cutoff threshold should be between 0 and 1'
140+
141+
self.local_fdr_cutoff = local_fdr_cutoff
142+
143+
impDfWithLog = self.df_[self.df_.splitCount >= countThreshold]
144+
impDfWithLog = impDfWithLog[['variant_id', 'logImportance']].set_index(
145+
'variant_id').squeeze()
146+
147+
self.local_fdr = LocalFdr()
148+
self.local_fdr.fit(impDfWithLog, bins)
149+
pvals = self.local_fdr.get_pvalues()
150+
fdr, mask = self.local_fdr.get_fdr(local_fdr_cutoff)
151+
self.pvalsDF = impDfWithLog.reset_index().assign(pvalue=pvals, is_significant=mask)
152+
return (
153+
self.pvalsDF,
154+
fdr
155+
)
156+
157+
def plot_manhattan_imp(self, fdr=None, gap_size=None):
158+
""" Displays manhattan plot of negative log importances for each feature, as well as significance cutoff.
159+
Categorises features in respective chromosomes, ordered by locus.
160+
:param gap_size: The size of gap between each chromosome.
161+
Included as an adjustable parameter as this value scales with the total number of loci
162+
"""
163+
pvals = self.pvalsDF
164+
# Estimate appropriate size for gap between chromosomes based on number of loci to plot
165+
gap_size = gap_size if gap_size is not None else int(np.ceil(pvals.shape[0]/80))
166+
if fdr is None:
167+
cutoff = self.local_fdr_cutoff
168+
else:
169+
cutoff = fdr
170+
def process_variant_id(variant_id):
171+
""" Extracts chromosome, locus, and alleles from the variant_id field using regex
172+
:param variant_id: Feature label
173+
"""
174+
pattern = r'(\d+)_([\d]+)_([A-Z]*)_([A-Z]*)'
175+
176+
match = re.match(pattern, variant_id)
177+
178+
if match:
179+
chrom = match.group(1)
180+
locus = match.group(2)
181+
alleles = [match.group(3), match.group(4)]
182+
return pd.Series([int(chrom), int(locus), alleles], index=['chrom', 'locus', 'alleles'])
183+
else:
184+
return pd.Series([None, None, None], index=['chrom', 'locus', 'alleles'])
185+
186+
pvals[['chrom', 'locus', 'alleles']] = pvals['variant_id'].apply(process_variant_id)
187+
pvals['-logp'] = -np.log10(pvals.pvalue)
188+
sorted_pvals = pvals.sort_values(by=['chrom', 'locus'])
189+
sorted_pvals.reset_index(inplace=True, drop=True)
190+
sorted_pvals['i'] = sorted_pvals.index
191+
sorted_pvals['chrom'] = sorted_pvals['chrom'].astype('category')
192+
sorted_pvals['chrom_idx'] = sorted_pvals['chrom'].cat.codes.astype(int)
193+
sorted_pvals['x'] = sorted_pvals['chrom_idx'] * gap_size + sorted_pvals['i']
194+
plot = sns.relplot(data=sorted_pvals, x='x', y='-logp', aspect=3.7,
195+
hue='chrom', palette = 'bright', legend=None)
196+
cutoff_logp = -np.log10(cutoff)
197+
plot.ax.axhline(y=cutoff_logp, color='black', linestyle='--')
198+
plot.ax.text(plot.ax.get_xlim()[1]+0.1, cutoff_logp, f"FDR Cutoff = {cutoff:.6f}")
199+
chrom_df=sorted_pvals.groupby('chrom')['x'].median()
200+
plot.ax.set_xlabel('chrom')
201+
plot.ax.set_xticks(chrom_df)
202+
plot.ax.set_xticklabels(chrom_df.index)
203+
plot.figure.suptitle('Manhattan plot of p values')
204+
return sorted_pvals

0 commit comments

Comments
 (0)