|
| 1 | +import regex as re |
| 2 | +import pandas as pd |
| 3 | +from varspark.stats.lfdr import * |
| 4 | + |
| 5 | +class LocalFdrVs: |
| 6 | + local_fdr: object |
| 7 | + df_: object |
| 8 | + |
| 9 | + def __init__(self, df): |
| 10 | + """ |
| 11 | + Constructor class |
| 12 | + :param df: Takes a pandas dataframe as argument with three columns: variant_id, |
| 13 | + logImportance and splitCount. |
| 14 | + """ |
| 15 | + self.df_ = df.sort_values('logImportance', ascending=True) |
| 16 | + |
| 17 | + @classmethod |
| 18 | + def from_imp_df(cls, df): |
| 19 | + """ |
| 20 | + Alternative class instantiation from a pandas dataframe |
| 21 | + :param cls: LocalFdrVs class |
| 22 | + :param df: Pandas dataframe with columns locus, alleles, importance, and splitCount. |
| 23 | + :return: Initialized class instance. |
| 24 | + """ |
| 25 | + df = df[df['splitCount'] >= 1] |
| 26 | + df = df.assign(logImportance=np.log(df.importance)) |
| 27 | + #df['variant_id'] = df.apply( |
| 28 | + # lambda row: str(row['locus'][0]) + '_' + str(row['locus'][1]) + '_' + \ |
| 29 | + # str('_'.join(row['alleles'])), axis=1) |
| 30 | + return cls(df[['variant_id', 'logImportance', 'splitCount']]) |
| 31 | + |
| 32 | + def find_split_count_th(self, cutoff_list=[1, 2, 3, 4, 5, 10, 15, 20], quantile=0.75, bins=120): |
| 33 | + """ |
| 34 | + Finds the ideal threshold for the splitCount. Ideal being the lowest differences between |
| 35 | + the fitted skewed normal distribution vs the real data |
| 36 | + :param cutoff_list: List of all values to be tried for the cutoff in the splitCount |
| 37 | + :param quantile: Quantile to evaluate the distribution |
| 38 | + :param bins: Number of bins for the distribution |
| 39 | + :return: best splitCount threshold |
| 40 | + """ |
| 41 | + best_split = [1, np.inf] |
| 42 | + |
| 43 | + for split in cutoff_list: |
| 44 | + # impDfWithLog = self.df_[self.df_.splitCount >= split] |
| 45 | + impDfWithLog = self.df_[self.df_.splitCount >= split] # temp |
| 46 | + impDfWithLog = impDfWithLog[['variant_id', 'logImportance']].set_index( |
| 47 | + 'variant_id').squeeze() |
| 48 | + |
| 49 | + local_fdr = LocalFdr() |
| 50 | + local_fdr.bins = bins |
| 51 | + |
| 52 | + impDfWithLog = impDfWithLog + sys.float_info.epsilon |
| 53 | + x, f_observed_y = local_fdr._observed_density(impDfWithLog) |
| 54 | + f_y = local_fdr._fit_density(x, f_observed_y) |
| 55 | + |
| 56 | + C = np.quantile(impDfWithLog, q=quantile) |
| 57 | + |
| 58 | + initial_f0_params = local_fdr._estimate_skewnorm_params(x[x < C], f_observed_y[x < C], |
| 59 | + SkewnormParams.initial_list(impDfWithLog)) |
| 60 | + |
| 61 | + res = skewnorm.pdf(x, a=initial_f0_params.a, loc=initial_f0_params.loc, |
| 62 | + scale=initial_f0_params.scale) - f_observed_y |
| 63 | + res = sum(res[x < C] ** 2) |
| 64 | + if best_split[1] > res: |
| 65 | + best_split[0] = split |
| 66 | + best_split[1] = res |
| 67 | + |
| 68 | + return best_split[0] |
| 69 | + |
| 70 | + def plot_log_densities(self, ax, cutoff_list=[1, 2, 3, 4, 5, 10, 15, 20], palette='Set1', |
| 71 | + find_automatic_best=False, xLabel='log(importance)', yLabel='density'): |
| 72 | + """ |
| 73 | + Plotting the log densities to visually identify the unimodal distributions. |
| 74 | + :param ax: Matplotlib axis as a canvas for this plot. |
| 75 | + :param cutoff_list: list of potential splitCount thresholds |
| 76 | + :param find_automatic_best: The user may let the computer highlight the potential best option. |
| 77 | + :param palette: Matplotlib color palette used for the plotting. |
| 78 | + :param xLabel: Label on the x-axis of the plot. |
| 79 | + :param yLabel: Label on the y-axis of the plot. |
| 80 | + """ |
| 81 | + assert type(palette) == str, 'palette should be a string' |
| 82 | + assert type(xLabel) == str, 'xLabel should be a string' |
| 83 | + assert type(yLabel) == str, 'yLabel should be a string' |
| 84 | + |
| 85 | + n_lines = len(cutoff_list) |
| 86 | + colors = sns.mpl_palette(palette, n_lines) |
| 87 | + df = self.df_ |
| 88 | + for i, c in zip(cutoff_list, colors): |
| 89 | + sns.kdeplot(df.logImportance[df.splitCount >= i], |
| 90 | + ax=ax, c=c, bw_adjust=0.5) # bw low show sharper distributions |
| 91 | + |
| 92 | + if find_automatic_best: |
| 93 | + potential_best = self.find_split_count_th(cutoff_list=cutoff_list) |
| 94 | + sns.kdeplot(df.logImportance[df.splitCount >= potential_best], |
| 95 | + ax=ax, c=colors[potential_best - 1], bw_adjust=0.5, lw=8, linestyle=':') |
| 96 | + best_split = [str(x) if x != potential_best else str(x) + '*' for x in cutoff_list] |
| 97 | + else: |
| 98 | + best_split = cutoff_list |
| 99 | + |
| 100 | + ax.legend(title='Minimum split counts in distribution') |
| 101 | + ax.legend(labels=best_split, bbox_to_anchor=(1, 1)) |
| 102 | + ax.set_xlabel(xLabel) |
| 103 | + ax.set_ylabel(yLabel) |
| 104 | + |
| 105 | + def plot_log_hist(self, ax, split_count, bins=120, xLabel='log(importance)', yLabel='count'): |
| 106 | + """ |
| 107 | + Ploting the log histogram for the chosen split_count |
| 108 | + :param ax: Matplotlib axis as a canvas for this plot. |
| 109 | + :param split_count: Minimum split count threshold for the plot. |
| 110 | + :param bins: Number of bins in the histogram |
| 111 | + :param xLabel: Label on the x-axis of the plot. |
| 112 | + :param yLabel: Label on the y-axis of the plot. |
| 113 | + """ |
| 114 | + |
| 115 | + assert bins > 0, 'bins should be bigger than 0' |
| 116 | + assert split_count > 0, 'split_count should be bigger than 0' |
| 117 | + assert type(xLabel) == str, 'xLabel should be a string' |
| 118 | + assert type(yLabel) == str, 'yLabel should be a string' |
| 119 | + |
| 120 | + df = self.df_ |
| 121 | + sns.histplot(df.logImportance[df.splitCount >= split_count], ax=ax, bins=bins) |
| 122 | + ax.set_xlabel(xLabel) |
| 123 | + ax.set_ylabel(yLabel) |
| 124 | + |
| 125 | + def plot(self, ax): |
| 126 | + self.local_fdr.plot(ax) |
| 127 | + |
| 128 | + def compute_fdr(self, countThreshold=2, local_fdr_cutoff=0.05, bins=120): |
| 129 | + """ |
| 130 | + Compute the FDR and p-values of the SNPs. |
| 131 | + :param countThreshold: The split count threshold for the SNPs to be considered. |
| 132 | + :param local_fdr_cutoff: Threshold of False positives over total of genes |
| 133 | + :param bins: number of bins to which the log importances will be aggregated |
| 134 | + :return: A tuple with a dataframe containing the SNPs and their p-values, |
| 135 | + and the expected FDR for the significant genes. |
| 136 | + """ |
| 137 | + |
| 138 | + assert countThreshold > 0, 'countThreshold should be bigger than 0' |
| 139 | + assert 0 < local_fdr_cutoff < 1, 'local_fdr_cutoff threshold should be between 0 and 1' |
| 140 | + |
| 141 | + self.local_fdr_cutoff = local_fdr_cutoff |
| 142 | + |
| 143 | + impDfWithLog = self.df_[self.df_.splitCount >= countThreshold] |
| 144 | + impDfWithLog = impDfWithLog[['variant_id', 'logImportance']].set_index( |
| 145 | + 'variant_id').squeeze() |
| 146 | + |
| 147 | + self.local_fdr = LocalFdr() |
| 148 | + self.local_fdr.fit(impDfWithLog, bins) |
| 149 | + pvals = self.local_fdr.get_pvalues() |
| 150 | + fdr, mask = self.local_fdr.get_fdr(local_fdr_cutoff) |
| 151 | + self.pvalsDF = impDfWithLog.reset_index().assign(pvalue=pvals, is_significant=mask) |
| 152 | + return ( |
| 153 | + self.pvalsDF, |
| 154 | + fdr |
| 155 | + ) |
| 156 | + |
| 157 | + def plot_manhattan_imp(self, fdr=None, gap_size=None): |
| 158 | + """ Displays manhattan plot of negative log importances for each feature, as well as significance cutoff. |
| 159 | + Categorises features in respective chromosomes, ordered by locus. |
| 160 | + :param gap_size: The size of gap between each chromosome. |
| 161 | + Included as an adjustable parameter as this value scales with the total number of loci |
| 162 | + """ |
| 163 | + pvals = self.pvalsDF |
| 164 | + # Estimate appropriate size for gap between chromosomes based on number of loci to plot |
| 165 | + gap_size = gap_size if gap_size is not None else int(np.ceil(pvals.shape[0]/80)) |
| 166 | + if fdr is None: |
| 167 | + cutoff = self.local_fdr_cutoff |
| 168 | + else: |
| 169 | + cutoff = fdr |
| 170 | + def process_variant_id(variant_id): |
| 171 | + """ Extracts chromosome, locus, and alleles from the variant_id field using regex |
| 172 | + :param variant_id: Feature label |
| 173 | + """ |
| 174 | + pattern = r'(\d+)_([\d]+)_([A-Z]*)_([A-Z]*)' |
| 175 | + |
| 176 | + match = re.match(pattern, variant_id) |
| 177 | + |
| 178 | + if match: |
| 179 | + chrom = match.group(1) |
| 180 | + locus = match.group(2) |
| 181 | + alleles = [match.group(3), match.group(4)] |
| 182 | + return pd.Series([int(chrom), int(locus), alleles], index=['chrom', 'locus', 'alleles']) |
| 183 | + else: |
| 184 | + return pd.Series([None, None, None], index=['chrom', 'locus', 'alleles']) |
| 185 | + |
| 186 | + pvals[['chrom', 'locus', 'alleles']] = pvals['variant_id'].apply(process_variant_id) |
| 187 | + pvals['-logp'] = -np.log10(pvals.pvalue) |
| 188 | + sorted_pvals = pvals.sort_values(by=['chrom', 'locus']) |
| 189 | + sorted_pvals.reset_index(inplace=True, drop=True) |
| 190 | + sorted_pvals['i'] = sorted_pvals.index |
| 191 | + sorted_pvals['chrom'] = sorted_pvals['chrom'].astype('category') |
| 192 | + sorted_pvals['chrom_idx'] = sorted_pvals['chrom'].cat.codes.astype(int) |
| 193 | + sorted_pvals['x'] = sorted_pvals['chrom_idx'] * gap_size + sorted_pvals['i'] |
| 194 | + plot = sns.relplot(data=sorted_pvals, x='x', y='-logp', aspect=3.7, |
| 195 | + hue='chrom', palette = 'bright', legend=None) |
| 196 | + cutoff_logp = -np.log10(cutoff) |
| 197 | + plot.ax.axhline(y=cutoff_logp, color='black', linestyle='--') |
| 198 | + plot.ax.text(plot.ax.get_xlim()[1]+0.1, cutoff_logp, f"FDR Cutoff = {cutoff:.6f}") |
| 199 | + chrom_df=sorted_pvals.groupby('chrom')['x'].median() |
| 200 | + plot.ax.set_xlabel('chrom') |
| 201 | + plot.ax.set_xticks(chrom_df) |
| 202 | + plot.ax.set_xticklabels(chrom_df.index) |
| 203 | + plot.figure.suptitle('Manhattan plot of p values') |
| 204 | + return sorted_pvals |
0 commit comments