-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmama_pipeline.py
executable file
·616 lines (503 loc) · 26.8 KB
/
mama_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
#!/usr/bin/env python3
"""
Python functions that implement the core MAMA processing
"""
import logging
from typing import Any, Dict, List, Tuple, Union
import warnings
import numpy as np
import pandas as pd
from scipy.stats import norm
from core_mama import (create_omega_matrix, create_sigma_matrix, run_mama_method, qc_omega,
qc_sigma)
from reg_mama import (REG_INT_OPT_NAME, REG_LD_OPT_NAME, REG_LD_SCALE_FACTOR_NAME, REG_SE_OPT_NAME,
MAMA_REG_OPT_ALL_FREE, run_ldscore_regressions)
from util.df import Filter, intersect_indices
from util.sumstats import (SNP_COL, BP_COL, CHR_COL, BETA_COL, FREQ_COL, SE_COL, A1_COL,
A2_COL, P_COL, INFO_COL, N_COL, Z_COL, COMPLEMENT, BASES,
MAX_RSID_LOGGING, create_freq_filter, create_chr_filter,
standardize_all_sumstats, process_sumstats)
# Constants / Parameters / Types #############
# Pylint upper-case errors disabled here to adhere to Python typing module conventions
AncestryId = Any # pylint: disable=invalid-name
PhenotypeId = Any # pylint: disable=invalid-name
PopulationId = Tuple[AncestryId, PhenotypeId]
# Columns that MAMA requires
MAMA_REQ_COLS_MAP = {
SNP_COL : str,
CHR_COL : str,
BETA_COL : float,
FREQ_COL : float,
SE_COL : float,
A1_COL : str,
A2_COL : str,
BP_COL : int,
P_COL : float
}
MAMA_REQ_STD_COLS = set(MAMA_REQ_COLS_MAP.keys())
# Map of default regular expressions used to convert summary stat column names to standardized names
MAMA_RE_EXPR_MAP = {
SNP_COL : '.*SNP.*|.*RS.*',
BP_COL : '.*BP.*|.*POS.*',
CHR_COL : '.*CHR.*',
BETA_COL : '.*BETA.*',
FREQ_COL : '.*FREQ.*|.*FRQ.*|.*AF',
SE_COL : '.*SE.*',
A1_COL : '.*A1.*|.*MAJOR.*|.*EFFECT.*ALL.*|REF.*',
A2_COL : '.*A2.*|.*MINOR.*|.*OTHER.*ALL.*|ALT.*',
P_COL : 'P|P.*VAL.*',
INFO_COL : 'INFO',
N_COL : 'N',
}
# Frequency filtering defaults
DEFAULT_MAF_MIN = 0.01
DEFAULT_MAF_MAX = 0.99
# Chromosome filtering defaults
DEFAULT_CHR_LIST = [str(cnum) for cnum in range(1, 23)] + ['X', 'Y']
# Filter-related materials
NAN_FILTER = 'NO NAN'
FREQ_FILTER = 'FREQ BOUNDS'
SE_FILTER = 'SE BOUNDS'
CHR_FILTER = 'CHR VALUES'
SNP_DUP_ALL_FILTER = 'DUPLICATE ALLELE SNPS'
SNP_PALIN_FILT = 'PALINDROMIC SNPS'
SNP_INVALID_ALLELES_FILTER = 'INVALID ALLELES'
SNP_NEGATIVE_P_FILTER = 'NEGATIVE GWAS P'
MAMA_STD_FILTER_FUNCS = {
NAN_FILTER :
{
'func' : lambda df: df[list(MAMA_REQ_STD_COLS)].isnull().any(axis=1),
'description' : "Filters out SNPs with any NaN values in required "
f"columns {MAMA_REQ_STD_COLS}"
},
FREQ_FILTER :
{
'func' : create_freq_filter(DEFAULT_MAF_MIN, DEFAULT_MAF_MAX),
'description' : "Filters out SNPs with FREQ values outside of "
f"[{DEFAULT_MAF_MIN}, {DEFAULT_MAF_MAX}]"
},
SE_FILTER :
{
'func' : lambda df: df[SE_COL].le(0.0),
'description' : "Filters out SNPs with non-positive SE values"
},
CHR_FILTER :
{
'func' : create_chr_filter(DEFAULT_CHR_LIST),
'description' : f"Filters out SNPs with listed chromosomes not in {DEFAULT_CHR_LIST}"
},
SNP_DUP_ALL_FILTER :
{
'func' : lambda df: df[A1_COL] == df[A2_COL],
'description' : "Filters out SNPs with major allele = minor allele"
},
SNP_PALIN_FILT :
{
'func' : lambda df: df[A1_COL].replace(COMPLEMENT) == df[A2_COL],
'description' : "Filters out SNPs where major / minor alleles are a base pair"
},
SNP_INVALID_ALLELES_FILTER :
{
'func' : lambda df: ~df[A1_COL].isin(BASES) | ~df[A2_COL].isin(BASES),
'description' : f"Filters out SNPs with alleles not in {BASES}"
},
SNP_NEGATIVE_P_FILTER :
{
'func' : lambda df: df[P_COL].lt(0.0),
'description' : "Filters out SNPs with negative GWAS P values"
},
}
# Column name to rename N column to if it exists (to help resolve ambiguity since there should be
# an effective N column added)
ORIGINAL_N_COL_RENAME = "N_ORIG"
# Column name to rename N column to if it exists (to help resolve ambiguity since there should be
# an effective N column added)
N_EFF_COL = "N_EFF"
# Derived Constants###########################
# Filter function dictionaries (name to function mapping or description) for MAMA
MAMA_STD_FILTERS = {fname : (finfo['func'], finfo['description'])
for fname, finfo in MAMA_STD_FILTER_FUNCS.items()}
# Regular expression indicating columns to keep in harmonized summary statistics
MAMA_HARM_COLS_RE = '|'.join(MAMA_REQ_STD_COLS)
# Calculate constants used in determination of P values for MAMA
ln = np.log # pylint: disable=invalid-name
LN_2 = ln(2.0)
RECIP_LN_10 = np.reciprocal(ln(10.0))
# Functions ##################################
#################################
def obtain_df(possible_df: Union[str, pd.DataFrame], id_val: Any, sep_arg: Union[None, str] = None
) -> pd.DataFrame:
"""
Small helper function that handles functionality related to reading in a DataFrame
:param possible_df: Should either be a string indicating the full path to a file to be
read into a DataFrame or the DataFrame itself. All other possibilities will
result in this function raising an error
:param id_val: Used for logging / error-reporting to identify the data being read / checked
:raises RuntimeError: If possible_df is not a string or pd.DataFrame
:return pd.DataFrame: Returns a DataFrame
"""
# If this is (presumably) a filename, read in the file
if isinstance(possible_df, str):
logging.info("Reading in %s file: %s", id_val, possible_df)
# Catch ParserWarning that warns of switch to Python engine if that happens
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=pd.errors.ParserWarning,
message="Falling back to the \'python\' engine")
possible_df = pd.read_csv(possible_df, sep=sep_arg, comment='#',
dtype=MAMA_REQ_COLS_MAP)
# If neither a string (presumed to be a filename) nor DataFrame are passed in, throw error
elif not isinstance(possible_df, pd.DataFrame):
raise RuntimeError(f"ERROR: Either pass in filename or DataFrame for {id_val} "
f"rather than [{type(possible_df)}]")
return possible_df
#################################
def qc_ldscores(ldscores_df: pd.DataFrame):
"""
Runs QC steps on LD scores. This will be much lighter-weight than what is done on summary
statistics, as it assumes that the LD score file was generated using this software.
:param ldscores_df: Dataframe holding ldscores
:return pd.DataFrame: DataFrame containing the QC'ed LD scores
"""
# Make copy of the dataframe (this copy will be modified)
df = ldscores_df.copy()
# Drop any lines with NaN
nan_drops = df.isnull().any(axis=1)
df.drop(df.index[nan_drops], inplace=True)
# Make sure SNP IDs are lower case ("rs..." rather than "RS...")
df[SNP_COL] = df[SNP_COL].str.lower()
# Set SNP column to be the index and sort
df.set_index(SNP_COL, inplace=True)
df.sort_index(inplace=True)
return df
#################################
def harmonize_all(sumstats: Dict[PopulationId, pd.DataFrame], ldscores: pd.DataFrame,
snp_list: pd.Index = None):
"""
Does the harmonization between the QC'ed input summary statistics and the LD scores. The
DataFrames are all modified in place (SNPs/rows dropped and reference alleles transformed
as needed), and all inputs are expected to have indices = SNP ID (beginning with "rs")
:param sumstats: Dictionary mapping a population id to a DataFrame holding the summary
stat information. The DFs should all have been QCed already.
:param ldscores: DataFrame of LD score information
:param snp_list: If specified, a Series containing rsIDs to which to restrict analysis
"""
# Intersect all the SNP lists to get the SNPs all data sources have in common
snp_intersection = intersect_indices(sumstats.values(), ldscores)
if snp_list is not None:
logging.info("Restricting to user-supplied SNP list (%s SNPs)...", len(snp_list))
snp_intersection = snp_intersection.intersection(snp_list)
logging.info("\n\nNumber of SNPS in initial intersection of all sources: %s",
len(snp_intersection))
# Reduce each DF down to the SNP intersection (and drop extraneous columns, too)
for pop_df in sumstats.values():
snps_to_drop = pop_df.index.difference(snp_intersection)
pop_df.drop(snps_to_drop, inplace=True)
pop_df.drop(pop_df.columns.difference(list(MAMA_RE_EXPR_MAP.keys())), axis=1, inplace=True)
snps_to_drop = ldscores.index.difference(snp_intersection)
ldscores.drop(snps_to_drop, inplace=True)
# Standardize alleles in the summary statistics
logging.info("\nStandardizing reference alleles in summary statistics.")
ref_popid, tot_drop_indices, drop_dict, ref_flip_dict = standardize_all_sumstats(sumstats) # pylint: disable=unused-variable,line-too-long
logging.info("Standardized to population: %s", ref_popid)
logging.info("Dropped %s SNPs during reference allele standardization.", tot_drop_indices.sum())
if logging.root.level <= logging.DEBUG:
logging.debug("RS IDs of drops during standardization: %s",
sumstats[ref_popid].index[tot_drop_indices].to_list())
# Drop SNPs as a result of standardization of reference alleles
for pop_df in sumstats.values():
pop_df.drop(pop_df.index[tot_drop_indices], inplace=True)
ldscores.drop(ldscores.index[tot_drop_indices], inplace=True)
#################################
def write_sumstats_to_file(filename: str, df: pd.DataFrame):
"""
Helper function that writes a summary statistics DataFrame to disk
:param filename: Full path to output file
:param df: DataFrame holding the summary statistics
"""
df.to_csv(filename, sep="\t", index_label=SNP_COL, na_rep="NaN")
#################################
def collate_df_values(sumstats: Dict[PopulationId, pd.DataFrame], ldscores: pd.DataFrame,
ordering: List[PopulationId] = None) -> Tuple[np.ndarray, np.ndarray,
np.ndarray]:
"""
Function that gathers data from DataFrames (betas, ses, etc.) into ndarrays for use in
vectorized processing
:param sumstats: Dictionary of population identifier -> DataFrame
:param ldscores: DataFrame for the LD scores
:param ordering: Optional parameter indicating the order in which populations should be arranged
(if not specified, the ordering of the sumstats dictionary keys will be used)
:return: Betas (MxP), SEs (MxP), and LD scores (MxPxP)
"""
# Make sure ordering is specified
if not ordering:
ordering = list(sumstats.keys())
# Move summary statistic data into arrays to allow for vectorized operations
# 1) Gather important numbers to use for shapes and dimensions
num_pops = len(sumstats)
num_snps = len(ldscores)
# 2) Create empty arrays so the memory can be allocated all at once
beta_arr = np.zeros((num_snps, num_pops))
se_arr = np.zeros((num_snps, num_pops))
ldscore_arr = np.zeros((num_snps, num_pops, num_pops))
# 3) Copy data into place
for pop_num, pop_id in enumerate(ordering):
pop_df = sumstats[pop_id]
ancestry_id = pop_id[0]
beta_arr[:, pop_num] = pop_df[BETA_COL].to_numpy()
se_arr[:, pop_num] = pop_df[SE_COL].to_numpy()
# For LD scores, need to iterate through populations a second time to process pairs
for second_pop_num, second_pop_id in enumerate(ordering):
second_ancestry_id = second_pop_id[0]
ldscore_col_name = "_".join((str(ancestry_id), str(second_ancestry_id)))
if ldscore_col_name not in ldscores.columns:
ldscore_col_name = "_".join((str(second_ancestry_id), str(ancestry_id)))
ldscore_arr[:, pop_num, second_pop_num] = ldscores[ldscore_col_name].to_numpy()
return beta_arr, se_arr, ldscore_arr
#################################
def calculate_z(betas: np.ndarray, ses: np.ndarray) -> np.ndarray:
"""
Function that calculates Z scores from betas and standard errors. Shape
of betas and ses must be the same / broadcastable.
:param pop: Array of beta values
:param ses: Array of standard errors
:return: The array of Z scores
"""
return np.divide(betas, ses)
#################################
def calculate_mean_chi_sq(z_scores: np.ndarray) -> float:
"""
Function that calculates a mean chi squared statistic from
Z scores
:param z_scores: Array of Z scores
:return: The mean chi squared statistic
"""
return np.square(z_scores).mean()
#################################
def calculate_n_eff(pop: int, n_orig: np.ndarray, sigma: np.ndarray, ses: np.ndarray) -> np.ndarray:
"""
Function that calculates effective N
:param pop: Number of the population (used to index into sigma)
:param n_orig: Array of original per-SNP N values for this population
:param sigma: MxPxP matrix of Sigma values
:param ses: Array of length M of standard errors
:return: The array of per-SNP effective N's
"""
return n_orig * sigma[:, pop, pop] * np.reciprocal(np.square(ses))
#################################
def calculate_p(z_scores: np.array) -> np.array:
"""
Function that calculates P for the MAMA results
:param z_scores: Z scores
:return: P values for MAMA
(as strings, to allow for very large negative exponents)
"""
# Since P = 2 * normal_cdf(-|Z|), P = e ^ (log_normal_cdf(-|Z|) + ln 2)
# This can be changed to base 10 as P = 10 ^ ((log_normal_cdf(-|Z|) + ln 2) / ln 10)
log_10_p = RECIP_LN_10 * (norm.logcdf(-np.abs(z_scores)) + LN_2)
# Break up the log based 10 of P values into the integer and fractional part
# To handle the case of Z = 0 (and not result in "10e-1"), set initial values to (-1.0, 1.0)
frac_part, int_part = np.full_like(z_scores, -1.0), np.full_like(z_scores, 1.0)
np.modf(log_10_p, out=(frac_part, int_part), where=(z_scores != 0.0))
# Construct strings for the P values
# 1) Add one to the fractional part to ensure that the result mantissa is between 1 and 10
# 2) Subtract one from the integer part to compensate and keep the overall value correct
result = np.char.add(np.char.add(np.power(10.0, (frac_part + 1.0)).astype(str), 'e'),
(int_part - 1).astype(int).astype(str))
return result
#################################
def mama_pipeline(sumstats: Dict[PopulationId, Any], ldscore_list: List[Any], snp_list: str = None,
column_maps: Dict[PopulationId, Dict[str, str]] = None,
re_expr_map: Dict[str, str] = None,
filters: Dict[str, Tuple[Filter, str]] = MAMA_STD_FILTERS,
ld_opt: Any = MAMA_REG_OPT_ALL_FREE,
se_prod_opt: Any = MAMA_REG_OPT_ALL_FREE,
int_opt: Any = MAMA_REG_OPT_ALL_FREE,
ld_corr_scale_factor = 1.0,
std_units: bool = False,
harmonized_file_fstr: str = "",
reg_coef_fstr: str = "",
sep: Union[None, str] = None) -> Dict[PopulationId, pd.DataFrame]:
"""
Runs the steps in the overall MAMA pipeline
:param sumstats: Dictionary of population identifier -> filename or DataFrame
:param ldscore_list: List of filenames and/or DataFrames of LD scores (will be concatenated)
:param snp_list: Path to file containing list of rsIDs to which to restrict analysis
:param column_maps: Dictionary containing any column mappings indexed by population identifier
(same as used for sumstats parameter). If none exists, the re_expr_map
will be used to determine column mappings
:param re_expr_map: Regular expressions used to map column names to standard columns
:param filters: Map of filter name to a (function, description) tuple, used to filter
summary statistics
# TODO(jonbjala) Switch to kwargs?
:return Dict[PopulationId, pd.DataFrame]: Result summary statistics dictionary (reference to
the same dictionary passed in, but with updated
summary statistics)
"""
# Get number of populations
num_pops = len(sumstats)
# If column maps are not specified, set to empty dictionary
if not column_maps:
column_maps = {}
# If regular expression map isn't specified, use MAMA default
if not re_expr_map:
re_expr_map = MAMA_RE_EXPR_MAP.copy()
# Check / read in LD scores and then QC
logging.info("\n\nReading in and running QC on LD Scores\n")
logging.debug("\nList of files: %s", ldscore_list)
ldscores = pd.concat((obtain_df(f, "LD Scores", sep) for f in ldscore_list), ignore_index=True)
ldscores = qc_ldscores(ldscores)
# Check / read in summary stats and then QC
logging.info("\n\nReading in summary statistics.")
for pop_id in sumstats.keys():
# Read in if necessary (and update dictionary)
logging.info("\n\n")
sumstats[pop_id] = obtain_df(sumstats[pop_id], str(pop_id) + " sumstats", sep)
# QC summary stats (along with some light validation and some logging of drops)
pop_df = sumstats[pop_id]
col_map = column_maps.get(pop_id, None) # If a column map exists for this pop, use that
logging.info("\nRunning QC on %s summary statistics", pop_id)
logging.debug("\tColumn mapping = %s\n", col_map)
sumstats[pop_id] = process_sumstats(pop_df, re_expr_map, MAMA_REQ_STD_COLS,
filters, col_map)
# If a SNP list is given, read that in
if snp_list:
snp_list = pd.read_csv(snp_list, sep='\n', engine='c', comment='#',
dtype=str, names=[SNP_COL], squeeze=True)
snp_list = pd.Index(data=snp_list, dtype=str)
# Harmonize the summary stats and LD scores (write to disk if requested)
harmonize_all(sumstats, ldscores, snp_list)
if harmonized_file_fstr:
logging.info("\n\nWriting harmonized summary statistics to disk.\n")
for (ancestry, phenotype), harm_ss_df in sumstats.items():
filename = harmonized_file_fstr % (ancestry, phenotype)
logging.debug("\t%s", filename)
write_sumstats_to_file(filename, harm_ss_df)
# Log mean chi squared statistics for each population if needed
if logging.root.level <= logging.DEBUG:
logging.debug("\n")
for (ancestry, phenotype), harm_ss_df in sumstats.items():
mean_chi_sq = calculate_mean_chi_sq(calculate_z(harm_ss_df[BETA_COL].to_numpy(),
harm_ss_df[SE_COL].to_numpy()))
logging.debug("Harmonized %s %s mean chi squared: %s",
ancestry, phenotype, mean_chi_sq)
logging.debug("\n")
# If using a standardized units model, convert to stdized units here (convert back later)
if std_units:
for pop_id, pop_df in sumstats.items():
logging.debug("Converting %s to standardized units", pop_id)
conversion_factor_col = np.sqrt(2.0 * pop_df[FREQ_COL] * (1.0 - pop_df[FREQ_COL]))
pop_df[BETA_COL] = pop_df[BETA_COL] * conversion_factor_col
pop_df[SE_COL] = pop_df[SE_COL] * conversion_factor_col
# Copy values to numpy ndarrays to use in vectorized processing
beta_arr, se_arr, ldscore_arr = collate_df_values(sumstats, ldscores)
# Run LD score regressions
logging.info("\n\nRunning LD Score regression.")
fixed_opts = {REG_LD_OPT_NAME : ld_opt,
REG_SE_OPT_NAME : se_prod_opt,
REG_INT_OPT_NAME : int_opt,
REG_LD_SCALE_FACTOR_NAME : ld_corr_scale_factor}
logging.debug("\tOptions = %s", fixed_opts)
ld_coef, const_coef, se2_coef = run_ldscore_regressions(beta_arr, se_arr,
ldscore_arr, **fixed_opts)
# Log coefficients at debug level (and write coefficients to disk if option is selected)
logging.info("Regression coefficients (LD):\n%s", ld_coef)
logging.info("Regression coefficients (Intercept):\n%s", const_coef)
logging.info("Regression coefficients (SE^2):\n%s", se2_coef)
if reg_coef_fstr:
logging.info("\nWriting regression coefficients to disk.\n")
ld_coef.tofile(reg_coef_fstr % "ld", sep='\t')
const_coef.tofile(reg_coef_fstr % "int", sep='\t')
se2_coef.tofile(reg_coef_fstr % "se2", sep='\t')
# Calculate Omegas and Sigmas
logging.info("\n\nCreating omega and sigma matrices.")
omega = create_omega_matrix(ldscore_arr, ld_coef)
sigma = create_sigma_matrix(se_arr, se2_coef, const_coef)
# Check omega and sigma for validity based on positive (semi-)definiteness
# Create drop arrays shapes that allow for broadcasting and comparison later
omega_pos_semi_def, tweaked_omegas = qc_omega(omega) \
if num_pops > 1 else (np.full(omega.shape[0], True), np.full(omega.shape[0], False))
omega_drops = np.logical_not(omega_pos_semi_def)
sigma_drops = np.logical_not(qc_sigma(sigma))
omega_sigma_drops = np.logical_or(omega_drops, sigma_drops)
logging.info("Average Omega (including dropped slices) =\n%s", omega.mean(axis=0))
logging.info("Average Sigma (including dropped slices) =\n%s", sigma.mean(axis=0))
if num_pops > 1:
logging.info("\nAdjusted %s SNPs to make omega positive semi-definite.",
tweaked_omegas.sum())
if logging.root.level <= logging.DEBUG:
omega_tweaked_rsids = ldscores.index[tweaked_omegas].to_list()
logging.debug("\tRS IDs = %s", omega_tweaked_rsids[:MAX_RSID_LOGGING] + ["..."]
if len(omega_tweaked_rsids) > MAX_RSID_LOGGING else omega_tweaked_rsids)
logging.info("\nDropped %s SNPs due to non-positive-semi-definiteness of omega.",
omega_drops.sum())
if logging.root.level <= logging.DEBUG:
omega_drop_rsids = ldscores.index[omega_drops].to_list()
logging.debug("\tRS IDs = %s", omega_drop_rsids[:MAX_RSID_LOGGING] + ["..."]
if len(omega_drop_rsids) > MAX_RSID_LOGGING else omega_drop_rsids)
else:
logging.info("\nSkipping positive-semi-definiteness check of Omega due to the "
"presence of only one population.\n")
logging.info("Dropped %s SNPs due to non-positive-definiteness of sigma.",
sigma_drops.sum())
if logging.root.level <= logging.DEBUG:
sigma_drop_rsids = ldscores.index[sigma_drops].to_list()
logging.debug("\tRS IDs = %s", sigma_drop_rsids[:MAX_RSID_LOGGING] + ["..."]
if len(sigma_drop_rsids) > MAX_RSID_LOGGING else sigma_drop_rsids)
logging.info("Dropped %s total SNPs due to non-positive-(semi)-definiteness of omega / sigma.",
omega_sigma_drops.sum())
if logging.root.level <= logging.DEBUG:
os_drop_rsids = ldscores.index[omega_sigma_drops].to_list()
logging.debug("\tRS IDs = %s", os_drop_rsids[:MAX_RSID_LOGGING] + ["..."]
if len(os_drop_rsids) > MAX_RSID_LOGGING else os_drop_rsids)
num_snps = ldscore_arr.shape[0] # pylint: disable=unsubscriptable-object
os_drops_reshaped = omega_sigma_drops.reshape((num_snps, 1, 1))
# Run the MAMA method
# Use identity matrices for "bad" SNPs to allow vectorized operations without having to copy
logging.info("\n\nRunning main MAMA method.")
new_betas, new_beta_ses = run_mama_method(beta_arr,
np.where(os_drops_reshaped,
np.identity(omega.shape[1]), omega),
np.where(os_drops_reshaped,
np.identity(sigma.shape[1]), sigma))
# Copy values back to the summary statistics DataFrames
# Also, perform any remaining calculations / formatting
logging.info("\nPreparing results for output.\n")
final_snp_count = 0
for pop_num, ((ancestry, phenotype), ss_df) in enumerate(sumstats.items()):
logging.info("\tPopulation %s: %s", pop_num, (ancestry, phenotype))
new_df_data_list = [(CHR_COL, ss_df[CHR_COL]),
(BP_COL, ss_df[BP_COL]), (A1_COL, ss_df[A1_COL]),
(A2_COL, ss_df[A2_COL]), (FREQ_COL, ss_df[FREQ_COL])]
# Update the betas and standard errors
new_df_data_list.append((BETA_COL, new_betas[:, pop_num]))
new_df_data_list.append((SE_COL, new_beta_ses[:, pop_num]))
# Calculate Z score
z_scores = calculate_z(new_betas[:, pop_num], new_beta_ses[:, pop_num])
new_df_data_list.append((Z_COL, z_scores))
# Calculate P column
new_df_data_list.append((P_COL, calculate_p(z_scores)))
# Calculate effective N
if N_COL in ss_df.columns:
new_df_data_list.append((N_EFF_COL, calculate_n_eff(pop_num, ss_df[N_COL].to_numpy(),
sigma, new_beta_ses[:, pop_num])))
new_df_data_list.append((ORIGINAL_N_COL_RENAME, ss_df[N_COL]))
# Construct the output dataframe
new_df = pd.DataFrame(data=dict(new_df_data_list), index=ss_df.index)
# Drop SNPs due to omega / sigma, and sort by BP and CHR
new_df.drop(new_df.index[omega_sigma_drops], inplace=True)
new_df.sort_values(by=[CHR_COL, BP_COL], inplace=True)
final_snp_count = len(new_df)
# Replace the old dataframe with the new one
sumstats[(ancestry, phenotype)] = new_df
# Report mean chi squared
mean_chi_2 = calculate_mean_chi_sq(new_df[Z_COL].to_numpy())
logging.info("\t\tMean Chi^2 for %s = %s", (ancestry, phenotype), mean_chi_2)
# If using a standardized units model, convert back from stdized units here
if std_units:
logging.debug("\t\tConverting %s from standardized units", (ancestry, phenotype))
conversion_factor_col = np.reciprocal(np.sqrt(
2.0 * new_df[FREQ_COL] * (1.0 - new_df[FREQ_COL])))
new_df[BETA_COL] = new_df[BETA_COL] * conversion_factor_col
new_df[SE_COL] = new_df[SE_COL] * conversion_factor_col
logging.info("\nFinal SNP count = %s", final_snp_count)
return sumstats