From 8b17b4ec87fde11f8d216c049c08d1420cd25de6 Mon Sep 17 00:00:00 2001 From: Omer Weissbrod Date: Mon, 9 Dec 2024 22:59:06 +0200 Subject: [PATCH] More robust conversion of chromosome numbers to int --- ldsc_polyfun/jackknife.py | 2 +- ldsc_polyfun/parse.py | 12 ++++++++++-- ldsc_polyfun/sumstats.py | 3 ++- polyfun.py | 2 +- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/ldsc_polyfun/jackknife.py b/ldsc_polyfun/jackknife.py index 1671d3f..9656e48 100644 --- a/ldsc_polyfun/jackknife.py +++ b/ldsc_polyfun/jackknife.py @@ -574,7 +574,7 @@ def __init__(self, x, y, n_blocks=None, separators=None, chr_num=None, verbose=T num_lambdas=100, approx_ridge=False, ridge_lambda=None, use_1se=False, has_intercept=False, standardize=True, skip_ridge_jackknife=True, num_chr_sets=2, num_chr=22): - + #sanity checks assert chr_num is not None # # # chr_num[:100000]=1 diff --git a/ldsc_polyfun/parse.py b/ldsc_polyfun/parse.py index 2577e71..2e8a6c6 100644 --- a/ldsc_polyfun/parse.py +++ b/ldsc_polyfun/parse.py @@ -32,12 +32,20 @@ def read_csv(fh, **kwargs): return df def set_snpid_index(df): + + def float_to_int(c): + try: + c = int(c) + except ValueError: + pass + return c + df['A1_first'] = (df['A1'] < df['A2']) | (df['A1'].str.len()>1) | (df['A2'].str.len()>1) df['A1s'] = df['A2'].copy() df.loc[df['A1_first'], 'A1s'] = df.loc[df['A1_first'], 'A1'].copy() df['A2s'] = df['A1'].copy() df.loc[df['A1_first'], 'A2s'] = df.loc[df['A1_first'], 'A2'].copy() - s_chr = df['CHR'].map(lambda c: int(c) if str(c)[0] in ['0','1','2','3','4','5,','6','7','8','9'] else c).astype(str) + s_chr = df['CHR'].map(float_to_int).astype(str) s_bp = df['BP'].astype(int).astype(str) df.index = s_chr + '.' + s_bp + '.' + df['A1s'] + '.' + df['A2s'] df.index.name = 'snpid' @@ -116,7 +124,7 @@ def sumstats(fh, alleles=True, dropna=True): if dropna: x = x.dropna(how='any') - x = set_snpid_index(x) + x = set_snpid_index(x) x.drop(columns=['CHR', 'BP'], inplace=True) diff --git a/ldsc_polyfun/sumstats.py b/ldsc_polyfun/sumstats.py index 0a15206..f8793d6 100644 --- a/ldsc_polyfun/sumstats.py +++ b/ldsc_polyfun/sumstats.py @@ -236,7 +236,7 @@ def _print_part_delete_values(ldscore_reg, ofh, log): def _merge_and_log(ld, sumstats, noun, log): '''Wrap smart merge with log messages about # of SNPs.''' - sumstats = smart_merge(ld, sumstats) + sumstats = smart_merge(ld, sumstats) msg = 'After merging with {F}, {N} SNPs remain.' if len(sumstats) == 0: msg += ' Please make sure that your annotation files include the SNPs in your sumstats files (please see the PolyFun wiki for details on downloading functional annotations)' @@ -275,6 +275,7 @@ def _read_ld_sumstats(args, log, fh, alleles=True, dropna=True): M_annot, ref_ld, novar_cols = _check_variance(log, M_annot, ref_ld) w_ld = _read_w_ld(args, log) + sumstats = _merge_and_log(ref_ld, sumstats, 'reference panel LD', log) sumstats = _merge_and_log(sumstats, w_ld, 'regression SNP LD', log) w_ld_cname = sumstats.columns[-1] diff --git a/polyfun.py b/polyfun.py index e17f432..130113f 100644 --- a/polyfun.py +++ b/polyfun.py @@ -188,7 +188,7 @@ def run_ldsc(self, args, use_ridge, nn, keep_large, evenodd_split, n_blocks=2): df_sumstats = pd.read_table(args.sumstats, sep='\s+') ###merge everything together... - #prepare LD-scores for S-LDSC run + #prepare LD-scores for S-LDSC run ref_ld = np.array(df_sumstats[ref_ld_cnames], dtype=np.float32) sumstats._check_ld_condnum(args, log, ref_ld_cnames) if df_sumstats.shape[0] < 200000: