-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpcgc_r2.py
149 lines (119 loc) · 6.68 KB
/
pcgc_r2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import numpy as np; np.set_printoptions(precision=3, linewidth=200)
import pandas as pd; pd.set_option('display.width', 200)
import os
import time
import sys
import ldsc_parse as ps
import logging
import ldscore_r2
import pcgc_utils
import imp
def compute_r2_prod(args):
# read bim/snp
array_snps = ps.PlinkBIMFile(args.bfile+'.bim')
snpnames = array_snps.df['SNP']
#read annotations file
if args.annot is None and args.annot_chr is None:
df_annotations = pd.DataFrame(np.ones(snpnames.shape[0]), index=snpnames, columns=['base'])
else:
df_annotations = pcgc_utils.load_dfs(args.annot, args.annot_chr, 'annot.gz', 'annot', 'annot', join_axis=0, index_col='SNP')
df_annotations.drop(columns=['CHR', 'CM', 'BP'], inplace=True)
#apply min_annot correction to ensure no negative values
category_names = df_annotations.columns
if args.sync is None:
raise ValueError('--annot must be used together with --sync')
df_sync = pd.read_table(args.sync+'sync', index_col='Category')
if df_sync.shape[0] != len(category_names) or not np.all(df_sync.index == category_names):
raise ValueError('Annotations in sync file do not match those in annotations/prodr2 files')
min_annot = df_sync['min_annot'].values
df_annotations -= min_annot
#mark which SNPs to keep
is_good_snp = np.ones(len(snpnames), dtype=bool)
if args.exclude is not None:
df_exclude = pd.read_table(args.exclude, squeeze=True, header=None)
is_good_snp = is_good_snp & (~snpnames.isin(df_exclude))
logging.info('Excluding %d SNPs'%(np.sum(~snpnames.isin(df_exclude))))
if args.extract is not None:
df_extract = pd.read_table(args.extract, squeeze=True, header=None)
is_good_snp = is_good_snp & (snpnames.isin(df_extract))
logging.info('Extracting %d SNPs'%(np.sum(snpnames.isin(df_extract))))
if np.sum(is_good_snp) == 0:
raise ValueError('No SNPs selected')
if args.ld_all:
keep_snps = None
is_r2_snp = is_good_snp
else:
keep_snps = np.where(is_good_snp)[0]
is_r2_snp = np.ones(len(keep_snps), dtype=bool)
snpnames = snpnames.iloc[keep_snps]
#keep only annotations of SNPs in plink file
if not snpnames.isin(df_annotations.index).all():
raise ValueError('not all SNPs have annotations')
df_annotations = df_annotations.loc[snpnames]
logging.info('Computing r^2 products for %d SNPs'%(len(snpnames)))
#find #individuals in bfile
df_fam = pd.read_table(args.bfile+'.fam', header=None)
n = df_fam.shape[0]
#read plink file
keep_indivs = None
mafMin = None
imp.reload(ldscore_r2)
logging.info('Loading SNP file...')
geno_array = ldscore_r2.PlinkBEDFile(args.bfile+'.bed', n, array_snps, is_r2_snp, keep_snps=keep_snps,
keep_indivs=keep_indivs, mafMin=mafMin)
#df_annotations = df_annotations.iloc[geno_array.kept_snps]
assert (df_annotations.index == snpnames[geno_array.kept_snps]).all()
# determine block widths
num_wind_args = np.array((args.ld_wind_snps, args.ld_wind_kb, args.ld_wind_cm), dtype=bool)
if np.sum(num_wind_args) != 1:
raise ValueError('Must specify exactly one --ld-wind option')
if args.ld_wind_snps:
max_dist = args.ld_wind_snps
coords = np.array(list(range(geno_array.m)))
elif args.ld_wind_kb:
max_dist = args.ld_wind_kb*1000
coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
if len(np.unique(coords)) == 1:
raise ValueError('bim file has no basepair data --- please use a different ld-wind option')
elif args.ld_wind_cm:
max_dist = args.ld_wind_cm
coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]
if len(np.unique(coords)) == 1:
raise ValueError('bim file has no CM data --- please use a different ld-wind option')
#compute r2_prod_table
logging.info('Computing r2 prod...')
block_left = ldscore_r2.getBlockLefts(coords, max_dist)
if block_left[len(block_left)-1] == 0:
error_msg = 'Only a single block selected - this is probably a mistake'
raise ValueError(error_msg)
t0 = time.time()
geno_array._currentSNP = 0
r2prod_table = geno_array.ldScoreVarBlocks(block_left, args.chunk_size, annot=df_annotations.values)
df_r2prod_table = pd.DataFrame(r2prod_table, index=df_annotations.columns, columns=df_annotations.columns)
return df_r2prod_table
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--bfile', required=True, help='plink file')
parser.add_argument('--annot', default=None, help='prefix of annotations file')
parser.add_argument('--annot-chr', default=None, help='Multi-chromosome prefix of annotations file')
parser.add_argument('--sync', default=None, help='Prefix of pcgc sync file, created by pcgc_sync.py')
parser.add_argument('--extract', default=None, help='Name of files with names of SNPs to use. Other SNPs will not be used')
parser.add_argument('--exclude', default=None, help='Name of files with names of SNPs to exclude')
parser.add_argument('--out', required=True, help='output file prefix')
parser.add_argument('--ld-wind-cm', type=float, default=None, help='window size to be used for estimating r2 products in units of centiMorgans (cM).')
parser.add_argument('--ld-wind-kb', type=int, default=None, help='window size to be used for estimating r2 products in units of Kb.')
parser.add_argument('--ld-wind-snps', type=int, default=None, help='window size to be used for estimating r2 products in units of SNPs.')
parser.add_argument('--chunk-size', type=int, default=50, help='chunk size for r2 product calculation')
parser.add_argument('--ld-all', default=False, action='store_true', help='If this is set, the script will compute sum of r^2 between extracted SNPs and all SNPs')
args = parser.parse_args()
#Use default ld-wind parameter if none specified
if args.ld_wind_cm is None and args.ld_wind_kb is None and args.ld_wind_snps is None:
args.ld_wind_cm = 1.0
logging.warning('no ld-wind argument specified so I''m using --ld-cm 1.0')
#check that the output directory exists
if os.path.isabs(args.out) and not os.path.exists(os.path.dirname(args.out)):
raise ValueError('output directory %s doesn''t exist'%(os.path.dirname(args.out)))
pcgc_utils.configure_logger(args.out)
df_prod_r2 = compute_r2_prod(args)
df_prod_r2.to_csv(args.out+'.prodr2', sep='\t', float_format='%0.5e')