-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwithin_family_simulation.py
139 lines (115 loc) · 5.85 KB
/
within_family_simulation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
Purpose: To simulate expected educational attainment gains from embryo selection between families.
Date: 10/09/2019
"""
import numpy as np
import pandas as pd
from scipy.stats import norm
from between_family_ea_simulation import (
get_random_index,
get_max_pgs_index,
select_embryos_by_index,
calc_phenotype_diffs
)
from scipy.stats import norm
import argparse
def calc_within_family_values(n, num_embryos, heritability, correlation_mz, rsquared):
"""
Purpose: To get the ghat_i and y_i for each family pair, where i={1,...,num_embryos}.
Arguments:
n: integer number of parent pairs
num_embryos: integer number of embryos for each parent pair
heritability: heritability of clinical trait
correlation_mz: twin correlation of clinical trait
rsquared: Ancestry-specific R^2 value for PGS prediction of trait
Returns:
{'pgs':df_pgs, 'liability':df_liability}.
Each dataframe has size (n x num_embryos) and holds polygenic scores and phenotype liability values, respectively.
"""
df_a = pd.DataFrame()
df_pgs = pd.DataFrame()
df_liability = pd.DataFrame()
# calculate average parent additive component
a_mean = np.random.normal(loc=0, scale=(heritability/2)**0.5, size=int(n))
# calculate shared environmental component (constant within-family)
c = np.random.normal(loc=0, scale=(correlation_mz - heritability)**0.5, size=int(n))
# calculate liability and PGS for clinical trait, for each embryo
for i in range(num_embryos):
# generate individual embryo additive components
df_a['a_{}'.format(i)] = np.random.normal(loc=a_mean, scale=(heritability/2)**0.5, size=int(n))
# calculate PGS
df_pgs['ghat_{}'.format(i)] = np.random.normal(loc=df_a['a_{}'.format(i)], \
scale=(heritability**2/rsquared - heritability)**0.5, size=int(n))
# calculate phenotypic liability
df_liability['liability_{}'.format(i)] = np.random.normal(loc=(df_a['a_{}'.format(i)] + c), \
scale=(1 - correlation_mz)**0.5, size=int(n))
return {'pgs':df_pgs, 'liability':df_liability}
def process_arguments():
"""
Parses command line arguments.
Args:
-----
None
Returns:
--------
parser: :class: argparse.Namespace
arguments passed in from command line
"""
parser = argparse.ArgumentParser()
parser.add_argument('--n', default=1000000, type=int,
help='Number of parent pairs to simulate. Defaults to 1e6.')
parser.add_argument('--embryos', default=10, type=int,
help='Number of embryos from which to choose. Defaults to 10.')
parser.add_argument('--ancestry', required=True, type=str,
help='Ancestry of interest.')
parser.add_argument('--heritability', required=True, type=float, nargs='+',
help='List of heritabilities on the liability scale for conditions. \
Index of heritabilities must match index of rsquared, correlation_mz, and prevalence.')
parser.add_argument('--rsquared', required=True, type=float, nargs='+',
help='List of r2 for conditions. Index of r2 must match index of \
heritability, correlation_mz, and prevalence.')
parser.add_argument('--correlation_mz', required=True, type=float, nargs='+',
help='List of monozygotic twin correlations on the liability scale for conditions. \
Index of correlations must match index of rsquared, heritabilities, and prevalence.')
parser.add_argument('--prevalence', required=True, type=float, nargs='+',
help='List of prevalences for conditions. \
Index of prevalences must match index of rsquared, heritabilities, and correlations.')
parser.add_argument('--condition', required=True, type=str, nargs='+',
help='Name of conditions. \
Index must match index of rsquared, heritabilities, and correlations.')
return parser.parse_args()
if __name__ == "__main__":
# import arguments
args = process_arguments()
N = args.n
NUM_EMBRYOS = args.embryos
HERITABILITY = args.heritability # list
RSQUARED = args.rsquared # list
CORR_MZ = args.correlation_mz # list
PREVALENCE = args.prevalence # list
CONDITION = args.condition # list
# check lengths
assert len(HERITABILITY) == len(RSQUARED), 'Your lists aren\'t the same length!'
assert len(HERITABILITY) == len(CORR_MZ), 'Your lists aren\'t the same length!'
assert len(HERITABILITY) == len(PREVALENCE), 'Your lists aren\'t the same length!'
assert len(HERITABILITY) == len(CONDITION), 'Your lists aren\'t the same length!'
# print intro
print('This is a simulation for within-family selection of embryos.')
print('This analysis is for parents of ' + args.ancestry + ' ancestry who are choosing from ' + str(NUM_EMBRYOS) + ' embryos.')
#### begin simualation
for i in range(len(args.condition)):
# calculate values
values = calc_within_family_values(n=N, num_embryos=NUM_EMBRYOS,
heritability=HERITABILITY[i], correlation_mz=CORR_MZ[i], rsquared=RSQUARED[i])
# generate indices
rand_index = get_random_index(NUM_EMBRYOS, N)
max_index = get_max_pgs_index(values['pgs'])
# get max/random liability values
max_liability = select_embryos_by_index(values['liability'], max_index)
rand_liability = select_embryos_by_index(values['liability'], rand_index)
# get fraction of individuals who will have disease (convert liability to binary trait)
max_frac = (max_liability <= norm.ppf(PREVALENCE[i])).astype(int).mean()
rand_frac = (rand_liability <= norm.ppf(PREVALENCE[i])).astype(int).mean()
# print summary
print('For ' + CONDITION[i] + ', the within-family prevalence is ' + str(rand_frac['selected_values']) + \
' in random embryos and ' + str(max_frac['selected_values']) + ' in selected embryos.')