Skip to content

Commit

Permalink
filter input vcf for missingness
Browse files Browse the repository at this point in the history
  • Loading branch information
Sarah D Turner committed Dec 7, 2021
1 parent 635c691 commit b957450
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 15 deletions.
17 changes: 13 additions & 4 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ CHR = ['DCARv3_Chr' + str(n) for n in range(1,10)]
dist_ind = pd.read_table(config['distinguished_inds'])

# dist_ind = dist_ind[dist_ind.Population != 'Improved_Cultivar']
# dist_ind = dist_ind[(dist_ind.Population == 'Landrace_A') | (dist_ind.Population == 'Landrace_B')]

dist_list = dist_ind.groupby('Population')['Sample_ID'].apply(lambda x: x.tolist())

Expand Down Expand Up @@ -104,9 +105,16 @@ def pop_choose(wildcards):
# 'Eastern_Cultivar':['Eastern_Cultivated', 'Cultivar'],
# 'Western_Cultivar':['Western_Cultivated', 'Cultivar']}

pop_pair_dict = {'LandraceA_LAwild':['Landrace_A', 'LandraceAWild'],
'LandraceA_LandraceB':['Landrace_A', 'Landrace_B'],
'LandraceB_LAwild':['Landrace_B', 'LandraceAWild']}
pop_pair_dict = {'LandraceA_LandraceB':['Landrace_A', 'Landrace_B'],
'LandraceA_LAwild':['Landrace_A', 'LandraceAWild'],
'LandraceA_Wild':['Landrace_A', 'Wild'],
'LandraceA_EarlyCultivar':['Landrace_A', 'Early_Cultivar'],
'LandraceA_ImprovedCultivar':['Landrace_A', 'Improved_Cultivar'],
'EarlyCultivar_ImprovedCultivar':['Early_Cultivar', 'Improved_Cultivar'],
'EarlyCultivar_Wild':['Early_Cultivar', 'Wild'],
'ImprovedCultivar_Wild':['Improved_Cultivar', 'Wild']}
# 'LandraceA_LAwild':['Landrace_A', 'LandraceAWild'],
# 'LandraceB_LAwild':['Landrace_B', 'LandraceAWild']}

# def pop_pair_choose(wildcards):
# list = popdict[wildcards.pop_pair]
Expand Down Expand Up @@ -151,7 +159,7 @@ smc_split_input_files21 = [expand('models/smc_split/input/{pop_pair}_21.{disting

def smc_split_input(wildcards):
pops = pop_pair_dict[wildcards.pop_pair]
models = expand("models/smc_cv/{population}/model.final.json", population = pops)
models = expand("models/smc_estimate_no_timepoints/{population}/model.final.json", population = pops)
pop1_input = expand("models/smc/input/{population}.{distinguished_ind}.{chr}.smc.gz", population=pops[0], distinguished_ind=dist_dict[pops[0]], chr=CHR)
pop2_input = expand("models/smc/input/{population}.{distinguished_ind}.{chr}.smc.gz", population=pops[1], distinguished_ind=dist_dict[pops[1]], chr=CHR)
joint_input12 = expand("models/smc_split/input/{pop_pair}_12.{distinguished_ind}.{chr}.smc.gz", pop_pair = wildcards.pop_pair, distinguished_ind=dist_dict[pops[0]], chr = CHR)
Expand All @@ -168,6 +176,7 @@ rule all:
input:
vcf2smc = smc_input_files,
# smc_cv = expand("models/smc_cv_no_timepoints/{population}/model.final.json", population = popdict.keys()),
smc_estimate = expand("models/smc_estimate_no_timepoints/{population}/model.final.json", population = dist_dict.keys()),
# plot_estimate = "reports/smc_cv_no_timepoints_results.png",
# smc_bootstrap = [expand('models/smc/bootstrap_input/{population}/{distinguished_ind}_{n_bootstrap}/bootstrap_{chr}.smc.gz',
# population = key, distinguished_ind = value, chr = CHR)
Expand Down
6 changes: 4 additions & 2 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ distinguished_inds: data/distinguished_individuals_admix10.txt

# VCF file & index
# can create index by using tabix -p vcf yourfile.vcf.gz
vcf: data/kevinG.d3.recode.vcf.gz
vcf_index: data/kevinG.d3.recode.vcf.gz.tbi
# vcf: data/kevinG.d3.recode.vcf.gz
# vcf_index: data/kevinG.d3.recode.vcf.gz.tbi
vcf: data/kevinG.d3.recode.missing10.vcf.gz
vcf_index: data/kevinG.d3.recode.missing10.vcf.gz.tbi

# mask file to exclude poorly mapped regions (BED format)
# should be a text file with a column for CHR, start, & stop positions
Expand Down
34 changes: 26 additions & 8 deletions rules/01-population_size_history.smk
Original file line number Diff line number Diff line change
Expand Up @@ -40,24 +40,42 @@ rule vcf2smc:
# cv method uses cross-validation to obtain model parameters
################################################################################

rule smc_cv:
# rule smc_cv:
# input:
# smc_cv_input
# output:
# cv_folds = expand("models/smc_cv_no_timepoints/{{population}}/fold{fold}/model.final.json", fold = ['0','1']),
# final_model = "models/smc_cv_no_timepoints/{population}/model.final.json"
# threads: 25
# params:
# model_in = "models/smc/input/{population}.*",
# model_out_dir = "models/smc_cv_no_timepoints/{population}",
# mu = config['mu']
# singularity:
# "docker://terhorst/smcpp:latest"
# shell:
# "smc++ cv \
# --cores {threads} \
# --spline cubic \
# --ftol 0.001 \
# -o {params.model_out_dir} {params.mu} {params.model_in}"

rule smc_estimate:
input:
smc_cv_input
output:
cv_folds = expand("models/smc_cv_no_timepoints/{{population}}/fold{fold}/model.final.json", fold = ['0','1']),
final_model = "models/smc_cv_no_timepoints/{population}/model.final.json"
final_model = "models/smc_estimate_no_timepoints/{population}/model.final.json"
threads: 25
params:
model_in = "models/smc/input/{population}.*",
model_out_dir = "models/smc_cv_no_timepoints/{population}",
model_out_dir = "models/smc_estimate_no_timepoints/{population}",
mu = config['mu']
singularity:
"docker://terhorst/smcpp:latest"
shell:
"smc++ cv \
"smc++ estimate \
--cores {threads} \
--spline cubic \
--ftol 0.001 \
-o {params.model_out_dir} {params.mu} {params.model_in}"

################################################################################
Expand All @@ -69,9 +87,9 @@ rule smc_cv:

rule plot_estimate:
input:
smc_out = expand("models/smc_cv_no_timepoints/{population}/model.final.json", population = popdict.keys())
smc_out = expand("models/smc_estimate_no_timepoints/{population}/model.final.json", population = popdict.keys())
output:
"reports/smc_cv_no_timepoints_results.png"
"reports/smc_estimate_no_timepoints_results.png"
params:
gen = config['gen']
singularity:
Expand Down
1 change: 1 addition & 0 deletions rules/02-population_splits.smk
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ rule smc_split:
"docker://terhorst/smcpp:latest"
shell:
"smc++ split \
--cores 2 \
-o {params.model_out_dir} \
{input}"

Expand Down
7 changes: 6 additions & 1 deletion submit.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
"mem" : "260G",
"p" : "bigmemm"
},
"smc_estimate" :
{
"mem" : "200G",
"p" : "med2"
},
"plot_estimate" :
{
"mem" : "1G",
Expand Down Expand Up @@ -43,7 +48,7 @@
},
"smc_split" :
{
"mem" : "600G",
"mem" : "250G",
"p" : "bigmemm"
},
"plot_split" :
Expand Down

0 comments on commit b957450

Please sign in to comment.