diff --git a/Snakefile b/Snakefile index f820ce3..b9cc845 100644 --- a/Snakefile +++ b/Snakefile @@ -30,6 +30,7 @@ CHR = ['DCARv3_Chr' + str(n) for n in range(1,10)] dist_ind = pd.read_table(config['distinguished_inds']) # dist_ind = dist_ind[dist_ind.Population != 'Improved_Cultivar'] +# dist_ind = dist_ind[(dist_ind.Population == 'Landrace_A') | (dist_ind.Population == 'Landrace_B')] dist_list = dist_ind.groupby('Population')['Sample_ID'].apply(lambda x: x.tolist()) @@ -104,9 +105,16 @@ def pop_choose(wildcards): # 'Eastern_Cultivar':['Eastern_Cultivated', 'Cultivar'], # 'Western_Cultivar':['Western_Cultivated', 'Cultivar']} -pop_pair_dict = {'LandraceA_LAwild':['Landrace_A', 'LandraceAWild'], -'LandraceA_LandraceB':['Landrace_A', 'Landrace_B'], -'LandraceB_LAwild':['Landrace_B', 'LandraceAWild']} +pop_pair_dict = {'LandraceA_LandraceB':['Landrace_A', 'Landrace_B'], +'LandraceA_LAwild':['Landrace_A', 'LandraceAWild'], +'LandraceA_Wild':['Landrace_A', 'Wild'], +'LandraceA_EarlyCultivar':['Landrace_A', 'Early_Cultivar'], +'LandraceA_ImprovedCultivar':['Landrace_A', 'Improved_Cultivar'], +'EarlyCultivar_ImprovedCultivar':['Early_Cultivar', 'Improved_Cultivar'], +'EarlyCultivar_Wild':['Early_Cultivar', 'Wild'], +'ImprovedCultivar_Wild':['Improved_Cultivar', 'Wild']} +# 'LandraceA_LAwild':['Landrace_A', 'LandraceAWild'], +# 'LandraceB_LAwild':['Landrace_B', 'LandraceAWild']} # def pop_pair_choose(wildcards): # list = popdict[wildcards.pop_pair] @@ -151,7 +159,7 @@ smc_split_input_files21 = [expand('models/smc_split/input/{pop_pair}_21.{disting def smc_split_input(wildcards): pops = pop_pair_dict[wildcards.pop_pair] - models = expand("models/smc_cv/{population}/model.final.json", population = pops) + models = expand("models/smc_estimate_no_timepoints/{population}/model.final.json", population = pops) pop1_input = expand("models/smc/input/{population}.{distinguished_ind}.{chr}.smc.gz", population=pops[0], distinguished_ind=dist_dict[pops[0]], chr=CHR) pop2_input = expand("models/smc/input/{population}.{distinguished_ind}.{chr}.smc.gz", population=pops[1], distinguished_ind=dist_dict[pops[1]], chr=CHR) joint_input12 = expand("models/smc_split/input/{pop_pair}_12.{distinguished_ind}.{chr}.smc.gz", pop_pair = wildcards.pop_pair, distinguished_ind=dist_dict[pops[0]], chr = CHR) @@ -168,6 +176,7 @@ rule all: input: vcf2smc = smc_input_files, # smc_cv = expand("models/smc_cv_no_timepoints/{population}/model.final.json", population = popdict.keys()), + smc_estimate = expand("models/smc_estimate_no_timepoints/{population}/model.final.json", population = dist_dict.keys()), # plot_estimate = "reports/smc_cv_no_timepoints_results.png", # smc_bootstrap = [expand('models/smc/bootstrap_input/{population}/{distinguished_ind}_{n_bootstrap}/bootstrap_{chr}.smc.gz', # population = key, distinguished_ind = value, chr = CHR) diff --git a/config.yaml b/config.yaml index 55da64e..ad14d88 100644 --- a/config.yaml +++ b/config.yaml @@ -14,8 +14,10 @@ distinguished_inds: data/distinguished_individuals_admix10.txt # VCF file & index # can create index by using tabix -p vcf yourfile.vcf.gz -vcf: data/kevinG.d3.recode.vcf.gz -vcf_index: data/kevinG.d3.recode.vcf.gz.tbi +# vcf: data/kevinG.d3.recode.vcf.gz +# vcf_index: data/kevinG.d3.recode.vcf.gz.tbi +vcf: data/kevinG.d3.recode.missing10.vcf.gz +vcf_index: data/kevinG.d3.recode.missing10.vcf.gz.tbi # mask file to exclude poorly mapped regions (BED format) # should be a text file with a column for CHR, start, & stop positions diff --git a/rules/01-population_size_history.smk b/rules/01-population_size_history.smk index 65dcd90..d7e96cd 100644 --- a/rules/01-population_size_history.smk +++ b/rules/01-population_size_history.smk @@ -40,24 +40,42 @@ rule vcf2smc: # cv method uses cross-validation to obtain model parameters ################################################################################ -rule smc_cv: +# rule smc_cv: +# input: +# smc_cv_input +# output: +# cv_folds = expand("models/smc_cv_no_timepoints/{{population}}/fold{fold}/model.final.json", fold = ['0','1']), +# final_model = "models/smc_cv_no_timepoints/{population}/model.final.json" +# threads: 25 +# params: +# model_in = "models/smc/input/{population}.*", +# model_out_dir = "models/smc_cv_no_timepoints/{population}", +# mu = config['mu'] +# singularity: +# "docker://terhorst/smcpp:latest" +# shell: +# "smc++ cv \ +# --cores {threads} \ +# --spline cubic \ +# --ftol 0.001 \ +# -o {params.model_out_dir} {params.mu} {params.model_in}" + +rule smc_estimate: input: smc_cv_input output: - cv_folds = expand("models/smc_cv_no_timepoints/{{population}}/fold{fold}/model.final.json", fold = ['0','1']), - final_model = "models/smc_cv_no_timepoints/{population}/model.final.json" + final_model = "models/smc_estimate_no_timepoints/{population}/model.final.json" threads: 25 params: model_in = "models/smc/input/{population}.*", - model_out_dir = "models/smc_cv_no_timepoints/{population}", + model_out_dir = "models/smc_estimate_no_timepoints/{population}", mu = config['mu'] singularity: "docker://terhorst/smcpp:latest" shell: - "smc++ cv \ + "smc++ estimate \ --cores {threads} \ --spline cubic \ - --ftol 0.001 \ -o {params.model_out_dir} {params.mu} {params.model_in}" ################################################################################ @@ -69,9 +87,9 @@ rule smc_cv: rule plot_estimate: input: - smc_out = expand("models/smc_cv_no_timepoints/{population}/model.final.json", population = popdict.keys()) + smc_out = expand("models/smc_estimate_no_timepoints/{population}/model.final.json", population = popdict.keys()) output: - "reports/smc_cv_no_timepoints_results.png" + "reports/smc_estimate_no_timepoints_results.png" params: gen = config['gen'] singularity: diff --git a/rules/02-population_splits.smk b/rules/02-population_splits.smk index e9c3be0..f8d99dd 100644 --- a/rules/02-population_splits.smk +++ b/rules/02-population_splits.smk @@ -53,6 +53,7 @@ rule smc_split: "docker://terhorst/smcpp:latest" shell: "smc++ split \ + --cores 2 \ -o {params.model_out_dir} \ {input}" diff --git a/submit.json b/submit.json index f76e504..5223574 100644 --- a/submit.json +++ b/submit.json @@ -16,6 +16,11 @@ "mem" : "260G", "p" : "bigmemm" }, + "smc_estimate" : + { + "mem" : "200G", + "p" : "med2" + }, "plot_estimate" : { "mem" : "1G", @@ -43,7 +48,7 @@ }, "smc_split" : { - "mem" : "600G", + "mem" : "250G", "p" : "bigmemm" }, "plot_split" :