filter input vcf for missingness

mishaploid · Dec 7, 2021 · b957450 · b957450
1 parent 635c691
commit b957450
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 15 deletions.
diff --git a/Snakefile b/Snakefile
@@ -30,6 +30,7 @@ CHR = ['DCARv3_Chr' + str(n) for n in range(1,10)]
 dist_ind = pd.read_table(config['distinguished_inds'])
 
 # dist_ind = dist_ind[dist_ind.Population != 'Improved_Cultivar']
+# dist_ind = dist_ind[(dist_ind.Population == 'Landrace_A') | (dist_ind.Population == 'Landrace_B')]
 
 dist_list = dist_ind.groupby('Population')['Sample_ID'].apply(lambda x: x.tolist())
 
@@ -104,9 +105,16 @@ def pop_choose(wildcards):
 # 'Eastern_Cultivar':['Eastern_Cultivated', 'Cultivar'],
 # 'Western_Cultivar':['Western_Cultivated', 'Cultivar']}
 
-pop_pair_dict = {'LandraceA_LAwild':['Landrace_A', 'LandraceAWild'],
-'LandraceA_LandraceB':['Landrace_A', 'Landrace_B'],
-'LandraceB_LAwild':['Landrace_B', 'LandraceAWild']}
+pop_pair_dict = {'LandraceA_LandraceB':['Landrace_A', 'Landrace_B'],
+'LandraceA_LAwild':['Landrace_A', 'LandraceAWild'],
+'LandraceA_Wild':['Landrace_A', 'Wild'],
+'LandraceA_EarlyCultivar':['Landrace_A', 'Early_Cultivar'],
+'LandraceA_ImprovedCultivar':['Landrace_A', 'Improved_Cultivar'],
+'EarlyCultivar_ImprovedCultivar':['Early_Cultivar', 'Improved_Cultivar'],
+'EarlyCultivar_Wild':['Early_Cultivar', 'Wild'],
+'ImprovedCultivar_Wild':['Improved_Cultivar', 'Wild']}
+# 'LandraceA_LAwild':['Landrace_A', 'LandraceAWild'],
+# 'LandraceB_LAwild':['Landrace_B', 'LandraceAWild']}
 
 # def pop_pair_choose(wildcards):
 # 	list = popdict[wildcards.pop_pair]
@@ -151,7 +159,7 @@ smc_split_input_files21 = [expand('models/smc_split/input/{pop_pair}_21.{disting
 
 def smc_split_input(wildcards):
     pops = pop_pair_dict[wildcards.pop_pair]
-    models = expand("models/smc_cv/{population}/model.final.json", population = pops)
+    models = expand("models/smc_estimate_no_timepoints/{population}/model.final.json", population = pops)
     pop1_input = expand("models/smc/input/{population}.{distinguished_ind}.{chr}.smc.gz", population=pops[0], distinguished_ind=dist_dict[pops[0]], chr=CHR)
     pop2_input = expand("models/smc/input/{population}.{distinguished_ind}.{chr}.smc.gz", population=pops[1], distinguished_ind=dist_dict[pops[1]], chr=CHR)
     joint_input12 = expand("models/smc_split/input/{pop_pair}_12.{distinguished_ind}.{chr}.smc.gz", pop_pair = wildcards.pop_pair, distinguished_ind=dist_dict[pops[0]], chr = CHR)
@@ -168,6 +176,7 @@ rule all:
 	input:
 		vcf2smc = smc_input_files,
 		# smc_cv = expand("models/smc_cv_no_timepoints/{population}/model.final.json", population = popdict.keys()),
+		smc_estimate = expand("models/smc_estimate_no_timepoints/{population}/model.final.json", population = dist_dict.keys()),
 		# plot_estimate = "reports/smc_cv_no_timepoints_results.png",
         # smc_bootstrap = [expand('models/smc/bootstrap_input/{population}/{distinguished_ind}_{n_bootstrap}/bootstrap_{chr}.smc.gz',
         # population = key, distinguished_ind = value, chr = CHR)

diff --git a/config.yaml b/config.yaml
@@ -14,8 +14,10 @@ distinguished_inds: data/distinguished_individuals_admix10.txt
 
 # VCF file & index
 # can create index by using tabix -p vcf yourfile.vcf.gz
-vcf: data/kevinG.d3.recode.vcf.gz
-vcf_index: data/kevinG.d3.recode.vcf.gz.tbi
+# vcf: data/kevinG.d3.recode.vcf.gz
+# vcf_index: data/kevinG.d3.recode.vcf.gz.tbi
+vcf: data/kevinG.d3.recode.missing10.vcf.gz
+vcf_index: data/kevinG.d3.recode.missing10.vcf.gz.tbi
 
 # mask file to exclude poorly mapped regions (BED format)
 # should be a text file with a column for CHR, start, & stop positions

diff --git a/rules/01-population_size_history.smk b/rules/01-population_size_history.smk
@@ -40,24 +40,42 @@ rule vcf2smc:
 #   cv method uses cross-validation to obtain model parameters
 ################################################################################
 
-rule smc_cv:
+# rule smc_cv:
+#     input:
+#     	smc_cv_input
+#     output:
+#     	cv_folds = expand("models/smc_cv_no_timepoints/{{population}}/fold{fold}/model.final.json", fold = ['0','1']),
+#         final_model = "models/smc_cv_no_timepoints/{population}/model.final.json"
+#     threads: 25
+#     params:
+#     	model_in = "models/smc/input/{population}.*",
+#     	model_out_dir = "models/smc_cv_no_timepoints/{population}",
+#     	mu = config['mu']
+#     singularity:
+#         "docker://terhorst/smcpp:latest"
+#     shell:
+#     	"smc++ cv \
+#         --cores {threads} \
+#         --spline cubic \
+#         --ftol 0.001 \
+#     	-o {params.model_out_dir} {params.mu} {params.model_in}"
+
+rule smc_estimate:
     input:
     	smc_cv_input
     output:
-    	cv_folds = expand("models/smc_cv_no_timepoints/{{population}}/fold{fold}/model.final.json", fold = ['0','1']),
-        final_model = "models/smc_cv_no_timepoints/{population}/model.final.json"
+        final_model = "models/smc_estimate_no_timepoints/{population}/model.final.json"
     threads: 25
     params:
     	model_in = "models/smc/input/{population}.*",
-    	model_out_dir = "models/smc_cv_no_timepoints/{population}",
+    	model_out_dir = "models/smc_estimate_no_timepoints/{population}",
     	mu = config['mu']
     singularity:
         "docker://terhorst/smcpp:latest"
     shell:
-    	"smc++ cv \
+    	"smc++ estimate \
         --cores {threads} \
         --spline cubic \
-        --ftol 0.001 \
     	-o {params.model_out_dir} {params.mu} {params.model_in}"
 
 ################################################################################
@@ -69,9 +87,9 @@ rule smc_cv:
 
 rule plot_estimate:
     input:
-        smc_out = expand("models/smc_cv_no_timepoints/{population}/model.final.json", population = popdict.keys())
+        smc_out = expand("models/smc_estimate_no_timepoints/{population}/model.final.json", population = popdict.keys())
     output:
-        "reports/smc_cv_no_timepoints_results.png"
+        "reports/smc_estimate_no_timepoints_results.png"
     params:
         gen = config['gen']
     singularity:

diff --git a/rules/02-population_splits.smk b/rules/02-population_splits.smk
@@ -53,6 +53,7 @@ rule smc_split:
         "docker://terhorst/smcpp:latest"
     shell:
         "smc++ split \
+        --cores 2 \
         -o {params.model_out_dir} \
         {input}"
 

diff --git a/submit.json b/submit.json
@@ -16,6 +16,11 @@
       "mem" : "260G",
       "p" : "bigmemm"
     },
+    "smc_estimate" :
+    {
+      "mem" : "200G",
+      "p" : "med2"
+    },
     "plot_estimate" :
     {
       "mem" : "1G",
@@ -43,7 +48,7 @@
     },
     "smc_split" :
     {
-      "mem" : "600G",
+      "mem" : "250G",
       "p" : "bigmemm"
     },
     "plot_split" :