HighlanderLab · gmafrafortuna · Nov 20, 2023 · Apr 12, 2024 · Apr 12, 2024 · Apr 12, 2024
diff --git a/Snakemake/config/glob_cattle.yaml b/Snakemake/config/glob_cattle.yaml
@@ -0,0 +1,53 @@
+PROJECT: "global_cattle"
+work_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/tree_sequence"
+species: "Bos taurus"
+ploidy: 2
+no_chromosomes: 29
+start_chromosome: 1
+end_chromosome: 29
+vcf_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global"
+genmap: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ma2015_shapeit_ARS-UCD1.2"
+reference_genome: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/Reference_genomes/ARS-UCD1.2_Btau5.0.1Y.fa"
+ancestral_allele: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ancestral_alleles/Dorji_et_al/data/new_cattle_ancestral.txt"
+meta: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global/files/metadata_correct.csv"
+phase_only: False
+remove_indel: True
+missing_data: "0.2"
+truncate_upper: "0.4"
+truncate_lower: "0.2"
+recombination_rate_tree: False # "1e-8"
+recombination_rate_anc: False
+mismatch_ratio_anc: False
+mismatch_ratio_tree: False # "1"
+mutation_rate: "1e-8"
+generation_interval: 5
+chromosome_length:
+  1 : 158534110
+  2 : 136231102
+  3 : 121005158
+  4 : 120000601
+  5 : 120089316
+  6 : 117806340
+  7 : 110682743
+  8 : 113319770
+  9 : 105454467
+  10 : 103308737
+  11 : 106982474
+  12 : 87216183
+  13 : 83472345
+  14 : 82403003
+  15 : 85007780
+  16 : 81013979
+  17 : 73167244
+  18 : 65820629
+  19 : 63449741
+  20 : 71974595
+  21 : 69862954
+  22 : 60773035
+  23 : 52498615
+  24 : 62317253
+  25 : 42350435
+  26 : 51992305
+  27 : 45612108
+  28 : 45940150
+  29 : 51098607
diff --git a/Snakemake/workflow/Snakefile b/Snakemake/workflow/Snakefile
@@ -11,17 +11,13 @@ from os.path import isfile, join
 
 
 ##### setup report #####
-report: "reports/tsinfer.rst"
-
-# specify config files on the command line using --configfile
 #configfile: '../config/ancestral_Eddie.yaml'
-#configfile: '../config/tsinfer_Eddie.yaml'
-
+configfile: '../config/glob_cattle.yaml'
+report: "reports/tsinfer.rst"
 
 ##### setup singularity #####
 # Can we set conda env here???
 
-
 ##### Define some variables #####
 # all files should be in the vcfDir/RawVCF folder
 # files that need spliting are named: Combined_something.vcf.gz
@@ -30,47 +26,45 @@ report: "reports/tsinfer.rst"
 # (2) sort into the correct Chr folder in vcfDir.
 
 # defining variables that will be used throughout the script
-
-# the final directory name is now editable -- so can be named according to specific project
-# mapdir indicates where to find the genetic map for phasing (full path in the config)
-mapdir = config['genmap']
-oDir = Path(config['o_dir'], config['PROJECT'])
-
-vcfdir=config['vcf_dir'] # from config file, the directory containing the VCF(s) to process
-# the path where to save intermetdiate and final VCFs
-if config['process_vcf_in_original_dir']:
-    vcfOut = config['vcf_dir'] # same as above
-else:
-    vcfOut = f"{oDir}/VCF" # VCF dir inside the output dir
-
-
+vcfdir=config['vcf_dir'] # from config file, the path to vcf
 # this folder can be any where but must have the following structure:
 # vcfDir:
 #   - RawVCF
 #   - Chr1
 #   -Chr...
 
+# the final directory name is now editable -- so can be named according to specific project
+# mapdir indicates where to find the genetic map for phasing (full path in the config)
+Project = config['PROJECT']
+projdir = config['work_dir'] + '/' + Project
+mapdir = config['genmap']
 
-# if want to split multiallelic sites set to TRUE:
-#multiallelic = False
-
-chromosomes = [f'Chr{n}' for n in range(1, config['no_chromosomes'] + 1)]
+chromosomes = [f'chr{n}' for n in range(config['start_chromosome'], config['end_chromosome'] + 1)]
 # set a list of 'Chr' + chr# combinations to be used as wildcards
 
+wildcard_constraints:
+    chromosome = r'chr\d+'
+# constraining the wildcard to avoid errors 
+
 ##### load rules #####
-include: "rules/ancestral_inference.smk"
+#include: "rules/ancestral_inference.smk"
 include: "rules/split.smk"
+include: "rules/filter.smk"
 include: "rules/merge_samples_vcf.smk"
 include: "rules/phasing.smk"
-include: "rules/prepare_files_for_tsinfer.smk"
-#include: "rules/multiallelic.smk"
-include: "rules/infer_trees.smk"
+include: "rules/ancestral_info_to_vcf.smk"
+include: "rules/new_infer_trees.smk"
+
+##### Conditional inputs for rule all based on the existence of phased VCFs and config['phase_only'] #####
+if not config['phase_only']:
+    rule_all_input = expand(f'{projdir}/Tsinfer/trees/{{chromosome}}.dated.trees', chromosome=chromosomes)
+else:
+    rule_all_input = expand(f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi', chromosome=chromosomes)
+
 
 ##### target rules #####
 # At the end it should generate a phased vcf file that combines all files for
 # each chromosome. These files are stored in the vcf folder declaired in the
 # configuration file. It also checks that vcfs have been filtered before merged
 rule all:
-    input:
-        expand(f'{oDir}/Tsinfer/trees/{{chromosome}}.trees', chromosome=chromosomes)
-        # tree inference
+    input: rule_all_input
diff --git a/Snakemake/workflow/profile/cluster.json b/Snakemake/workflow/profile/cluster.json
@@ -1,55 +1,55 @@
 {
  "__default__": 
  {
-   "mem": "64G",
+   "mem": "8G",
    "threads": "1",
    "time": "10:00:00"
  },
- "rename":
+ "compare_and_filter_vcfs":
  {
-  "mem": "64G",
-  "threads": "1",
-  "time": "03:00:00"
+  "mem": "16G",
+  "threads": "8",
+  "time": "20:00:00"
+ },
+ "merge":
+ {
+  "mem": "16G",
+  "threads": "8",
+  "time": "20:00:00"
+ },
+ "remove_missing_indels":
+ {
+  "mem": "16G",
+  "threads": "16",
+  "time": "05:00:00"
  },
  "phase":
-  {
-    "mem": "350G",
-    "threads": "25",
-    "time": "336:00:00"
-  },
- "rename_phased":
   {
     "mem": "64G",
-    "threads": "1",
-    "time": "03:00:00"
+    "threads": "32",
+    "time": "240:00:00"
   },
-  "get_af":
+  "get_vcf_positions":
   {
-    "mem": "64G",
-    "threads": "1",
-    "time": "03:00:00"
-  },
-  "get_major":
-  {
-    "mem": "64G",
-    "threads": "1",
-    "time": "03:00:00"
+    "mem": "16G",
+    "threads": "32",
+    "time": "05:00:00"
   },
-  "extract_ancestral_chromosome":
+  "get_ancestral_alleles":
   {
-    "mem": "64G",
+    "mem": "32G",
     "threads": "1",
     "time": "02:00:00"
   },
-  "join_major_ancestral":
+  "update_regions_file":
   {
-    "mem": "64G",
+    "mem": "32G",
     "threads": "1",
     "time": "02:00:00"
   },
-  "determine_ancestral_major":
+  "add_info_to_vcf":
   {
-    "mem": "64G",
+    "mem": "32G",
     "threads": "1",
     "time": "04:00:00"
   },
@@ -59,118 +59,40 @@
     "threads": "1",
     "time": "03:00:00"
   },
-  "sort_ancestral_to_vcf":
-  {
-    "mem": "64G",
-    "threads": "1",
-    "time": "03:00:00"
-  },
-  "decompress":
-  {
-    "mem": "64G",
-    "threads": "1",
-    "time": "03:00:00"
-  },
-  "change_infoAA_vcf":
-  {
-    "mem": "128G",
-    "threads": "1",
-    "time": "05:00:00"
-  },
   "prepare_sample_file":
-  {
-    "mem": "128G",
-    "threads": "1",
-    "time": "10:00:00"
-  },
-  "infer":
-  {
-    "mem": "128G",
-    "threads": "1",
-    "time": "40:00:00"
-  },
- "extract_aligned_alleles_from_vcf":
- {
-  "mem": "32G",
-  "threads": "1",
-  "time": "03:00:00"
- },
- "extract_snps_from_info":
   {
     "mem": "32G",
-    "threads": "1",
-    "time": "03:00:00"
+    "threads": "16",
+    "time": "336:00:00"
   },
-  "extract_pos_aligned_alleles_from_vcf":
+  "generate_ancestors":
   {
     "mem": "32G",
-    "threads": "1",
-    "time": "03:00:00"
-  },
-  "extract_vcf_alleles_from_aligned":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time" : "03:00:00"
-  },
-  "create_estsfs_dicts":
-  {
-    "mem": "16G",
-    "threads": 1",
-    "time" : "05:00:00"
-  },
-  "edit_estsfs_dicts":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "01:00:00"
-  },
-  "combine_estsfs_dicts":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
+    "threads": "16",
+    "time": "336:00:00"
   },
-  "run_estsfs":
+  "truncate_ancestors":
   {
     "mem": "32G",
-    "threads": "1",
-    "time": "20:00:00"
-  },
-  "extract_major":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
-  },
-  "extract_minor":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
-  },
-  "extract_major_outgroup":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
+    "threads": "16",
+    "time": "336:00:00"
   },
-  "extract_estsfs_prob":
+  "ancestors_trees":
   {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
+    "mem": "32G",
+    "threads": "16",
+    "time": "336:00:00"
   },
-  "determine_ancestral":
+  "infer_trees":
   {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
+    "mem": "32G",
+    "threads": "16",
+    "time": "336:00:00"
   },
-  "move_and_rename_aa":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "05:00:00"
-  }
+ "simplify_trees":
+ {
+  "mem": "32G",
+  "threads": "16",
+  "time": "03:00:00"
+ }
 }
diff --git a/Snakemake/workflow/profile/config.yaml b/Snakemake/workflow/profile/config.yaml
@@ -3,10 +3,8 @@ use-conda: true
 jobscript: "jobscript.sh"
 cluster-config: "cluster.json"
 jobname: "{rule}.{jobid}"
-jobs: 16
-#drmaa: " -cwd -l h_vmem={cluster.mem} -l h_rt={cluster.time} -pe sharedmem {cluster.threads} -P roslin_HighlanderLab"
+drmaa: " -P roslin_HighlanderLab -cwd -l h_vmem={cluster.mem} -l h_rt={cluster.time} -pe sharedmem {cluster.threads} -R y"
 rerun-incomplete: true
-rerun-triggers: mtime
-cores: 16 # how many jobs you want to submit to your cluster queue
-local-cores: 1
+rerun-triggers: [mtime, input]
+local-cores: 99
 keep-going: false