diff --git a/Snakemake/config/glob_cattle.yaml b/Snakemake/config/glob_cattle.yaml
new file mode 100644
index 0000000..7c8a537
--- /dev/null
+++ b/Snakemake/config/glob_cattle.yaml
@@ -0,0 +1,53 @@
+PROJECT: "global_cattle"
+work_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/tree_sequence"
+species: "Bos taurus"
+ploidy: 2
+no_chromosomes: 29
+start_chromosome: 1
+end_chromosome: 29
+vcf_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global"
+genmap: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ma2015_shapeit_ARS-UCD1.2"
+reference_genome: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/Reference_genomes/ARS-UCD1.2_Btau5.0.1Y.fa"
+ancestral_allele: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ancestral_alleles/Dorji_et_al/data/new_cattle_ancestral.txt"
+meta: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global/files/metadata_correct.csv"
+phase_only: False
+remove_indel: True
+missing_data: "0.2"
+truncate_upper: "0.4"
+truncate_lower: "0.2"
+recombination_rate_tree: False # "1e-8"
+recombination_rate_anc: False
+mismatch_ratio_anc: False
+mismatch_ratio_tree: False # "1"
+mutation_rate: "1e-8"
+generation_interval: 5
+chromosome_length:
+  1 : 158534110
+  2 : 136231102
+  3 : 121005158
+  4 : 120000601
+  5 : 120089316
+  6 : 117806340
+  7 : 110682743
+  8 : 113319770
+  9 : 105454467
+  10 : 103308737
+  11 : 106982474
+  12 : 87216183
+  13 : 83472345
+  14 : 82403003
+  15 : 85007780
+  16 : 81013979
+  17 : 73167244
+  18 : 65820629
+  19 : 63449741
+  20 : 71974595
+  21 : 69862954
+  22 : 60773035
+  23 : 52498615
+  24 : 62317253
+  25 : 42350435
+  26 : 51992305
+  27 : 45612108
+  28 : 45940150
+  29 : 51098607
\ No newline at end of file
diff --git a/Snakemake/workflow/Snakefile b/Snakemake/workflow/Snakefile
index d4e5888..1f6de40 100644
--- a/Snakemake/workflow/Snakefile
+++ b/Snakemake/workflow/Snakefile
@@ -11,17 +11,13 @@ from os.path import isfile, join
 
 
 ##### setup report #####
-report: "reports/tsinfer.rst"
-
-# specify config files on the command line using --configfile
 #configfile: '../config/ancestral_Eddie.yaml'
-#configfile: '../config/tsinfer_Eddie.yaml'
-
+configfile: '../config/glob_cattle.yaml'
+report: "reports/tsinfer.rst"
 
 ##### setup singularity #####
 # Can we set conda env here???
 
-
 ##### Define some variables #####
 # all files should be in the vcfDir/RawVCF folder
 # files that need spliting are named: Combined_something.vcf.gz
@@ -30,47 +26,45 @@ report: "reports/tsinfer.rst"
 # (2) sort into the correct Chr folder in vcfDir.
 
 # defining variables that will be used throughout the script
-
-# the final directory name is now editable -- so can be named according to specific project
-# mapdir indicates where to find the genetic map for phasing (full path in the config)
-mapdir = config['genmap']
-oDir = Path(config['o_dir'], config['PROJECT'])
-
-vcfdir=config['vcf_dir'] # from config file, the directory containing the VCF(s) to process
-# the path where to save intermetdiate and final VCFs
-if config['process_vcf_in_original_dir']:
-    vcfOut = config['vcf_dir'] # same as above
-else:
-    vcfOut = f"{oDir}/VCF" # VCF dir inside the output dir
-
-
+vcfdir=config['vcf_dir'] # from config file, the path to vcf
 # this folder can be any where but must have the following structure:
 # vcfDir:
 #   - RawVCF
 #   - Chr1
 #   -Chr...
 
+# the final directory name is now editable -- so can be named according to specific project
+# mapdir indicates where to find the genetic map for phasing (full path in the config)
+Project = config['PROJECT']
+projdir = config['work_dir'] + '/' + Project
+mapdir = config['genmap']
 
-# if want to split multiallelic sites set to TRUE:
-#multiallelic = False
-
-chromosomes = [f'Chr{n}' for n in range(1, config['no_chromosomes'] + 1)]
+chromosomes = [f'chr{n}' for n in range(config['start_chromosome'], config['end_chromosome'] + 1)]
 # set a list of 'Chr' + chr# combinations to be used as wildcards
 
+wildcard_constraints:
+    chromosome = r'chr\d+'
+# constraining the wildcard to avoid errors 
+
 ##### load rules #####
-include: "rules/ancestral_inference.smk"
+#include: "rules/ancestral_inference.smk"
 include: "rules/split.smk"
+include: "rules/filter.smk"
 include: "rules/merge_samples_vcf.smk"
 include: "rules/phasing.smk"
-include: "rules/prepare_files_for_tsinfer.smk"
-#include: "rules/multiallelic.smk"
-include: "rules/infer_trees.smk"
+include: "rules/ancestral_info_to_vcf.smk"
+include: "rules/new_infer_trees.smk"
+
+##### Conditional inputs for rule all based on the existence of phased VCFs and config['phase_only'] #####
+if not config['phase_only']:
+    rule_all_input = expand(f'{projdir}/Tsinfer/trees/{{chromosome}}.dated.trees', chromosome=chromosomes)
+else:
+    rule_all_input = expand(f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi', chromosome=chromosomes)
+
 
 ##### target rules #####
 # At the end it should generate a phased vcf file that combines all files for
 # each chromosome. These files are stored in the vcf folder declaired in the
 # configuration file. It also checks that vcfs have been filtered before merged
 rule all:
-    input:
-        expand(f'{oDir}/Tsinfer/trees/{{chromosome}}.trees', chromosome=chromosomes)
-        # tree inference
+    input: rule_all_input
diff --git a/Snakemake/workflow/profile/cluster.json b/Snakemake/workflow/profile/cluster.json
index f4b023c..2670dad 100644
--- a/Snakemake/workflow/profile/cluster.json
+++ b/Snakemake/workflow/profile/cluster.json
@@ -1,55 +1,55 @@
 {
  "__default__": 
  {
-   "mem": "64G",
+   "mem": "8G",
    "threads": "1",
    "time": "10:00:00"
  },
- "rename":
+ "compare_and_filter_vcfs":
  {
-  "mem": "64G",
-  "threads": "1",
-  "time": "03:00:00"
+  "mem": "16G",
+  "threads": "8",
+  "time": "20:00:00"
+ },
+ "merge":
+ {
+  "mem": "16G",
+  "threads": "8",
+  "time": "20:00:00"
+ },
+ "remove_missing_indels":
+ {
+  "mem": "16G",
+  "threads": "16",
+  "time": "05:00:00"
  },
  "phase":
-  {
-    "mem": "350G",
-    "threads": "25",
-    "time": "336:00:00"
-  },
- "rename_phased":
   {
     "mem": "64G",
-    "threads": "1",
-    "time": "03:00:00"
+    "threads": "32",
+    "time": "240:00:00"
   },
-  "get_af":
+  "get_vcf_positions":
   {
-    "mem": "64G",
-    "threads": "1",
-    "time": "03:00:00"
-  },
-  "get_major":
-  {
-    "mem": "64G",
-    "threads": "1",
-    "time": "03:00:00"
+    "mem": "16G",
+    "threads": "32",
+    "time": "05:00:00"
   },
-  "extract_ancestral_chromosome":
+  "get_ancestral_alleles":
   {
-    "mem": "64G",
+    "mem": "32G",
     "threads": "1",
     "time": "02:00:00"
   },
-  "join_major_ancestral":
+  "update_regions_file":
   {
-    "mem": "64G",
+    "mem": "32G",
     "threads": "1",
     "time": "02:00:00"
   },
-  "determine_ancestral_major":
+  "add_info_to_vcf":
   {
-    "mem": "64G",
+    "mem": "32G",
     "threads": "1",
     "time": "04:00:00"
   },
@@ -59,118 +59,40 @@
     "threads": "1",
     "time": "03:00:00"
   },
-  "sort_ancestral_to_vcf":
-  {
-    "mem": "64G",
-    "threads": "1",
-    "time": "03:00:00"
-  },
-  "decompress":
-  {
-    "mem": "64G",
-    "threads": "1",
-    "time": "03:00:00"
-  },
-  "change_infoAA_vcf":
-  {
-    "mem": "128G",
-    "threads": "1",
-    "time": "05:00:00"
-  },
   "prepare_sample_file":
-  {
-    "mem": "128G",
-    "threads": "1",
-    "time": "10:00:00"
-  },
-  "infer":
-  {
-    "mem": "128G",
-    "threads": "1",
-    "time": "40:00:00"
-  },
- "extract_aligned_alleles_from_vcf":
- {
-  "mem": "32G",
-  "threads": "1",
-  "time": "03:00:00"
- },
- "extract_snps_from_info":
   {
     "mem": "32G",
-    "threads": "1",
-    "time": "03:00:00"
+    "threads": "16",
+    "time": "336:00:00"
   },
-  "extract_pos_aligned_alleles_from_vcf":
+  "generate_ancestors":
   {
     "mem": "32G",
-    "threads": "1",
-    "time": "03:00:00"
-  },
-  "extract_vcf_alleles_from_aligned":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time" : "03:00:00"
-  },
-  "create_estsfs_dicts":
-  {
-    "mem": "16G",
-    "threads": 1",
-    "time" : "05:00:00"
-  },
-  "edit_estsfs_dicts":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "01:00:00"
-  },
-  "combine_estsfs_dicts":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
+    "threads": "16",
+    "time": "336:00:00"
   },
-  "run_estsfs":
+  "truncate_ancestors":
   {
     "mem": "32G",
-    "threads": "1",
-    "time": "20:00:00"
-  },
-  "extract_major":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
-  },
-  "extract_minor":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
-  },
-  "extract_major_outgroup":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
+    "threads": "16",
+    "time": "336:00:00"
   },
-  "extract_estsfs_prob":
+  "ancestors_trees":
   {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
+    "mem": "32G",
+    "threads": "16",
+    "time": "336:00:00"
   },
-  "determine_ancestral":
+  "infer_trees":
   {
-    "mem": "16G",
-    "threads": "1",
-    "time": "02:00:00"
+    "mem": "32G",
+    "threads": "16",
+    "time": "336:00:00"
   },
-  "move_and_rename_aa":
-  {
-    "mem": "16G",
-    "threads": "1",
-    "time": "05:00:00"
-  }
+ "simplify_trees":
+ {
+  "mem": "32G",
+  "threads": "16",
+  "time": "03:00:00"
+ }
 }
diff --git a/Snakemake/workflow/profile/config.yaml b/Snakemake/workflow/profile/config.yaml
index 1dc5486..e7a1c31 100644
--- a/Snakemake/workflow/profile/config.yaml
+++ b/Snakemake/workflow/profile/config.yaml
@@ -3,10 +3,8 @@ use-conda: true
 jobscript: "jobscript.sh"
 cluster-config: "cluster.json"
 jobname: "{rule}.{jobid}"
-jobs: 16
-#drmaa: " -cwd -l h_vmem={cluster.mem} -l h_rt={cluster.time} -pe sharedmem {cluster.threads} -P roslin_HighlanderLab"
+drmaa: " -P roslin_HighlanderLab -cwd -l h_vmem={cluster.mem} -l h_rt={cluster.time} -pe sharedmem {cluster.threads} -R y"
 rerun-incomplete: true
-rerun-triggers: mtime
-cores: 16 # how many jobs you want to submit to your cluster queue
-local-cores: 1
+rerun-triggers: [mtime, input]
+local-cores: 99
 keep-going: false
diff --git a/Snakemake/workflow/rules/add_ancestral.smk b/Snakemake/workflow/rules/add_ancestral.smk
new file mode 100644
index 0000000..dd404f1
--- /dev/null
+++ b/Snakemake/workflow/rules/add_ancestral.smk
@@ -0,0 +1,167 @@
+
+ancestral_file = config['ancestral_allele']
+
+# input and output
+infile = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz'
+infileidx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi'
+outfile = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz'
+outidx = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz.csi'
+
+# temporary files
+filtered_samples = f'{vcfdir}/{{chromosome}}/{{chromosome}}_sam.vcf.gz'
+filtered_positions = f'{vcfdir}/{{chromosome}}/{{chromosome}}_sampos.vcf'
+ancestral_not_compressed = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf'
+regfile = f'{vcfdir}/{{chromosome}}_regions.tsv'
+aaInfo = f'{vcfdir}/{{chromosome}}_aa.info'
+
+if config['PROJECT'] == "global_cattle":
+    rule remove_samples:
+        input: infile
+        output: filtered_samples
+        params:
+            samples_to_remove = f'{vcfdir}/{{chromosome}}/remove'
+        conda: 'bcftools'
+        threads: 32
+        resources: cpus=32, mem_mb=2048000, time_min=1200
+        shell:
+            r"""
+            bcftools view -S ^{params.samples_to_remove} {input} --threads {threads} -O z -o {output}
+
+            bcftools index {output}
+            """
+
+
+# need to some helper files:
+# 1) create a chr pos file for filtering
+# 2) create an info file for adding ancestrals
+# filter vcf positions so that it has the same as the aa file
+# add ancestral allele to INFO
+
+rule make_regions_file:
+    input: 
+        anc = ancestral_file,
+        vcf = filtered_samples
+    output: temp(regfile)
+    params:
+        chrnum = lambda wc: wc.get('chromosome')[3:]
+    shell:
+        r"""
+        # filter positions in ancestral that correspond to chr
+        cut -d' ' -f1,2 {input.ancestral_file} | awk '$1 == {params.chrnum}' | sed -e $'s/ /\t/g' > chr{params.chrnum}.anc
+
+        cut -f2 chr{params.chrnum}.anc > chr{params.chrnum}.compare
+
+        # get positions in vcf
+        bcftools query -l '%POS\n' {input.vcf} > chr{params.chrnum}.file
+
+        # match positions ancestral and vcf
+        comm -12 <( sort chr{params.chrnum}.file ) <( chr{params.chrnum}.compare) > chr{params.chrnum}.keep
+
+        # number of lines
+        l=($(wc -l chr{params.chrnum}.keep))
+
+        # create final regions file
+        yes {params.chrnum} | head -n $l > chr{params.chrnum}
+        paste chr{params.chrnum} chr{params.chrnum}.keep > {output}
+
+        rm chr{params.chrnum} chr{params.chrnum}.keep chr{params.chrnum}.file chr{params.chrnum}.compare chr{params.chrnum}.anc
+        """
+#grep -vw 4579381 3565339
+
+rule filter_positions:
+    input: 
+        vcf = filtered_samples,
+        idx = f'{filtered_samples}.csi',
+        regions = regfile,
+    output: filtered_positions
+    conda: 'bcftools'
+    threads: 32
+    resources: cpus = 32, mem_mb = 768000, time_min = 300
+    shell:
+        r"""
+        bcftools view -R {input.regions} {input.vcf} \
+        #            --threads {threads} \
+                     -o {output}
+        """
+
+# take only the rows where col1 == current_chromosome
+# add AA= in front of the aa col
+# use only col pos & aa
+# match with the left pos in filtered vcf
+
+rule make_aaInfo_file:
+    input: 
+        aa_file = ancestral_file,
+        vcf = filtered_positions
+    output: temp(aaInfo)
+    params:
+        chrnum = lambda wc: wc.get('chromosome')[3:]
+    shell:
+        r"""
+        # filter by chromosome and add AA= to the allele col
+        awk '$1 == {params.chrnum}' {input} | awk '$3="AA="$3' | cut -d' ' -f2,3 > {wildcards.chromosome}.aa 
+
+        # get the positions in the vcf
+        bcftools query -f '%POS\n' {input.vcf} > {wildcards.chromosome}.pos 
+
+        # add allele col to the pos in the vcf
+        awk 'NR==FNR{{a[$1];next}}($1 in a){{print $0}}' {wildcards.chromosome}.pos {wildcards.chromosome}.aa > {wildcards.chromosome}.pos.aa
+
+        cut -d' ' -f2  {wildcards.chromosome}.pos.aa | sed '1s/^/INFO\n/' > {output}
+
+        rm {wildcards.chromosome}.aa {wildcards.chromosome}.pos {wildcards.chromosome}.pos.aa
+        """
+
+# need to run shapeit first to know line numbers
+rule add_ancestral:
+    input: 
+        vcf = filtered_positions,
+        aa_info = aaInfo,
+    output: ancestral_not_compressed
+    conda: 'bcftools'
+    shell:
+        r"""
+        awk 'NR==FNR{{a[NR] = $1; next}} FNR<13{{print}}; \
+        FNR==6{{printf "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">\n"}}; \
+        FNR>12{{$8=a[FNR-11]; print}}' OFS="\t" {input.aa_info} {input.vcf} > {output}
+        """
+
+
+# HEADERNUM="$(( $(bcftools view -h {input.vcf} | wc -l) - 1 ))"
+
+#         INFOLINE=$(( $(bcftools view -h {input.vcf} | awk '/INFO/{{print NR}}' | head -n 1) ))
+
+#         awk -v OFS="\t" -v HEADER=$HEADERNUM -v INFO=$INFOLINE 'NR==FNR{{{{a[FNR] = $2; next}}}} FNR<=HEADER{{{{print}}}}; \
+#         FNR==INFO{{{{printf "##INFO=<ID=AA,Number=1,Type=String,Description=Ancestral Allele>\\n"}}}}; \
+#         FNR>HEADER{{{{$8=a[FNR-HEADER]; print}}}}' OFS="\t" {input.aa_info} {input.vcf} > {output}
+#  All_AAInfo.txt AllFiltered.recode.vcf > AllFiltered.recode.Ancestral.vcf
+
+# compress and index final vcf
+# rule match_samples:
+#     input: ancestral_not_compressed
+#     output: outfile
+#     params: 
+#         samples_to_keep = f'{vcfdir}/{{chromosome}}/keep.samples'
+#     conda: 'bcftools'
+#     shell:
+#         r"""
+#         bcftools view -S {params.samples_to_keep} {input} -O z -o {output}
+#         """
+
+rule compress_vcf:
+    input: ancestral_not_compressed
+    output: outfile,
+    conda: 'bcftools'
+    shell:
+        r"""
+        bgzip -c {input} > {output}
+        """
+
+rule index_output:
+    input: outfile
+    output: outidx
+    conda: 'bcftools'
+    shell:
+        r"""
+        bcftools index {input}
+        """
diff --git a/Snakemake/workflow/rules/ancestral_info_to_vcf.smk b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk
new file mode 100644
index 0000000..1aeb6c0
--- /dev/null
+++ b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk
@@ -0,0 +1,141 @@
+
+# extract all positions in vcf file using bcftools and save to regions file
+# for rows in ancestral_alleles file where col 1 match wildcard.chromosome
+# for each row in regions file, if value is in ancestral_alleles col 2, add
+# ancestral_alleles col 3 to regions file col 2
+# else add -1
+
+ancestral_file = config['ancestral_allele']
+
+if "global_cattle" in config['PROJECT']:
+    # rule list_samples:
+    #     input:
+    #         vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz',
+    #         metadata = config['meta']
+    #     output:
+    #         samples = f'{vcfdir}/{{chromosome}}/samples_to_remove'
+    #     conda: 'bcftools'
+    #     shell:
+    #         r"""
+    #         bcftools query -l {input.vcf_file} > {wildcards.chromosome}.samples
+    #         cut -d',' -f1 {input.metadata} | tail -n +2 > metadata.samples
+    #         grep -v -f metadata.samples vcf.samples > unique.samples
+    #         """
+    
+    rule remove_samples:
+        input: 
+            vcf_file = input_ancestral,
+            idx_file = input_ancestral_idx,
+        output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_analysis.vcf',
+        params:
+            samples_to_remove = f'{vcfdir}/samples.remove'
+        conda: 'bcftools'
+        threads: 32
+        resources: cpus=32, mem_mb=2048000, time_min=1200
+        benchmark: 'benchmarks/{chromosome}.remove.benchmark.txt' 
+        shell:
+            r"""
+            bcftools view -S ^{params.samples_to_remove} {input.vcf_file} --threads {threads} -O v -o {output}
+            """
+
+else:
+    rule decompress:
+        input: 
+            vcf_file = input_ancestral,
+            idx_file = input_ancestral_idx
+        output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_analysis.vcf'
+        conda: 'bcftools'
+        benchmark: 'benchmarks/{chromosome}.decompress.benchmark.txt' 
+        shell:
+            r"""
+            bcftools view {input.vcf_file} -O v -o {output}
+            """
+
+input_ancestral = f'{vcfdir}/{{chromosome}}/{{chromosome}}_analysis.vcf'     
+
+rule get_vcf_positions:
+    input: 
+        vcf_file = input_ancestral,
+    output: 
+        regions_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.regions')
+    conda: 'bcftools'
+    benchmark: 'benchmarks/{chromosome}.positions.benchmark.txt' 
+    shell:
+        r"""
+        bcftools query -f '%POS\n' {input.vcf_file} | awk '{{print $1, -1}}' > {output.regions_file}
+        """
+
+rule get_ancestral_alleles:
+    input: 
+        ancestral_alleles_file = ancestral_file
+    output:
+        ancestrals = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.anc')
+    params:
+        chrnum = lambda wc: wc.get('chromosome')[3:]
+    benchmark: 'benchmarks/{chromosome}.getaa.benchmark.txt'     
+    shell:
+       r"""
+       # filter positions in ancestral that correspond to chr
+        awk '$1 == {params.chrnum}' {input.ancestral_alleles_file} | cut -d' ' -f2,3 > {output.ancestrals}
+       """       
+
+rule update_regions_file:
+    input:
+        regions_file = rules.get_vcf_positions.output.regions_file,
+        ancestrals = rules.get_ancestral_alleles.output.ancestrals,
+    output:
+        updated_regions_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.updated.regions'),
+        positions_exclude = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.exclude'),
+    benchmark: 'benchmarks/{chromosome}.regions.benchmark.txt'     
+    shell:
+        r"""
+        awk 'NR==FNR{{a[$1]=$2;next}} ($1 in a){{$2=a[$1]}}1' {input.ancestrals} {input.regions_file} > {output.updated_regions_file}
+
+        awk '$2 == -1' {output.updated_regions_file} | cut -d' ' -f1 > {output.positions_exclude}
+        """
+
+rule add_AA_to_regions_file:
+    input:
+        updated_regions_file = rules.update_regions_file.output.updated_regions_file,
+    output:
+        new_regions_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}.final.regions'
+    benchmark: 'benchmarks/{chromosome}.addaaregions.benchmark.txt'     
+    shell:
+        r"""
+        echo "INFO" > {output.new_regions_file}
+        awk '{{print "AA="$2}}' {input.updated_regions_file} >> {output.new_regions_file}
+        """
+
+rule add_info_to_vcf:
+    input: 
+        vcf_file = input_ancestral,
+        # f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf',
+        aa_info = rules.add_AA_to_regions_file.output.new_regions_file,
+    output: 
+        large_vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf')
+    conda: 'bcftools'
+    benchmark: 'benchmarks/{chromosome}.aainfo.benchmark.txt' 
+    shell:
+        r"""
+        awk 'NR==FNR{{a[NR] = $1; next}} FNR<15{{print}}; \
+        FNR==6{{printf "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">\n"}}; \
+        FNR>14{{$8=a[FNR-14]; print}}' OFS="\t" {input.aa_info} {input.vcf_file} > {output.large_vcf}
+
+        rm {input.vcf_file}
+        """
+
+rule compress_index_vcf:
+    input: 
+        large_vcf = rules.add_info_to_vcf.output.large_vcf
+        # f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf'
+    output: 
+        vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz',
+        vcf_indx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz.csi'
+    conda: 'bcftools'
+    benchmark: 'benchmarks/{chromosome}.compress.benchmark.txt' 
+    shell:
+        r"""
+        bgzip -c {input} > {output.vcf_file}
+        bcftools index {output.vcf_file}
+        """
+input_ts = rules.compress_index_vcf.output.vcf_file
diff --git a/Snakemake/workflow/rules/date.smk b/Snakemake/workflow/rules/date.smk
new file mode 100644
index 0000000..a9afb85
--- /dev/null
+++ b/Snakemake/workflow/rules/date.smk
@@ -0,0 +1,19 @@
+# Script for (iteractively) dating the tree sequence 
+# trees must be dated, re-inferred and dated again
+# to get the correct dates (test number of iterations)
+
+# if this is the first iteration, start from the existing tree sequence;
+# otherwise, re-infer the trees using the dated sampleFile and date the trees again
+
+input_ts = 
+
+print(f'Starting dating of tree sequence {input_ts}...')
+
+rule date_trees:
+    input: rules.simplify_trees.output
+    output: f'{_input:n}.dated'
+    conda: ''
+
+
+print('A rough estimate of the effective population size is', ts.diversity() / (4 * 1e-6))
+
diff --git a/Snakemake/workflow/rules/filter.smk b/Snakemake/workflow/rules/filter.smk
new file mode 100644
index 0000000..54a596a
--- /dev/null
+++ b/Snakemake/workflow/rules/filter.smk
@@ -0,0 +1,159 @@
+# if there is only one vcf files, use the output from split_and_move as input
+# for the next step samples
+# else, compare the files and filter out overlapping then merge the vcfs into
+# one and index and use the output from the merge as input for the next step
+if len(allFiles) == 1:
+    input_vcf = rules.split_and_move_vcfs.output
+    input_index = f'{rules.split_and_move_vcfs.output}.csi'
+    input_summary = 'None'
+
+else:
+    input_to_filter = [f'{vcfdir}/{{chromosome}}/{{chromosome}}_{file}.vcf.gz' for file in allFiles]
+    
+    filtered_output = [f'{vcfdir}/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz' for file in allFiles]
+    filtered_output_idx = [f'{vcfdir}/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz.csi' 
+                            for file in allFiles]                    
+    merged_output   = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz'
+
+    rule compare_and_filter_vcfs:
+        input: input_to_filter 
+        output: temp(filtered_output)
+        params:
+            file_names = [f'{{chromosome}}_{file}' for file in allFiles],
+            path = f'{vcfdir}/{{chromosome}}'
+        benchmark: 'benchmarks/{chromosome}.sample.benchmark.txt'    
+        run:
+            import os
+            import shlex
+            import itertools
+            import subprocess
+            from glob import glob
+
+            path_names = [f'{params.path}/{name}' for name in params.file_names]
+            pairs = list(itertools.combinations(path_names, 2))
+            new_ext = '.filtered.vcf.gz'
+            to_rename = path_names.copy()
+
+            # to avoid shell injection, sanitize the file names
+            # -- ensure speciall shell characters are escaped
+            sanitized_path = shlex.quote(params.path)
+            sanitized_new_ext = shlex.quote(new_ext)
+            sanitized_idx = shlex.quote(new_ext + '.csi')
+            
+            for i, pair in enumerate(pairs):
+                print(f'pair {i} {pair}')
+                
+                # Run vcftools to find if there are overlapping samples between pairs
+                command = f"""
+                        module load anaconda
+                        source activate bcftools
+                        vcftools --gzvcf {os.path.join(pair[0] + '.vcf.gz')} \
+                                 --gzdiff {os.path.join(pair[1] + '.vcf.gz')} \
+                                 --diff-indv
+                        """
+                subprocess.run(command, shell=True, check=True, cwd=sanitized_path)   
+                
+                # if vcftools find any overlapping samples, remove them from the second vcf file
+                if "B" in subprocess.run("awk '{if ($2 == \"B\") print $2}' out.diff.indv_in_files", 
+                                            shell=True, 
+                                            capture_output=True, 
+                                            text=True, 
+                                            cwd=sanitized_path).stdout:
+                    print(f'Overlapping samples found!! Removing samples from {pair[1]}')
+                    command = f"""
+                            grep B out.diff.indv_in_files | cut -f1 > samples.remove
+                            rm out.diff.indv_in_files
+                            """
+                    subprocess.run(command, shell=True, check=True, cwd=sanitized_path)     
+                    command = f"""
+                              bcftools view -S ^samples.remove {os.path.join(pair[1] + '.vcf.gz')} \
+                                            -Oz -o {os.path.join(pair[1] + sanitized_new_ext)}
+                              
+                              bcftools index -f {os.path.join(pair[1] + sanitized_new_ext)}      
+
+                              rm samples.remove
+                            """
+                    subprocess.run(command, shell=True, check=True, cwd=sanitized_path)
+                    # remove the file that has been sampled from renaming list
+                    to_rename.remove(pair[1])
+                    print(f'Files to rename are: {to_rename}')
+
+                # if there are no overlapping samples, 
+                # just rename the files that still have the original extension
+                else:
+                    print('No overlapping samples')
+
+            print(f'Finished comparing and sampling vcfs')
+
+            # rename the files that have not been sampled
+            for name in to_rename:
+                sanitized_name = shlex.quote(name)
+                print(f'Renaming {sanitized_name}')
+                command = f"""
+                        mv {os.path.join(sanitized_name + '.vcf.gz')} {os.path.join(sanitized_name + sanitized_new_ext)}
+                        mv {os.path.join(sanitized_name + '.vcf.gz.csi')} {os.path.join(sanitized_name + sanitized_idx)}
+                        """
+                subprocess.run(command, shell=True, check=True, cwd=sanitized_path)
+
+            # Remove all files that do not contain the string new_ext
+            command = f"""
+                        find {sanitized_path} -type f ! -name '*{sanitized_new_ext}*' -exec rm -f {{}} \;
+                        """
+            subprocess.run(command, shell=True, check=True, cwd=sanitized_path)
+
+    rule merge:
+        input: rules.compare_and_filter_vcfs.output
+        output: merged_output 
+        params:
+            files_path = f'{vcfdir}/{{chromosome}}/',
+            indexes = filtered_output_idx
+        conda: "bcftools"
+        threads: 32
+        resources: cpus=32, mem_mb=2048000, time_min=1200
+        benchmark: 'benchmarks/{chromosome}.merge.benchmark.txt'
+        shell:
+            r"""
+            printf '%s\n' {params.files_path}*filtered*.gz > {wildcards.chromosome}.merge
+            
+            bcftools merge \
+                -m snps \
+                -l {wildcards.chromosome}.merge \
+                --threads {threads} -O z -o {output}
+	    
+            rm {wildcards.chromosome}.merge {params.indexes}
+            """
+
+    rule index_merged:
+        input: rules.merge.output,
+        output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz.csi'
+        conda: 'bcftools'
+        benchmark: 'benchmarks/{chromosome}.indxmerged.benchmark.txt'
+        shell:
+            r"""
+            bcftools index -f {input}
+            """                    
+    
+    rule vcf_summary:
+        input: 
+            vcf_file = rules.merge.output,
+            index = rules.index_merged.output,
+        output:
+            summary = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary.vchk',
+            # done = touch(f'{vcfdir}/{{chromosome}}/merged_summary.done')
+        params:
+            plots = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary/'
+        conda: 'bcftools'
+        benchmark: 'benchmarks/{chromosome}.summary.benchmark.txt'
+        shell:
+            r"""
+            bcftools stats {input.vcf_file} > {output.summary}
+            plot-vcfstats -p {params.plots} {output.summary}
+            """
+
+    input_vcf = rules.merge.output
+    input_index = rules.index_merged.output
+    input_summary = rules.vcf_summary.output.summary
+
+# else, use the output from split_and_move as input for the next step
+# else:
+#     input_vcf = rules.split_and_move_vcfs.output
\ No newline at end of file
diff --git a/Snakemake/workflow/rules/infer_trees.smk b/Snakemake/workflow/rules/infer_trees.smk
index 93231af..cf36dcc 100644
--- a/Snakemake/workflow/rules/infer_trees.smk
+++ b/Snakemake/workflow/rules/infer_trees.smk
@@ -1,36 +1,38 @@
 import re
 
-# if multiallelic:
-#     vcf_4_inference = f'{vcfOut}/{{chromosome}}_allbi.vcf.gz'
-# else:
-#     vcf_4_inference = f'{vcfOut}/{{chromosome}}_ancestral.vcf.gz'
-
 rule prepare_sample_file:
     input:
-        #vcf=vcf_4_inference,
-        vcf = f'{vcfOut}/{{chromosome}}_ancestral.vcf.gz',
-        meta = Path(vcfdir, config['meta'])
+        vcf = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz',
+        meta = config['meta']
     output:
-        f"{oDir}/Tsinfer/samples/{{chromosome}}.samples"
+        f'{projdir}/Tsinfer/samples/{{chromosome}}.samples'
     params:
         chrLength= lambda wildcards:  config['chromosome_length'][int(re.findall(r'\d+', wildcards.chromosome)[0])],
         ploidy=config['ploidy']
     conda: "HLab_tsinfer"
-    threads: 1
-    resources: cpus=1, mem_mb=128000, time_min=200
-    log: 'logs/Prepare_sample_file_{chromosome}.log'
+    threads: 32
+    resources: cpus=32, mem_mb=768000, time_min=200
+    benchmark:
+            'benchmarks/{chromosome}.prep_sample.benchmark.txt'
+    #log: 'logs/Prepare_sample_file_{chromosome}.log'
     shell:
         "python scripts/Prepare_tsinfer_sample_file.py "
         "{input.vcf} {input.meta} {output} {params.ploidy} {params.chrLength}"
 
 rule infer:
     input:
-        f"{oDir}/Tsinfer/samples/{{chromosome}}.samples"
-    conda: "HLab_tsinfer"
-    threads: 1
-    resources: cpus=1, mem_mb=128000, time_min=300
-    log: 'logs/Infer_{chromosome}.log'
+        f'{projdir}/Tsinfer/samples/{{chromosome}}.samples',
     output:
-        f"{oDir}/Tsinfer/trees/{{chromosome}}.trees"
-    script:
-        "../scripts/Infer_trees.py" # path to script is relative to the file containing this rule. Hence the prepended "../"
+        f'{projdir}/Tsinfer/trees/{{chromosome}}.trees',
+    conda: "HLab_tsinfer"
+    threads: 16
+    resources: cpus=16, mem_mb=1024000, time_min=10080
+    benchmark:
+            'benchmarks/{chromosome}.infer.benchmark.txt'
+    #log: 'logs/Infer_{chromosome}.log'
+    shell:
+        r"""
+        # Check amount of memory (in kbytes) as seen by the job
+        ulimit -v
+        python scripts/Infer_trees.py {input} {output}
+        """
diff --git a/Snakemake/workflow/rules/keep_snp.smk b/Snakemake/workflow/rules/keep_snp.smk
new file mode 100644
index 0000000..323bd1b
--- /dev/null
+++ b/Snakemake/workflow/rules/keep_snp.smk
@@ -0,0 +1,43 @@
+# rule gatk_index:
+#     input:
+#         vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz',
+#         idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi' 
+#     output: temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.tbi'),
+#     conda: 'gatk-env'
+#     benchmark: 
+#         'benchmark/{chromosome}.gatk_index.benchmark.txt'
+#     shell:
+#         r"""
+#         gatk IndexFeatureFile -I {input.vcf}
+#         """
+
+# rule filter_snps:
+#     input: 
+#         vcf = rules.phase.output,
+#         idx = rules.index_phased.output,
+#     output: 
+#         vcf_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz'),
+#         idx_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz.csi'),
+#     params: config['reference_genome'],
+#     conda: 'gatk-env'
+#     benchmark: 'benchmarks/{chromosome}.filter_snps.benchmark.txt'
+#     shell:
+#         r"""
+#         gatk SelectVariants \
+#             -R {params} \
+#             -V {input.vcf} \
+#             --select-type-to-include SNP \
+#             -O {output.vcf_file} 
+
+#         bcftools index {output.vcf_file}           
+#         """
+
+# rule index:
+#     input: f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz',
+#     output: temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz.csi')
+#     conda: 'bcftools'
+#     benchmark: 'benchmarks/{chromosome}.index_filtered_snps.benchmark.txt'
+#     shell:
+#         r"""
+#         bcftools index {input}
+#         """
diff --git a/Snakemake/workflow/rules/merge_samples_vcf.smk b/Snakemake/workflow/rules/merge_samples_vcf.smk
index b6894d7..72f74c1 100644
--- a/Snakemake/workflow/rules/merge_samples_vcf.smk
+++ b/Snakemake/workflow/rules/merge_samples_vcf.smk
@@ -1,103 +1,32 @@
-import yaml
-
-
-if len(allFiles) != 1:
-    rule get_samples:
-        input:
-            f'{vcfOut}/{{chromosome}}/{{chromosome}}_{{file}}.vcf.gz'
-        output: temp(f'{vcfOut}/{{chromosome}}/{{chromosome}}_{{file}}.txt')
-        conda: "bcftools"
-        threads: 1
-        resources: cpus=1, mem_mb=32000, time_min=30
-        log: 'logs/Get_samples_{chromosome}_{file}.log'
-        shell:
-            """
-            bcftools query -l {input} > {output}
-            """
-
-    rule filter:
-        input:
-            [f'{vcfOut}' + x
-                for x in expand('/{{chromosome}}/{{chromosome}}_{file}.txt',
-                    file=allFiles)],
-        output:
-            temp([f'{vcfOut}' + x for x in
-                expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.{ext}', file=allFiles, ext=['gz', 'gz.csi'])])
-        params:
-            duplicated = '{chromosome}.ids'
-       # conda: 'bcftools'
-        resources: cpus=1, mem_mb=64000, time_min=60
-        run:
-            import os
-            filtered = []
-
-            for i in range(len(input)):
-                print(i)
-                for j, file in enumerate(input):
-                    if file != input[i] and j > i:
-                        file1 = input[i]
-                        file2 = input[j]
-                        print(file1, file2)
-
-                        shell("comm -12 <(sort {file1}) <(sort {file2}) > {params.duplicated}")
-
-                        remove = params.duplicated
-                        lremove = os.stat(f'{remove}').st_size
-
-                        if lremove == 0:
-                            print(f'Nothing to remove {lremove}')
-                        else:
-                            print(f'{lremove} samples to remove')
-                            if file2 not in filtered:
-                                filtered.append(file2)
-
-                            file2 = file2.split('.')[0]
-                            print(f'file {file2} filtered')
-                            shell('bcftools view -S ^{params.duplicated} {file2}.vcf.gz -O z -o {file2}.filtered.vcf.gz')
-                            shell('bcftools index -f {file2}.filtered.vcf.gz')
-                            shell('rm {params.duplicated}')
-
-            for file in input:
-                if file not in filtered:
-                    file1 = file.split('.')[0]
-                    print(f'file {file1} not filtered')
-                    shell("cp {file1}.vcf.gz {file1}.filtered.vcf.gz && bcftools index -f {file1}.filtered.vcf.gz")
-            shell('rm {params.duplicated}')
-
-    rule merge:
-        input:
-            [f'{vcfOut}' + x for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz', file=allFiles)]
-        output:
-            vcf=temp(f'{vcfOut}/{{chromosome}}_final.vcf.gz'),
-            index=temp(f'{vcfOut}/{{chromosome}}_final.vcf.gz.csi')
-        conda: "bcftools"
-        threads: 1
-        resources: cpus=1, mem_mb=32000, time_min=60
-        log: 'logs/Merge_{chromosome}.log'
-        shell:
-            """
-            bcftools merge {input} -O z -o {output.vcf}
-            bcftools index -f {output.vcf}
-            """
-
-else: # i.e. len(allFiles) == 1
-    rule rename:
-        input:
-            [f'{vcfOut}' + x
-                for x in expand('/{{chromosome}}/{{chromosome}}_{suffixOne}.vcf.gz',
-                    suffixOne = splitFiles)] if len(splitFiles) != 0 else [], 
-                [f'{vcfOut}' + x
-                for x in expand('/{{chromosome}}/{{chromosome}}_{suffixTwo}.vcf.gz',
-                    suffixTwo = combinedFiles)] if len(combinedFiles) != 0 else []
-        output:
-            vcf = (f'{vcfOut}/{{chromosome}}_final.vcf.gz'),
-            idx = (f'{vcfOut}/{{chromosome}}_final.vcf.gz.csi')
-        conda: "bcftools"
-        log: 'logs/Rename_{chromosome}.log'
-        resources: cpus=1, mem_mb=32000, time_min=30
-        shell:
-            """
-                ln -s $( realpath {input} ) {output.vcf}
-                ln -s $( realpath {input} ).csi {output.idx}
-            """
-
+# Filter out INDELS and missing data (>20%) from the vcf file
+# The input vcf file is the merged vcf file from the previous step
+# or the output from split and move (if only one file is present)
+
+rule remove_missing_indels:
+    input: 
+        vcf = input_vcf,
+        index = input_index,
+        summary = input_summary,
+    output:    
+        vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz'),
+        index = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz.csi')
+    params:
+        prefix = f'{vcfdir}/{{chromosome}}/{{chromosome}}',
+        missing = config['missing_data'],
+        indel = '--remove-indels' if config['remove_indel'] else ''
+    conda: 'bcftools'
+    benchmark: 'benchmarks/{chromosome}.missindel.benchmark.txt' 
+    shell:
+        r"""
+        vcftools --gzvcf {input.vcf} \
+            {params.indel} \
+            --max-missing {params.missing} \
+            --remove-filtered-all \
+            --recode \
+            --out {params.prefix}
+
+        bgzip {params.prefix}.recode.vcf
+        bcftools index -f {output.vcf}  
+
+        # rm  {params.prefix}.recode.vcf 
+        """ 
diff --git a/Snakemake/workflow/rules/new_infer_trees.smk b/Snakemake/workflow/rules/new_infer_trees.smk
new file mode 100644
index 0000000..0dec01f
--- /dev/null
+++ b/Snakemake/workflow/rules/new_infer_trees.smk
@@ -0,0 +1,139 @@
+# Convert vcf to samples file
+# generate ancestos samples based only on sites with ancestral allele
+# information
+# truncate ancestors to remove those spanning too many sites
+# generate ancestors trees
+# match ancestors and samples to infer trees
+# final output is a simplified tree sequence
+
+rule prepare_sample_file:
+    input:
+        vcf = input_ts,
+        # f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz',
+        meta = config['meta']
+    output: f'{projdir}/Tsinfer/samples/{{chromosome}}.samples'
+    params:
+        chrLength = lambda wildcards:  config['chromosome_length'][int(re.findall(r'\d+', wildcards.chromosome)[0])]
+    conda: "HLab_tsinfer"
+    threads: 32
+    resources: cpus = 32, mem_mb = 768000, time_min = 200
+    benchmark: 'benchmarks/{chromosome}_new.prep_sample.benchmark.txt'
+    shell:
+        r"""
+        ulimit -v
+        
+        python scripts/generate_samples.py \
+                {input.vcf} {output} {input.meta} {params.chrLength} {threads}
+        
+        """
+
+rule generate_ancestors:
+    input: 
+        samples = rules.prepare_sample_file.output,
+        sites_to_exclude = f'{vcfdir}/{{chromosome}}/{{chromosome}}.exclude',
+    output:
+        ancestors_samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors',
+    conda: 'HLab_tsinfer'
+    threads: 64
+    resources: cpus = 64, mem_mb = 2048000, time_min = 1680
+    benchmark: 'benchmarks/{chromosome}_generate_ancestors.benchmark.txt'
+    shell:
+        r"""
+        # Check amount of memory (in kbytes) as seen by the job
+        ulimit -v
+        python scripts/generate_ancestors.py {input.samples} {input.sites_to_exclude} {output.ancestors_samples} {threads}
+        """
+
+rule truncate_ancestors:
+    input:
+        ancestors_samples = rules.generate_ancestors.output.ancestors_samples,
+    output: 
+        truncated_ancestors = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors.truncated',
+    params:
+       upper = config['truncate_upper'],
+       lower = config['truncate_lower'],
+    conda: 'HLab_tsinfer'
+    threads: 32
+    resources: cpus=32, mem_mb=2048000, time_min=1680
+    benchmark: 'benchmarks/{chromosome}_truncate.benchmark.txt'
+    shell:
+        r"""
+        ulimit -v
+        python scripts/truncate.py {input} {params.upper} {params.lower} {output}
+        """
+
+rule ancestors_trees:
+    input:
+        samples = rules.prepare_sample_file.output,
+        truncated_ancestors = rules.truncate_ancestors.output.truncated_ancestors,
+    output: 
+        ancestors_trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.atrees',
+    params:
+        re = f"--recombination-rate {config['recombination_rate_anc']}" if config['recombination_rate_anc'] else '',
+        misma = f"--mismatch-ratio {config['mismatch_ratio_anc']}" if config['mismatch_ratio_anc'] else '',
+    conda: 'HLab_tsinfer'
+    threads: 32
+    resources: cpus = 32, mem_mb = 2048000, time_min = 1680
+    benchmark: 'benchmarks/{chromosome}_ancestors_trees.benchmark.txt'
+    shell:
+        r"""
+        tsinfer match-ancestors {input.samples} \
+                -A {output} \
+                {params.re} \
+                {params.misma} \
+                --log-section tsinfer.inference \
+                --num-threads {threads} \
+                --progress \
+                --verbosity \
+                --ancestors {input.truncated_ancestors} 
+        """
+
+rule infer_trees:
+    input:
+        samples = rules.prepare_sample_file.output,
+        ancestors_trees = rules.ancestors_trees.output.ancestors_trees,
+    output:
+        trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.trees',
+    params:
+        re = f"--recombination-rate {config['recombination_rate_tree']}" 
+            if config['recombination_rate_tree'] else '',
+        misma = f"--mismatch-ratio {config['mismatch_ratio_tree']}" 
+            if config['mismatch_ratio_tree'] else '',
+    conda: 'HLab_tsinfer'
+    threads: 32
+    resources: cpus=32, mem_mb=2048000, time_min=1680
+    benchmark: 'benchmarks/{chromosome}_infer.benchmark.txt'
+    shell:
+        r"""
+        # Check amount of memory (in kbytes) as seen by the job
+        ulimit -v
+        tsinfer match-samples {input.samples} \
+                -A {input.ancestors_trees} \
+                --num-threads {threads} \
+                {params.re} \
+                {params.misma} \
+                --progress \
+                --log-section tsinfer.inference \
+                --verbosity \
+                -O {output}
+        """
+
+rule simplify_trees:
+    input: 
+        trees = rules.infer_trees.output.trees,
+    output: 
+        f'{projdir}/Tsinfer/trees/{{chromosome}}.dated.trees',
+    params:
+        genint = config['generation_interval'],
+        mutrate = config['mutation_rate'],
+    conda: 'HLab_tsinfer'
+    threads: 32
+    resources: cpus=32, mem_mb=2048000, time_min=1680
+    benchmark: 'benchmarks/{chromosome}_simplify.benchmark.txt'
+    shell: 
+        r"""
+        ulimit -v
+        python scripts/simplify.py {input} {params.genint} {params.mutrate} {output}
+        """             
+
+# ,[^.]        
\ No newline at end of file
diff --git a/Snakemake/workflow/rules/phasing.smk b/Snakemake/workflow/rules/phasing.smk
index fbf92d3..6ea0fad 100644
--- a/Snakemake/workflow/rules/phasing.smk
+++ b/Snakemake/workflow/rules/phasing.smk
@@ -1,62 +1,66 @@
-# def getInputVcfFile_phasing(wildcards):
-#     if os.path.isfile(vcfdir + "/" + wildcards.chromosome + '_final.vcf.gz'):
-#         print("Final file found ")
-#         vcf_file = vcfdir + "/" + wildcards.chromosome + '_final.vcf.gz'
-#         print(vcf_file)
-#     else:
-#         print("No final file ")
-#         file = open(vcfdir + '/Vcf_file_' + wildcards.chromosome + '.txt')
-#         vcf_file = file.read().strip("\n")
-#
-#     return(vcf_file)
+# Phase files if species is diploid using the output from the
+# remove_missing_indels rule
 
+file_to_phase = f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz'
+file_to_phase_idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz.csi'
+
+phased_output = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz'
+phased_output_idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi'
+
+checkpoint decide_start_point:
+    output: "workflow_start.txt"
+    shell:
+        """
+        if [ {config['phase_only']} == False ] && [ -f {phased_output} {phased_output_idx} ]; then
+            echo "ancestral_info_to_vcf" > {output}
+        else
+            echo "start" > {output}
+        fi
+        """
 
 if config['ploidy'] == 1:
-    rule rename_phased:
-        input:
-            vcf = f'{vcfOut}/{{chromosome}}_final.vcf.gz',
-            idx = f'{vcfOut}/{{chromosome}}_final.vcf.gz.csi'
-        output:
-            vcf = f'{vcfOut}/{{chromosome}}_phased.vcf.gz',
-            idx = f'{vcfOut}/{{chromosome}}_phased.vcf.gz.csi'
-        log: 'logs/Rename_phased_{chromosome}.log'
-        resources: cpus=1, mem_mb=32000, time_min=60
-        shell:
-            """
-            if [ -h {input.vcf} ]; then
-                ln -s $( realpath {input.vcf} ) {output.vcf}
-                ln -s $( realpath {input.idx} ) {output.idx}
-            else
-                ln -s {input.vcf} {output.vcf}
-                ln -s {input.idx} {output.idx}
-            fi
-            """
+    input_ancestral = file_to_phase
+    input_ancestral_idx = file_to_phase_idx
 
 else:
     rule phase:
         input:
-            vcf = f'{vcfOut}/{{chromosome}}_final.vcf.gz',
+            vcf = file_to_phase,
+            idx = file_to_phase_idx,
         output:
-            file = f'{vcfOut}/{{chromosome}}_phased.vcf.gz',
-            idx = f'{vcfOut}/{{chromosome}}_phased.vcf.gz.csi'
+            vcf = phased_output,
         params:
             map = f'{mapdir}/{{chromosome}}.txt',
-        #log: 'logs/phase_{chromosome}.log'
-        threads: 25
-#        resources: cpus=20, mem_mb=25000, time_min=5
+        threads: 32
+        resources: cpus = 32, mem_mb = 2048, time_min = 7200
         conda: 'shapeit4am'
-        log: 'logs/Phase_{chromosome}.log'
+        benchmark: 'benchmarks/{chromosome}.shapeit.benchmark.txt'
         shell:
-            """
+            r"""
             str='{wildcards.chromosome}'
             chr=$(echo ${{str:3}})
-            start=`date +%s`
+
             shapeit4 --input {input.vcf} \
-                             --map {params.map} \
-                             --region ${{chr}} \
-                             --output {output} \
-                             --thread {threads}
-            end=`date +%s`
-            echo Execution time was `expr $end - $start` seconds > shapeit_{wildcards.chromosome}.time
-            bcftools index -f {output}
+                                --map {params.map} \
+                                --region $chr \
+                                --output {output.vcf} \
+                                --thread {threads} \
+                                --pbwt-depth 8 \
+                                --sequencing \
+                                --log {wildcards.chromosome}_phased.log                
+            """         
+
+    rule index_phased_cleanup: 
+        input: rules.phase.output
+        output: phased_output_idx
+        params: 
+            chr = lambda wildcards: wildcards.chromosome[3:],
+            dire = f'{vcfdir}/{{chromosome}}',    
+        conda: 'bcftools'
+        benchmark: 'benchmarks/{chromosome}.bcftools_index.bechmark.txt'
+        shell:
+            r"""
+            bcftools index {input}  
             """
+    input_ancestral = phased_output
+    input_ancestral_idx = phased_output_idx
\ No newline at end of file
diff --git a/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk b/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk
index c26f24b..9cac1ef 100644
--- a/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk
+++ b/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk
@@ -5,19 +5,20 @@ else:
 
 
 rule get_af:
-    input: f'{vcfOut}/{{chromosome}}_phased.vcf.gz'
+    input: f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz'
     output:
-        info = temp(f'{vcfOut}/Info{{chromosome}}.INFO')
+        vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_info.vcf.gz'),
+        info = temp(f'{vcfdir}/{{chromosome}}/Info{{chromosome}}.INFO')
     params:
-        prefix=f'{vcfOut}/Info{{chromosome}}'
+        prefix=f'{vcfdir}/{{chromosome}}/Info{{chromosome}}'
     conda: "bcftools"
     threads: 1
     resources: cpus=1, mem_mb=32000, time_min=60
     log: 'logs/get_af_{chromosome}.log'
     shell:
         """
-        bcftools +fill-tags {input} -Oz -o {input} -- -t AN,AC,AF
-        vcftools --gzvcf {input} --out {params.prefix} --get-INFO AC --get-INFO AF
+        bcftools +fill-tags {input} -Oz -o {output.vcf} -- -t AN,AC,AF
+        vcftools --gzvcf {output.vcf} --out {params.prefix} --get-INFO AC --get-INFO AF
 
         LOGFILE={params.prefix}.log
         if test -f "$LOGFILE"; then
@@ -27,12 +28,10 @@ rule get_af:
 
 rule get_major:
     input: rules.get_af.output.info
-    output: temp(f'{vcfOut}/Major{{chromosome}}.txt')
+    output: temp(f'{vcfdir}/{{chromosome}}/Major{{chromosome}}.txt')
     threads: 1
     resources: cpus=1, mem_mb=32000, time_min=60
     log: 'logs/Get_major_{chromosome}.log'
-#    wildcard_constraints:
-#        chromosome="^chr | ^Chr"
     shell:
         """
         awk '{{if (NR!=1 && $5>=0.5) {{print $1"_"$2","$4}} else if (NR!=1 && $5<0.5) {{print $1"_"$2","$3}}}}' {input} > {output}
@@ -42,7 +41,7 @@ rule extract_ancestral_chromosome:
     input:
         ancestral_file
     output:
-        temp(f'{vcfOut}/Ancestral{{chromosome}}.txt')
+        temp(f'{vcfdir}/{{chromosome}}/Ancestral{{chromosome}}.txt')
     params:
         chrNum = lambda wc: wc.get('chromosome')[3:]
     log: 'logs/Extract_ancestral_chromosome_{chromosome}.log'
@@ -57,7 +56,7 @@ rule join_major_ancestral:
         ancestral = rules.extract_ancestral_chromosome.output,
         major = rules.get_major.output
     output:
-        temp(f'{vcfOut}/MajAAnc_{{chromosome}}.txt')
+        temp(f'{vcfdir}/{{chromosome}}/MajAAnc_{{chromosome}}.txt')
     log: 'logs/Join_major_ancestral_{chromosome}.log'
     resources: cpus=1, mem_mb=32000, time_min=60
     shell:
@@ -69,7 +68,7 @@ rule determine_ancestral_major:
     input:
         rules.join_major_ancestral.output
     output:
-        temp(f'{vcfOut}/MajOAnc_{{chromosome}}.txt')
+        temp(f'{vcfdir}/{{chromosome}}/MajOAnc_{{chromosome}}.txt')
     log: 'logs/Determine_major_ancestral_{chromosome}.log'
     shell:
         """
@@ -78,9 +77,9 @@ rule determine_ancestral_major:
 
 rule decompress:
     input:
-        vcf=f'{vcfOut}/{{chromosome}}_phased.vcf.gz',
-	majororaa=rules.determine_ancestral_major.output
-    output: f'{vcfOut}/{{chromosome}}_phased.vcf' # this is removing both the .gz and the decompressed file
+        vcf = rules.get_af.output.vcf,
+	    majororaa = rules.determine_ancestral_major.output
+    output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf' 
     threads: 1
     resources: cpus=1, mem_mb=32000, time_min=30
     log: 'logs/Decompress_{chromosome}.log'
@@ -92,9 +91,9 @@ rule decompress:
 
 rule change_infoAA_vcf:
     input:
-        vcf=rules.decompress.output,
-        ancestralAllele=rules.determine_ancestral_major.output
-    output: f'{vcfOut}/{{chromosome}}_ancestral.vcf'
+        vcf = rules.decompress.output,
+        ancestralAllele = rules.determine_ancestral_major.output
+    output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf'
     conda: "bcftools"
     threads: 1
     resources: cpus=1, mem_mb=64000, time_min=120
@@ -109,17 +108,23 @@ rule change_infoAA_vcf:
         """
 
 rule compress_vcf:
-    input: rules.change_infoAA_vcf.output,
+    input: 
+        vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf',
     output:
-        file = f'{vcfOut}/{{chromosome}}_ancestral.vcf.gz',
-        idx = f'{vcfOut}/{{chromosome}}_ancestral.vcf.gz.csi'
+        vcf = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz',
+        idx = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz.csi',
+    params:
+        chrdir = f'{vcfdir}/{{chromosome}}/',
+        outdir = f'{vcfdir}/'
     conda: "bcftools"
     threads: 1
     resources: cpus=1, mem_mb=64000, time_min=60
     log: 'logs/Compress_{chromosome}.log'
     shell:
+       r"""
+        bgzip {input.vcf}
+        bcftools index {input.vcf}.gz
+
+        mv {input}.gz {input}.gz.csi {params.outdir}
+        rm -r {params.chrdir}
         """
-        bgzip {input}
-        bcftools index -f {output.file}
-        """
-# keeping the phased files w/o ancestral alleles
diff --git a/Snakemake/workflow/rules/split.smk b/Snakemake/workflow/rules/split.smk
index d6de413..8bc8a21 100644
--- a/Snakemake/workflow/rules/split.smk
+++ b/Snakemake/workflow/rules/split.smk
@@ -1,10 +1,29 @@
+# This script is used for managing VCF files in a directory previous to tree inference
+# It identifies two types of VCF files in the 'RawVCF' directory: 
+# 1. Files that start with either "Chr" or "chr" and end with "vcf.gz" (referred to as 'splitFiles')
+# 2. Files that start with "Combined" and end with "vcf.gz" (referred to as 'combinedFiles')
+# It then creates regex patterns for these file names for later use in rules.
+# The script also combines the names of both types of files into a single list (allFiles).
+# If there are any 'splitFiles', it defines a rule (move_vcf) to move these files 
+# from the 'RawVCF' directory to a directory named after the chromosome in the file name.
+# The rule also renames the files to include the chromosome name at the start of the file name.
+import os
+import re
 
+print(chromosomes)
 splitFiles = list(set([f.strip("vcf.gz").split("_")[1]
     for f in os.listdir(f'{vcfdir}/RawVCF')
-        if (f.endswith("vcf.gz") and (f.startswith("Chr") | f.startswith("chr")) )]))
+        if (f.endswith("vcf.gz") and (f.startswith("Chr") | f.startswith("chr")) and "_" in f)]))
+
 combinedFiles = list(set([f.strip("vcf.gz").split("_")[1]
     for f in os.listdir(f'{vcfdir}/RawVCF')
         if (f.endswith("vcf.gz") and f.startswith("Combined"))]))
+
+# Prepare a regex pattern that matches any of the file names in splitFiles and
+# combinedFiles so rules only look for them
+splitFiles_pattern = "(" + "|".join(re.escape(name) for name in splitFiles) + ")"
+combinedFiles_pattern = "(" + "|".join(re.escape(name) for name in combinedFiles) + ")"        
+
 allFiles = splitFiles + combinedFiles
 
 if len(splitFiles) > 0:
@@ -18,16 +37,13 @@ if len(splitFiles) > 0:
         conda: "bcftools"
         threads: 1
         resources: cpus=1, mem_mb=32000, time_min=60
-        log: 'logs/Move_vcf_{chromosome}_{suffixOne}.log'
+        benchmark: 'benchmarks/{chromosome}.{suffixOne}.move.benchmark.txt'
+        wildcard_constraints:
+            suffixOne = splitFiles_pattern
         shell:
             """
-            if [ -h {input.vcf} ]; then
-                ln -s $( realpath {input.vcf} ) {output.vcf}
-                ln -s $( realpath {input.vcf} ).csi {output.vcf}
-            else
-                ln -s {input.vcf} {output.vcf}
-                ln -s {input.idx} {output.idx}
-            fi
+            ln -s {input.vcf} {output.vcf}
+            ln -s {input.idx} {output.idx}
             """
 
 if len(combinedFiles) > 0:
@@ -35,16 +51,15 @@ if len(combinedFiles) > 0:
         input:
             f'{vcfdir}/RawVCF/Combined_{{suffixTwo}}.vcf.gz'
         output:
-            vcf = [f'{vcfOut}' + x
+            temp([f'{vcfdir}' + x
                 for x in expand('/{{chromosome}}/{{chromosome}}_{{suffixTwo}}.vcf.gz',
-                    chromosome = chromosomes, suffixTwo = combinedFiles)],
-            idx = [f'{vcfOut}' + x
-                for x in expand('/{{chromosome}}/{{chromosome}}_{{suffixTwo}}.vcf.gz.csi',
-                    chromosome = chromosomes, suffixTwo = combinedFiles)]
+                    chromosome = chromosomes, suffixTwo = combinedFiles)])
         conda: "bcftools"
         threads: 1
         resources: cpus=1, mem_mb=64000, time_min=60
-        log: expand('logs/Split_and_move_vcfs_{{chromosome}}_{{suffixTwo}}.log')
+        benchmark: 'benchmarks/{chromosome}.{suffixTwo}.split.benchmark.txt'
+        wildcard_constraints:
+            suffixTwo = combinedFiles_pattern
         shell:
             """
             str='{wildcards.chromosome}'
diff --git a/Snakemake/workflow/scripts/Infer_trees.py b/Snakemake/workflow/scripts/Infer_trees.py
index 7fb0b1c..c686b8b 100644
--- a/Snakemake/workflow/scripts/Infer_trees.py
+++ b/Snakemake/workflow/scripts/Infer_trees.py
@@ -42,6 +42,8 @@
     progress_monitor=True,
 )
 
+ancestors_ts.dump(f'{outputfile}.ancestors')
+
 ts = tsinfer.match_samples(
     sampleFile,
     ancestors_ts,
@@ -51,25 +53,24 @@
     progress_monitor=True,
 ).simplify(keep_unary=False)
 
-"""
-ts = tsinfer.infer(sampleFile)
+#ts = tsinfer.infer(sampleFile)
 print(
     "Inferred tree sequence `{}`: {} trees over {} Mb".format(
-        "drone_ts", ts.num_trees, ts.sequence_length / 1e6
+        "ts", ts.num_trees, ts.sequence_length / 1e6
     )
 )
-"""
-# # Check the metadata
-# for sample_node_id in ts.samples():
-#     individual_id = ts.node(sample_node_id).individual
-#     population_id = ts.node(sample_node_id).population
-#     print(
-#         "Node",
-#         sample_node_id,
-#         "labels genome sampled from",
-#         json.loads(ts.individual(individual_id).metadata),
-#         "in",
-#         json.loads(ts.population(population_id).metadata)["subpop"],
-#     )
+
+# Check the metadata
+for sample_node_id in ts.samples():
+    individual_id = ts.node(sample_node_id).individual
+    population_id = ts.node(sample_node_id).population
+    print(
+        "Node",
+        sample_node_id,
+        "labels genome sampled from",
+        json.loads(ts.individual(individual_id).metadata),
+        "in",
+        json.loads(ts.population(population_id).metadata)["subpop"],
+    )
 
 ts.dump(outputFile)
diff --git a/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py b/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py
index 0064f21..3a620b2 100644
--- a/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py
+++ b/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pandas as pd
 import pkg_resources
+from itertools import chain
+from tsinfer import MISSING_DATA
 
 print(sys.executable)
 
@@ -70,7 +72,9 @@ def add_sites(vcf, samples, ploidy):
     for variant in vcf:  # Loop over variants, each assumed at a unique site
         progressbar.update(variant.POS - pos)
         if pos == variant.POS:
-            raise ValueError("Duplicate positions for variant at position", pos)
+            print("Duplicate positions for variant at position", pos)
+            continue
+        
         else:
             pos = variant.POS
         if ploidy > 1:
@@ -83,10 +87,10 @@ def add_sites(vcf, samples, ploidy):
              alleles = alleles + [letter for letter in string]
              #ALT = ALT + [letter for letter in string]
 
-        print(alleles)
+        #print(alleles)
         # ignores non-bialllelic sites
         if len(alleles) > 2:
-            print('Ignoring non-biallelic site')
+         #   print('Ignoring non-biallelic site')
             continue
 
         # if len([*ancestral]) > 1:
@@ -95,7 +99,7 @@ def add_sites(vcf, samples, ploidy):
         #     #continue
 
         ancestral = variant.INFO.get("AA", variant.REF)
-        print(f'alleles: {alleles} | ancestal: {[*ancestral]}')
+        #print(f'alleles: {alleles} | ancestal: {[*ancestral]}')
         
         try:
            ancestral_allele = alleles.index(ancestral[0])
@@ -111,6 +115,7 @@ def add_sites(vcf, samples, ploidy):
                              ancestral_allele=alleles.index(ancestral[0]))
 
 
+
 def add_populations(vcf, samples, metaData):
     """
     This is a modified add populations function for drones from David Wragg data
@@ -121,10 +126,23 @@ def add_populations(vcf, samples, metaData):
     """
     pop_lookup = {}
     metaPop = metaData.iloc[:, 1:].drop_duplicates().dropna(axis = 0, how = 'all')
-    sample_subpop = [list(metaData.SubPop[metaData.ID == x]) for x in vcf.samples]
-    for subpop, pop in zip(metaPop.SubPop, metaPop.Pop):
-        pop_lookup[subpop] = samples.add_population(metadata={"pop": pop, "subpop": subpop})
-    return [pop_lookup[subpop[0]] for subpop in sample_subpop]
+    
+    # to account for same subpop in different countries - creating a 'fake subpop' to be used as key
+    # if subpop is repeated new code is subpop+origin, else subpop
+    d = [a if not (sum(j == a for j in metaPop.SubPop[:i])) else f'{a}{c}' for (i,a),c in zip(enumerate(metaPop.SubPop),metaPop.Origin)]
+    metaPop['code'] = d
+    # add code col to the full meatafile
+    metaData = pd.merge(metaData, metaPop[['Pop', 'SubPop', 'Origin', 'code']], on = ['Pop', 'SubPop', 'Origin'], how = 'inner')
+    
+    # list population per sample based on recoded subpop
+    sample_subpop = [list(metaData.code[metaData.ID == x]) for x in vcf.samples]
+    
+    # the dictionary key will be based on the recoding - so keeps the correct amount of population
+    # but the metadata takes in the real subpop value
+    for subpop, pop, origin, lat, lon, code in zip(metaPop.SubPop, metaPop.Pop, metaPop.Origin, metaPop.Lat, metaPop.Lon, metaPop.code):
+        pop_lookup[code] = samples.add_population(metadata={"pop": pop, "subpop": subpop, "origin": origin, "coord1": lat, "coord2":lon})
+    return [pop_lookup[code[0]] for code in sample_subpop]
+
 
 #######################################################################
 # Read in the information and prepare the data
@@ -138,7 +156,7 @@ def add_populations(vcf, samples, metaData):
 # Create samples for haploid data
 with tsinfer.SampleData(path=sampleFile,
                         sequence_length=chrLength,
-                        num_flush_threads=20, max_file_size=2**30) as samples:
+                        num_flush_threads=16, max_file_size=2**30) as samples:
    populations = add_populations(vcf = vcfD, samples = samples, metaData = metaFile)
    print("populations determined")
    add_individuals(vcf = vcfD, samples = samples, populations = populations, metaData = metaFile, ploidy = ploidy)
@@ -148,5 +166,6 @@ def add_populations(vcf, samples, metaData):
 print(
    "Sample file created for {} samples ".format(samples.num_samples)
    + "({} individuals) ".format(samples.num_individuals)
-   + "with {} variable sites.".format(samples.num_sites)
-)
+   + "with {} variable sites.".format(samples.num_sites),
+   flush=True,
+)
\ No newline at end of file
diff --git a/Snakemake/workflow/scripts/generate_ancestors.py b/Snakemake/workflow/scripts/generate_ancestors.py
new file mode 100644
index 0000000..9a8ef82
--- /dev/null
+++ b/Snakemake/workflow/scripts/generate_ancestors.py
@@ -0,0 +1,22 @@
+import sys
+import tsinfer
+import numcodecs
+import numpy as np
+
+args = sys.argv
+
+samples = tsinfer.load(args[1])
+sites_to_exclude = np.loadtxt(args[2])
+output = args[3]
+threads = int(args[4])
+
+gzip_compressor = numcodecs.GZip(level=1)
+
+ancestors = tsinfer.generate_ancestors(
+    sample_data=samples,
+    path=output,
+    exclude_positions=sites_to_exclude,
+    progress_monitor=True,
+    num_threads=threads,
+    compressor=gzip_compressor,
+)
\ No newline at end of file
diff --git a/Snakemake/workflow/scripts/generate_samples.py b/Snakemake/workflow/scripts/generate_samples.py
new file mode 100644
index 0000000..3543fc7
--- /dev/null
+++ b/Snakemake/workflow/scripts/generate_samples.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import os
+import sys
+import json
+import cyvcf2
+import tsinfer
+import numcodecs
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+# create a GZip compressor instance with a specific compression level
+gzip_compressor = numcodecs.GZip(level=1)
+
+def add_populations(vcf, samples, metaData):
+    """
+    This is a modified add populations function for drones from David Wragg data
+    Add tsinfer Population objects for drone subspecies, stored in metaPop object, and return a list od IDs correposning to the VCF samples
+    Input: vcf = cyvcf2 VCF object, samples is tsinfer SampleData and metaData is a pandas dataframe with id in ID column and populations in Type column
+    pops = dictinary holding name and additional information about populations
+    Return: A list of population indexes
+    """
+    pop_lookup = {}
+    metaPop = metaData.iloc[:, 1:].drop_duplicates().dropna(axis = 0, how = 'all')
+    
+    # to account for same subpop in different countries - creating a 'fake subpop' to be used as key
+    # if subpop is repeated new code is subpop+origin, else subpop
+    d = [a if not (sum(j == a for j in metaPop.SubPop[:i])) else f'{a}{c}' for (i,a),c in zip(enumerate(metaPop.SubPop),metaPop.Origin)]
+    metaPop['code'] = d
+    # add code col to the full meatafile
+    metaData = pd.merge(metaData, metaPop[['Pop', 'SubPop', 'Origin', 'code']], on = ['Pop', 'SubPop', 'Origin'], how = 'inner')
+    
+    # list population per sample based on recoded subpop
+    sample_subpop = [list(metaData.code[metaData.ID == x]) for x in vcf.samples]
+    
+    # the dictionary key will be based on the recoding - so keeps the correct amount of population
+    # but the metadata takes in the real subpop value
+    for subpop, pop, origin, lat, lon, code in zip(metaPop.SubPop, metaPop.Pop, metaPop.Origin, metaPop.Lat, metaPop.Lon, metaPop.code):
+        pop_lookup[code] = samples.add_population(metadata={"pop": pop, "subpop": subpop, "origin": origin, "coord1": lat, "coord2":lon})
+    return [pop_lookup[code[0]] for code in sample_subpop]
+
+def add_diploid_individuals(vcf, samples, populations):
+    for iid, population in zip(vcf.samples, populations):
+        samples.add_individual(ploidy=2, metadata={'ID': iid}, population=population)
+
+
+def add_diploid_sites(vcf, samples):
+    """
+    Read the sites in the vcf and add them to the samples object.
+    """
+    # You may want to change the following line, e.g. here we allow
+    # "*" (a spanning deletion) to be a valid allele state
+    allele_chars = set("ATGCatgc*")
+    
+    pos = 0
+    
+    progressbar = tqdm(total=samples.sequence_length, desc="Read VCF", unit='bp')
+    
+    for variant in vcf:  # Loop over variants, each assumed at a unique site
+        progressbar.update(variant.POS - pos)
+        
+        if pos == variant.POS:
+            print(f"Duplicate entries at position {pos}, ignoring all but the first")
+            continue
+        else:
+            pos = variant.POS
+        
+        if any([not phased for _, _, phased in variant.genotypes]):
+            raise ValueError("Unphased genotypes for variant at position", pos)
+        
+        alleles = [variant.REF.upper()] + [v.upper() for v in variant.ALT]
+        
+        ancestral = variant.INFO.get("AA", ".")  # "." means unknown
+        
+        if(ancestral == 'ambiguous' or ancestral not in alleles): # set ancestral == reference
+            ancestral_allele = 0
+        elif ancestral == '-1':
+            ancestral_allele = -1    
+        else:
+            ancestral_allele = alleles.index(ancestral)
+        
+        # Check we have ATCG alleles
+        for a in alleles:
+            if len(set(a) - allele_chars) > 0:
+                print(f"Ignoring site at pos {pos}: allele {a} not in {allele_chars}")
+                continue
+        
+        # Map original allele indexes to their indexes in the new alleles list.
+        genotypes = [g for row in variant.genotypes for g in row[0:2]]
+        samples.add_site(pos, genotypes, alleles, ancestral_allele=ancestral_allele)
+
+def process_samples(file, metadata, sample_path, chr_length, compressor, threads):
+    with tsinfer.SampleData(path=sample_path, 
+                            sequence_length=chr_length,
+                            compressor=compressor, 
+                            num_flush_threads=threads, max_file_size=2**30) as samples:
+        populations = add_populations(vcf=file, samples=samples, metaData=metadata)
+        print("populations determined")
+        add_diploid_individuals(vcf=file, samples=samples, populations=populations)
+        print("individuals added")
+        add_diploid_sites(vcf=file, samples=samples)
+        print("sites added")
+        
+        print(
+            "Sample file created for {} samples ".format(samples.num_samples)
+            + "({} individuals) ".format(samples.num_individuals)
+            + "with {} variable sites.".format(samples.num_sites),
+            flush=True,
+        )
+
+# -----------------------------------------------------------------------------------------
+
+args = sys.argv
+vcfFile = args[1]
+sampleFile = args[2]
+meta = args[3]
+chrLength = args[4]
+nthreads = int(args[5])
+
+metaFile = pd.read_csv(meta)
+
+# Create a population (subspecie) list for the samples in the VCF
+vcfD = cyvcf2.VCF(vcfFile, strict_gt=True)
+
+# Create samples for haploid data
+try:
+    process_samples(vcfD, metaFile, sampleFile, chrLength, gzip_compressor, nthreads)
+except Exception as e:
+    print(f"Error encountered: {e}. Retrying with num_flush_threads set to 16.")
+    process_samples(vcfD, metaFile, sampleFile, chrLength, gzip_compressor, 16)
+        
+# with tsinfer.SampleData(path=sampleFile,
+#                         sequence_length=chrLength,
+#                         num_flush_threads=nthreads, max_file_size=2**30) as samples:
+#    populations = add_populations(vcf = vcfD, samples = samples, metaData = metaFile)
+#    print("populations determined")
+#    add_diploid_individuals(vcf = vcfD, samples = samples, populations = populations)
+#    print("individuals added")
+#    add_diploid_sites(vcf = vcfD, samples = samples)
+#    print("sites added")
+
+# -----------------------------------------------------------------------------------------
+# def execute_with_threads(nthreads):
+#     try:
+#         with tsinfer.SampleData(path=sampleFile, 
+#                                 sequence_length=chrLength,
+#                                 compressor=gzip_compressor, 
+#                                 num_flush_threads=nthreads, max_file_size=2**30) as samples:
+#             populations = add_populations(vcf=vcfD, samples=samples, metaData=metaFile)
+#             print("populations determined")
+#             add_diploid_individuals(vcf=vcfD, samples=samples, populations=populations)
+#             print("individuals added")
+#             add_diploid_sites(vcf=vcfD, samples=samples)
+#             print("sites added")
+#     except Exception as e:
+#         print(f"Error encountered: {e}. Retrying with num_flush_threads set to 16.")
+#         with tsinfer.SampleData(path=sampleFile,
+#                                 sequence_length=chrLength,
+#                                 compressor=gzip_compressor,
+#                                 num_flush_threads=16, max_file_size=2**30) as samples:
+#             populations = add_populations(vcf=vcfD, samples=samples, metaData=metaFile)
+#             print("populations determined")
+#             add_diploid_individuals(vcf=vcfD, samples=samples, populations=populations)
+#             print("individuals added")
+#             add_diploid_sites(vcf=vcfD, samples=samples)
+#             print("sites added")
+# execute_with_threads(nthreads)
diff --git a/Snakemake/workflow/scripts/simplify.py b/Snakemake/workflow/scripts/simplify.py
new file mode 100644
index 0000000..7659671
--- /dev/null
+++ b/Snakemake/workflow/scripts/simplify.py
@@ -0,0 +1,29 @@
+import sys
+import tskit
+import subprocess
+# upgrade to the latest version of tsdate
+subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "tsdate"])
+import tsdate
+
+args = sys.argv
+ts = tskit.load(args[1])
+generation_interval = float(args[2])
+mu = float(args[3])
+output = args[4]
+
+# Ne = ts.diversity()/4/mu  
+
+processed_ts = tsdate.preprocess_ts(
+    ts,
+    split_disjoint=True,
+    keep_unary=False,
+    )
+
+dated_ts = tsdate.variational_gamma(tree_sequence = processed_ts, 
+                                  mutation_rate = mu, 
+                                  eps = generation_interval,
+                                  max_iterations = 100,
+                                  time_units = 'generations',
+                                  progress = True,)
+
+dated_ts.dump(output)
\ No newline at end of file
diff --git a/Snakemake/workflow/scripts/truncate.py b/Snakemake/workflow/scripts/truncate.py
new file mode 100644
index 0000000..1fbf0f4
--- /dev/null
+++ b/Snakemake/workflow/scripts/truncate.py
@@ -0,0 +1,17 @@
+import sys
+import tsinfer
+
+args = sys.argv
+ancestors = tsinfer.load(args[1])
+uprtime = float(args[2])
+lwertime = float(args[3])
+output = args[4]
+
+ancestors.truncate_ancestors(
+    path=output, 
+    max_file_size=2**38,
+    lower_time_bound=lwertime,
+    upper_time_bound=uprtime,
+    length_multiplier=2,
+)
+