diff --git a/Snakemake/config/glob_cattle.yaml b/Snakemake/config/glob_cattle.yaml new file mode 100644 index 0000000..7c8a537 --- /dev/null +++ b/Snakemake/config/glob_cattle.yaml @@ -0,0 +1,53 @@ +PROJECT: "global_cattle" +work_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/tree_sequence" +species: "Bos taurus" +ploidy: 2 +no_chromosomes: 29 +start_chromosome: 1 +end_chromosome: 29 +vcf_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global" +genmap: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ma2015_shapeit_ARS-UCD1.2" +reference_genome: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/Reference_genomes/ARS-UCD1.2_Btau5.0.1Y.fa" +ancestral_allele: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ancestral_alleles/Dorji_et_al/data/new_cattle_ancestral.txt" +meta: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global/files/metadata_correct.csv" +phase_only: False +remove_indel: True +missing_data: "0.2" +truncate_upper: "0.4" +truncate_lower: "0.2" +recombination_rate_tree: False # "1e-8" +recombination_rate_anc: False +mismatch_ratio_anc: False +mismatch_ratio_tree: False # "1" +mutation_rate: "1e-8" +generation_interval: 5 +chromosome_length: + 1 : 158534110 + 2 : 136231102 + 3 : 121005158 + 4 : 120000601 + 5 : 120089316 + 6 : 117806340 + 7 : 110682743 + 8 : 113319770 + 9 : 105454467 + 10 : 103308737 + 11 : 106982474 + 12 : 87216183 + 13 : 83472345 + 14 : 82403003 + 15 : 85007780 + 16 : 81013979 + 17 : 73167244 + 18 : 65820629 + 19 : 63449741 + 20 : 71974595 + 21 : 69862954 + 22 : 60773035 + 23 : 52498615 + 24 : 62317253 + 25 : 42350435 + 26 : 51992305 + 27 : 45612108 + 28 : 45940150 + 29 : 51098607 \ No newline at end of file diff --git a/Snakemake/workflow/Snakefile b/Snakemake/workflow/Snakefile index d4e5888..1f6de40 100644 --- a/Snakemake/workflow/Snakefile +++ b/Snakemake/workflow/Snakefile @@ -11,17 +11,13 @@ from os.path import isfile, join ##### setup report ##### -report: "reports/tsinfer.rst" - -# specify config files on the command line using --configfile #configfile: '../config/ancestral_Eddie.yaml' -#configfile: '../config/tsinfer_Eddie.yaml' - +configfile: '../config/glob_cattle.yaml' +report: "reports/tsinfer.rst" ##### setup singularity ##### # Can we set conda env here??? - ##### Define some variables ##### # all files should be in the vcfDir/RawVCF folder # files that need spliting are named: Combined_something.vcf.gz @@ -30,47 +26,45 @@ report: "reports/tsinfer.rst" # (2) sort into the correct Chr folder in vcfDir. # defining variables that will be used throughout the script - -# the final directory name is now editable -- so can be named according to specific project -# mapdir indicates where to find the genetic map for phasing (full path in the config) -mapdir = config['genmap'] -oDir = Path(config['o_dir'], config['PROJECT']) - -vcfdir=config['vcf_dir'] # from config file, the directory containing the VCF(s) to process -# the path where to save intermetdiate and final VCFs -if config['process_vcf_in_original_dir']: - vcfOut = config['vcf_dir'] # same as above -else: - vcfOut = f"{oDir}/VCF" # VCF dir inside the output dir - - +vcfdir=config['vcf_dir'] # from config file, the path to vcf # this folder can be any where but must have the following structure: # vcfDir: # - RawVCF # - Chr1 # -Chr... +# the final directory name is now editable -- so can be named according to specific project +# mapdir indicates where to find the genetic map for phasing (full path in the config) +Project = config['PROJECT'] +projdir = config['work_dir'] + '/' + Project +mapdir = config['genmap'] -# if want to split multiallelic sites set to TRUE: -#multiallelic = False - -chromosomes = [f'Chr{n}' for n in range(1, config['no_chromosomes'] + 1)] +chromosomes = [f'chr{n}' for n in range(config['start_chromosome'], config['end_chromosome'] + 1)] # set a list of 'Chr' + chr# combinations to be used as wildcards +wildcard_constraints: + chromosome = r'chr\d+' +# constraining the wildcard to avoid errors + ##### load rules ##### -include: "rules/ancestral_inference.smk" +#include: "rules/ancestral_inference.smk" include: "rules/split.smk" +include: "rules/filter.smk" include: "rules/merge_samples_vcf.smk" include: "rules/phasing.smk" -include: "rules/prepare_files_for_tsinfer.smk" -#include: "rules/multiallelic.smk" -include: "rules/infer_trees.smk" +include: "rules/ancestral_info_to_vcf.smk" +include: "rules/new_infer_trees.smk" + +##### Conditional inputs for rule all based on the existence of phased VCFs and config['phase_only'] ##### +if not config['phase_only']: + rule_all_input = expand(f'{projdir}/Tsinfer/trees/{{chromosome}}.dated.trees', chromosome=chromosomes) +else: + rule_all_input = expand(f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi', chromosome=chromosomes) + ##### target rules ##### # At the end it should generate a phased vcf file that combines all files for # each chromosome. These files are stored in the vcf folder declaired in the # configuration file. It also checks that vcfs have been filtered before merged rule all: - input: - expand(f'{oDir}/Tsinfer/trees/{{chromosome}}.trees', chromosome=chromosomes) - # tree inference + input: rule_all_input diff --git a/Snakemake/workflow/profile/cluster.json b/Snakemake/workflow/profile/cluster.json index f4b023c..2670dad 100644 --- a/Snakemake/workflow/profile/cluster.json +++ b/Snakemake/workflow/profile/cluster.json @@ -1,55 +1,55 @@ { "__default__": { - "mem": "64G", + "mem": "8G", "threads": "1", "time": "10:00:00" }, - "rename": + "compare_and_filter_vcfs": { - "mem": "64G", - "threads": "1", - "time": "03:00:00" + "mem": "16G", + "threads": "8", + "time": "20:00:00" + }, + "merge": + { + "mem": "16G", + "threads": "8", + "time": "20:00:00" + }, + "remove_missing_indels": + { + "mem": "16G", + "threads": "16", + "time": "05:00:00" }, "phase": - { - "mem": "350G", - "threads": "25", - "time": "336:00:00" - }, - "rename_phased": { "mem": "64G", - "threads": "1", - "time": "03:00:00" + "threads": "32", + "time": "240:00:00" }, - "get_af": + "get_vcf_positions": { - "mem": "64G", - "threads": "1", - "time": "03:00:00" - }, - "get_major": - { - "mem": "64G", - "threads": "1", - "time": "03:00:00" + "mem": "16G", + "threads": "32", + "time": "05:00:00" }, - "extract_ancestral_chromosome": + "get_ancestral_alleles": { - "mem": "64G", + "mem": "32G", "threads": "1", "time": "02:00:00" }, - "join_major_ancestral": + "update_regions_file": { - "mem": "64G", + "mem": "32G", "threads": "1", "time": "02:00:00" }, - "determine_ancestral_major": + "add_info_to_vcf": { - "mem": "64G", + "mem": "32G", "threads": "1", "time": "04:00:00" }, @@ -59,118 +59,40 @@ "threads": "1", "time": "03:00:00" }, - "sort_ancestral_to_vcf": - { - "mem": "64G", - "threads": "1", - "time": "03:00:00" - }, - "decompress": - { - "mem": "64G", - "threads": "1", - "time": "03:00:00" - }, - "change_infoAA_vcf": - { - "mem": "128G", - "threads": "1", - "time": "05:00:00" - }, "prepare_sample_file": - { - "mem": "128G", - "threads": "1", - "time": "10:00:00" - }, - "infer": - { - "mem": "128G", - "threads": "1", - "time": "40:00:00" - }, - "extract_aligned_alleles_from_vcf": - { - "mem": "32G", - "threads": "1", - "time": "03:00:00" - }, - "extract_snps_from_info": { "mem": "32G", - "threads": "1", - "time": "03:00:00" + "threads": "16", + "time": "336:00:00" }, - "extract_pos_aligned_alleles_from_vcf": + "generate_ancestors": { "mem": "32G", - "threads": "1", - "time": "03:00:00" - }, - "extract_vcf_alleles_from_aligned": - { - "mem": "16G", - "threads": "1", - "time" : "03:00:00" - }, - "create_estsfs_dicts": - { - "mem": "16G", - "threads": 1", - "time" : "05:00:00" - }, - "edit_estsfs_dicts": - { - "mem": "16G", - "threads": "1", - "time": "01:00:00" - }, - "combine_estsfs_dicts": - { - "mem": "16G", - "threads": "1", - "time": "02:00:00" + "threads": "16", + "time": "336:00:00" }, - "run_estsfs": + "truncate_ancestors": { "mem": "32G", - "threads": "1", - "time": "20:00:00" - }, - "extract_major": - { - "mem": "16G", - "threads": "1", - "time": "02:00:00" - }, - "extract_minor": - { - "mem": "16G", - "threads": "1", - "time": "02:00:00" - }, - "extract_major_outgroup": - { - "mem": "16G", - "threads": "1", - "time": "02:00:00" + "threads": "16", + "time": "336:00:00" }, - "extract_estsfs_prob": + "ancestors_trees": { - "mem": "16G", - "threads": "1", - "time": "02:00:00" + "mem": "32G", + "threads": "16", + "time": "336:00:00" }, - "determine_ancestral": + "infer_trees": { - "mem": "16G", - "threads": "1", - "time": "02:00:00" + "mem": "32G", + "threads": "16", + "time": "336:00:00" }, - "move_and_rename_aa": - { - "mem": "16G", - "threads": "1", - "time": "05:00:00" - } + "simplify_trees": + { + "mem": "32G", + "threads": "16", + "time": "03:00:00" + } } diff --git a/Snakemake/workflow/profile/config.yaml b/Snakemake/workflow/profile/config.yaml index 1dc5486..e7a1c31 100644 --- a/Snakemake/workflow/profile/config.yaml +++ b/Snakemake/workflow/profile/config.yaml @@ -3,10 +3,8 @@ use-conda: true jobscript: "jobscript.sh" cluster-config: "cluster.json" jobname: "{rule}.{jobid}" -jobs: 16 -#drmaa: " -cwd -l h_vmem={cluster.mem} -l h_rt={cluster.time} -pe sharedmem {cluster.threads} -P roslin_HighlanderLab" +drmaa: " -P roslin_HighlanderLab -cwd -l h_vmem={cluster.mem} -l h_rt={cluster.time} -pe sharedmem {cluster.threads} -R y" rerun-incomplete: true -rerun-triggers: mtime -cores: 16 # how many jobs you want to submit to your cluster queue -local-cores: 1 +rerun-triggers: [mtime, input] +local-cores: 99 keep-going: false diff --git a/Snakemake/workflow/rules/add_ancestral.smk b/Snakemake/workflow/rules/add_ancestral.smk new file mode 100644 index 0000000..dd404f1 --- /dev/null +++ b/Snakemake/workflow/rules/add_ancestral.smk @@ -0,0 +1,167 @@ + +ancestral_file = config['ancestral_allele'] + +# input and output +infile = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz' +infileidx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi' +outfile = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz' +outidx = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz.csi' + +# temporary files +filtered_samples = f'{vcfdir}/{{chromosome}}/{{chromosome}}_sam.vcf.gz' +filtered_positions = f'{vcfdir}/{{chromosome}}/{{chromosome}}_sampos.vcf' +ancestral_not_compressed = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf' +regfile = f'{vcfdir}/{{chromosome}}_regions.tsv' +aaInfo = f'{vcfdir}/{{chromosome}}_aa.info' + +if config['PROJECT'] == "global_cattle": + rule remove_samples: + input: infile + output: filtered_samples + params: + samples_to_remove = f'{vcfdir}/{{chromosome}}/remove' + conda: 'bcftools' + threads: 32 + resources: cpus=32, mem_mb=2048000, time_min=1200 + shell: + r""" + bcftools view -S ^{params.samples_to_remove} {input} --threads {threads} -O z -o {output} + + bcftools index {output} + """ + + +# need to some helper files: +# 1) create a chr pos file for filtering +# 2) create an info file for adding ancestrals +# filter vcf positions so that it has the same as the aa file +# add ancestral allele to INFO + +rule make_regions_file: + input: + anc = ancestral_file, + vcf = filtered_samples + output: temp(regfile) + params: + chrnum = lambda wc: wc.get('chromosome')[3:] + shell: + r""" + # filter positions in ancestral that correspond to chr + cut -d' ' -f1,2 {input.ancestral_file} | awk '$1 == {params.chrnum}' | sed -e $'s/ /\t/g' > chr{params.chrnum}.anc + + cut -f2 chr{params.chrnum}.anc > chr{params.chrnum}.compare + + # get positions in vcf + bcftools query -l '%POS\n' {input.vcf} > chr{params.chrnum}.file + + # match positions ancestral and vcf + comm -12 <( sort chr{params.chrnum}.file ) <( chr{params.chrnum}.compare) > chr{params.chrnum}.keep + + # number of lines + l=($(wc -l chr{params.chrnum}.keep)) + + # create final regions file + yes {params.chrnum} | head -n $l > chr{params.chrnum} + paste chr{params.chrnum} chr{params.chrnum}.keep > {output} + + rm chr{params.chrnum} chr{params.chrnum}.keep chr{params.chrnum}.file chr{params.chrnum}.compare chr{params.chrnum}.anc + """ +#grep -vw 4579381 3565339 + +rule filter_positions: + input: + vcf = filtered_samples, + idx = f'{filtered_samples}.csi', + regions = regfile, + output: filtered_positions + conda: 'bcftools' + threads: 32 + resources: cpus = 32, mem_mb = 768000, time_min = 300 + shell: + r""" + bcftools view -R {input.regions} {input.vcf} \ + # --threads {threads} \ + -o {output} + """ + +# take only the rows where col1 == current_chromosome +# add AA= in front of the aa col +# use only col pos & aa +# match with the left pos in filtered vcf + +rule make_aaInfo_file: + input: + aa_file = ancestral_file, + vcf = filtered_positions + output: temp(aaInfo) + params: + chrnum = lambda wc: wc.get('chromosome')[3:] + shell: + r""" + # filter by chromosome and add AA= to the allele col + awk '$1 == {params.chrnum}' {input} | awk '$3="AA="$3' | cut -d' ' -f2,3 > {wildcards.chromosome}.aa + + # get the positions in the vcf + bcftools query -f '%POS\n' {input.vcf} > {wildcards.chromosome}.pos + + # add allele col to the pos in the vcf + awk 'NR==FNR{{a[$1];next}}($1 in a){{print $0}}' {wildcards.chromosome}.pos {wildcards.chromosome}.aa > {wildcards.chromosome}.pos.aa + + cut -d' ' -f2 {wildcards.chromosome}.pos.aa | sed '1s/^/INFO\n/' > {output} + + rm {wildcards.chromosome}.aa {wildcards.chromosome}.pos {wildcards.chromosome}.pos.aa + """ + +# need to run shapeit first to know line numbers +rule add_ancestral: + input: + vcf = filtered_positions, + aa_info = aaInfo, + output: ancestral_not_compressed + conda: 'bcftools' + shell: + r""" + awk 'NR==FNR{{a[NR] = $1; next}} FNR<13{{print}}; \ + FNR==6{{printf "##INFO=\n"}}; \ + FNR>12{{$8=a[FNR-11]; print}}' OFS="\t" {input.aa_info} {input.vcf} > {output} + """ + + +# HEADERNUM="$(( $(bcftools view -h {input.vcf} | wc -l) - 1 ))" + +# INFOLINE=$(( $(bcftools view -h {input.vcf} | awk '/INFO/{{print NR}}' | head -n 1) )) + +# awk -v OFS="\t" -v HEADER=$HEADERNUM -v INFO=$INFOLINE 'NR==FNR{{{{a[FNR] = $2; next}}}} FNR<=HEADER{{{{print}}}}; \ +# FNR==INFO{{{{printf "##INFO=\\n"}}}}; \ +# FNR>HEADER{{{{$8=a[FNR-HEADER]; print}}}}' OFS="\t" {input.aa_info} {input.vcf} > {output} +# All_AAInfo.txt AllFiltered.recode.vcf > AllFiltered.recode.Ancestral.vcf + +# compress and index final vcf +# rule match_samples: +# input: ancestral_not_compressed +# output: outfile +# params: +# samples_to_keep = f'{vcfdir}/{{chromosome}}/keep.samples' +# conda: 'bcftools' +# shell: +# r""" +# bcftools view -S {params.samples_to_keep} {input} -O z -o {output} +# """ + +rule compress_vcf: + input: ancestral_not_compressed + output: outfile, + conda: 'bcftools' + shell: + r""" + bgzip -c {input} > {output} + """ + +rule index_output: + input: outfile + output: outidx + conda: 'bcftools' + shell: + r""" + bcftools index {input} + """ diff --git a/Snakemake/workflow/rules/ancestral_info_to_vcf.smk b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk new file mode 100644 index 0000000..1aeb6c0 --- /dev/null +++ b/Snakemake/workflow/rules/ancestral_info_to_vcf.smk @@ -0,0 +1,141 @@ + +# extract all positions in vcf file using bcftools and save to regions file +# for rows in ancestral_alleles file where col 1 match wildcard.chromosome +# for each row in regions file, if value is in ancestral_alleles col 2, add +# ancestral_alleles col 3 to regions file col 2 +# else add -1 + +ancestral_file = config['ancestral_allele'] + +if "global_cattle" in config['PROJECT']: + # rule list_samples: + # input: + # vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz', + # metadata = config['meta'] + # output: + # samples = f'{vcfdir}/{{chromosome}}/samples_to_remove' + # conda: 'bcftools' + # shell: + # r""" + # bcftools query -l {input.vcf_file} > {wildcards.chromosome}.samples + # cut -d',' -f1 {input.metadata} | tail -n +2 > metadata.samples + # grep -v -f metadata.samples vcf.samples > unique.samples + # """ + + rule remove_samples: + input: + vcf_file = input_ancestral, + idx_file = input_ancestral_idx, + output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_analysis.vcf', + params: + samples_to_remove = f'{vcfdir}/samples.remove' + conda: 'bcftools' + threads: 32 + resources: cpus=32, mem_mb=2048000, time_min=1200 + benchmark: 'benchmarks/{chromosome}.remove.benchmark.txt' + shell: + r""" + bcftools view -S ^{params.samples_to_remove} {input.vcf_file} --threads {threads} -O v -o {output} + """ + +else: + rule decompress: + input: + vcf_file = input_ancestral, + idx_file = input_ancestral_idx + output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_analysis.vcf' + conda: 'bcftools' + benchmark: 'benchmarks/{chromosome}.decompress.benchmark.txt' + shell: + r""" + bcftools view {input.vcf_file} -O v -o {output} + """ + +input_ancestral = f'{vcfdir}/{{chromosome}}/{{chromosome}}_analysis.vcf' + +rule get_vcf_positions: + input: + vcf_file = input_ancestral, + output: + regions_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.regions') + conda: 'bcftools' + benchmark: 'benchmarks/{chromosome}.positions.benchmark.txt' + shell: + r""" + bcftools query -f '%POS\n' {input.vcf_file} | awk '{{print $1, -1}}' > {output.regions_file} + """ + +rule get_ancestral_alleles: + input: + ancestral_alleles_file = ancestral_file + output: + ancestrals = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.anc') + params: + chrnum = lambda wc: wc.get('chromosome')[3:] + benchmark: 'benchmarks/{chromosome}.getaa.benchmark.txt' + shell: + r""" + # filter positions in ancestral that correspond to chr + awk '$1 == {params.chrnum}' {input.ancestral_alleles_file} | cut -d' ' -f2,3 > {output.ancestrals} + """ + +rule update_regions_file: + input: + regions_file = rules.get_vcf_positions.output.regions_file, + ancestrals = rules.get_ancestral_alleles.output.ancestrals, + output: + updated_regions_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.updated.regions'), + positions_exclude = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.exclude'), + benchmark: 'benchmarks/{chromosome}.regions.benchmark.txt' + shell: + r""" + awk 'NR==FNR{{a[$1]=$2;next}} ($1 in a){{$2=a[$1]}}1' {input.ancestrals} {input.regions_file} > {output.updated_regions_file} + + awk '$2 == -1' {output.updated_regions_file} | cut -d' ' -f1 > {output.positions_exclude} + """ + +rule add_AA_to_regions_file: + input: + updated_regions_file = rules.update_regions_file.output.updated_regions_file, + output: + new_regions_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}.final.regions' + benchmark: 'benchmarks/{chromosome}.addaaregions.benchmark.txt' + shell: + r""" + echo "INFO" > {output.new_regions_file} + awk '{{print "AA="$2}}' {input.updated_regions_file} >> {output.new_regions_file} + """ + +rule add_info_to_vcf: + input: + vcf_file = input_ancestral, + # f'{vcfdir}/{{chromosome}}/{{chromosome}}_correctids.vcf', + aa_info = rules.add_AA_to_regions_file.output.new_regions_file, + output: + large_vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf') + conda: 'bcftools' + benchmark: 'benchmarks/{chromosome}.aainfo.benchmark.txt' + shell: + r""" + awk 'NR==FNR{{a[NR] = $1; next}} FNR<15{{print}}; \ + FNR==6{{printf "##INFO=\n"}}; \ + FNR>14{{$8=a[FNR-14]; print}}' OFS="\t" {input.aa_info} {input.vcf_file} > {output.large_vcf} + + rm {input.vcf_file} + """ + +rule compress_index_vcf: + input: + large_vcf = rules.add_info_to_vcf.output.large_vcf + # f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf' + output: + vcf_file = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz', + vcf_indx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz.csi' + conda: 'bcftools' + benchmark: 'benchmarks/{chromosome}.compress.benchmark.txt' + shell: + r""" + bgzip -c {input} > {output.vcf_file} + bcftools index {output.vcf_file} + """ +input_ts = rules.compress_index_vcf.output.vcf_file diff --git a/Snakemake/workflow/rules/date.smk b/Snakemake/workflow/rules/date.smk new file mode 100644 index 0000000..a9afb85 --- /dev/null +++ b/Snakemake/workflow/rules/date.smk @@ -0,0 +1,19 @@ +# Script for (iteractively) dating the tree sequence +# trees must be dated, re-inferred and dated again +# to get the correct dates (test number of iterations) + +# if this is the first iteration, start from the existing tree sequence; +# otherwise, re-infer the trees using the dated sampleFile and date the trees again + +input_ts = + +print(f'Starting dating of tree sequence {input_ts}...') + +rule date_trees: + input: rules.simplify_trees.output + output: f'{_input:n}.dated' + conda: '' + + +print('A rough estimate of the effective population size is', ts.diversity() / (4 * 1e-6)) + diff --git a/Snakemake/workflow/rules/filter.smk b/Snakemake/workflow/rules/filter.smk new file mode 100644 index 0000000..54a596a --- /dev/null +++ b/Snakemake/workflow/rules/filter.smk @@ -0,0 +1,159 @@ +# if there is only one vcf files, use the output from split_and_move as input +# for the next step samples +# else, compare the files and filter out overlapping then merge the vcfs into +# one and index and use the output from the merge as input for the next step +if len(allFiles) == 1: + input_vcf = rules.split_and_move_vcfs.output + input_index = f'{rules.split_and_move_vcfs.output}.csi' + input_summary = 'None' + +else: + input_to_filter = [f'{vcfdir}/{{chromosome}}/{{chromosome}}_{file}.vcf.gz' for file in allFiles] + + filtered_output = [f'{vcfdir}/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz' for file in allFiles] + filtered_output_idx = [f'{vcfdir}/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz.csi' + for file in allFiles] + merged_output = f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz' + + rule compare_and_filter_vcfs: + input: input_to_filter + output: temp(filtered_output) + params: + file_names = [f'{{chromosome}}_{file}' for file in allFiles], + path = f'{vcfdir}/{{chromosome}}' + benchmark: 'benchmarks/{chromosome}.sample.benchmark.txt' + run: + import os + import shlex + import itertools + import subprocess + from glob import glob + + path_names = [f'{params.path}/{name}' for name in params.file_names] + pairs = list(itertools.combinations(path_names, 2)) + new_ext = '.filtered.vcf.gz' + to_rename = path_names.copy() + + # to avoid shell injection, sanitize the file names + # -- ensure speciall shell characters are escaped + sanitized_path = shlex.quote(params.path) + sanitized_new_ext = shlex.quote(new_ext) + sanitized_idx = shlex.quote(new_ext + '.csi') + + for i, pair in enumerate(pairs): + print(f'pair {i} {pair}') + + # Run vcftools to find if there are overlapping samples between pairs + command = f""" + module load anaconda + source activate bcftools + vcftools --gzvcf {os.path.join(pair[0] + '.vcf.gz')} \ + --gzdiff {os.path.join(pair[1] + '.vcf.gz')} \ + --diff-indv + """ + subprocess.run(command, shell=True, check=True, cwd=sanitized_path) + + # if vcftools find any overlapping samples, remove them from the second vcf file + if "B" in subprocess.run("awk '{if ($2 == \"B\") print $2}' out.diff.indv_in_files", + shell=True, + capture_output=True, + text=True, + cwd=sanitized_path).stdout: + print(f'Overlapping samples found!! Removing samples from {pair[1]}') + command = f""" + grep B out.diff.indv_in_files | cut -f1 > samples.remove + rm out.diff.indv_in_files + """ + subprocess.run(command, shell=True, check=True, cwd=sanitized_path) + command = f""" + bcftools view -S ^samples.remove {os.path.join(pair[1] + '.vcf.gz')} \ + -Oz -o {os.path.join(pair[1] + sanitized_new_ext)} + + bcftools index -f {os.path.join(pair[1] + sanitized_new_ext)} + + rm samples.remove + """ + subprocess.run(command, shell=True, check=True, cwd=sanitized_path) + # remove the file that has been sampled from renaming list + to_rename.remove(pair[1]) + print(f'Files to rename are: {to_rename}') + + # if there are no overlapping samples, + # just rename the files that still have the original extension + else: + print('No overlapping samples') + + print(f'Finished comparing and sampling vcfs') + + # rename the files that have not been sampled + for name in to_rename: + sanitized_name = shlex.quote(name) + print(f'Renaming {sanitized_name}') + command = f""" + mv {os.path.join(sanitized_name + '.vcf.gz')} {os.path.join(sanitized_name + sanitized_new_ext)} + mv {os.path.join(sanitized_name + '.vcf.gz.csi')} {os.path.join(sanitized_name + sanitized_idx)} + """ + subprocess.run(command, shell=True, check=True, cwd=sanitized_path) + + # Remove all files that do not contain the string new_ext + command = f""" + find {sanitized_path} -type f ! -name '*{sanitized_new_ext}*' -exec rm -f {{}} \; + """ + subprocess.run(command, shell=True, check=True, cwd=sanitized_path) + + rule merge: + input: rules.compare_and_filter_vcfs.output + output: merged_output + params: + files_path = f'{vcfdir}/{{chromosome}}/', + indexes = filtered_output_idx + conda: "bcftools" + threads: 32 + resources: cpus=32, mem_mb=2048000, time_min=1200 + benchmark: 'benchmarks/{chromosome}.merge.benchmark.txt' + shell: + r""" + printf '%s\n' {params.files_path}*filtered*.gz > {wildcards.chromosome}.merge + + bcftools merge \ + -m snps \ + -l {wildcards.chromosome}.merge \ + --threads {threads} -O z -o {output} + + rm {wildcards.chromosome}.merge {params.indexes} + """ + + rule index_merged: + input: rules.merge.output, + output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_merged.vcf.gz.csi' + conda: 'bcftools' + benchmark: 'benchmarks/{chromosome}.indxmerged.benchmark.txt' + shell: + r""" + bcftools index -f {input} + """ + + rule vcf_summary: + input: + vcf_file = rules.merge.output, + index = rules.index_merged.output, + output: + summary = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary.vchk', + # done = touch(f'{vcfdir}/{{chromosome}}/merged_summary.done') + params: + plots = f'{vcfdir}/{{chromosome}}/{{chromosome}}_summary/' + conda: 'bcftools' + benchmark: 'benchmarks/{chromosome}.summary.benchmark.txt' + shell: + r""" + bcftools stats {input.vcf_file} > {output.summary} + plot-vcfstats -p {params.plots} {output.summary} + """ + + input_vcf = rules.merge.output + input_index = rules.index_merged.output + input_summary = rules.vcf_summary.output.summary + +# else, use the output from split_and_move as input for the next step +# else: +# input_vcf = rules.split_and_move_vcfs.output \ No newline at end of file diff --git a/Snakemake/workflow/rules/infer_trees.smk b/Snakemake/workflow/rules/infer_trees.smk index 93231af..cf36dcc 100644 --- a/Snakemake/workflow/rules/infer_trees.smk +++ b/Snakemake/workflow/rules/infer_trees.smk @@ -1,36 +1,38 @@ import re -# if multiallelic: -# vcf_4_inference = f'{vcfOut}/{{chromosome}}_allbi.vcf.gz' -# else: -# vcf_4_inference = f'{vcfOut}/{{chromosome}}_ancestral.vcf.gz' - rule prepare_sample_file: input: - #vcf=vcf_4_inference, - vcf = f'{vcfOut}/{{chromosome}}_ancestral.vcf.gz', - meta = Path(vcfdir, config['meta']) + vcf = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz', + meta = config['meta'] output: - f"{oDir}/Tsinfer/samples/{{chromosome}}.samples" + f'{projdir}/Tsinfer/samples/{{chromosome}}.samples' params: chrLength= lambda wildcards: config['chromosome_length'][int(re.findall(r'\d+', wildcards.chromosome)[0])], ploidy=config['ploidy'] conda: "HLab_tsinfer" - threads: 1 - resources: cpus=1, mem_mb=128000, time_min=200 - log: 'logs/Prepare_sample_file_{chromosome}.log' + threads: 32 + resources: cpus=32, mem_mb=768000, time_min=200 + benchmark: + 'benchmarks/{chromosome}.prep_sample.benchmark.txt' + #log: 'logs/Prepare_sample_file_{chromosome}.log' shell: "python scripts/Prepare_tsinfer_sample_file.py " "{input.vcf} {input.meta} {output} {params.ploidy} {params.chrLength}" rule infer: input: - f"{oDir}/Tsinfer/samples/{{chromosome}}.samples" - conda: "HLab_tsinfer" - threads: 1 - resources: cpus=1, mem_mb=128000, time_min=300 - log: 'logs/Infer_{chromosome}.log' + f'{projdir}/Tsinfer/samples/{{chromosome}}.samples', output: - f"{oDir}/Tsinfer/trees/{{chromosome}}.trees" - script: - "../scripts/Infer_trees.py" # path to script is relative to the file containing this rule. Hence the prepended "../" + f'{projdir}/Tsinfer/trees/{{chromosome}}.trees', + conda: "HLab_tsinfer" + threads: 16 + resources: cpus=16, mem_mb=1024000, time_min=10080 + benchmark: + 'benchmarks/{chromosome}.infer.benchmark.txt' + #log: 'logs/Infer_{chromosome}.log' + shell: + r""" + # Check amount of memory (in kbytes) as seen by the job + ulimit -v + python scripts/Infer_trees.py {input} {output} + """ diff --git a/Snakemake/workflow/rules/keep_snp.smk b/Snakemake/workflow/rules/keep_snp.smk new file mode 100644 index 0000000..323bd1b --- /dev/null +++ b/Snakemake/workflow/rules/keep_snp.smk @@ -0,0 +1,43 @@ +# rule gatk_index: +# input: +# vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz', +# idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi' +# output: temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.tbi'), +# conda: 'gatk-env' +# benchmark: +# 'benchmark/{chromosome}.gatk_index.benchmark.txt' +# shell: +# r""" +# gatk IndexFeatureFile -I {input.vcf} +# """ + +# rule filter_snps: +# input: +# vcf = rules.phase.output, +# idx = rules.index_phased.output, +# output: +# vcf_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz'), +# idx_file = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz.csi'), +# params: config['reference_genome'], +# conda: 'gatk-env' +# benchmark: 'benchmarks/{chromosome}.filter_snps.benchmark.txt' +# shell: +# r""" +# gatk SelectVariants \ +# -R {params} \ +# -V {input.vcf} \ +# --select-type-to-include SNP \ +# -O {output.vcf_file} + +# bcftools index {output.vcf_file} +# """ + +# rule index: +# input: f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz', +# output: temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_snps.vcf.gz.csi') +# conda: 'bcftools' +# benchmark: 'benchmarks/{chromosome}.index_filtered_snps.benchmark.txt' +# shell: +# r""" +# bcftools index {input} +# """ diff --git a/Snakemake/workflow/rules/merge_samples_vcf.smk b/Snakemake/workflow/rules/merge_samples_vcf.smk index b6894d7..72f74c1 100644 --- a/Snakemake/workflow/rules/merge_samples_vcf.smk +++ b/Snakemake/workflow/rules/merge_samples_vcf.smk @@ -1,103 +1,32 @@ -import yaml - - -if len(allFiles) != 1: - rule get_samples: - input: - f'{vcfOut}/{{chromosome}}/{{chromosome}}_{{file}}.vcf.gz' - output: temp(f'{vcfOut}/{{chromosome}}/{{chromosome}}_{{file}}.txt') - conda: "bcftools" - threads: 1 - resources: cpus=1, mem_mb=32000, time_min=30 - log: 'logs/Get_samples_{chromosome}_{file}.log' - shell: - """ - bcftools query -l {input} > {output} - """ - - rule filter: - input: - [f'{vcfOut}' + x - for x in expand('/{{chromosome}}/{{chromosome}}_{file}.txt', - file=allFiles)], - output: - temp([f'{vcfOut}' + x for x in - expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.{ext}', file=allFiles, ext=['gz', 'gz.csi'])]) - params: - duplicated = '{chromosome}.ids' - # conda: 'bcftools' - resources: cpus=1, mem_mb=64000, time_min=60 - run: - import os - filtered = [] - - for i in range(len(input)): - print(i) - for j, file in enumerate(input): - if file != input[i] and j > i: - file1 = input[i] - file2 = input[j] - print(file1, file2) - - shell("comm -12 <(sort {file1}) <(sort {file2}) > {params.duplicated}") - - remove = params.duplicated - lremove = os.stat(f'{remove}').st_size - - if lremove == 0: - print(f'Nothing to remove {lremove}') - else: - print(f'{lremove} samples to remove') - if file2 not in filtered: - filtered.append(file2) - - file2 = file2.split('.')[0] - print(f'file {file2} filtered') - shell('bcftools view -S ^{params.duplicated} {file2}.vcf.gz -O z -o {file2}.filtered.vcf.gz') - shell('bcftools index -f {file2}.filtered.vcf.gz') - shell('rm {params.duplicated}') - - for file in input: - if file not in filtered: - file1 = file.split('.')[0] - print(f'file {file1} not filtered') - shell("cp {file1}.vcf.gz {file1}.filtered.vcf.gz && bcftools index -f {file1}.filtered.vcf.gz") - shell('rm {params.duplicated}') - - rule merge: - input: - [f'{vcfOut}' + x for x in expand('/{{chromosome}}/{{chromosome}}_{file}.filtered.vcf.gz', file=allFiles)] - output: - vcf=temp(f'{vcfOut}/{{chromosome}}_final.vcf.gz'), - index=temp(f'{vcfOut}/{{chromosome}}_final.vcf.gz.csi') - conda: "bcftools" - threads: 1 - resources: cpus=1, mem_mb=32000, time_min=60 - log: 'logs/Merge_{chromosome}.log' - shell: - """ - bcftools merge {input} -O z -o {output.vcf} - bcftools index -f {output.vcf} - """ - -else: # i.e. len(allFiles) == 1 - rule rename: - input: - [f'{vcfOut}' + x - for x in expand('/{{chromosome}}/{{chromosome}}_{suffixOne}.vcf.gz', - suffixOne = splitFiles)] if len(splitFiles) != 0 else [], - [f'{vcfOut}' + x - for x in expand('/{{chromosome}}/{{chromosome}}_{suffixTwo}.vcf.gz', - suffixTwo = combinedFiles)] if len(combinedFiles) != 0 else [] - output: - vcf = (f'{vcfOut}/{{chromosome}}_final.vcf.gz'), - idx = (f'{vcfOut}/{{chromosome}}_final.vcf.gz.csi') - conda: "bcftools" - log: 'logs/Rename_{chromosome}.log' - resources: cpus=1, mem_mb=32000, time_min=30 - shell: - """ - ln -s $( realpath {input} ) {output.vcf} - ln -s $( realpath {input} ).csi {output.idx} - """ - +# Filter out INDELS and missing data (>20%) from the vcf file +# The input vcf file is the merged vcf file from the previous step +# or the output from split and move (if only one file is present) + +rule remove_missing_indels: + input: + vcf = input_vcf, + index = input_index, + summary = input_summary, + output: + vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz'), + index = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz.csi') + params: + prefix = f'{vcfdir}/{{chromosome}}/{{chromosome}}', + missing = config['missing_data'], + indel = '--remove-indels' if config['remove_indel'] else '' + conda: 'bcftools' + benchmark: 'benchmarks/{chromosome}.missindel.benchmark.txt' + shell: + r""" + vcftools --gzvcf {input.vcf} \ + {params.indel} \ + --max-missing {params.missing} \ + --remove-filtered-all \ + --recode \ + --out {params.prefix} + + bgzip {params.prefix}.recode.vcf + bcftools index -f {output.vcf} + + # rm {params.prefix}.recode.vcf + """ diff --git a/Snakemake/workflow/rules/new_infer_trees.smk b/Snakemake/workflow/rules/new_infer_trees.smk new file mode 100644 index 0000000..0dec01f --- /dev/null +++ b/Snakemake/workflow/rules/new_infer_trees.smk @@ -0,0 +1,139 @@ +# Convert vcf to samples file +# generate ancestos samples based only on sites with ancestral allele +# information +# truncate ancestors to remove those spanning too many sites +# generate ancestors trees +# match ancestors and samples to infer trees +# final output is a simplified tree sequence + +rule prepare_sample_file: + input: + vcf = input_ts, + # f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf.gz', + meta = config['meta'] + output: f'{projdir}/Tsinfer/samples/{{chromosome}}.samples' + params: + chrLength = lambda wildcards: config['chromosome_length'][int(re.findall(r'\d+', wildcards.chromosome)[0])] + conda: "HLab_tsinfer" + threads: 32 + resources: cpus = 32, mem_mb = 768000, time_min = 200 + benchmark: 'benchmarks/{chromosome}_new.prep_sample.benchmark.txt' + shell: + r""" + ulimit -v + + python scripts/generate_samples.py \ + {input.vcf} {output} {input.meta} {params.chrLength} {threads} + + """ + +rule generate_ancestors: + input: + samples = rules.prepare_sample_file.output, + sites_to_exclude = f'{vcfdir}/{{chromosome}}/{{chromosome}}.exclude', + output: + ancestors_samples = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors', + conda: 'HLab_tsinfer' + threads: 64 + resources: cpus = 64, mem_mb = 2048000, time_min = 1680 + benchmark: 'benchmarks/{chromosome}_generate_ancestors.benchmark.txt' + shell: + r""" + # Check amount of memory (in kbytes) as seen by the job + ulimit -v + python scripts/generate_ancestors.py {input.samples} {input.sites_to_exclude} {output.ancestors_samples} {threads} + """ + +rule truncate_ancestors: + input: + ancestors_samples = rules.generate_ancestors.output.ancestors_samples, + output: + truncated_ancestors = f'{projdir}/Tsinfer/samples/{{chromosome}}.ancestors.truncated', + params: + upper = config['truncate_upper'], + lower = config['truncate_lower'], + conda: 'HLab_tsinfer' + threads: 32 + resources: cpus=32, mem_mb=2048000, time_min=1680 + benchmark: 'benchmarks/{chromosome}_truncate.benchmark.txt' + shell: + r""" + ulimit -v + python scripts/truncate.py {input} {params.upper} {params.lower} {output} + """ + +rule ancestors_trees: + input: + samples = rules.prepare_sample_file.output, + truncated_ancestors = rules.truncate_ancestors.output.truncated_ancestors, + output: + ancestors_trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.atrees', + params: + re = f"--recombination-rate {config['recombination_rate_anc']}" if config['recombination_rate_anc'] else '', + misma = f"--mismatch-ratio {config['mismatch_ratio_anc']}" if config['mismatch_ratio_anc'] else '', + conda: 'HLab_tsinfer' + threads: 32 + resources: cpus = 32, mem_mb = 2048000, time_min = 1680 + benchmark: 'benchmarks/{chromosome}_ancestors_trees.benchmark.txt' + shell: + r""" + tsinfer match-ancestors {input.samples} \ + -A {output} \ + {params.re} \ + {params.misma} \ + --log-section tsinfer.inference \ + --num-threads {threads} \ + --progress \ + --verbosity \ + --ancestors {input.truncated_ancestors} + """ + +rule infer_trees: + input: + samples = rules.prepare_sample_file.output, + ancestors_trees = rules.ancestors_trees.output.ancestors_trees, + output: + trees = f'{projdir}/Tsinfer/trees/{{chromosome}}.trees', + params: + re = f"--recombination-rate {config['recombination_rate_tree']}" + if config['recombination_rate_tree'] else '', + misma = f"--mismatch-ratio {config['mismatch_ratio_tree']}" + if config['mismatch_ratio_tree'] else '', + conda: 'HLab_tsinfer' + threads: 32 + resources: cpus=32, mem_mb=2048000, time_min=1680 + benchmark: 'benchmarks/{chromosome}_infer.benchmark.txt' + shell: + r""" + # Check amount of memory (in kbytes) as seen by the job + ulimit -v + tsinfer match-samples {input.samples} \ + -A {input.ancestors_trees} \ + --num-threads {threads} \ + {params.re} \ + {params.misma} \ + --progress \ + --log-section tsinfer.inference \ + --verbosity \ + -O {output} + """ + +rule simplify_trees: + input: + trees = rules.infer_trees.output.trees, + output: + f'{projdir}/Tsinfer/trees/{{chromosome}}.dated.trees', + params: + genint = config['generation_interval'], + mutrate = config['mutation_rate'], + conda: 'HLab_tsinfer' + threads: 32 + resources: cpus=32, mem_mb=2048000, time_min=1680 + benchmark: 'benchmarks/{chromosome}_simplify.benchmark.txt' + shell: + r""" + ulimit -v + python scripts/simplify.py {input} {params.genint} {params.mutrate} {output} + """ + +# ,[^.] \ No newline at end of file diff --git a/Snakemake/workflow/rules/phasing.smk b/Snakemake/workflow/rules/phasing.smk index fbf92d3..6ea0fad 100644 --- a/Snakemake/workflow/rules/phasing.smk +++ b/Snakemake/workflow/rules/phasing.smk @@ -1,62 +1,66 @@ -# def getInputVcfFile_phasing(wildcards): -# if os.path.isfile(vcfdir + "/" + wildcards.chromosome + '_final.vcf.gz'): -# print("Final file found ") -# vcf_file = vcfdir + "/" + wildcards.chromosome + '_final.vcf.gz' -# print(vcf_file) -# else: -# print("No final file ") -# file = open(vcfdir + '/Vcf_file_' + wildcards.chromosome + '.txt') -# vcf_file = file.read().strip("\n") -# -# return(vcf_file) +# Phase files if species is diploid using the output from the +# remove_missing_indels rule +file_to_phase = f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz' +file_to_phase_idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}.recode.vcf.gz.csi' + +phased_output = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz' +phased_output_idx = f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi' + +checkpoint decide_start_point: + output: "workflow_start.txt" + shell: + """ + if [ {config['phase_only']} == False ] && [ -f {phased_output} {phased_output_idx} ]; then + echo "ancestral_info_to_vcf" > {output} + else + echo "start" > {output} + fi + """ if config['ploidy'] == 1: - rule rename_phased: - input: - vcf = f'{vcfOut}/{{chromosome}}_final.vcf.gz', - idx = f'{vcfOut}/{{chromosome}}_final.vcf.gz.csi' - output: - vcf = f'{vcfOut}/{{chromosome}}_phased.vcf.gz', - idx = f'{vcfOut}/{{chromosome}}_phased.vcf.gz.csi' - log: 'logs/Rename_phased_{chromosome}.log' - resources: cpus=1, mem_mb=32000, time_min=60 - shell: - """ - if [ -h {input.vcf} ]; then - ln -s $( realpath {input.vcf} ) {output.vcf} - ln -s $( realpath {input.idx} ) {output.idx} - else - ln -s {input.vcf} {output.vcf} - ln -s {input.idx} {output.idx} - fi - """ + input_ancestral = file_to_phase + input_ancestral_idx = file_to_phase_idx else: rule phase: input: - vcf = f'{vcfOut}/{{chromosome}}_final.vcf.gz', + vcf = file_to_phase, + idx = file_to_phase_idx, output: - file = f'{vcfOut}/{{chromosome}}_phased.vcf.gz', - idx = f'{vcfOut}/{{chromosome}}_phased.vcf.gz.csi' + vcf = phased_output, params: map = f'{mapdir}/{{chromosome}}.txt', - #log: 'logs/phase_{chromosome}.log' - threads: 25 -# resources: cpus=20, mem_mb=25000, time_min=5 + threads: 32 + resources: cpus = 32, mem_mb = 2048, time_min = 7200 conda: 'shapeit4am' - log: 'logs/Phase_{chromosome}.log' + benchmark: 'benchmarks/{chromosome}.shapeit.benchmark.txt' shell: - """ + r""" str='{wildcards.chromosome}' chr=$(echo ${{str:3}}) - start=`date +%s` + shapeit4 --input {input.vcf} \ - --map {params.map} \ - --region ${{chr}} \ - --output {output} \ - --thread {threads} - end=`date +%s` - echo Execution time was `expr $end - $start` seconds > shapeit_{wildcards.chromosome}.time - bcftools index -f {output} + --map {params.map} \ + --region $chr \ + --output {output.vcf} \ + --thread {threads} \ + --pbwt-depth 8 \ + --sequencing \ + --log {wildcards.chromosome}_phased.log + """ + + rule index_phased_cleanup: + input: rules.phase.output + output: phased_output_idx + params: + chr = lambda wildcards: wildcards.chromosome[3:], + dire = f'{vcfdir}/{{chromosome}}', + conda: 'bcftools' + benchmark: 'benchmarks/{chromosome}.bcftools_index.bechmark.txt' + shell: + r""" + bcftools index {input} """ + input_ancestral = phased_output + input_ancestral_idx = phased_output_idx \ No newline at end of file diff --git a/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk b/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk index c26f24b..9cac1ef 100644 --- a/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk +++ b/Snakemake/workflow/rules/prepare_files_for_tsinfer.smk @@ -5,19 +5,20 @@ else: rule get_af: - input: f'{vcfOut}/{{chromosome}}_phased.vcf.gz' + input: f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz' output: - info = temp(f'{vcfOut}/Info{{chromosome}}.INFO') + vcf = temp(f'{vcfdir}/{{chromosome}}/{{chromosome}}_info.vcf.gz'), + info = temp(f'{vcfdir}/{{chromosome}}/Info{{chromosome}}.INFO') params: - prefix=f'{vcfOut}/Info{{chromosome}}' + prefix=f'{vcfdir}/{{chromosome}}/Info{{chromosome}}' conda: "bcftools" threads: 1 resources: cpus=1, mem_mb=32000, time_min=60 log: 'logs/get_af_{chromosome}.log' shell: """ - bcftools +fill-tags {input} -Oz -o {input} -- -t AN,AC,AF - vcftools --gzvcf {input} --out {params.prefix} --get-INFO AC --get-INFO AF + bcftools +fill-tags {input} -Oz -o {output.vcf} -- -t AN,AC,AF + vcftools --gzvcf {output.vcf} --out {params.prefix} --get-INFO AC --get-INFO AF LOGFILE={params.prefix}.log if test -f "$LOGFILE"; then @@ -27,12 +28,10 @@ rule get_af: rule get_major: input: rules.get_af.output.info - output: temp(f'{vcfOut}/Major{{chromosome}}.txt') + output: temp(f'{vcfdir}/{{chromosome}}/Major{{chromosome}}.txt') threads: 1 resources: cpus=1, mem_mb=32000, time_min=60 log: 'logs/Get_major_{chromosome}.log' -# wildcard_constraints: -# chromosome="^chr | ^Chr" shell: """ awk '{{if (NR!=1 && $5>=0.5) {{print $1"_"$2","$4}} else if (NR!=1 && $5<0.5) {{print $1"_"$2","$3}}}}' {input} > {output} @@ -42,7 +41,7 @@ rule extract_ancestral_chromosome: input: ancestral_file output: - temp(f'{vcfOut}/Ancestral{{chromosome}}.txt') + temp(f'{vcfdir}/{{chromosome}}/Ancestral{{chromosome}}.txt') params: chrNum = lambda wc: wc.get('chromosome')[3:] log: 'logs/Extract_ancestral_chromosome_{chromosome}.log' @@ -57,7 +56,7 @@ rule join_major_ancestral: ancestral = rules.extract_ancestral_chromosome.output, major = rules.get_major.output output: - temp(f'{vcfOut}/MajAAnc_{{chromosome}}.txt') + temp(f'{vcfdir}/{{chromosome}}/MajAAnc_{{chromosome}}.txt') log: 'logs/Join_major_ancestral_{chromosome}.log' resources: cpus=1, mem_mb=32000, time_min=60 shell: @@ -69,7 +68,7 @@ rule determine_ancestral_major: input: rules.join_major_ancestral.output output: - temp(f'{vcfOut}/MajOAnc_{{chromosome}}.txt') + temp(f'{vcfdir}/{{chromosome}}/MajOAnc_{{chromosome}}.txt') log: 'logs/Determine_major_ancestral_{chromosome}.log' shell: """ @@ -78,9 +77,9 @@ rule determine_ancestral_major: rule decompress: input: - vcf=f'{vcfOut}/{{chromosome}}_phased.vcf.gz', - majororaa=rules.determine_ancestral_major.output - output: f'{vcfOut}/{{chromosome}}_phased.vcf' # this is removing both the .gz and the decompressed file + vcf = rules.get_af.output.vcf, + majororaa = rules.determine_ancestral_major.output + output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf' threads: 1 resources: cpus=1, mem_mb=32000, time_min=30 log: 'logs/Decompress_{chromosome}.log' @@ -92,9 +91,9 @@ rule decompress: rule change_infoAA_vcf: input: - vcf=rules.decompress.output, - ancestralAllele=rules.determine_ancestral_major.output - output: f'{vcfOut}/{{chromosome}}_ancestral.vcf' + vcf = rules.decompress.output, + ancestralAllele = rules.determine_ancestral_major.output + output: f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf' conda: "bcftools" threads: 1 resources: cpus=1, mem_mb=64000, time_min=120 @@ -109,17 +108,23 @@ rule change_infoAA_vcf: """ rule compress_vcf: - input: rules.change_infoAA_vcf.output, + input: + vcf = f'{vcfdir}/{{chromosome}}/{{chromosome}}_ancestral.vcf', output: - file = f'{vcfOut}/{{chromosome}}_ancestral.vcf.gz', - idx = f'{vcfOut}/{{chromosome}}_ancestral.vcf.gz.csi' + vcf = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz', + idx = f'{vcfdir}/{{chromosome}}_ancestral.vcf.gz.csi', + params: + chrdir = f'{vcfdir}/{{chromosome}}/', + outdir = f'{vcfdir}/' conda: "bcftools" threads: 1 resources: cpus=1, mem_mb=64000, time_min=60 log: 'logs/Compress_{chromosome}.log' shell: + r""" + bgzip {input.vcf} + bcftools index {input.vcf}.gz + + mv {input}.gz {input}.gz.csi {params.outdir} + rm -r {params.chrdir} """ - bgzip {input} - bcftools index -f {output.file} - """ -# keeping the phased files w/o ancestral alleles diff --git a/Snakemake/workflow/rules/split.smk b/Snakemake/workflow/rules/split.smk index d6de413..8bc8a21 100644 --- a/Snakemake/workflow/rules/split.smk +++ b/Snakemake/workflow/rules/split.smk @@ -1,10 +1,29 @@ +# This script is used for managing VCF files in a directory previous to tree inference +# It identifies two types of VCF files in the 'RawVCF' directory: +# 1. Files that start with either "Chr" or "chr" and end with "vcf.gz" (referred to as 'splitFiles') +# 2. Files that start with "Combined" and end with "vcf.gz" (referred to as 'combinedFiles') +# It then creates regex patterns for these file names for later use in rules. +# The script also combines the names of both types of files into a single list (allFiles). +# If there are any 'splitFiles', it defines a rule (move_vcf) to move these files +# from the 'RawVCF' directory to a directory named after the chromosome in the file name. +# The rule also renames the files to include the chromosome name at the start of the file name. +import os +import re +print(chromosomes) splitFiles = list(set([f.strip("vcf.gz").split("_")[1] for f in os.listdir(f'{vcfdir}/RawVCF') - if (f.endswith("vcf.gz") and (f.startswith("Chr") | f.startswith("chr")) )])) + if (f.endswith("vcf.gz") and (f.startswith("Chr") | f.startswith("chr")) and "_" in f)])) + combinedFiles = list(set([f.strip("vcf.gz").split("_")[1] for f in os.listdir(f'{vcfdir}/RawVCF') if (f.endswith("vcf.gz") and f.startswith("Combined"))])) + +# Prepare a regex pattern that matches any of the file names in splitFiles and +# combinedFiles so rules only look for them +splitFiles_pattern = "(" + "|".join(re.escape(name) for name in splitFiles) + ")" +combinedFiles_pattern = "(" + "|".join(re.escape(name) for name in combinedFiles) + ")" + allFiles = splitFiles + combinedFiles if len(splitFiles) > 0: @@ -18,16 +37,13 @@ if len(splitFiles) > 0: conda: "bcftools" threads: 1 resources: cpus=1, mem_mb=32000, time_min=60 - log: 'logs/Move_vcf_{chromosome}_{suffixOne}.log' + benchmark: 'benchmarks/{chromosome}.{suffixOne}.move.benchmark.txt' + wildcard_constraints: + suffixOne = splitFiles_pattern shell: """ - if [ -h {input.vcf} ]; then - ln -s $( realpath {input.vcf} ) {output.vcf} - ln -s $( realpath {input.vcf} ).csi {output.vcf} - else - ln -s {input.vcf} {output.vcf} - ln -s {input.idx} {output.idx} - fi + ln -s {input.vcf} {output.vcf} + ln -s {input.idx} {output.idx} """ if len(combinedFiles) > 0: @@ -35,16 +51,15 @@ if len(combinedFiles) > 0: input: f'{vcfdir}/RawVCF/Combined_{{suffixTwo}}.vcf.gz' output: - vcf = [f'{vcfOut}' + x + temp([f'{vcfdir}' + x for x in expand('/{{chromosome}}/{{chromosome}}_{{suffixTwo}}.vcf.gz', - chromosome = chromosomes, suffixTwo = combinedFiles)], - idx = [f'{vcfOut}' + x - for x in expand('/{{chromosome}}/{{chromosome}}_{{suffixTwo}}.vcf.gz.csi', - chromosome = chromosomes, suffixTwo = combinedFiles)] + chromosome = chromosomes, suffixTwo = combinedFiles)]) conda: "bcftools" threads: 1 resources: cpus=1, mem_mb=64000, time_min=60 - log: expand('logs/Split_and_move_vcfs_{{chromosome}}_{{suffixTwo}}.log') + benchmark: 'benchmarks/{chromosome}.{suffixTwo}.split.benchmark.txt' + wildcard_constraints: + suffixTwo = combinedFiles_pattern shell: """ str='{wildcards.chromosome}' diff --git a/Snakemake/workflow/scripts/Infer_trees.py b/Snakemake/workflow/scripts/Infer_trees.py index 7fb0b1c..c686b8b 100644 --- a/Snakemake/workflow/scripts/Infer_trees.py +++ b/Snakemake/workflow/scripts/Infer_trees.py @@ -42,6 +42,8 @@ progress_monitor=True, ) +ancestors_ts.dump(f'{outputfile}.ancestors') + ts = tsinfer.match_samples( sampleFile, ancestors_ts, @@ -51,25 +53,24 @@ progress_monitor=True, ).simplify(keep_unary=False) -""" -ts = tsinfer.infer(sampleFile) +#ts = tsinfer.infer(sampleFile) print( "Inferred tree sequence `{}`: {} trees over {} Mb".format( - "drone_ts", ts.num_trees, ts.sequence_length / 1e6 + "ts", ts.num_trees, ts.sequence_length / 1e6 ) ) -""" -# # Check the metadata -# for sample_node_id in ts.samples(): -# individual_id = ts.node(sample_node_id).individual -# population_id = ts.node(sample_node_id).population -# print( -# "Node", -# sample_node_id, -# "labels genome sampled from", -# json.loads(ts.individual(individual_id).metadata), -# "in", -# json.loads(ts.population(population_id).metadata)["subpop"], -# ) + +# Check the metadata +for sample_node_id in ts.samples(): + individual_id = ts.node(sample_node_id).individual + population_id = ts.node(sample_node_id).population + print( + "Node", + sample_node_id, + "labels genome sampled from", + json.loads(ts.individual(individual_id).metadata), + "in", + json.loads(ts.population(population_id).metadata)["subpop"], + ) ts.dump(outputFile) diff --git a/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py b/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py index 0064f21..3a620b2 100644 --- a/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py +++ b/Snakemake/workflow/scripts/Prepare_tsinfer_sample_file.py @@ -8,6 +8,8 @@ import numpy as np import pandas as pd import pkg_resources +from itertools import chain +from tsinfer import MISSING_DATA print(sys.executable) @@ -70,7 +72,9 @@ def add_sites(vcf, samples, ploidy): for variant in vcf: # Loop over variants, each assumed at a unique site progressbar.update(variant.POS - pos) if pos == variant.POS: - raise ValueError("Duplicate positions for variant at position", pos) + print("Duplicate positions for variant at position", pos) + continue + else: pos = variant.POS if ploidy > 1: @@ -83,10 +87,10 @@ def add_sites(vcf, samples, ploidy): alleles = alleles + [letter for letter in string] #ALT = ALT + [letter for letter in string] - print(alleles) + #print(alleles) # ignores non-bialllelic sites if len(alleles) > 2: - print('Ignoring non-biallelic site') + # print('Ignoring non-biallelic site') continue # if len([*ancestral]) > 1: @@ -95,7 +99,7 @@ def add_sites(vcf, samples, ploidy): # #continue ancestral = variant.INFO.get("AA", variant.REF) - print(f'alleles: {alleles} | ancestal: {[*ancestral]}') + #print(f'alleles: {alleles} | ancestal: {[*ancestral]}') try: ancestral_allele = alleles.index(ancestral[0]) @@ -111,6 +115,7 @@ def add_sites(vcf, samples, ploidy): ancestral_allele=alleles.index(ancestral[0])) + def add_populations(vcf, samples, metaData): """ This is a modified add populations function for drones from David Wragg data @@ -121,10 +126,23 @@ def add_populations(vcf, samples, metaData): """ pop_lookup = {} metaPop = metaData.iloc[:, 1:].drop_duplicates().dropna(axis = 0, how = 'all') - sample_subpop = [list(metaData.SubPop[metaData.ID == x]) for x in vcf.samples] - for subpop, pop in zip(metaPop.SubPop, metaPop.Pop): - pop_lookup[subpop] = samples.add_population(metadata={"pop": pop, "subpop": subpop}) - return [pop_lookup[subpop[0]] for subpop in sample_subpop] + + # to account for same subpop in different countries - creating a 'fake subpop' to be used as key + # if subpop is repeated new code is subpop+origin, else subpop + d = [a if not (sum(j == a for j in metaPop.SubPop[:i])) else f'{a}{c}' for (i,a),c in zip(enumerate(metaPop.SubPop),metaPop.Origin)] + metaPop['code'] = d + # add code col to the full meatafile + metaData = pd.merge(metaData, metaPop[['Pop', 'SubPop', 'Origin', 'code']], on = ['Pop', 'SubPop', 'Origin'], how = 'inner') + + # list population per sample based on recoded subpop + sample_subpop = [list(metaData.code[metaData.ID == x]) for x in vcf.samples] + + # the dictionary key will be based on the recoding - so keeps the correct amount of population + # but the metadata takes in the real subpop value + for subpop, pop, origin, lat, lon, code in zip(metaPop.SubPop, metaPop.Pop, metaPop.Origin, metaPop.Lat, metaPop.Lon, metaPop.code): + pop_lookup[code] = samples.add_population(metadata={"pop": pop, "subpop": subpop, "origin": origin, "coord1": lat, "coord2":lon}) + return [pop_lookup[code[0]] for code in sample_subpop] + ####################################################################### # Read in the information and prepare the data @@ -138,7 +156,7 @@ def add_populations(vcf, samples, metaData): # Create samples for haploid data with tsinfer.SampleData(path=sampleFile, sequence_length=chrLength, - num_flush_threads=20, max_file_size=2**30) as samples: + num_flush_threads=16, max_file_size=2**30) as samples: populations = add_populations(vcf = vcfD, samples = samples, metaData = metaFile) print("populations determined") add_individuals(vcf = vcfD, samples = samples, populations = populations, metaData = metaFile, ploidy = ploidy) @@ -148,5 +166,6 @@ def add_populations(vcf, samples, metaData): print( "Sample file created for {} samples ".format(samples.num_samples) + "({} individuals) ".format(samples.num_individuals) - + "with {} variable sites.".format(samples.num_sites) -) + + "with {} variable sites.".format(samples.num_sites), + flush=True, +) \ No newline at end of file diff --git a/Snakemake/workflow/scripts/generate_ancestors.py b/Snakemake/workflow/scripts/generate_ancestors.py new file mode 100644 index 0000000..9a8ef82 --- /dev/null +++ b/Snakemake/workflow/scripts/generate_ancestors.py @@ -0,0 +1,22 @@ +import sys +import tsinfer +import numcodecs +import numpy as np + +args = sys.argv + +samples = tsinfer.load(args[1]) +sites_to_exclude = np.loadtxt(args[2]) +output = args[3] +threads = int(args[4]) + +gzip_compressor = numcodecs.GZip(level=1) + +ancestors = tsinfer.generate_ancestors( + sample_data=samples, + path=output, + exclude_positions=sites_to_exclude, + progress_monitor=True, + num_threads=threads, + compressor=gzip_compressor, +) \ No newline at end of file diff --git a/Snakemake/workflow/scripts/generate_samples.py b/Snakemake/workflow/scripts/generate_samples.py new file mode 100644 index 0000000..3543fc7 --- /dev/null +++ b/Snakemake/workflow/scripts/generate_samples.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python +# coding: utf-8 + +import os +import sys +import json +import cyvcf2 +import tsinfer +import numcodecs +import numpy as np +import pandas as pd +from tqdm import tqdm + +# create a GZip compressor instance with a specific compression level +gzip_compressor = numcodecs.GZip(level=1) + +def add_populations(vcf, samples, metaData): + """ + This is a modified add populations function for drones from David Wragg data + Add tsinfer Population objects for drone subspecies, stored in metaPop object, and return a list od IDs correposning to the VCF samples + Input: vcf = cyvcf2 VCF object, samples is tsinfer SampleData and metaData is a pandas dataframe with id in ID column and populations in Type column + pops = dictinary holding name and additional information about populations + Return: A list of population indexes + """ + pop_lookup = {} + metaPop = metaData.iloc[:, 1:].drop_duplicates().dropna(axis = 0, how = 'all') + + # to account for same subpop in different countries - creating a 'fake subpop' to be used as key + # if subpop is repeated new code is subpop+origin, else subpop + d = [a if not (sum(j == a for j in metaPop.SubPop[:i])) else f'{a}{c}' for (i,a),c in zip(enumerate(metaPop.SubPop),metaPop.Origin)] + metaPop['code'] = d + # add code col to the full meatafile + metaData = pd.merge(metaData, metaPop[['Pop', 'SubPop', 'Origin', 'code']], on = ['Pop', 'SubPop', 'Origin'], how = 'inner') + + # list population per sample based on recoded subpop + sample_subpop = [list(metaData.code[metaData.ID == x]) for x in vcf.samples] + + # the dictionary key will be based on the recoding - so keeps the correct amount of population + # but the metadata takes in the real subpop value + for subpop, pop, origin, lat, lon, code in zip(metaPop.SubPop, metaPop.Pop, metaPop.Origin, metaPop.Lat, metaPop.Lon, metaPop.code): + pop_lookup[code] = samples.add_population(metadata={"pop": pop, "subpop": subpop, "origin": origin, "coord1": lat, "coord2":lon}) + return [pop_lookup[code[0]] for code in sample_subpop] + +def add_diploid_individuals(vcf, samples, populations): + for iid, population in zip(vcf.samples, populations): + samples.add_individual(ploidy=2, metadata={'ID': iid}, population=population) + + +def add_diploid_sites(vcf, samples): + """ + Read the sites in the vcf and add them to the samples object. + """ + # You may want to change the following line, e.g. here we allow + # "*" (a spanning deletion) to be a valid allele state + allele_chars = set("ATGCatgc*") + + pos = 0 + + progressbar = tqdm(total=samples.sequence_length, desc="Read VCF", unit='bp') + + for variant in vcf: # Loop over variants, each assumed at a unique site + progressbar.update(variant.POS - pos) + + if pos == variant.POS: + print(f"Duplicate entries at position {pos}, ignoring all but the first") + continue + else: + pos = variant.POS + + if any([not phased for _, _, phased in variant.genotypes]): + raise ValueError("Unphased genotypes for variant at position", pos) + + alleles = [variant.REF.upper()] + [v.upper() for v in variant.ALT] + + ancestral = variant.INFO.get("AA", ".") # "." means unknown + + if(ancestral == 'ambiguous' or ancestral not in alleles): # set ancestral == reference + ancestral_allele = 0 + elif ancestral == '-1': + ancestral_allele = -1 + else: + ancestral_allele = alleles.index(ancestral) + + # Check we have ATCG alleles + for a in alleles: + if len(set(a) - allele_chars) > 0: + print(f"Ignoring site at pos {pos}: allele {a} not in {allele_chars}") + continue + + # Map original allele indexes to their indexes in the new alleles list. + genotypes = [g for row in variant.genotypes for g in row[0:2]] + samples.add_site(pos, genotypes, alleles, ancestral_allele=ancestral_allele) + +def process_samples(file, metadata, sample_path, chr_length, compressor, threads): + with tsinfer.SampleData(path=sample_path, + sequence_length=chr_length, + compressor=compressor, + num_flush_threads=threads, max_file_size=2**30) as samples: + populations = add_populations(vcf=file, samples=samples, metaData=metadata) + print("populations determined") + add_diploid_individuals(vcf=file, samples=samples, populations=populations) + print("individuals added") + add_diploid_sites(vcf=file, samples=samples) + print("sites added") + + print( + "Sample file created for {} samples ".format(samples.num_samples) + + "({} individuals) ".format(samples.num_individuals) + + "with {} variable sites.".format(samples.num_sites), + flush=True, + ) + +# ----------------------------------------------------------------------------------------- + +args = sys.argv +vcfFile = args[1] +sampleFile = args[2] +meta = args[3] +chrLength = args[4] +nthreads = int(args[5]) + +metaFile = pd.read_csv(meta) + +# Create a population (subspecie) list for the samples in the VCF +vcfD = cyvcf2.VCF(vcfFile, strict_gt=True) + +# Create samples for haploid data +try: + process_samples(vcfD, metaFile, sampleFile, chrLength, gzip_compressor, nthreads) +except Exception as e: + print(f"Error encountered: {e}. Retrying with num_flush_threads set to 16.") + process_samples(vcfD, metaFile, sampleFile, chrLength, gzip_compressor, 16) + +# with tsinfer.SampleData(path=sampleFile, +# sequence_length=chrLength, +# num_flush_threads=nthreads, max_file_size=2**30) as samples: +# populations = add_populations(vcf = vcfD, samples = samples, metaData = metaFile) +# print("populations determined") +# add_diploid_individuals(vcf = vcfD, samples = samples, populations = populations) +# print("individuals added") +# add_diploid_sites(vcf = vcfD, samples = samples) +# print("sites added") + +# ----------------------------------------------------------------------------------------- +# def execute_with_threads(nthreads): +# try: +# with tsinfer.SampleData(path=sampleFile, +# sequence_length=chrLength, +# compressor=gzip_compressor, +# num_flush_threads=nthreads, max_file_size=2**30) as samples: +# populations = add_populations(vcf=vcfD, samples=samples, metaData=metaFile) +# print("populations determined") +# add_diploid_individuals(vcf=vcfD, samples=samples, populations=populations) +# print("individuals added") +# add_diploid_sites(vcf=vcfD, samples=samples) +# print("sites added") +# except Exception as e: +# print(f"Error encountered: {e}. Retrying with num_flush_threads set to 16.") +# with tsinfer.SampleData(path=sampleFile, +# sequence_length=chrLength, +# compressor=gzip_compressor, +# num_flush_threads=16, max_file_size=2**30) as samples: +# populations = add_populations(vcf=vcfD, samples=samples, metaData=metaFile) +# print("populations determined") +# add_diploid_individuals(vcf=vcfD, samples=samples, populations=populations) +# print("individuals added") +# add_diploid_sites(vcf=vcfD, samples=samples) +# print("sites added") +# execute_with_threads(nthreads) diff --git a/Snakemake/workflow/scripts/simplify.py b/Snakemake/workflow/scripts/simplify.py new file mode 100644 index 0000000..7659671 --- /dev/null +++ b/Snakemake/workflow/scripts/simplify.py @@ -0,0 +1,29 @@ +import sys +import tskit +import subprocess +# upgrade to the latest version of tsdate +subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "tsdate"]) +import tsdate + +args = sys.argv +ts = tskit.load(args[1]) +generation_interval = float(args[2]) +mu = float(args[3]) +output = args[4] + +# Ne = ts.diversity()/4/mu + +processed_ts = tsdate.preprocess_ts( + ts, + split_disjoint=True, + keep_unary=False, + ) + +dated_ts = tsdate.variational_gamma(tree_sequence = processed_ts, + mutation_rate = mu, + eps = generation_interval, + max_iterations = 100, + time_units = 'generations', + progress = True,) + +dated_ts.dump(output) \ No newline at end of file diff --git a/Snakemake/workflow/scripts/truncate.py b/Snakemake/workflow/scripts/truncate.py new file mode 100644 index 0000000..1fbf0f4 --- /dev/null +++ b/Snakemake/workflow/scripts/truncate.py @@ -0,0 +1,17 @@ +import sys +import tsinfer + +args = sys.argv +ancestors = tsinfer.load(args[1]) +uprtime = float(args[2]) +lwertime = float(args[3]) +output = args[4] + +ancestors.truncate_ancestors( + path=output, + max_file_size=2**38, + lower_time_bound=lwertime, + upper_time_bound=uprtime, + length_multiplier=2, +) +