Skip to content

updated rule filter (in merge_samples) #64

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions Snakemake/config/glob_cattle.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
PROJECT: "global_cattle"
work_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/tree_sequence"
species: "Bos taurus"
ploidy: 2
no_chromosomes: 29
start_chromosome: 1
end_chromosome: 29
vcf_dir: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global"
genmap: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ma2015_shapeit_ARS-UCD1.2"
reference_genome: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/Reference_genomes/ARS-UCD1.2_Btau5.0.1Y.fa"
ancestral_allele: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/resources/ancestral_alleles/Dorji_et_al/data/new_cattle_ancestral.txt"
meta: "/exports/cmvm/eddie/eb/groups/HighlanderLab/gfortuna/giro_popgen/genomes/global/files/metadata_correct.csv"
phase_only: False
remove_indel: True
missing_data: "0.2"
truncate_upper: "0.4"
truncate_lower: "0.2"
recombination_rate_tree: False # "1e-8"
recombination_rate_anc: False
mismatch_ratio_anc: False
mismatch_ratio_tree: False # "1"
mutation_rate: "1e-8"
generation_interval: 5
chromosome_length:
1 : 158534110
2 : 136231102
3 : 121005158
4 : 120000601
5 : 120089316
6 : 117806340
7 : 110682743
8 : 113319770
9 : 105454467
10 : 103308737
11 : 106982474
12 : 87216183
13 : 83472345
14 : 82403003
15 : 85007780
16 : 81013979
17 : 73167244
18 : 65820629
19 : 63449741
20 : 71974595
21 : 69862954
22 : 60773035
23 : 52498615
24 : 62317253
25 : 42350435
26 : 51992305
27 : 45612108
28 : 45940150
29 : 51098607
56 changes: 25 additions & 31 deletions Snakemake/workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,13 @@ from os.path import isfile, join


##### setup report #####
report: "reports/tsinfer.rst"

# specify config files on the command line using --configfile
#configfile: '../config/ancestral_Eddie.yaml'
#configfile: '../config/tsinfer_Eddie.yaml'

configfile: '../config/glob_cattle.yaml'
report: "reports/tsinfer.rst"

##### setup singularity #####
# Can we set conda env here???


##### Define some variables #####
# all files should be in the vcfDir/RawVCF folder
# files that need spliting are named: Combined_something.vcf.gz
Expand All @@ -30,47 +26,45 @@ report: "reports/tsinfer.rst"
# (2) sort into the correct Chr folder in vcfDir.

# defining variables that will be used throughout the script

# the final directory name is now editable -- so can be named according to specific project
# mapdir indicates where to find the genetic map for phasing (full path in the config)
mapdir = config['genmap']
oDir = Path(config['o_dir'], config['PROJECT'])

vcfdir=config['vcf_dir'] # from config file, the directory containing the VCF(s) to process
# the path where to save intermetdiate and final VCFs
if config['process_vcf_in_original_dir']:
vcfOut = config['vcf_dir'] # same as above
else:
vcfOut = f"{oDir}/VCF" # VCF dir inside the output dir


vcfdir=config['vcf_dir'] # from config file, the path to vcf
# this folder can be any where but must have the following structure:
# vcfDir:
# - RawVCF
# - Chr1
# -Chr...

# the final directory name is now editable -- so can be named according to specific project
# mapdir indicates where to find the genetic map for phasing (full path in the config)
Project = config['PROJECT']
projdir = config['work_dir'] + '/' + Project
mapdir = config['genmap']

# if want to split multiallelic sites set to TRUE:
#multiallelic = False

chromosomes = [f'Chr{n}' for n in range(1, config['no_chromosomes'] + 1)]
chromosomes = [f'chr{n}' for n in range(config['start_chromosome'], config['end_chromosome'] + 1)]
# set a list of 'Chr' + chr# combinations to be used as wildcards

wildcard_constraints:
chromosome = r'chr\d+'
# constraining the wildcard to avoid errors

##### load rules #####
include: "rules/ancestral_inference.smk"
#include: "rules/ancestral_inference.smk"
include: "rules/split.smk"
include: "rules/filter.smk"
include: "rules/merge_samples_vcf.smk"
include: "rules/phasing.smk"
include: "rules/prepare_files_for_tsinfer.smk"
#include: "rules/multiallelic.smk"
include: "rules/infer_trees.smk"
include: "rules/ancestral_info_to_vcf.smk"
include: "rules/new_infer_trees.smk"

##### Conditional inputs for rule all based on the existence of phased VCFs and config['phase_only'] #####
if not config['phase_only']:
rule_all_input = expand(f'{projdir}/Tsinfer/trees/{{chromosome}}.dated.trees', chromosome=chromosomes)
else:
rule_all_input = expand(f'{vcfdir}/{{chromosome}}/{{chromosome}}_phased.vcf.gz.csi', chromosome=chromosomes)


##### target rules #####
# At the end it should generate a phased vcf file that combines all files for
# each chromosome. These files are stored in the vcf folder declaired in the
# configuration file. It also checks that vcfs have been filtered before merged
rule all:
input:
expand(f'{oDir}/Tsinfer/trees/{{chromosome}}.trees', chromosome=chromosomes)
# tree inference
input: rule_all_input
180 changes: 51 additions & 129 deletions Snakemake/workflow/profile/cluster.json
Original file line number Diff line number Diff line change
@@ -1,55 +1,55 @@
{
"__default__":
{
"mem": "64G",
"mem": "8G",
"threads": "1",
"time": "10:00:00"
},
"rename":
"compare_and_filter_vcfs":
{
"mem": "64G",
"threads": "1",
"time": "03:00:00"
"mem": "16G",
"threads": "8",
"time": "20:00:00"
},
"merge":
{
"mem": "16G",
"threads": "8",
"time": "20:00:00"
},
"remove_missing_indels":
{
"mem": "16G",
"threads": "16",
"time": "05:00:00"
},
"phase":
{
"mem": "350G",
"threads": "25",
"time": "336:00:00"
},
"rename_phased":
{
"mem": "64G",
"threads": "1",
"time": "03:00:00"
"threads": "32",
"time": "240:00:00"
},
"get_af":
"get_vcf_positions":
{
"mem": "64G",
"threads": "1",
"time": "03:00:00"
},
"get_major":
{
"mem": "64G",
"threads": "1",
"time": "03:00:00"
"mem": "16G",
"threads": "32",
"time": "05:00:00"
},
"extract_ancestral_chromosome":
"get_ancestral_alleles":
{
"mem": "64G",
"mem": "32G",
"threads": "1",
"time": "02:00:00"
},
"join_major_ancestral":
"update_regions_file":
{
"mem": "64G",
"mem": "32G",
"threads": "1",
"time": "02:00:00"
},
"determine_ancestral_major":
"add_info_to_vcf":
{
"mem": "64G",
"mem": "32G",
"threads": "1",
"time": "04:00:00"
},
Expand All @@ -59,118 +59,40 @@
"threads": "1",
"time": "03:00:00"
},
"sort_ancestral_to_vcf":
{
"mem": "64G",
"threads": "1",
"time": "03:00:00"
},
"decompress":
{
"mem": "64G",
"threads": "1",
"time": "03:00:00"
},
"change_infoAA_vcf":
{
"mem": "128G",
"threads": "1",
"time": "05:00:00"
},
"prepare_sample_file":
{
"mem": "128G",
"threads": "1",
"time": "10:00:00"
},
"infer":
{
"mem": "128G",
"threads": "1",
"time": "40:00:00"
},
"extract_aligned_alleles_from_vcf":
{
"mem": "32G",
"threads": "1",
"time": "03:00:00"
},
"extract_snps_from_info":
{
"mem": "32G",
"threads": "1",
"time": "03:00:00"
"threads": "16",
"time": "336:00:00"
},
"extract_pos_aligned_alleles_from_vcf":
"generate_ancestors":
{
"mem": "32G",
"threads": "1",
"time": "03:00:00"
},
"extract_vcf_alleles_from_aligned":
{
"mem": "16G",
"threads": "1",
"time" : "03:00:00"
},
"create_estsfs_dicts":
{
"mem": "16G",
"threads": 1",
"time" : "05:00:00"
},
"edit_estsfs_dicts":
{
"mem": "16G",
"threads": "1",
"time": "01:00:00"
},
"combine_estsfs_dicts":
{
"mem": "16G",
"threads": "1",
"time": "02:00:00"
"threads": "16",
"time": "336:00:00"
},
"run_estsfs":
"truncate_ancestors":
{
"mem": "32G",
"threads": "1",
"time": "20:00:00"
},
"extract_major":
{
"mem": "16G",
"threads": "1",
"time": "02:00:00"
},
"extract_minor":
{
"mem": "16G",
"threads": "1",
"time": "02:00:00"
},
"extract_major_outgroup":
{
"mem": "16G",
"threads": "1",
"time": "02:00:00"
"threads": "16",
"time": "336:00:00"
},
"extract_estsfs_prob":
"ancestors_trees":
{
"mem": "16G",
"threads": "1",
"time": "02:00:00"
"mem": "32G",
"threads": "16",
"time": "336:00:00"
},
"determine_ancestral":
"infer_trees":
{
"mem": "16G",
"threads": "1",
"time": "02:00:00"
"mem": "32G",
"threads": "16",
"time": "336:00:00"
},
"move_and_rename_aa":
{
"mem": "16G",
"threads": "1",
"time": "05:00:00"
}
"simplify_trees":
{
"mem": "32G",
"threads": "16",
"time": "03:00:00"
}
}
8 changes: 3 additions & 5 deletions Snakemake/workflow/profile/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@ use-conda: true
jobscript: "jobscript.sh"
cluster-config: "cluster.json"
jobname: "{rule}.{jobid}"
jobs: 16
#drmaa: " -cwd -l h_vmem={cluster.mem} -l h_rt={cluster.time} -pe sharedmem {cluster.threads} -P roslin_HighlanderLab"
drmaa: " -P roslin_HighlanderLab -cwd -l h_vmem={cluster.mem} -l h_rt={cluster.time} -pe sharedmem {cluster.threads} -R y"
rerun-incomplete: true
rerun-triggers: mtime
cores: 16 # how many jobs you want to submit to your cluster queue
local-cores: 1
rerun-triggers: [mtime, input]
local-cores: 99
keep-going: false
Loading