Skip to content

Commit

Permalink
Merge pull request #2 from alipirani88/master
Browse files Browse the repository at this point in the history
Updating Variant calling pipeline
  • Loading branch information
alipirani88 authored Jan 10, 2019
2 parents e225319 + f33685a commit 7649435
Show file tree
Hide file tree
Showing 227 changed files with 6,367 additions and 22,841 deletions.
18 changes: 18 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_gubbins_masked.fa
modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_gubbins_masked_invar_site_counts.txt
modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_gubbins_masked_snp-sites.vcf
modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_gubbins_masked_var_sites.fa
modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_invar_site_counts.txt
modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_masked_recomb_positions.txt
modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_snp-sites.vcf
modules/beast/test/input_beast.txt
modules/beast/test/invar_base_counts.txt
modules/beast/test/model_finder_IQTREE_noDec_wDates_LA.tree
modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta2_renamed.xml
modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta2.xml
modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta_renamed_st_invSites.xml
modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta_renamed.xml
modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta.xml
modules/beast/test/test_commands.sh
modules/beast/test/test_fasta_path.txt
modules/beast/test/test_gff_path.txt
2 changes: 1 addition & 1 deletion .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

255 changes: 225 additions & 30 deletions .idea/workspace.xml

Large diffs are not rendered by default.

396 changes: 289 additions & 107 deletions README.md

Large diffs are not rendered by default.

100 changes: 97 additions & 3 deletions config_ali
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,16 @@ base_cmd: FastTree

[raxml]
raxml_bin: /raxml/
openmpi_bin: /openmpi/bin/
# Other raxml executable available to use: raxmlHPC-PTHREADS,raxmlHPC-PTHREADS-SSE3,raxmlHPC-SSE3
base_cmd: raxmlHPC-SSE3
base_cmd: raxmlHPC-HYBRID-SSE3
parameters: -f a -x 12345 -p 12345 -N autoMRE -m GTRCAT -T 20

[iqtree]
iqtree_bin: /nfs/esnitkin/bin_group/anaconda3/bin/
base_cmd: iqtree
parameters: -nt AUTO -bb 1000 -m GTR+G+ASC

[gubbins]
# Change this path to wherever gubbins is located/installed. Right now, installed using conda from anaconda3 package installed in bin_group.
gubbins_bin: /nfs/esnitkin/bin_group/anaconda3/bin/
Expand Down Expand Up @@ -148,10 +154,32 @@ fq2: 0.025
mq: 50
# Filter variants with Variant QUAL less than the below threshold
qual: 100
# Filter variants with GATK QualbyDepth QD parameter; filter less than the below threshold. Currently, being used for Indel SNPS only.
qd: 2.00
# Filter variants with AF1 less than the below threshold
af: 0.900
# Filter Variants that are proximate to each other within this number of range. To turn this off, use 0(zero).
prox: 0


[gatk_haplotypecaller_filters]
avg_depth: no
# If AVG_DEPTH is yes, the below DP threshold will be ignored. Instead, LOW_DEPTH and HIGH_DEPTH filter parameter will be used.
# Filter variants with Depth less than the below threshold
dp: 9
# A value of 2 means that regions with less than half of the average coverage of the entire genome will fail
low_depth: 2
# A value of 5 means that regions with 5x depth greater than the average coverage will fail
high_depth: 5
# FQ not represented in GATK Haplotype caller vcf format. Instead use AF.
# Filter variants with MQ(Root Mean Square Quality) less than the below threshold
mq: 50
# Filter variants with Variant QUAL less than the below threshold
qual: 2
# Filter variants with AF1 less than the below threshold
af: 0.9
# Filter Variants that are proximate to each other
prox: 10
# Filter Variants that are proximate to each other within this number of range. To turn this off, use 0(zero).
prox: 0

[rna_filters]
avg_depth: no
Expand Down Expand Up @@ -268,12 +296,30 @@ Ref_Name: cdiff_630.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/CDIFF_630/

[CDIFF_630_ncbi]
# Name of reference genome fasta file.
Ref_Name: cdiff_630.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/CDIFF_630_ncbi/

[Cdiff_VPI10463]
# Name of reference genome fasta file.
Ref_Name: Cdiff_VPI10463.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Cdiff_VPI10463/

[Cdiff_O27_R20291]
# Name of reference genome fasta file.
Ref_Name: Cdiff_O27_R20291.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Cdiff_O27_R20291/

[Cdiff_O14_PH44]
# Name of reference genome fasta file.
Ref_Name: Cdiff_O14_PH44.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Cdiff_O14_PH44/

[CFT073]
# Name of reference genome fasta file.
Ref_Name: EscherichiacoliCFT073.fna
Expand Down Expand Up @@ -927,3 +973,51 @@ Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_20_Rush_KPC_8
Ref_Name: SRR3334137.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/SRR3334137/



##Maddeline Helicobacter project
[helicobacter_hepaticus_ATCC_51449]
# Name of reference genome fasta file.
Ref_Name: helicobacter_hepaticus_ATCC_51449.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/helicobacter_hepaticus_ATCC_51449/


##Steph Lab Confirmation Project
[MRSA_USA300_FPR3757]
# Name of reference genome fasta file.
Ref_Name: MRSA_USA300_FPR3757.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/MRSA_USA300_FPR3757/

[MRSA_Newman]
# Name of reference genome fasta file.
Ref_Name: MRSA_Newman.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/MRSA_Newman/

[AB030]
# Name of reference genome fasta file.
Ref_Name: AB030.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/AB030/


[Steno_K279a]
# Name of reference genome fasta file.
Ref_Name: Steno_K279a.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Steno_K279a/

[lactobacillus_crispatus_ST1]
# Name of reference genome fasta file.
Ref_Name: lactobacillus_crispatus_ST1.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/Project_Crispatus/Sequence_data/Project_mother_daughter/reference_genome/lactobacillus_crispatus_ST1/

[USA500-2395]
# Name of reference genome fasta file.
Ref_Name: USA500-2395.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/USA500-2395/
141 changes: 83 additions & 58 deletions variant_calling_pipeline_dev/config → config_gatk
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
# This config file will help you set custom tools and their parameters for the pipeline
# In the below [pipeline] section, set the two main tools that will be used for variant calling. Supports bwa and samtools and fully tested. Also supports bowtie and GATK, but not fully tested.
## Set which tools to use in pipeline:
[pipeline]
# Options for Aligner: bwa / smalt / bowtie
# Options for Aligner:bwa / smalt / bowtie
aligner: bwa
# Options for variant_caller: samtoolswithpostalignbam / gatkhaplotypecaller /samtools
variant_caller: samtools

# Set bin folder path. Please make sure all the installations and executables are placed in this bin folder. Also make sure the path for individual tools are working properly.
## Set bin folder path. Please make sure all the executables are placed in bin folder. Also make sure the path for individual tools are correct.
[bin_path]
binbase: /nfs/esnitkin/bin_group/variant_calling_bin/

# If you are using a grid system to run jobs, change the below parameters accordingly. Supports only pbs torque system.
## Set PBS scheduler fields.
[scheduler]
# This small resource will be used for smaller jobs
resources: nodes=1:ppn=4,pmem=4000mb,walltime=250:00:00
large_resources: nodes=1:ppn=4,mem=47000mb,walltime=250:00:00
email: [email protected]
queue: XXX
flux_account: XXX
notification: XXX

# Large cluster resources in case of large sample size/variant call sets
large_resources: nodes=1:ppn=12,mem=47gb,walltime=250:00:00
email: [email protected]
queue: flux
flux_account: esnitkin_flux
notification: ae

## Tools/Module Parameters
# Set Parameters for individual tools. Set the binbase of each tool: This should be the folder name of respective tools where the executables for each resp. tool resides.
# Set parameter for Trimmomatic
[Trimmomatic]
Expand Down Expand Up @@ -79,11 +81,11 @@ base_cmd: picard.jar
[gatk]
gatk_bin: /GenomeAnalysisTK-3.3-0/
base_cmd: GenomeAnalysisTK.jar
haplotype_parameters: -T HaplotypeCaller --genotyping_mode DISCOVERY
#changes: 12th August
gatk_filter1_parameter_expression: FQ < 40.00 && MQ > 20 && QUAL > 50 && DP > 15
gatk_filter2_parameter_expression: FQ < 0.025 && MQ > 50 && QUAL > 100 && DP > 15
#changed gatk_filter2_parameter_expression DP 15 from 10 that was used in VRE samples.
haplotype_parameters: -T HaplotypeCaller
#gatk_filter1_parameter_expression: FQ < 40.00 && MQ > 20 && QUAL > 50 && DP > 15
#gatk_filter2_parameter_expression: FQ < 0.025 && MQ > 50 && QUAL > 100 && DP > 15
#Deprecated. gatk_haplotypecaller Integration Pending for SNP calling.
#gatk_haplotypecaller_specific_filter2_parameter_expression: FQ < 0.025 && MQ > 50 && QUAL > 100 && DP > 15

[vcftools]
#vcftools_perl_bin: /vcftools_0.1.12b/perl/
Expand Down Expand Up @@ -112,19 +114,34 @@ base_cmd: FastTree

[raxml]
raxml_bin: /raxml/
openmpi_bin: /openmpi/bin/
# Other raxml executable available to use: raxmlHPC-PTHREADS,raxmlHPC-PTHREADS-SSE3,raxmlHPC-SSE3
base_cmd: raxmlHPC-SSE3
base_cmd: raxmlHPC-HYBRID-SSE3
parameters: -f a -x 12345 -p 12345 -N autoMRE -m GTRCAT -T 20

[iqtree]
iqtree_bin: /nfs/esnitkin/bin_group/anaconda3/bin/
base_cmd: iqtree
parameters: -nt AUTO -bb 1000 -m GTR+G+ASC

[gubbins]
# Change this path to wherever gubbins is located/installed. Right now, installed using conda from anaconda3 package installed in bin_group.
gubbins_bin: /nfs/esnitkin/bin_group/anaconda3/bin/
base_cmd: run_gubbins.py

[mummer]
mummer_bin: /MUMmer3.23/
nucmer_base_cmd: nucmer
min_tandem_repeat_length: 20
percent_id: 95

## Variant Filters
# Select which type of filters to use. Default is snitkin_filters. See Below snitkin_filters for more information about each filter criterias.
[SNP_filters]
filter_criteria: snitkin_filters
# Other Criterias: SPANDx_filters, loose_filters, snitkin_filters
# Other types of filters: SPANDx_filters, loose_filters, snitkin_filters, contamination_filters

## Filters used for most of the Microbial Variant calling Projects.
[snitkin_filters]
avg_depth: no
# If AVG_DEPTH is yes, the below DP threshold will be ignored. Instead, LOW_DEPTH and HIGH_DEPTH filter parameter will be used.
Expand All @@ -141,9 +158,54 @@ fq2: 0.025
mq: 50
# Filter variants with Variant QUAL less than the below threshold
qual: 100
# Filter variants with GATK QualbyDepth QD parameter; filter less than the below threshold. Currently, being used for Indel SNPS only.
qd: 2.00
# Filter variants with AF1 less than the below threshold
af: 0.900
# Filter Variants that are proximate to each other within this number of range. To turn this off, use 0(zero).
prox: 0

[contamination_filters]
avg_depth: no
# If AVG_DEPTH is yes, the below DP threshold will be ignored. Instead, LOW_DEPTH and HIGH_DEPTH filter parameter will be used.
# Filter variants with Depth less than the below threshold
dp: 3
# A value of 2 means that regions with less than half of the average coverage of the entire genome will fail
low_depth: 2
# A value of 5 means that regions with 5x depth greater than the average coverage will fail
high_depth: 5
# Filter variants with FQ(Consensus Quality) greater than the below threshold
fq: -20.00
fq2: -20.00
# Filter variants with MQ(Root Mean Square Quality) less than the below threshold
mq: 50
# Filter variants with Variant QUAL less than the below threshold
qual: 0
# Filter variants with AF1 less than the below threshold
af: 1
# Filter Variants that are proximate to each other
prox: 10
prox: 1

## Functional class filters. Find Phage and repetitive regions in reference genome, Mask regions of reference genome and dont call variants against it, Mask features with mobile element annotations in it.
[functional_filters]
# If apply_functional_filters is set to no, the pipeline will not find Phage/Repeat regions and Mask positions provided in mask_file.
apply_functional_filters: yes
find_phage_region: yes
find_repetitive_region: yes
mask_region: no
mask_file: /nfs/esnitkin/bin_group/variant_calling_bin/reference/test_file.txt
mobile_elements: yes
# If apply_to_calls is set to no, PHAGE/REPEAT/MASK will still run but will not remove calls falling in this region.
apply_to_calls: yes

##SNP annotations
[snpeff]
snpeff_bin: /snpEff/
base_cmd: snpEff.jar
snpeff_parameters: -d -no-downstream -no-upstream
prebuild: no
db: Staphylococcus_aureus_subsp_aureus_usa300_tch1516
dataDir: /data/

#CLUSTER_SNP: 3
#CLUSTER_WINDOW_SNP: 10
Expand All @@ -152,61 +214,24 @@ prox: 10
#FS_SNP: 10.0
#HAPLO_SNP: 20.0


##SNP annotations
[snpeff]
snpeff_bin: /snpEff/
base_cmd: snpEff.jar
snpeff_parameters: -d -no-downstream -no-upstream
dataDir: /data/
########################################################################################################################

# Reference Genome to be used for pipeline
# Set path for already indexed reference genome

# Most Frequently used reference genomes

# Name of the reference genome. Provide this value with -index_name argument.
# Name of the reference genome. Provide this value with -index argument.
[KPNIH1]
# Name of reference genome fasta file.
Ref_Name: KPNIH1.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/KPNIH1/

# Name of the reference genome. Provide this value with -index_name argument.
[aus]
# Name of reference genome fasta file.
Ref_Name: Efae_aus0004_genome.fa
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/aus/

[MRSA_USA_300]
# Name of reference genome fasta file.
Ref_Name: MRSA_USA_300.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/MRSA_USA_300/


[MRSA_USA_100]
# Name of reference genome fasta file.
Ref_Name: MRSA_USA_100_1.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/MRSA_USA_100/
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/KPNIH1_test/

[CDIFF_630]
# Name of reference genome fasta file.
Ref_Name: cdiff_630.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/CDIFF_630/

[CFT073]
# Name of reference genome fasta file.
Ref_Name: EscherichiacoliCFT073.fna
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/CFT073

[paris]
# Name of reference genome fasta file.
Ref_Name: Paris.fna
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/legionella/paris/
Binary file modified config_settings.pyc
Binary file not shown.
Binary file added img/barplot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/barplot_DP.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/core_results_dir.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/core_variants.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/filtered_positions.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/phage_fq_mq.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/pipeline.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/pipeline_All.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/pipeline_core_All.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/reference_allele.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/unmapped_positions.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified modules/__init__.pyc
Binary file not shown.
Loading

0 comments on commit 7649435

Please sign in to comment.