Skip to content

Commit

Permalink
Matrix changes and GATK Indel integration
Browse files Browse the repository at this point in the history
  • Loading branch information
alipirani88 committed Dec 19, 2018
1 parent e9bb7d2 commit 053aea1
Show file tree
Hide file tree
Showing 65 changed files with 5,021 additions and 3,192 deletions.
18 changes: 18 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_gubbins_masked.fa
modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_gubbins_masked_invar_site_counts.txt
modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_gubbins_masked_snp-sites.vcf
modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_gubbins_masked_var_sites.fa
modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_invar_site_counts.txt
modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_masked_recomb_positions.txt
modules/beast/test/2018_08_15_10_45_01_KPNIH1_ref_allele_unmapped_consensus_snp-sites.vcf
modules/beast/test/input_beast.txt
modules/beast/test/invar_base_counts.txt
modules/beast/test/model_finder_IQTREE_noDec_wDates_LA.tree
modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta2_renamed.xml
modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta2.xml
modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta_renamed_st_invSites.xml
modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta_renamed.xml
modules/beast/test/penn-st258_LA_bmt_ucln_bs_dta.xml
modules/beast/test/test_commands.sh
modules/beast/test/test_fasta_path.txt
modules/beast/test/test_gff_path.txt
194 changes: 77 additions & 117 deletions .idea/workspace.xml

Large diffs are not rendered by default.

207 changes: 56 additions & 151 deletions README.md

Large diffs are not rendered by default.

Binary file removed barplot.png
Binary file not shown.
Binary file removed barplot_DP.png
Binary file not shown.
49 changes: 48 additions & 1 deletion config_ali
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,31 @@ fq2: 0.025
mq: 50
# Filter variants with Variant QUAL less than the below threshold
qual: 100
# Filter variants with GATK QualbyDepth QD parameter; filter less than the below threshold. Currently, being used for Indel SNPS only.
qd: 2.00
# Filter variants with AF1 less than the below threshold
af: 0.900
# Filter Variants that are proximate to each other within this number of range. To turn this off, use 0(zero).
prox: 0


[gatk_haplotypecaller_filters]
avg_depth: no
# If AVG_DEPTH is yes, the below DP threshold will be ignored. Instead, LOW_DEPTH and HIGH_DEPTH filter parameter will be used.
# Filter variants with Depth less than the below threshold
dp: 9
# A value of 2 means that regions with less than half of the average coverage of the entire genome will fail
low_depth: 2
# A value of 5 means that regions with 5x depth greater than the average coverage will fail
high_depth: 5
# FQ not represented in GATK Haplotype caller vcf format. Instead use AF.
# Filter variants with MQ(Root Mean Square Quality) less than the below threshold
mq: 50
# Filter variants with Variant QUAL less than the below threshold
qual: 2
# Filter variants with AF1 less than the below threshold
af: 0.9
# Filter Variants that are proximate to each other
# Filter Variants that are proximate to each other within this number of range. To turn this off, use 0(zero).
prox: 0

[rna_filters]
Expand Down Expand Up @@ -274,6 +296,12 @@ Ref_Name: cdiff_630.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/CDIFF_630/

[CDIFF_630_ncbi]
# Name of reference genome fasta file.
Ref_Name: cdiff_630.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/CDIFF_630_ncbi/

[Cdiff_VPI10463]
# Name of reference genome fasta file.
Ref_Name: Cdiff_VPI10463.fasta
Expand Down Expand Up @@ -974,3 +1002,22 @@ Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/MRSA_Newman/
Ref_Name: AB030.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/AB030/


[Steno_K279a]
# Name of reference genome fasta file.
Ref_Name: Steno_K279a.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Steno_K279a/

[lactobacillus_crispatus_ST1]
# Name of reference genome fasta file.
Ref_Name: lactobacillus_crispatus_ST1.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/Project_Crispatus/Sequence_data/Project_mother_daughter/reference_genome/lactobacillus_crispatus_ST1/

[USA500-2395]
# Name of reference genome fasta file.
Ref_Name: USA500-2395.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/USA500-2395/
237 changes: 237 additions & 0 deletions config_gatk
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
## Set which tools to use in pipeline:
[pipeline]
# Options for Aligner:bwa / smalt / bowtie
aligner: bwa
# Options for variant_caller: samtoolswithpostalignbam / gatkhaplotypecaller /samtools
variant_caller: samtools

## Set bin folder path. Please make sure all the executables are placed in bin folder. Also make sure the path for individual tools are correct.
[bin_path]
binbase: /nfs/esnitkin/bin_group/variant_calling_bin/

## Set PBS scheduler fields.
[scheduler]
# This small resource will be used for smaller jobs
resources: nodes=1:ppn=4,pmem=4000mb,walltime=250:00:00
# Large cluster resources in case of large sample size/variant call sets
large_resources: nodes=1:ppn=12,mem=47gb,walltime=250:00:00
email: [email protected]
queue: flux
flux_account: esnitkin_flux
notification: ae

## Tools/Module Parameters
# Set Parameters for individual tools. Set the binbase of each tool: This should be the folder name of respective tools where the executables for each resp. tool resides.
# Set parameter for Trimmomatic
[Trimmomatic]
trimmomatic_bin: /Trimmomatic/
adaptor_filepath: adapters/TruSeq3-Nextera_PE_combined.fa
seed_mismatches: 2
palindrome_clipthreshold: 30
simple_clipthreshold: 10
minadapterlength: 8
#change this to true and see the effect on alignment
keep_both_reads: true
window_size: 4
window_size_quality: 20
minlength: 40
headcrop_length: 0
colon: :
targetlength: 125
crop_length: 40
f_p: forward_paired.fq.gz
f_up: forward_unpaired.fq.gz
r_p: reverse_paired.fq.gz
r_up: reverse_unpaired.fq.gz

[bwa]
bwa_bin: /bwa-0.7.12/
cores: 8
base_cmd: bwa
algorithm: mem
index: index
RG_header: -R
Mark_splithits: -M

[bowtie]
bowtie_bin: /bowtie2-2.2.6/
cores: 8
build_cmd: bowtie2-build
align_cmd: bowtie2
parameters: -k 1 --non-deterministic --end-to-end

[samtools]
samtools_bin: /samtools-1.2/
base_cmd: samtools
#minimum mapping quality
#change parameter S to -t SP and D to -t DP
mpileup_parameters: -ug -f
faiindex: faidx
#-q30 -B -E -C50

[bcftools]
bcftools_bin: /bcftools-1.2/
base_cmd: bcftools
call_parameters: -vg

[picard]
picard_bin: /picard-tools-2.5.0/
base_cmd: picard.jar

[gatk]
gatk_bin: /GenomeAnalysisTK-3.3-0/
base_cmd: GenomeAnalysisTK.jar
haplotype_parameters: -T HaplotypeCaller
#gatk_filter1_parameter_expression: FQ < 40.00 && MQ > 20 && QUAL > 50 && DP > 15
#gatk_filter2_parameter_expression: FQ < 0.025 && MQ > 50 && QUAL > 100 && DP > 15
#Deprecated. gatk_haplotypecaller Integration Pending for SNP calling.
#gatk_haplotypecaller_specific_filter2_parameter_expression: FQ < 0.025 && MQ > 50 && QUAL > 100 && DP > 15

[vcftools]
#vcftools_perl_bin: /vcftools_0.1.12b/perl/
vcftools_bin: /vcftools_0.1.12b/bin/
tabix_bin: /tabix-0.2.6/
#vcftools_bin: /vcftools_0.1.12b/bin/
vcftools_perl_bin: /vcftools_0.1.12b/perl/

[qualimap]
qualimap_bin: /qualimap_v2.1/
base_cmd: qualimap

[bedtools]
bedtools_bin: /bedtools2-master/bin/
base_cmd: bedtools
version_for_coverage: /version_for_coverage/

[bioawk]
bioawk_bin: /bioawk-master/
base_cmd: bioawk

[fasttree]
fasttree_bin: /Fasttree_2.1.10/
#For Multithread fasttree; use FastTreeMP executable file
base_cmd: FastTree

[raxml]
raxml_bin: /raxml/
openmpi_bin: /openmpi/bin/
# Other raxml executable available to use: raxmlHPC-PTHREADS,raxmlHPC-PTHREADS-SSE3,raxmlHPC-SSE3
base_cmd: raxmlHPC-HYBRID-SSE3
parameters: -f a -x 12345 -p 12345 -N autoMRE -m GTRCAT -T 20

[iqtree]
iqtree_bin: /nfs/esnitkin/bin_group/anaconda3/bin/
base_cmd: iqtree
parameters: -nt AUTO -bb 1000 -m GTR+G+ASC

[gubbins]
# Change this path to wherever gubbins is located/installed. Right now, installed using conda from anaconda3 package installed in bin_group.
gubbins_bin: /nfs/esnitkin/bin_group/anaconda3/bin/
base_cmd: run_gubbins.py

[mummer]
mummer_bin: /MUMmer3.23/
nucmer_base_cmd: nucmer
min_tandem_repeat_length: 20
percent_id: 95

## Variant Filters
# Select which type of filters to use. Default is snitkin_filters. See Below snitkin_filters for more information about each filter criterias.
[SNP_filters]
filter_criteria: snitkin_filters
# Other types of filters: SPANDx_filters, loose_filters, snitkin_filters, contamination_filters

## Filters used for most of the Microbial Variant calling Projects.
[snitkin_filters]
avg_depth: no
# If AVG_DEPTH is yes, the below DP threshold will be ignored. Instead, LOW_DEPTH and HIGH_DEPTH filter parameter will be used.
# Filter variants with Depth less than the below threshold
dp: 9
# A value of 2 means that regions with less than half of the average coverage of the entire genome will fail
low_depth: 2
# A value of 5 means that regions with 5x depth greater than the average coverage will fail
high_depth: 5
# Filter variants with FQ(Consensus Quality) greater than the below threshold
fq: 0.025
fq2: 0.025
# Filter variants with MQ(Root Mean Square Quality) less than the below threshold
mq: 50
# Filter variants with Variant QUAL less than the below threshold
qual: 100
# Filter variants with GATK QualbyDepth QD parameter; filter less than the below threshold. Currently, being used for Indel SNPS only.
qd: 2.00
# Filter variants with AF1 less than the below threshold
af: 0.900
# Filter Variants that are proximate to each other within this number of range. To turn this off, use 0(zero).
prox: 0

[contamination_filters]
avg_depth: no
# If AVG_DEPTH is yes, the below DP threshold will be ignored. Instead, LOW_DEPTH and HIGH_DEPTH filter parameter will be used.
# Filter variants with Depth less than the below threshold
dp: 3
# A value of 2 means that regions with less than half of the average coverage of the entire genome will fail
low_depth: 2
# A value of 5 means that regions with 5x depth greater than the average coverage will fail
high_depth: 5
# Filter variants with FQ(Consensus Quality) greater than the below threshold
fq: -20.00
fq2: -20.00
# Filter variants with MQ(Root Mean Square Quality) less than the below threshold
mq: 50
# Filter variants with Variant QUAL less than the below threshold
qual: 0
# Filter variants with AF1 less than the below threshold
af: 1
# Filter Variants that are proximate to each other
prox: 1

## Functional class filters. Find Phage and repetitive regions in reference genome, Mask regions of reference genome and dont call variants against it, Mask features with mobile element annotations in it.
[functional_filters]
# If apply_functional_filters is set to no, the pipeline will not find Phage/Repeat regions and Mask positions provided in mask_file.
apply_functional_filters: yes
find_phage_region: yes
find_repetitive_region: yes
mask_region: no
mask_file: /nfs/esnitkin/bin_group/variant_calling_bin/reference/test_file.txt
mobile_elements: yes
# If apply_to_calls is set to no, PHAGE/REPEAT/MASK will still run but will not remove calls falling in this region.
apply_to_calls: yes

##SNP annotations
[snpeff]
snpeff_bin: /snpEff/
base_cmd: snpEff.jar
snpeff_parameters: -d -no-downstream -no-upstream
prebuild: no
db: Staphylococcus_aureus_subsp_aureus_usa300_tch1516
dataDir: /data/

#CLUSTER_SNP: 3
#CLUSTER_WINDOW_SNP: 10
#MLEAF_SNP: 0.95
#QD_SNP: 10.0
#FS_SNP: 10.0
#HAPLO_SNP: 20.0

########################################################################################################################

# Reference Genome to be used for pipeline
# Set path for already indexed reference genome

# Most Frequently used reference genomes

# Name of the reference genome. Provide this value with -index argument.
[KPNIH1]
# Name of reference genome fasta file.
Ref_Name: KPNIH1.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/KPNIH1_test/


[CFT073]
# Name of reference genome fasta file.
Ref_Name: EscherichiacoliCFT073.fna
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/CFT073

Loading

0 comments on commit 053aea1

Please sign in to comment.