Skip to content

Commit

Permalink
Matrix changes and GATK Indel integration
Browse files Browse the repository at this point in the history
  • Loading branch information
alipirani88 committed Dec 19, 2018
2 parents 053aea1 + 7b2c92a commit a72aa46
Show file tree
Hide file tree
Showing 9 changed files with 234 additions and 2 deletions.
7 changes: 5 additions & 2 deletions config_ali
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ binbase: /nfs/esnitkin/bin_group/variant_calling_bin/
resources: nodes=1:ppn=4,pmem=4000mb,walltime=250:00:00
large_resources: nodes=1:ppn=12,mem=47gb,walltime=250:00:00
email: [email protected]
queue: flux
flux_account: esnitkin_flux
queue: fluxod
flux_account: esnitkin_fluxod
notification: ae

# Set Parameters for individual tools. Set the binbase of each tool: This should be the folder name of respective tools where the executables for each resp. tool resides.
Expand Down Expand Up @@ -1009,6 +1009,7 @@ Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/AB030/
Ref_Name: Steno_K279a.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Steno_K279a/
<<<<<<< HEAD

[lactobacillus_crispatus_ST1]
# Name of reference genome fasta file.
Expand All @@ -1021,3 +1022,5 @@ Ref_Path: /nfs/esnitkin/Project_Crispatus/Sequence_data/Project_mother_daughter/
Ref_Name: USA500-2395.fasta
# path to the reference genome fasta file.
Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/USA500-2395/
=======
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
29 changes: 29 additions & 0 deletions modules/beast/beast.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
import argparse
from subprocess import call
import re
<<<<<<< HEAD
from fasta_functions import *
=======
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be

# Get command line arguments
parser = argparse.ArgumentParser(description='Run BEAST pipeline.')
Expand All @@ -27,17 +30,43 @@
tree = args.tree
scripts_dir = args.scripts_dir

<<<<<<< HEAD
=======
# Set paths
python = '/nfs/esnitkin/bin_group/anaconda3/bin/python'
mask_vars_path = '/nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/mask_gubbins_variants.py'

>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
# Get invariant site counts
if invar_sites is not None:
if not invar_sites.endswith(tuple(['fa','fasta'])):
# Path to file with invariant site counts
invar_counts_file = invar_sites
else:
<<<<<<< HEAD
# Count invariant sites in whole genome alignment
invar_counts_file = count_invar_sites(invar_sites, gubbins_gff)
with open(invar_counts_file) as f:
invar_counts = f.readline().strip()

=======
# Mask recombinant regions before counting invariant sites
if gubbins_gff is not None:
call([python, mask_vars_path, invar_sites, gubbins_gff])
aln_file = re.split('/|\.', invar_sites)[-2] + \
'_gubbins_masked.fa'
else:
aln_file = invar_sites
# Count invariant sites in whole genome alignment
call([python, scripts_dir + '/get_num_invariant_sites_v2.py', aln_file])
invar_counts_file = re.split('/|\.', aln_file)[-2] + \
'_invar_site_counts.txt'
# Read file with invariant site counts
with open(invar_counts_file) as f:
invar_counts = f.readline().strip()


>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
# Final xmls for beast
xmls_for_beast = []

Expand Down
13 changes: 13 additions & 0 deletions modules/beast/beast_pbs_flux.pl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
#Takes as input:
# 1) config_file - A 1-column file where each line is the file name (minus the .xml extension) for a beast run
# UPDATED 2017-09-21 from beast2/2.4.5 to beast2/2.4.7
<<<<<<< HEAD
# UPDATED 2018-11-19 from beast2/2.4.7 to /nfs/esnitkin/bin_group/beast_v2.5.1/beast/bin/beast
=======
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
#PRINT USAGE
if(@ARGV == 0)
Expand Down Expand Up @@ -75,13 +78,23 @@
print PBS "cd $out_dir";
print PBS "\n";
print PBS "#Load modules\n";
<<<<<<< HEAD
#print PBS "module load beast2/2.4.7\n";
print PBS "module load beagle\n";
#print PBS "addonmanager -add BEAST_CLASSIC\n";
#print PBS "addonmanager -add SCOTTI\n";
print PBS "\n";
print PBS "# Put your job commands here:\n";
print PBS "/nfs/esnitkin/bin_group/beast_v2.5.1/beast/bin/beast -beagle_SSE -instances 12 -threads 12 beast$count\_$fnameorig\n";
=======
print PBS "module load beast2/2.4.7\n";
print PBS "module load beagle\n";
#print PBS "addonmanager -add BEAST_CLASSIC\n";
#print PBS "addonmanager -add SCOTTI\n";
print PBS "\n";
print PBS "# Put your job commands here:\n";
print PBS "beast -beagle_SSE -instances 12 -threads 12 beast$count\_$fnameorig\n";
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
print PBS "\n";
close PBS;
Expand Down
53 changes: 53 additions & 0 deletions modules/beast/fasta_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,25 @@
import re
import os

<<<<<<< HEAD
def subset_fasta(ids, fastain, fastaout):
"""Creates a new fasta file with a subset of original sequences.
Args:
idnames: List of sequence names to be extracted.
=======
def subset_fasta(idnames, fastain, fastaout):
"""Creates a new fasta file with a subset of original sequences.

Args:
idnames: Text file with list of names of sequences to be extracted.
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
fastain: Path to fasta file containing all sequences of interest.
fastaout: Name of output fasta file containing only the sequences of interest.

Output:
Fasta file containing only the sequences of interest.
<<<<<<< HEAD
"""
with open(fastaout, 'w') as f:
Expand Down Expand Up @@ -62,6 +71,28 @@ def get_fasta_subsets(csv, fasta):
fas.append(fa)
return(fas)
=======
Returns:
Path to subsetted fasta file.
"""

ids = []

for line in open(idnames, 'r'):
if line.strip() == 'x':
pass
else:
ids.append(line.strip())

with open(fastaout, 'w') as f:
for rec in SeqIO.parse(fastain, 'fasta'):
if str(rec.id) in ids:
f.write('>' + names[ids.index(rec.id)] + '\n')
f.write(str(rec.seq) + '\n')

return(fastaout)
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be


def rm_invar_sites(fasta, outfile=None, outfmt=['fasta','vcf'], outdir='.',
Expand All @@ -82,7 +113,11 @@ def rm_invar_sites(fasta, outfile=None, outfmt=['fasta','vcf'], outdir='.',
Path to output file.
"""

<<<<<<< HEAD
if len(outfmt) > 1:
=======
if len(outfmt) > 1:
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
outfmt = outfmt[0]

# Get output fasta file name
Expand Down Expand Up @@ -237,13 +272,18 @@ def count_invar_sites(fasta,gff=None,outdir='.',path={'snpsites':'/nfs/esnitkin/
if not li.startswith("#"):
positions.append(line.split('\t')[1])

<<<<<<< HEAD
# Get allele for invariant sites
invar_sites = []
=======
# Get ref allele for invariant sites
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
for record in aln:
seq_str = list(str(record.seq))
for index in positions:
index = int(index)
seq_str[index] = ''
<<<<<<< HEAD
#tmp = ''.join(seq_str)
if len(invar_sites) == 0:
invar_sites = seq_str
Expand All @@ -260,16 +300,29 @@ def count_invar_sites(fasta,gff=None,outdir='.',path={'snpsites':'/nfs/esnitkin/
# Get invariant site count for each base
#print('Counting bases.')
#invar_counts = Counter(invar_sites)
=======
invar_sites = ''.join(seq_str)
break

# Get invariant site count for each base
print('Counting bases.')
invar_counts = Counter(invar_sites)
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be

# Write base counts to files
invar_counts_file = outdir + '/' + re.split('/|\.', aln_file)[-2] + \
'_invar_site_counts.txt'
print('Writing ', invar_counts_file, ' (order: A C G T).', sep='')
with open(invar_counts_file, 'w') as f:
for base, count in sorted(invar_counts.items()):
<<<<<<< HEAD
#if base in ['A','a','C','c','G','g','T','t']:
print(base + ' ' + str(count))
f.write('%s ' % (count))
#else:
# print(base + ' ' + str(count))
=======
f.write('%s ' % (count))

>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
return(invar_counts_file)
30 changes: 30 additions & 0 deletions modules/beast/scotti.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
<<<<<<< HEAD
# PIPELINE TO RUN BEAST ON FLUX
=======
# PIPELINE TO RUN SCOTTI ON FLUX
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be

# Import modules
import sys
import os
import argparse
from subprocess import call
import re
<<<<<<< HEAD
from fasta_functions import *
from scotti_functions import *

Expand Down Expand Up @@ -106,3 +111,28 @@
print('Generating PBS script(s) and submitting jobs to flux.')
call(['perl', scripts_dir + '/beast_pbs_flux.pl', 'input_beast.txt'])

=======

# Get command line arguments
parser = argparse.ArgumentParser(description='Generate SCOTTI XMLs and run SCOTTI.')
parser.add_argument('xmls', nargs='*', help='input SCOTTI BEAST xml file(s)')
parser.add_argument('--nReps', '-n', metavar='N', type=int, help='number of beast runs for each xml', default=3)
parser.add_argument('--invarSites', '-i', metavar='FILE', help='''
option 1: fasta file to get number of invariant As, Cs, Gs, and Ts not in alignment
option 2: text file with number of invariant As, Cs, Gs, and Ts in alignment''')
parser.add_argument('--gubbinsGFF', '-g', metavar='GFF', help='''gubbins output GFF file to mask recombinant sites in alignment before counting invariant sites''')
parser.add_argument('--tree', '-t', help='starting tree file in newick format')
parser.add_argument('-scriptsDir', '-d', metavar='DIR', help='directory where all of the scripts are housed', default='/nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/beast')
args = parser.parse_args()
# Rename variables
xmls = args.xmls
nReps = args.nReps
invarSites = args.invarSites
gubbinsGFF = args.gubbinsGFF
tree = args.tree
scriptsDir = args.scriptsDir

# Set paths
python = '/nfs/esnitkin/bin_group/anaconda3/bin/python'
maskVarsPath = '/nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/mask_gubbins_variants.py'
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
29 changes: 29 additions & 0 deletions modules/stages.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,40 @@ def qualimap(out_sorted_bam, out_path, analysis, logger, Config):
""" Variant Filteration """
def filter_variants(final_raw_vcf, out_path, analysis, ref_index, logger, Config, Avg_dp):
reference = ConfigSectionMap(ref_index, Config)['ref_path'] + "/" + ConfigSectionMap(ref_index, Config)['ref_name']
<<<<<<< HEAD
gatk_filter_final_vcf_file = gatk_filter(final_raw_vcf, out_path, analysis, reference, logger, Config, Avg_dp)
#gatk_filter_final_vcf_contamination_file = gatk_filter_contamination(final_raw_vcf, out_path, analysis, reference, logger, Config, Avg_dp)
gatk_filter_final_vcf_file_no_proximate_snp = remove_proximate_snps(gatk_filter_final_vcf_file, out_path, analysis, reference, logger, Config)
keep_logging('The vcf file with no proximate snp: {}'.format(gatk_filter_final_vcf_file_no_proximate_snp), 'The vcf file with no proximate snp: {}'.format(gatk_filter_final_vcf_file_no_proximate_snp), logger, 'debug')

=======
gatk_filter2_final_vcf_file = gatk_filter2(final_raw_vcf, out_path, analysis, reference, logger, Config, Avg_dp)
gatk_filter2_final_vcf_contamination_file = gatk_filter2_contamination(final_raw_vcf, out_path, analysis, reference, logger, Config, Avg_dp)
gatk_filter2_final_vcf_file_no_proximate_snp = remove_proximate_snps(gatk_filter2_final_vcf_file, out_path, analysis, reference, logger, Config)
keep_logging('The vcf file with no proximate snp: {}'.format(gatk_filter2_final_vcf_file_no_proximate_snp), 'The vcf file with no proximate snp: {}'.format(gatk_filter2_final_vcf_file_no_proximate_snp), logger, 'debug')
#gatk_vcf2fasta_filter2_file = gatk_vcf2fasta_filter2(gatk_filter2_final_vcf_file, out_path, analysis, reference, logger, Config)
#gatk_vcf2fasta_filter2_file_no_proximate = gatk_vcf2fasta_filter2(gatk_filter2_final_vcf_file_no_proximate_snp, out_path, analysis, reference, logger, Config)
#vcftools_vcf2fasta_filter2_file = vcftools_vcf2fasta_filter2(gatk_filter2_final_vcf_file, out_path, analysis, reference, logger, Config)
#vcftools_vcf2fasta_filter2_file_no_proximate = vcftools_vcf2fasta_filter2(gatk_filter2_final_vcf_file_no_proximate_snp, out_path, analysis, reference, logger, Config)
#keep_logging('The final Consensus Fasta file: {}'.format(gatk_vcf2fasta_filter2_file), 'The final Consensus Fasta file: {}'.format(gatk_vcf2fasta_filter2_file), logger, 'debug')
#keep_logging('The final Consensus Fasta file with no proximate: {}'.format(gatk_vcf2fasta_filter2_file_no_proximate), 'The final Consensus Fasta file with no proximate: {}'.format(gatk_vcf2fasta_filter2_file_no_proximate), logger, 'debug')
#keep_logging('The final Consensus Fasta file from VCF-consensus: {}'.format(vcftools_vcf2fasta_filter2_file), 'The final Consensus Fasta file from VCF-consensus: {}'.format(vcftools_vcf2fasta_filter2_file), logger, 'debug')
#keep_logging('The final Consensus Fasta file from VCF-consensus with no proximate: {}'.format(vcftools_vcf2fasta_filter2_file_no_proximate), 'The final Consensus Fasta file from VCF-consensus with no proximate: {}'.format(vcftools_vcf2fasta_filter2_file_no_proximate), logger, 'debug')

def filter2_indels(final_raw_indel_vcf, out_path, analysis, ref_index, logger, Config, Avg_dp):
reference = ConfigSectionMap(ref_index, Config)['ref_path'] + "/" + ConfigSectionMap(ref_index, Config)['ref_name']
gatk_filter2_final_vcf_file = gatk_filter2_indel(final_raw_indel_vcf, out_path, analysis, reference, logger, Config, Avg_dp)
# gatk_filter2_final_vcf_file_no_proximate_snp = remove_proximate_snps(gatk_filter2_final_vcf_file, out_path, analysis, reference, logger, Config)
# keep_logging('The vcf file with no proximate snp: {}'.format(gatk_filter2_final_vcf_file_no_proximate_snp), 'The vcf file with no proximate snp: {}'.format(gatk_filter2_final_vcf_file_no_proximate_snp), logger, 'debug')
# gatk_vcf2fasta_filter2_file = gatk_vcf2fasta_filter2(gatk_filter2_final_vcf_file, out_path, analysis, reference, logger, Config)
# gatk_vcf2fasta_filter2_file_no_proximate = gatk_vcf2fasta_filter2(gatk_filter2_final_vcf_file_no_proximate_snp, out_path, analysis, reference, logger, Config)
# vcftools_vcf2fasta_filter2_file = vcftools_vcf2fasta_filter2(gatk_filter2_final_vcf_file, out_path, analysis, reference, logger, Config)
# vcftools_vcf2fasta_filter2_file_no_proximate = vcftools_vcf2fasta_filter2(gatk_filter2_final_vcf_file_no_proximate_snp, out_path, analysis, reference, logger, Config)
# keep_logging('The final Consensus Fasta file: {}'.format(gatk_vcf2fasta_filter2_file), 'The final Consensus Fasta file: {}'.format(gatk_vcf2fasta_filter2_file), logger, 'debug')
# keep_logging('The final Consensus Fasta file with no proximate: {}'.format(gatk_vcf2fasta_filter2_file_no_proximate), 'The final Consensus Fasta file with no proximate: {}'.format(gatk_vcf2fasta_filter2_file_no_proximate), logger, 'debug')
# keep_logging('The final Consensus Fasta file from VCF-consensus: {}'.format(vcftools_vcf2fasta_filter2_file), 'The final Consensus Fasta file from VCF-consensus: {}'.format(vcftools_vcf2fasta_filter2_file), logger, 'debug')
# keep_logging('The final Consensus Fasta file from VCF-consensus with no proximate: {}'.format(vcftools_vcf2fasta_filter2_file_no_proximate), 'The final Consensus Fasta file from VCF-consensus with no proximate: {}'.format(vcftools_vcf2fasta_filter2_file_no_proximate), logger, 'debug')
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be

def filter_indels(final_raw_indel_vcf, out_path, analysis, ref_index, logger, Config, Avg_dp):
reference = ConfigSectionMap(ref_index, Config)['ref_path'] + "/" + ConfigSectionMap(ref_index, Config)['ref_name']
Expand Down
14 changes: 14 additions & 0 deletions modules/variant_diagnostics/core_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3386,7 +3386,11 @@ def someOtherFunc(data, key):
files_for_tabix = glob.glob("%s/*.vcf" % args.filter2_only_snp_vcf_dir)
tabix(files_for_tabix, "vcf", logger, Config)

<<<<<<< HEAD
# # Get the cluster option; create and run jobs based on given parameter. The jobs will parse all the intermediate vcf file to extract information such as if any unique variant position was unmapped in a sample, if it was filtered out dur to DP,MQ, FQ, proximity to indel, proximity to other SNPs and other variant filter parameters set in config file.
=======
# Get the cluster option; create and run jobs based on given parameter. The jobs will parse all the intermediate vcf file to extract information such as if any unique variant position was unmapped in a sample, if it was filtered out dur to DP,MQ, FQ, proximity to indel, proximity to other SNPs and other variant filter parameters set in config file.
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
tmp_dir = "/tmp/temp_%s/" % log_unique_time
create_job(args.jobrun, vcf_filenames, unique_position_file, tmp_dir)
create_indel_job(args.jobrun, vcf_filenames, unique_indel_position_file, tmp_dir)
Expand Down Expand Up @@ -3681,6 +3685,10 @@ def someOtherFunc(data, key):
call("%s" % prepare_ref_var_consensus_input_cmd, logger)
call("%s" % prepare_var_consensus_input_cmd, logger)
call("%s" % prepare_allele_var_consensus_input_cmd, logger)
<<<<<<< HEAD
=======
#call("%s" % prepare_ref_allele_var_consensus_input_cmd, logger)
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
call("%s" % prepare_ref_allele_unmapped_consensus_input_cmd, logger)


Expand Down Expand Up @@ -3713,9 +3721,15 @@ def someOtherFunc(data, key):
# # Read new allele matrix and generate fasta; generate a seperate function
keep_logging('Generating Fasta from Variant Alleles...\n', 'Generating Fasta from Variant Alleles...\n', logger, 'info')

<<<<<<< HEAD
# create_job_allele_variant_fasta(args.jobrun, vcf_filenames, args.filter2_only_snp_vcf_dir, config_file)
#
# extract_only_ref_variant_fasta_from_reference_allele_variant()
=======
#create_job_allele_variant_fasta(args.jobrun, vcf_filenames, args.filter2_only_snp_vcf_dir, config_file)

#extract_only_ref_variant_fasta_from_reference_allele_variant()
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be

call("cp %s %s/Logs/core/" % (
log_file_handle, os.path.dirname(os.path.dirname(args.filter2_only_snp_vcf_dir))), logger)
Expand Down
4 changes: 4 additions & 0 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,11 @@ def cleanup(args, logger):
files_to_delete = []
Config = ConfigParser.ConfigParser()
Config.read(config_file)
<<<<<<< HEAD
#pipeline(args, logger)
=======
pipeline(args, logger)
>>>>>>> 7b2c92a5aa76895d2a815d185b3d861945add2be
cleanup(args, logger)
keep_logging('End: Pipeline', 'End: Pipeline', logger, 'info')
time_taken = datetime.now() - start_time_2
Expand Down
Loading

0 comments on commit a72aa46

Please sign in to comment.