diff --git a/GenomeAnalysisTK.jar b/GenomeAnalysisTK.jar new file mode 100644 index 0000000..33d2225 Binary files /dev/null and b/GenomeAnalysisTK.jar differ diff --git a/config_ali b/config_ali index e763fca..adff5d3 100755 --- a/config_ali +++ b/config_ali @@ -11,24 +11,24 @@ binbase: /nfs/esnitkin/bin_group/variant_calling_bin/ #resources: nodes=1:ppn=12,mem=47gb,walltime=250:00:00 [scheduler] -resources: nodes=1:ppn=4,pmem=4000mb,walltime=50:00:00 -large_resources: nodes=1:ppn=12,mem=47gb,walltime=50:00:00 +resources: nodes=1:ppn=4,pmem=4000mb,walltime=250:00:00 +large_resources: nodes=1:ppn=12,mem=47gb,walltime=250:00:00 email: apirani@med.umich.edu queue: flux flux_account: esnitkin_flux notification: a [flux] -resources: nodes=1:ppn=4,pmem=4000mb,walltime=50:00:00 -large_resources: nodes=1:ppn=12,mem=47gb,walltime=50:00:00 +resources: nodes=1:ppn=4,pmem=4000mb,walltime=250:00:00 +large_resources: nodes=1:ppn=12,mem=47gb,walltime=250:00:00 email: apirani@med.umich.edu queue: flux flux_account: esnitkin_flux notification: a [slurm] -resources: nodes=1:ppn=4,pmem=4000mb,walltime=50:00:00 -large_resources: nodes=1:ppn=12,mem=47gb,walltime=50:00:00 +resources: nodes=1:ppn=4,pmem=4000mb,walltime=250:00:00 +large_resources: nodes=1:ppn=12,mem=47gb,walltime=250:00:00 email: apirani@med.umich.edu queue: flux flux_account: esnitkin_flux @@ -1070,9 +1070,3 @@ Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Pt0_Chromosome_O Ref_Name: NTHi_86-028NP.fasta # path to the reference genome fasta file. Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/NTHi_86-028NP/ - -[Legionella_SG6_Thunder_Bay] -# Name of reference genome fasta file. -Ref_Name: Legionella_SG6_Thunder_Bay.fasta -# path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Legionella_SG6_Thunder_Bay/ diff --git a/config_ali_backup b/config_ali_gl similarity index 66% rename from config_ali_backup rename to config_ali_gl index 30690e4..ee4b954 100755 --- a/config_ali_backup +++ b/config_ali_gl @@ -6,21 +6,43 @@ aligner: bwa variant_caller: samtools # Set bin folder path. Please make sure all the executables are placed in bin folder. Also make sure the path for individual tools are correct. +# Updated for Great Lakes and Conda Integration [bin_path] -binbase: /nfs/esnitkin/bin_group/variant_calling_bin/ +binbase: /home/apirani/.conda/envs/variantcalling_env_test/bin/ +#binbase: /nfs/esnitkin/bin_group/variant_calling_bin/ + +#resources: nodes=1:ppn=12,mem=47gb,walltime=250:00:00 [scheduler] -resources: nodes=1:ppn=4,pmem=4000mb,walltime=92:00:00 +resources: nodes=1:ppn=4,pmem=4000mb,walltime=250:00:00 +large_resources: nodes=1:ppn=12,mem=47gb,walltime=250:00:00 +email: apirani@med.umich.edu +queue: flux +flux_account: esnitkin_flux +notification: a + +[pbs] +resources: nodes=1:ppn=4,pmem=4000mb,walltime=250:00:00 +large_resources: nodes=1:ppn=12,mem=47gb,walltime=250:00:00 email: apirani@med.umich.edu -queue: fluxod -flux_account: esnitkin_fluxod +queue: flux +flux_account: esnitkin notification: a +[slurm] +resources: --nodes=1 --ntasks=1 --cpus-per-task=1 --mem=5g --time=125:00:00 +large_resources: --nodes=1 --ntasks-per-node=12 --mem=47000mb --time=250:00:00 +email: apirani@med.umich.edu +partition: standard +flux_account: esnitkin1 +notification: BEGIN,END,NONE,FAIL,REQUEUE + # Set Parameters for individual tools. Set the binbase of each tool: This should be the folder name of respective tools where the executables for each resp. tool resides. # Set parameter for Trimmomatic +# Updated for Great Lakes and Conda Integration [Trimmomatic] -trimmomatic_bin: /Trimmomatic/ -adaptor_filepath: adapters/TruSeq3-Nextera_PE_combined.fa +trimmomatic_bin: // +adaptor_filepath: /home/apirani/.conda/envs/variantcalling_env_test/share/trimmomatic-0.39-1/adapters/NexteraPE-PE.fa seed_mismatches: 2 palindrome_clipthreshold: 30 simple_clipthreshold: 10 @@ -40,7 +62,7 @@ r_p: reverse_paired.fq.gz r_up: reverse_unpaired.fq.gz [bwa] -bwa_bin: /bwa-0.7.12/ +bwa_bin: // cores: 8 base_cmd: bwa algorithm: mem @@ -49,14 +71,14 @@ RG_header: -R Mark_splithits: -M [bowtie] -bowtie_bin: /bowtie2-2.2.6/ +bowtie_bin: // cores: 8 build_cmd: bowtie2-build align_cmd: bowtie2 parameters: -k 1 --non-deterministic --end-to-end [samtools] -samtools_bin: /samtools-1.2/ +samtools_bin: // base_cmd: samtools #minimum mapping quality #change parameter S to -t SP and D to -t DP @@ -65,62 +87,76 @@ faiindex: faidx #-q30 -B -E -C50 [bcftools] -bcftools_bin: /bcftools-1.2/ +bcftools_bin: // base_cmd: bcftools call_parameters: -vg [picard] -picard_bin: /picard-tools-2.5.0/ -base_cmd: picard.jar +picard_bin: // +base_cmd: picard [gatk] -gatk_bin: /GenomeAnalysisTK-3.3-0/ -base_cmd: GenomeAnalysisTK.jar -haplotype_parameters: -T HaplotypeCaller --genotyping_mode DISCOVERY +gatk_bin: // +base_cmd: gatk +haplotype_parameters: HaplotypeCaller #changes: 12th August gatk_filter1_parameter_expression: FQ < 40.00 && MQ > 20 && QUAL > 50 && DP > 15 gatk_filter2_parameter_expression: FQ < 0.025 && MQ > 50 && QUAL > 100 && DP > 15 #changed gatk_filter2_parameter_expression DP 15 from 10 that was used in VRE samples. +# Default Parameters: FQ < 0.025 && MQ > 50 && QUAL > 100 && DP > 15 +# gatk_filter2_parameter_expression: AF1 < 1 && DP > 1 [vcftools] #vcftools_perl_bin: /vcftools_0.1.12b/perl/ -vcftools_bin: /vcftools_0.1.12b/bin/ -tabix_bin: /tabix-0.2.6/ +vcftools_bin: // +tabix_bin: // #vcftools_bin: /vcftools_0.1.12b/bin/ vcftools_perl_bin: /vcftools_0.1.12b/perl/ [qualimap] -qualimap_bin: /qualimap_v2.1/ +qualimap_bin: // base_cmd: qualimap [bedtools] -bedtools_bin: /bedtools2-master/bin/ +bedtools_bin: // base_cmd: bedtools version_for_coverage: /version_for_coverage/ [bioawk] -bioawk_bin: /bioawk-master/ +bioawk_bin: // base_cmd: bioawk [fasttree] -fasttree_bin: /Fasttree_2.1.10/ +fasttree_bin: // #For Multithread fasttree; use FastTreeMP executable file base_cmd: FastTree [raxml] -raxml_bin: /raxml/ +raxml_bin: // +openmpi_bin: // # Other raxml executable available to use: raxmlHPC-PTHREADS,raxmlHPC-PTHREADS-SSE3,raxmlHPC-SSE3 -base_cmd: raxmlHPC-SSE3 +base_cmd: raxmlHPC-HYBRID-SSE3 parameters: -f a -x 12345 -p 12345 -N autoMRE -m GTRCAT -T 20 +[iqtree] +iqtree_bin: // +base_cmd: iqtree +parameters: -nt AUTO -bb 1000 -m GTR+G+ASC + [gubbins] # Change this path to wherever gubbins is located/installed. Right now, installed using conda from anaconda3 package installed in bin_group. -gubbins_bin: /nfs/esnitkin/bin_group/anaconda3/bin/ +gubbins_bin: // base_cmd: run_gubbins.py +[mummer] +mummer_bin: // +nucmer_base_cmd: nucmer +min_tandem_repeat_length: 20 +percent_id: 95 + [SNP_filters] filter_criteria: snitkin_filters -# Other Criterias: SPANDx_filters, loose_filters, snitkin_filters +# Other Criterias: SPANDx_filters, loose_filters, snitkin_filters, contamination_filters [snitkin_filters] avg_depth: no @@ -138,8 +174,83 @@ fq2: 0.025 mq: 50 # Filter variants with Variant QUAL less than the below threshold qual: 100 +# Filter variants with GATK QualbyDepth QD parameter; filter less than the below threshold. Currently, being used for Indel SNPS only. +qd: 2.00 +# Filter variants with AF1 less than the below threshold +af: 0.900 +# Filter Variants that are proximate to each other within this number of range. To turn this off, use 0(zero). +prox: 0 + + +[gatk_haplotypecaller_filters] +avg_depth: no +# If AVG_DEPTH is yes, the below DP threshold will be ignored. Instead, LOW_DEPTH and HIGH_DEPTH filter parameter will be used. +# Filter variants with Depth less than the below threshold +dp: 9 +# A value of 2 means that regions with less than half of the average coverage of the entire genome will fail +low_depth: 2 +# A value of 5 means that regions with 5x depth greater than the average coverage will fail +high_depth: 5 +# FQ not represented in GATK Haplotype caller vcf format. Instead use AF. +# Filter variants with MQ(Root Mean Square Quality) less than the below threshold +mq: 50 +# Filter variants with Variant QUAL less than the below threshold +qual: 2 +# Filter variants with AF1 less than the below threshold +af: 0.9 +# Filter Variants that are proximate to each other within this number of range. To turn this off, use 0(zero). +prox: 0 + +[rna_filters] +avg_depth: no +# If AVG_DEPTH is yes, the below DP threshold will be ignored. Instead, LOW_DEPTH and HIGH_DEPTH filter parameter will be used. +# Filter variants with Depth less than the below threshold +dp: 3 +# A value of 2 means that regions with less than half of the average coverage of the entire genome will fail +low_depth: 2 +# A value of 5 means that regions with 5x depth greater than the average coverage will fail +high_depth: 5 +# Filter variants with FQ(Consensus Quality) greater than the below threshold +fq: 0.00 +fq2: 0.00 +# Filter variants with MQ(Root Mean Square Quality) less than the below threshold +mq: 50 +# Filter variants with Variant QUAL less than the below threshold +qual: 0 +# Filter variants with AF1 less than the below threshold +af: 0.9 # Filter Variants that are proximate to each other -prox: 10 +prox: 1 + +[contamination_filters] +avg_depth: no +# If AVG_DEPTH is yes, the below DP threshold will be ignored. Instead, LOW_DEPTH and HIGH_DEPTH filter parameter will be used. +# Filter variants with Depth less than the below threshold +dp: 3 +# A value of 2 means that regions with less than half of the average coverage of the entire genome will fail +low_depth: 2 +# A value of 5 means that regions with 5x depth greater than the average coverage will fail +high_depth: 5 +# Filter variants with FQ(Consensus Quality) greater than the below threshold +fq: -20.00 +fq2: -20.00 +# Filter variants with MQ(Root Mean Square Quality) less than the below threshold +mq: 50 +# Filter variants with Variant QUAL less than the below threshold +qual: 0 +# Filter variants with AF1 less than the below threshold +af: 1 +# Filter Variants that are proximate to each other +prox: 1 + +[functional_filters] +apply_functional_filters: yes +find_phage_region: yes +find_repetitive_region: yes +mask_region: no +mask_file: /nfs/esnitkin/bin_group/variant_calling_bin/reference/test_file.txt +mobile_elements: yes +apply_to_calls: yes #CLUSTER_SNP: 3 @@ -151,9 +262,11 @@ prox: 10 ##SNP annotations [snpeff] -snpeff_bin: /snpEff/ -base_cmd: snpEff.jar +snpeff_bin: // +base_cmd: snpEff snpeff_parameters: -d -no-downstream -no-upstream +prebuild: no +db: Staphylococcus_aureus_subsp_aureus_usa300_tch1516 dataDir: /data/ ######################################################################################################################## @@ -168,7 +281,7 @@ dataDir: /data/ # Name of reference genome fasta file. Ref_Name: KPNIH1.fasta # path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/KPNIH1/ +Ref_Path: /scratch/esnitkin_root/esnitkin/apirani/Testing_pipelines/reference/KPNIH1/ # Name of the reference genome. Provide this value with -index_name argument. [aus] @@ -183,6 +296,7 @@ Ref_Name: MRSA_USA_300.fasta # path to the reference genome fasta file. Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/MRSA_USA_300/ + [FPR3757] # Name of reference genome fasta file. Ref_Name: FPR3757.fasta @@ -194,6 +308,7 @@ Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/FPR3757/ Ref_Name: MRSA_USA_100_1.fasta # path to the reference genome fasta file. Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/MRSA_USA_100/ +Gff_Name: MRSA_USA_100.gff [CDIFF_630] # Name of reference genome fasta file. @@ -201,6 +316,30 @@ Ref_Name: cdiff_630.fasta # path to the reference genome fasta file. Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/CDIFF_630/ +[CDIFF_630_ncbi] +# Name of reference genome fasta file. +Ref_Name: cdiff_630.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/CDIFF_630_ncbi/ + +[Cdiff_VPI10463] +# Name of reference genome fasta file. +Ref_Name: Cdiff_VPI10463.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Cdiff_VPI10463/ + +[Cdiff_O27_R20291] +# Name of reference genome fasta file. +Ref_Name: Cdiff_O27_R20291.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Cdiff_O27_R20291/ + +[Cdiff_O14_PH44] +# Name of reference genome fasta file. +Ref_Name: Cdiff_O14_PH44.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Cdiff_O14_PH44/ + [CFT073] # Name of reference genome fasta file. Ref_Name: EscherichiacoliCFT073.fna @@ -353,81 +492,14 @@ Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/H358/ ######################################################################################################################## -# KPC Cluster Specific Reference Genomes - -[313] -# Name of reference genome fasta file. -Ref_Name: 313_R1_001_final_l500_contigs.fasta -# path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/Ali/Project_KPC_LTACH_analysis/Analysis/Project_KPC_all_KPC_2008+Hiseq+Latest/2017-01-17-Variant-Calling_Strain_Specific/data/reference/ - -[Rush_KPC_108] -# Name of reference genome fasta file. -Ref_Name: Rush_KPC_108_R1_l500_contigs.fasta -# path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/Ali/Project_KPC_LTACH_analysis/Analysis/Project_KPC_all_KPC_2008+Hiseq+Latest/2017-01-17-Variant-Calling_Strain_Specific/data/reference/ - -[Rush_KPC_110] -# Name of reference genome fasta file. -Ref_Name: Rush_KPC_110_R1_l500_contigs.fasta -# path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/Ali/Project_KPC_LTACH_analysis/Analysis/Project_KPC_all_KPC_2008+Hiseq+Latest/2017-01-17-Variant-Calling_Strain_Specific/data/reference/ - -[Rush_KPC_157] -# Name of reference genome fasta file. -Ref_Name: Rush_KPC_157_R1_l500_contigs.fasta -# path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/Ali/Project_KPC_LTACH_analysis/Analysis/Project_KPC_all_KPC_2008+Hiseq+Latest/2017-01-17-Variant-Calling_Strain_Specific/data/reference/ - -[Rush_KPC_212] -# Name of reference genome fasta file. -Ref_Name: Rush_KPC_212_R1_l500_contigs.fasta -# path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/Ali/Project_KPC_LTACH_analysis/Analysis/Project_KPC_all_KPC_2008+Hiseq+Latest/2017-01-17-Variant-Calling_Strain_Specific/data/reference/ - -[Rush_KPC_216] -# Name of reference genome fasta file. -Ref_Name: Rush_KPC_216_R1_l500_contigs.fasta -# path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/Ali/Project_KPC_LTACH_analysis/Analysis/Project_KPC_all_KPC_2008+Hiseq+Latest/2017-01-17-Variant-Calling_Strain_Specific/data/reference/ - -[Rush_KPC_233] -# Name of reference genome fasta file. -Ref_Name: Rush_KPC_233_R1_l500_contigs.fasta -# path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/Ali/Project_KPC_LTACH_analysis/Analysis/Project_KPC_all_KPC_2008+Hiseq+Latest/2017-01-17-Variant-Calling_Strain_Specific/data/reference/ - -[Rush_KPC_412] -# Name of reference genome fasta file. -Ref_Name: Rush_KPC_412_R1_l500_contigs.fasta -# path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/Ali/Project_KPC_LTACH_analysis/Analysis/Project_KPC_all_KPC_2008+Hiseq+Latest/2017-01-17-Variant-Calling_Strain_Specific/data/reference/ - -[Rush_KPC_48] -# Name of reference genome fasta file. -Ref_Name: Rush_KPC_48_R1_l500_contigs.fasta -# path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/Ali/Project_KPC_LTACH_analysis/Analysis/Project_KPC_all_KPC_2008+Hiseq+Latest/2017-01-17-Variant-Calling_Strain_Specific/data/reference/ - -[Rush_KPC_52] -# Name of reference genome fasta file. -Ref_Name: Rush_KPC_52_R1_l500_contigs.fasta -# path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/Ali/Project_KPC_LTACH_analysis/Analysis/Project_KPC_all_KPC_2008+Hiseq+Latest/2017-01-17-Variant-Calling_Strain_Specific/data/reference/ - -[Rush_KPC_586] -# Name of reference genome fasta file. -Ref_Name: Rush_KPC_586_R1_l500_contigs.fasta -# path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/Ali/Project_KPC_LTACH_analysis/Analysis/Project_KPC_all_KPC_2008+Hiseq+Latest/2017-01-17-Variant-Calling_Strain_Specific/data/reference/ +##Ecoli Reference Genomes -[Rush_KPC_653] +# ST131 reference Genome +[EC958] # Name of reference genome fasta file. -Ref_Name: Rush_KPC_653_R1_l500_contigs.fasta +Ref_Name: EC958.fasta # path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/Ali/Project_KPC_LTACH_analysis/Analysis/Project_KPC_all_KPC_2008+Hiseq+Latest/2017-01-17-Variant-Calling_Strain_Specific/data/reference/ - -##Ecoli Reference Genomes +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/EC958 [Ecoli_CD306] # Name of reference genome fasta file. @@ -481,16 +553,17 @@ Ref_Name: 803-A001-120-N-MRSA-TIP__ordered.fasta # path to the reference genome fasta file. Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/803-A001-120-N-MRSA-TIP +## This Efaecium settings is an old reference genome version. Not much difference except the contigs in the latest one is combined with gene start spacers seperating them out. ##Efaecium NH cluster-based Reference Genomes -[3399-4058-0-RVRE] -# Name of reference genome fasta file. -Ref_Name: 3399-4058-0-RVRE_ordered.fasta -# path to the reference genome fasta file. -Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/3399-4058-0-RVRE +##[3399-4058-0-RVRE] +### Name of reference genome fasta file. +##Ref_Name: 3399-4058-0-RVRE_ordered.fasta +### path to the reference genome fasta file. +##Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/3399-4058-0-RVRE ##Efaecium NH cluster-based Reference Genomes -[3399-4058-0-RVRE_latest] +[3399-4058-0-RVRE] # Name of reference genome fasta file. Ref_Name: 3399-4058-0-RVRE_final_ordered.fasta # path to the reference genome fasta file. @@ -802,6 +875,202 @@ Ref_Path: /scratch/esnitkin_fluxod/apirani/Project_Ecoli_KO/Analysis/2017_11_07_ ##LTACH latest clusters -/home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_1_Rush_KPC_194/Rush_KPC_194_contigs_ordered.fasta -/home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_2_Rush_KPC_350/Rush_KPC_350_S8_contigs_ordered.fasta -/home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_8_Rush_KPC_21/Rush_KPC_21_S35_contigs_ordered.fasta +[cluster_1_Rush_KPC_194] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_194_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_1_Rush_KPC_194/ + +[cluster_2_Rush_KPC_350] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_350_S8_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_2_Rush_KPC_350/ + +[cluster_3_Rush_KPC_586] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_586_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_3_Rush_KPC_586/ + +[cluster_4_Rush_KPC_227] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_227_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_4_Rush_KPC_227/ + +[cluster_5_Rush_KPC_108] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_108_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_5_Rush_KPC_108/ + +[cluster_6_Rush_KPC_113] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_113_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_6_Rush_KPC_113/ + +[cluster_7_Rush_KPC_157] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_157_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_7_Rush_KPC_157/ + +[cluster_8_Rush_KPC_21] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_21_S35_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_8_Rush_KPC_21/ + +[cluster_9_Rush_KPC_48] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_48_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_9_Rush_KPC_48/ + +[cluster_10_Rush_KPC_52] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_52_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_10_Rush_KPC_52/ + +[cluster_11_Rush_KPC_216] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_216_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_11_Rush_KPC_216/ + +[cluster_12_Rush_KPC_790] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_790_S58_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_12_Rush_KPC_790/ + +[cluster_13_Rush_KPC_233] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_233_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_13_Rush_KPC_233/ + +[cluster_14_Rush_KPC_340] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_340_S13_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_14_Rush_KPC_340/ + +[cluster_15_Rush_KPC_38] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_38_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_15_Rush_KPC_38/ + +[cluster_16_Rush_KPC_430] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_430_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_16_Rush_KPC_430/ + +[cluster_17_Rush_KPC_615] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_615_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_17_Rush_KPC_615/ + +[cluster_18_Rush_KPC_574] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_574_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_18_Rush_KPC_574/ + +[cluster_19_Rush_KPC_60] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_60_S27_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_19_Rush_KPC_60/ + +[cluster_20_Rush_KPC_85] +# Name of reference genome fasta file. +Ref_Name: Rush_KPC_85_contigs_ordered.fasta +# path to the reference genome fasta file. +Ref_Path: /home/apirani/bin/reference/KPC_LTACH_clusters_2/cluster_20_Rush_KPC_85/ + +[SRR3334137] +# Name of reference genome fasta file. +Ref_Name: SRR3334137.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/SRR3334137/ + + + +##Maddeline Helicobacter project +[helicobacter_hepaticus_ATCC_51449] +# Name of reference genome fasta file. +Ref_Name: helicobacter_hepaticus_ATCC_51449.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/helicobacter_hepaticus_ATCC_51449/ + + +##Steph Lab Confirmation Project +[MRSA_USA300_FPR3757] +# Name of reference genome fasta file. +Ref_Name: MRSA_USA300_FPR3757.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/MRSA_USA300_FPR3757/ + +[MRSA_Newman] +# Name of reference genome fasta file. +Ref_Name: MRSA_Newman.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/MRSA_Newman/ + +[AB030] +# Name of reference genome fasta file. +Ref_Name: AB030.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/AB030/ + + +[Steno_K279a] +# Name of reference genome fasta file. +Ref_Name: Steno_K279a.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Steno_K279a/ + +[lactobacillus_crispatus_ST1] +# Name of reference genome fasta file. +Ref_Name: lactobacillus_crispatus_ST1.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/Project_Crispatus/Sequence_data/Project_mother_daughter/reference_genome/lactobacillus_crispatus_ST1/ + +[USA500-2395] +# Name of reference genome fasta file. +Ref_Name: USA500-2395.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/USA500-2395/ + + +[DA01014] +# Name of reference genome fasta file. +Ref_Name: DA01014.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/DA01014/ + +[Cdiff_O14_W0022a] +# Name of reference genome fasta file. +Ref_Name: Cdiff_O14_W0022a.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Cdiff_O14_W0022a/ + + +[Pt0_Chromosome_OXA232] +# Name of reference genome fasta file. +Ref_Name: Pt0_Chromosome_OXA232.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/Pt0_Chromosome_OXA232/ + +[NTHi_86-028NP] +# Name of reference genome fasta file. +Ref_Name: NTHi_86-028NP.fasta +# path to the reference genome fasta file. +Ref_Path: /nfs/esnitkin/bin_group/variant_calling_bin/reference/NTHi_86-028NP/ diff --git a/config_settings.pyc b/config_settings.pyc index 4aec2a8..865dea9 100755 Binary files a/config_settings.pyc and b/config_settings.pyc differ diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..65eb6f7 --- /dev/null +++ b/environment.yml @@ -0,0 +1,57 @@ +name: variantcalling_env +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - bcftools=1.2=h02bfda8_4 + - bedtools=2.23.0=hdbcaa40_3 + - bioawk=1.0=h84994c4_4 + - bowtie2=2.2.6=py27_0 + - bwa=0.7.12=1 + - bzip2=1.0.8=h516909a_1 + - ca-certificates=2019.9.11=hecc5488_0 + - capnproto=0.6.1=hfc679d8_1 + - certifi=2019.9.11=py27_0 + - curl=7.61.0=h93b3f91_2 + - gatk=3.8=py27_0 + - gatk4=4.1.3.0=0 + - gsl=2.5=h294904e_1 + - java-jdk=8.0.92=1 + - joblib=0.14.0=py_0 + - krb5=1.14.6=0 + - libblas=3.8.0=13_openblas + - libcblas=3.8.0=13_openblas + - libgcc-ng=9.1.0=hdf63c60_0 + - libgfortran-ng=7.3.0=hdf63c60_0 + - libopenblas=0.3.7=h6e990d7_1 + - libssh2=1.8.0=h1ad7b7a_1003 + - libstdcxx-ng=9.1.0=hdf63c60_0 + - mash=2.2.1=h3d38be6_0 + - mummer=3.23=pl526_8 + - ncurses=5.9=10 + - openjdk=8.0.192=h14c3975_1003 + - openssl=1.0.2r=h14c3975_0 + - perl=5.26.2=h516909a_1006 + - picard=2.5.0=2 + - pilon=1.22=1 + - pip=19.2.3=py27_0 + - python=2.7.12=2 + - qualimap=2.2.2a=1 + - raxml=8.2.12=h14c3975_1 + - readline=6.2=0 + - samtools=1.2=2 + - setuptools=41.2.0=py27_0 + - snpeff=4.3.1t=2 + - sqlite=3.13.0=1 + - subprocess32=3.5.4=py27h516909a_0 + - tabix=0.2.6=ha92aebf_0 + - tk=8.5.19=2 + - trimmomatic=0.39=1 + - vcftools=0.1.16=he860b03_3 + - wheel=0.33.6=py27_0 + - xz=5.2.4=h14c3975_1001 + - zlib=1.2.11=h516909a_1006 +prefix: /home/apirani/.conda/envs/variantcalling_env + diff --git a/modules/__init__.pyc b/modules/__init__.pyc index 7007ff8..f2a3c03 100755 Binary files a/modules/__init__.pyc and b/modules/__init__.pyc differ diff --git a/modules/bedtools.py b/modules/bedtools.py index fc361e3..24b90cd 100755 --- a/modules/bedtools.py +++ b/modules/bedtools.py @@ -41,7 +41,7 @@ def bedgraph_coverage(out_sorted_bam, out_path, analysis, reference, logger, Con reference_first_part_split = reference_filename_base.split('.') first_part = reference_first_part_split[0] reference_dir = os.path.dirname(reference) - makewindows_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("bedtools", Config)['bedtools_bin'] + "/" + ConfigSectionMap("bedtools", Config)['version_for_coverage'] + ConfigSectionMap("bedtools", Config)['base_cmd'] + " makewindows -g %s -w 1000 > %s/%s.bed" % (reference_SIZE_file, reference_dir, first_part) + makewindows_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("bedtools", Config)['base_cmd'] + " makewindows -g %s -w 1000 > %s/%s.bed" % (reference_SIZE_file, reference_dir, first_part) keep_logging(makewindows_cmd, makewindows_cmd, logger, 'debug') try: call(makewindows_cmd, logger) @@ -49,7 +49,7 @@ def bedgraph_coverage(out_sorted_bam, out_path, analysis, reference, logger, Con keep_logging('Error in Bedtools Make Windows step. Exiting.', 'Error in Bedtools Make Windows step. Exiting.', logger, 'exception') sys.exit(1) reference_windows_file = "%s/%s.bed" % (reference_dir, first_part) - bedcoverage_command = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("bedtools", Config)['bedtools_bin'] + "/" + ConfigSectionMap("bedtools", Config)['version_for_coverage'] + ConfigSectionMap("bedtools", Config)['base_cmd'] + " coverage -abam %s -b %s > %s/%s.bedcov" % (out_sorted_bam, reference_windows_file, out_path, analysis) + bedcoverage_command = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("bedtools", Config)['base_cmd'] + " coverage -abam %s -b %s > %s/%s.bedcov" % (out_sorted_bam, reference_windows_file, out_path, analysis) keep_logging(bedcoverage_command, bedcoverage_command, logger, 'debug') try: call(bedcoverage_command, logger) diff --git a/modules/bedtools.pyc b/modules/bedtools.pyc index 2c22dfd..5136a04 100755 Binary files a/modules/bedtools.pyc and b/modules/bedtools.pyc differ diff --git a/modules/bioawk.pyc b/modules/bioawk.pyc index dd26263..579c4a3 100755 Binary files a/modules/bioawk.pyc and b/modules/bioawk.pyc differ diff --git a/modules/bowtie.pyc b/modules/bowtie.pyc index 1dba0a8..7cac71f 100755 Binary files a/modules/bowtie.pyc and b/modules/bowtie.pyc differ diff --git a/modules/bwa.pyc b/modules/bwa.pyc index b5ff129..d432657 100755 Binary files a/modules/bwa.pyc and b/modules/bwa.pyc differ diff --git a/modules/core_prep_sanity_checks.pyc b/modules/core_prep_sanity_checks.pyc index 15fefa6..4ee3ebf 100755 Binary files a/modules/core_prep_sanity_checks.pyc and b/modules/core_prep_sanity_checks.pyc differ diff --git a/modules/fasttree.pyc b/modules/fasttree.pyc index 8344adf..d80a3da 100755 Binary files a/modules/fasttree.pyc and b/modules/fasttree.pyc differ diff --git a/modules/gatk.py b/modules/gatk.py index 9ae95c3..dc78982 100755 --- a/modules/gatk.py +++ b/modules/gatk.py @@ -9,7 +9,7 @@ def gatk_filter(final_raw_vcf, out_path, analysis, reference, logger, Config, Avg_dp): if ConfigSectionMap("pipeline", Config)['variant_caller'] == "samtools": - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] + base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] filter_criteria = ConfigSectionMap("SNP_filters", Config)['filter_criteria'] print "Using variant filter parameters from: %s" % filter_criteria if ConfigSectionMap(filter_criteria, Config)['avg_depth'] == "yes": @@ -26,9 +26,9 @@ def gatk_filter(final_raw_vcf, out_path, analysis, reference, logger, Config, Av AF_filter = "AF1 > %s" % float(ConfigSectionMap(filter_criteria, Config)['af']) gatk_filter2_parameter_expression = "%s && %s && %s && %s && %s && %s" % (FQ_filter, MQ_filter, QUAL_filter, DP_filter, FQ_filter2, AF_filter) if os.path.exists(final_raw_vcf): - gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % (base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) + gatk_filter2_command = "%s VariantFiltration -R %s -O %s/%s_filter2_gatk.vcf --variant %s --filter-expression \"%s\" --filter-name PASS_filter2" % (base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) else: - gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_gatk.vcf --variant %s.gz --filterExpression \"%s\" --filterName PASS_filter2" % (base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) + gatk_filter2_command = "%s VariantFiltration -R %s -O %s/%s_filter2_gatk.vcf --variant %s.gz --filter-expression \"%s\" --filter-name PASS_filter2" % (base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_gatk.vcf > %s/%s_filter2_final.vcf" % (out_path, analysis, out_path, analysis) keep_logging(gatk_filter2_command, gatk_filter2_command, logger, 'debug') keep_logging(filter_flag_command, filter_flag_command, logger, 'debug') @@ -41,8 +41,7 @@ def gatk_filter(final_raw_vcf, out_path, analysis, reference, logger, Config, Av gatk_filter2_final_vcf = "%s/%s_filter2_final.vcf" % (out_path, analysis) return gatk_filter2_final_vcf elif ConfigSectionMap("pipeline", Config)['variant_caller'] == "gatkhaplotypecaller": - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)[ - 'gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] + base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] filter_criteria = ConfigSectionMap("SNP_filters", Config)['filter_criteria'] keep_logging("Using variant filter parameters from: %s" % filter_criteria, "Using variant filter parameters from: %s" % filter_criteria, logger, 'info') if ConfigSectionMap(filter_criteria, Config)['avg_depth'] == "yes": @@ -59,10 +58,10 @@ def gatk_filter(final_raw_vcf, out_path, analysis, reference, logger, Config, Av gatk_filter2_parameter_expression = "%s && %s && %s && %s" % ( MQ_filter, QUAL_filter, DP_filter, AF_filter) if os.path.exists(final_raw_vcf): - gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % ( + gatk_filter2_command = "%s VariantFiltration -R %s -o %s/%s_filter2_gatk.vcf --variant %s --filter-expression \"%s\" --filter-name PASS_filter2" % ( base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) else: - gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_gatk.vcf --variant %s.gz --filterExpression \"%s\" --filterName PASS_filter2" % ( + gatk_filter2_command = "%s VariantFiltration -R %s -o %s/%s_filter2_gatk.vcf --variant %s.gz --filter-expression \"%s\" --filter-name PASS_filter2" % ( base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_gatk.vcf > %s/%s_filter2_final.vcf" % ( out_path, analysis, out_path, analysis) @@ -80,7 +79,7 @@ def gatk_filter(final_raw_vcf, out_path, analysis, reference, logger, Config, Av def gatk_filter_contamination(final_raw_vcf, out_path, analysis, reference, logger, Config, Avg_dp): if ConfigSectionMap("pipeline", Config)['variant_caller'] == "samtools": - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] + base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] filter_criteria = "contamination_filters" if ConfigSectionMap(filter_criteria, Config)['avg_depth'] == "yes": keep_logging('The average depth filter is turned on.', 'The average depth filter is turned on.', logger, 'info') @@ -95,9 +94,9 @@ def gatk_filter_contamination(final_raw_vcf, out_path, analysis, reference, logg AF_filter = "AF1 < %s" % float(ConfigSectionMap(filter_criteria, Config)['af']) gatk_filter2_parameter_expression = "%s && %s && %s && %s && %s" % (FQ_filter, MQ_filter, QUAL_filter, DP_filter, AF_filter) if os.path.exists(final_raw_vcf): - gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_contamination.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % (base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) + gatk_filter2_command = "%s VariantFiltration -R %s -o %s/%s_filter2_contamination.vcf --variant %s --filter-expression \"%s\" --filter-name PASS_filter2" % (base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) else: - gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_contamination.vcf --variant %s.gz --filterExpression \"%s\" --filterName PASS_filter2" % (base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) + gatk_filter2_command = "%s VariantFiltration -R %s -o %s/%s_filter2_contamination.vcf --variant %s.gz --filter-expression \"%s\" --filter-name PASS_filter2" % (base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_contamination.vcf > %s/%s_filter2_final_contamination.vcf" % (out_path, analysis, out_path, analysis) keep_logging(gatk_filter2_command, gatk_filter2_command, logger, 'debug') keep_logging(filter_flag_command, filter_flag_command, logger, 'debug') @@ -176,8 +175,7 @@ def gatk_filter_indel(final_raw_vcf, out_path, analysis, reference, logger, Conf # gatk_filter2_final_vcf = "%s/%s_filter2_indel_final.vcf" % (out_path, analysis) # return gatk_filter2_final_vcf # elif ConfigSectionMap("pipeline", Config)['variant_caller'] == "gatkhaplotypecaller": - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)[ - 'gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] + base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] filter_criteria = ConfigSectionMap("SNP_filters", Config)['filter_criteria'] keep_logging("Using variant filter parameters from: %s" % filter_criteria, "Using variant filter parameters from: %s" % filter_criteria, logger, 'info') if ConfigSectionMap(filter_criteria, Config)['avg_depth'] == "yes": @@ -196,10 +194,10 @@ def gatk_filter_indel(final_raw_vcf, out_path, analysis, reference, logger, Conf gatk_filter2_parameter_expression = "%s && %s && %s && %s" % (MQ_filter, QUAL_filter, DP_filter, AF_filter) if os.path.exists(final_raw_vcf): - gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_indel_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % ( + gatk_filter2_command = "%s VariantFiltration -R %s -O %s/%s_filter2_indel_gatk.vcf --variant %s --filter-expression \"%s\" --filter-name PASS_filter2" % ( base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) else: - gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_indel_gatk.vcf --variant %s.gz --filterExpression \"%s\" --filterName PASS_filter2" % ( + gatk_filter2_command = "%s VariantFiltration -R %s -O %s/%s_filter2_indel_gatk.vcf --variant %s.gz --filter-expression \"%s\" --filter-name PASS_filter2" % ( base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_indel_gatk.vcf > %s/%s_filter2_indel_final.vcf" % ( @@ -217,8 +215,8 @@ def gatk_filter_indel(final_raw_vcf, out_path, analysis, reference, logger, Conf return gatk_filter2_final_vcf def gatk_DepthOfCoverage(out_sorted_bam, out_path, analysis_name, reference, logger, Config): - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] - cmd = "java -jar %s -T DepthOfCoverage -R %s -o %s/%s_depth_of_coverage -I %s --summaryCoverageThreshold 1 --summaryCoverageThreshold 5 --summaryCoverageThreshold 9 --summaryCoverageThreshold 10 --summaryCoverageThreshold 15 --summaryCoverageThreshold 20 --summaryCoverageThreshold 25 --ignoreDeletionSites" % (base_cmd, reference, out_path, analysis_name, out_sorted_bam) + base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] + cmd = "java -Xmx8G -jar %s/GenomeAnalysisTK.jar -T DepthOfCoverage -R %s -o %s/%s_depth_of_coverage -I %s --summaryCoverageThreshold 1 --summaryCoverageThreshold 5 --summaryCoverageThreshold 9 --summaryCoverageThreshold 10 --summaryCoverageThreshold 15 --summaryCoverageThreshold 20 --summaryCoverageThreshold 25 --ignoreDeletionSites" % (os.path.dirname(os.path.dirname(os.path.abspath(__file__))), reference, out_path, analysis_name, out_sorted_bam) keep_logging(cmd, cmd, logger, 'debug') try: call(cmd, logger) @@ -230,8 +228,8 @@ def gatk_DepthOfCoverage(out_sorted_bam, out_path, analysis_name, reference, log return gatk_depth_of_coverage_file def gatk_vcf2fasta_filter2(only_snp_filter2_vcf_file, out_path, analysis, reference, logger, Config): - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] - vcf2fasta_filter2_cmd = "java -jar %s -R %s -T FastaAlternateReferenceMaker -o %s_filter2.fasta --variant %s" % (base_cmd, reference, only_snp_filter2_vcf_file, only_snp_filter2_vcf_file) + base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] + vcf2fasta_filter2_cmd = "%s FastaAlternateReferenceMaker -R %s -O %s_filter2.fasta --variant %s" % (base_cmd, reference, only_snp_filter2_vcf_file, only_snp_filter2_vcf_file) keep_logging(vcf2fasta_filter2_cmd, vcf2fasta_filter2_cmd, logger, 'debug') try: call(vcf2fasta_filter2_cmd, logger) @@ -256,9 +254,9 @@ def gatk_vcf2fasta_filter2(only_snp_filter2_vcf_file, out_path, analysis, refere return gatk_vcf2fasta_filter2_file def gatkhaplotypecaller(out_finalbam, out_path, reference, analysis, logger, Config): - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] + base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] reference_filename = ConfigSectionMap(reference, Config)['ref_path'] + "/" + ConfigSectionMap(reference, Config)['ref_name'] - cmd = "java -jar %s %s -R %s -I %s -o %s/%s_aln_mpileup_raw.vcf" % (base_cmd, ConfigSectionMap("gatk", Config)['haplotype_parameters'], reference_filename, out_finalbam, out_path, analysis) + cmd = "%s %s -R %s -I %s -O %s/%s_aln_mpileup_raw.vcf" % (base_cmd, ConfigSectionMap("gatk", Config)['haplotype_parameters'], reference_filename, out_finalbam, out_path, analysis) keep_logging('Running Command: [%s]' % cmd, 'Running Command: [%s]' % cmd, logger, 'info') #os.system(cmd) call(cmd, logger) @@ -267,9 +265,9 @@ def gatkhaplotypecaller(out_finalbam, out_path, reference, analysis, logger, Con """ Unused methods """ def gatk_filter1(final_raw_vcf, out_path, analysis, reference): - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] + base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] gatk_filter1_parameter_expression = ConfigSectionMap("gatk")['gatk_filter1_parameter_expression'] - gatk_filter1_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter1_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter1" % (base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter1_parameter_expression) + gatk_filter1_command = "%s VariantFiltration -R %s -O %s/%s_filter1_gatk.vcf --variant %s --filter-expression \"%s\" --filter-name PASS_filter1" % (base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter1_parameter_expression) keep_logging('Running Command: [%s]' % gatk_filter1_command, 'Running Command: [%s]' % gatk_filter1_command, logger, 'info') os.system(gatk_filter1_command) filter_flag_command = "grep '#\|PASS_filter1' %s/%s_filter1_gatk.vcf > %s/%s_filter1_final.vcf" % (out_path, analysis, out_path, analysis) @@ -278,15 +276,15 @@ def gatk_filter1(final_raw_vcf, out_path, analysis, reference): return gatk_filter1_final_vcf def indel_realign(out_marked_sort_bam_rename, reference, out_path, analysis): - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] + base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] #require fai index of reference #require seq dict of reference reference_filename = ConfigSectionMap(reference)['ref_path'] + "/" + ConfigSectionMap(reference)['ref_name'] ref_fai_index(reference_filename) picard_seqdict(reference_filename, reference) - cmd = "java -jar %s -T RealignerTargetCreator -R %s -o %s/%s_aln_sort_marked.bam.list -I %s " % (base_cmd, reference_filename, out_path, analysis, out_marked_sort_bam_rename) + cmd = "%s RealignerTargetCreator -R %s -O %s/%s_aln_sort_marked.bam.list -I %s " % (base_cmd, reference_filename, out_path, analysis, out_marked_sort_bam_rename) os.system(cmd) - cmd = "java -jar %s -I %s -R %s -T IndelRealigner -targetIntervals %s/%s_aln_sort_marked.bam.list -o %s/%s_aln_realigned.bam" % (base_cmd, out_marked_sort_bam_rename, reference_filename, out_path, analysis, out_path, analysis) + cmd = "%s IndelRealigner -I %s -R %s -targetIntervals %s/%s_aln_sort_marked.bam.list O %s/%s_aln_realigned.bam" % (base_cmd, out_marked_sort_bam_rename, reference_filename, out_path, analysis, out_path, analysis) os.system(cmd) out_indel_realigned = "%s/%s_aln_realigned.bam" % (out_path, analysis) return out_indel_realigned @@ -301,8 +299,8 @@ def indel_realign(out_marked_sort_bam_rename, reference, out_path, analysis): # return final_raw_vcf def gatk_vcf2fasta_filter1(only_snp_filter1_vcf_file, out_path, analysis, reference): - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] - vcf2fasta_filter1_cmd = "java -jar %s -R %s -T FastaAlternateReferenceMaker -o %s_filter1.fasta --variant %s" % (base_cmd, reference, only_snp_filter1_vcf_file, only_snp_filter1_vcf_file) + base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] + vcf2fasta_filter1_cmd = "%s FastaAlternateReferenceMaker -R %s -o %s_filter1.fasta --variant %s" % (base_cmd, reference, only_snp_filter1_vcf_file, only_snp_filter1_vcf_file) keep_logging('Running Command: [%s]' % vcf2fasta_filter1_cmd, 'Running Command: [%s]' % vcf2fasta_filter1_cmd, logger, 'info') os.system(vcf2fasta_filter1_cmd) if _platform == "darwin": diff --git a/modules/gatk.pyc b/modules/gatk.pyc index 9ec14e2..4fe68b6 100755 Binary files a/modules/gatk.pyc and b/modules/gatk.pyc differ diff --git a/modules/gubbins.pyc b/modules/gubbins.pyc index d8183d4..34dc5bf 100755 Binary files a/modules/gubbins.pyc and b/modules/gubbins.pyc differ diff --git a/modules/iqtree.pyc b/modules/iqtree.pyc index 37e70cd..b5e7e49 100755 Binary files a/modules/iqtree.pyc and b/modules/iqtree.pyc differ diff --git a/modules/log_modules.pyc b/modules/log_modules.pyc index 813b88f..f89c293 100755 Binary files a/modules/log_modules.pyc and b/modules/log_modules.pyc differ diff --git a/modules/logging_subprocess.pyc b/modules/logging_subprocess.pyc index eae0d33..924e74f 100755 Binary files a/modules/logging_subprocess.pyc and b/modules/logging_subprocess.pyc differ diff --git a/modules/organize_jobs.py b/modules/organize_jobs.py deleted file mode 100755 index f6d60f1..0000000 --- a/modules/organize_jobs.py +++ /dev/null @@ -1,16 +0,0 @@ -__author__ = 'alipirani' - -import sys -import os -import argparse -import errno -from datetime import datetime -import ConfigParser -from config_settings import ConfigSectionMap -if sys.version_info < (3, 2): - import subprocess32 as sp -else: - import subprocess as sp - - -def submit_job diff --git a/modules/phage_detection.pyc b/modules/phage_detection.pyc index a7712be..858d792 100755 Binary files a/modules/phage_detection.pyc and b/modules/phage_detection.pyc differ diff --git a/modules/picard.py b/modules/picard.py index 2c5dc26..a92ff70 100755 --- a/modules/picard.py +++ b/modules/picard.py @@ -5,9 +5,10 @@ from config_settings import ConfigSectionMap def markduplicates(out_sorted_bam, out_path, analysis, files_to_delete, logger, Config): - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("picard", Config)['picard_bin'] + "/" + ConfigSectionMap("picard", Config)['base_cmd'] + # Updated for Great Lakes and Conda Integration + base_cmd = ConfigSectionMap("picard", Config)['base_cmd'] keep_logging('Removing PCR duplicates using PICARD', 'Removing PCR duplicates using PICARD', logger, 'info') - cmd = "java -jar %s MarkDuplicates REMOVE_DUPLICATES=true INPUT=%s OUTPUT=%s/%s_aln_marked.bam METRICS_FILE=%s/%s_markduplicates_metrics CREATE_INDEX=true VALIDATION_STRINGENCY=LENIENT MAX_FILE_HANDLES=500" % (base_cmd, out_sorted_bam, out_path, analysis, out_path, analysis) + cmd = "%s MarkDuplicates REMOVE_DUPLICATES=true INPUT=%s OUTPUT=%s/%s_aln_marked.bam METRICS_FILE=%s/%s_markduplicates_metrics CREATE_INDEX=true VALIDATION_STRINGENCY=LENIENT" % (base_cmd, out_sorted_bam, out_path, analysis, out_path, analysis) keep_logging(cmd, cmd, logger, 'debug') try: call(cmd, logger) @@ -23,18 +24,20 @@ def markduplicates(out_sorted_bam, out_path, analysis, files_to_delete, logger, return out_marked_bam def picard_seqdict(reference_filename, reference, logger, Config): + # Updated for Great Lakes and Conda Integration dict_name = os.path.splitext(os.path.basename(reference_filename))[0] + ".dict" - cmd = "java -jar %s CreateSequenceDictionary REFERENCE=%s OUTPUT=%s/%s" % (base_cmd, reference_filename, ConfigSectionMap(reference, Config)['ref_path'],dict_name) + base_cmd = ConfigSectionMap("picard", Config)['base_cmd'] + cmd = "%s CreateSequenceDictionary REFERENCE=%s OUTPUT=%s/%s" % (base_cmd, reference_filename, ConfigSectionMap(reference, Config)['ref_path'],dict_name) os.system(cmd) def picardstats(out_sorted_bam, out_path, analysis, reference, logger, Config): + # Updated for Great Lakes and Conda Integration reference_filename = ConfigSectionMap(reference, Config)['ref_path'] + "/" + ConfigSectionMap(reference, Config)['ref_name'] - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("picard", Config)[ - 'picard_bin'] + "/" + ConfigSectionMap("picard", Config)['base_cmd'] + base_cmd = ConfigSectionMap("picard", Config)['base_cmd'] - cmd = "java -jar %s CollectWgsMetrics I=%s O=%s/%s_collect_wgs_metrics.txt R=%s" % ( + cmd = "%s CollectWgsMetrics I=%s O=%s/%s_collect_wgs_metrics.txt R=%s" % ( base_cmd, out_sorted_bam, out_path, analysis, reference_filename) keep_logging(cmd, cmd, logger, 'debug') try: @@ -44,7 +47,7 @@ def picardstats(out_sorted_bam, out_path, analysis, reference, logger, Config): 'Error in Picard CollectWgsMetrics step. Exiting.', logger, 'exception') sys.exit(1) - cmd = "java -jar %s CollectAlignmentSummaryMetrics I=%s O=%s/%s_collect_alignment_metrics.txt R=%s" % ( + cmd = "%s CollectAlignmentSummaryMetrics I=%s O=%s/%s_collect_alignment_metrics.txt R=%s" % ( base_cmd, out_sorted_bam, out_path, analysis, reference_filename) keep_logging(cmd, cmd, logger, 'debug') try: @@ -54,7 +57,7 @@ def picardstats(out_sorted_bam, out_path, analysis, reference, logger, Config): 'Error in Picard CollectWgsMetrics step. Exiting.', logger, 'exception') sys.exit(1) - cmd = "java -jar %s CollectGcBiasMetrics I=%s O=%s/%s_gc_bias_metrics.txt R=%s S=%s/%s_summary_metrics.txt CHART=%s/%s_gc_bias_metrics.pdf " % ( + cmd = "%s CollectGcBiasMetrics I=%s O=%s/%s_gc_bias_metrics.txt R=%s S=%s/%s_summary_metrics.txt CHART=%s/%s_gc_bias_metrics.pdf " % ( base_cmd, out_sorted_bam, out_path, analysis, reference_filename, out_path, analysis, out_path, analysis) keep_logging(cmd, cmd, logger, 'debug') try: diff --git a/modules/picard.pyc b/modules/picard.pyc index a976a63..1de471b 100755 Binary files a/modules/picard.pyc and b/modules/picard.pyc differ diff --git a/modules/qualimap.pyc b/modules/qualimap.pyc index f59277e..e8aa092 100755 Binary files a/modules/qualimap.pyc and b/modules/qualimap.pyc differ diff --git a/modules/raxml.pyc b/modules/raxml.pyc index 901725f..0c1b824 100755 Binary files a/modules/raxml.pyc and b/modules/raxml.pyc differ diff --git a/modules/remove_5_bp_snp_indel.py b/modules/remove_5_bp_snp_indel.py index 20640be..1f6de62 100755 --- a/modules/remove_5_bp_snp_indel.py +++ b/modules/remove_5_bp_snp_indel.py @@ -46,9 +46,8 @@ def remove_5_bp_snp_indel(raw_vcf_file, out_path, analysis, reference, logger, C print "GATK Haplotype caller: Removing SNPs proximate to Indel by 5bp" remove_snps_5_bp_snp_indel_file_name = raw_vcf_file + "_5bp_indel_removed.vcf" indel_file_name = raw_vcf_file + "_indel.vcf" - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)[ - 'gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] - cmd = "java -jar %s -T SelectVariants -R %s -V %s -selectType INDEL -o %s" % ( + base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] + cmd = "%s SelectVariants -R %s -V %s -select-type INDEL -O %s" % ( base_cmd, reference, raw_vcf_file, indel_file_name) call(cmd, logger) keep_logging('Running Command: [%s]' % cmd, 'Running Command: [%s]' % cmd, logger, 'info') @@ -105,9 +104,8 @@ def prepare_indel(raw_vcf_file, out_path, analysis, reference, logger, Config): elif ConfigSectionMap("pipeline", Config)['variant_caller'] == "gatkhaplotypecaller": print "GATK Haplotype caller: Extracting indels from raw vcf files" indel_file_name = raw_vcf_file + "_indel.vcf" - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)[ - 'gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] - cmd = "java -jar %s -T SelectVariants -R %s -V %s -selectType INDEL -o %s" % ( + base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] + cmd = "%s SelectVariants -R %s -V %s -select-type INDEL -O %s" % ( base_cmd, reference, raw_vcf_file, indel_file_name) call(cmd, logger) keep_logging('Running Command: [%s]' % cmd, 'Running Command: [%s]' % cmd, logger, 'info') @@ -123,9 +121,8 @@ def prepare_indel_gatk(out_finalbam, out_path, analysis, reference, logger, Conf else: print "GATK Haplotype caller: Extracting indels from raw vcf files" indel_file_name = final_raw_vcf + "_indel.vcf" - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)[ - 'gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] - cmd = "java -jar %s -T SelectVariants -R %s -V %s -selectType INDEL -o %s" % ( + base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] + cmd = "%s SelectVariants -R %s -V %s -select-type INDEL -O %s" % ( base_cmd, reference_filename, final_raw_vcf, indel_file_name) call(cmd, logger) keep_logging('Running Command: [%s]' % cmd, 'Running Command: [%s]' % cmd, logger, 'info') diff --git a/modules/remove_5_bp_snp_indel.pyc b/modules/remove_5_bp_snp_indel.pyc index 0733f00..bf59f67 100755 Binary files a/modules/remove_5_bp_snp_indel.pyc and b/modules/remove_5_bp_snp_indel.pyc differ diff --git a/modules/samtools.py b/modules/samtools.py index 8fa8578..300613c 100755 --- a/modules/samtools.py +++ b/modules/samtools.py @@ -24,7 +24,8 @@ def samtobam(out_sam, out_path, analysis, files_to_delete, logger, Config): def sort_bam(out_bam, out_path, analysis, logger, Config): base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("samtools", Config)['samtools_bin'] + "/" + ConfigSectionMap("samtools", Config)['base_cmd'] - cmd = "%s sort %s %s/%s_aln_sort" % (base_cmd, out_bam, out_path, analysis) + #cmd = "%s sort %s %s/%s_aln_sort" % (base_cmd, out_bam, out_path, analysis) + cmd = "%s sort %s -m 500M -@ 0 -o %s/%s_aln_sort.bam -T %s/%s_aln_sort_temp" % (base_cmd, out_bam, out_path, analysis, out_path, analysis) keep_logging('Sorting BAM file', 'Sorting BAM file', logger, 'info') keep_logging(cmd, cmd, logger, 'debug') try: diff --git a/modules/samtools.pyc b/modules/samtools.pyc index 26f9ed1..cf6a8eb 100755 Binary files a/modules/samtools.pyc and b/modules/samtools.pyc differ diff --git a/modules/stages.pyc b/modules/stages.pyc index 86fa285..6a55f8b 100755 Binary files a/modules/stages.pyc and b/modules/stages.pyc differ diff --git a/modules/tabix.pyc b/modules/tabix.pyc index b57b0c4..a253d39 100755 Binary files a/modules/tabix.pyc and b/modules/tabix.pyc differ diff --git a/modules/trim.py b/modules/trim.py index 6809ed3..0b5d7bd 100755 --- a/modules/trim.py +++ b/modules/trim.py @@ -15,15 +15,18 @@ def trim(input1, input2, out_path, crop, logger, Config): if input2 != "None": keep_logging('Pre-processing PE reads using Trimmomatic.', 'Pre-processing PE reads using Trimmomatic.', logger, 'info') - adapter_file = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("Trimmomatic", Config)['trimmomatic_bin'] + "/" + ConfigSectionMap("Trimmomatic", Config)['adaptor_filepath'] - clean_filenames = out_path + ConfigSectionMap("Trimmomatic", Config)['f_p'] + " " + out_path + ConfigSectionMap("Trimmomatic", Config)['f_up'] + " " + out_path + ConfigSectionMap("Trimmomatic", Config)['r_p'] + " " + out_path + ConfigSectionMap("Trimmomatic", Config)['r_up'] + #adapter_file = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("Trimmomatic", Config)['trimmomatic_bin'] + "/" + ConfigSectionMap("Trimmomatic", Config)['adaptor_filepath'] + # Updated for Great Lakes and Conda Integration + adapter_file = ConfigSectionMap("Trimmomatic", Config)['adaptor_filepath'] + clean_filenames = out_path + ConfigSectionMap("Trimmomatic", Config)['f_p'] + " " + out_path + ConfigSectionMap("Trimmomatic", Config)['f_up'] + " " + out_path + ConfigSectionMap("Trimmomatic", Config)['r_p'] + " " + out_path + ConfigSectionMap("Trimmomatic", Config)['r_up'] # changing this parameter for KPC variant analysis for keeping both reads. date: 31 August illumina_string = 'ILLUMINACLIP:' + adapter_file + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['seed_mismatches'] + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['palindrome_clipthreshold'] + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['simple_clipthreshold'] + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['minadapterlength'] + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['keep_both_reads'] sliding_string = 'SLIDINGWINDOW:' + ConfigSectionMap("Trimmomatic", Config)['window_size'] + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['window_size_quality'] minlen_string = 'MINLEN:' + ConfigSectionMap("Trimmomatic", Config)['minlength'] headcrop_string = 'HEADCROP:' + ConfigSectionMap("Trimmomatic", Config)['headcrop_length'] if not crop: - cmdstring = "java -jar " + ConfigSectionMap("bin_path", Config)['binbase'] + ConfigSectionMap("Trimmomatic", Config)['trimmomatic_bin'] + "trimmomatic-0.36.jar PE -phred33 " + input1 + " " + input2 + " " + clean_filenames + " " + illumina_string + " " + sliding_string + " " + minlen_string + " " + headcrop_string + " 2> %s/%s_trim_out.log" % (out_path, os.path.basename(os.path.dirname(out_path))) + # Updated for Great Lakes and Conda Integration + cmdstring = "trimmomatic PE -phred33 " + input1 + " " + input2 + " " + clean_filenames + " " + illumina_string + " " + sliding_string + " " + minlen_string + " " + headcrop_string + " 2> %s/%s_trim_out.log" % (out_path, os.path.basename(os.path.dirname(out_path))) keep_logging(cmdstring, cmdstring, logger, 'debug') try: call(cmdstring, logger) @@ -33,7 +36,7 @@ def trim(input1, input2, out_path, crop, logger, Config): keep_logging('End: Data Pre-processing', 'End: Data Pre-processing', logger, 'info') else: crop_string = 'CROP:' + crop - cmdstring = "java -jar " + ConfigSectionMap("bin_path", Config)['binbase'] + ConfigSectionMap("Trimmomatic", Config)['trimmomatic_bin'] + "trimmomatic-0.36.jar PE " + input1 + " " + input2 + " " + clean_filenames + " " + crop_string + " " + illumina_string + " " + sliding_string + " " + minlen_string + " 2> %s/%s_trim_out.log" % (out_path, os.path.basename(os.path.dirname(out_path))) + cmdstring = "trimmomatic PE " + input1 + " " + input2 + " " + clean_filenames + " " + crop_string + " " + illumina_string + " " + sliding_string + " " + minlen_string + " 2> %s/%s_trim_out.log" % (out_path, os.path.basename(os.path.dirname(out_path))) try: call(cmdstring, logger) except sp.CalledProcessError: @@ -42,7 +45,9 @@ def trim(input1, input2, out_path, crop, logger, Config): keep_logging('End: Data Pre-processing', 'End: Data Pre-processing', logger, 'info') else: keep_logging('Pre-processing SE reads using Trimmomatic.', 'Pre-processing SE reads using Trimmomatic.', logger, 'info') - adapter_file = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("Trimmomatic", Config)['trimmomatic_bin'] + "/" + ConfigSectionMap("Trimmomatic", Config)['adaptor_filepath'] + #adapter_file = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("Trimmomatic", Config)['trimmomatic_bin'] + "/" + ConfigSectionMap("Trimmomatic", Config)['adaptor_filepath'] + # Updated for Great Lakes and Conda Integration + adapter_file = ConfigSectionMap("Trimmomatic", Config)['adaptor_filepath'] clean_filenames = out_path + ConfigSectionMap("Trimmomatic", Config)['f_p'] # changing this parameter for KPC variant analysis for keeping both reads. date: 31 August illumina_string = 'ILLUMINACLIP:' + adapter_file + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['seed_mismatches'] + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['palindrome_clipthreshold'] + ConfigSectionMap("Trimmomatic", Config)['colon'] + ConfigSectionMap("Trimmomatic", Config)['simple_clipthreshold'] @@ -50,7 +55,7 @@ def trim(input1, input2, out_path, crop, logger, Config): minlen_string = 'MINLEN:' + ConfigSectionMap("Trimmomatic", Config)['minlength'] headcrop_string = 'HEADCROP:' + ConfigSectionMap("Trimmomatic", Config)['headcrop_length'] if not crop: - cmdstring = "java -jar " + ConfigSectionMap("bin_path", Config)['binbase'] + ConfigSectionMap("Trimmomatic", Config)['trimmomatic_bin'] + "trimmomatic-0.36.jar SE " + input1 + " " + clean_filenames + " " + illumina_string + " " + sliding_string + " " + minlen_string + " " + headcrop_string + " 2> %s/%s_trim_out.log" % (out_path, os.path.basename(os.path.dirname(out_path))) + cmdstring = "trimmomatic SE " + input1 + " " + clean_filenames + " " + illumina_string + " " + sliding_string + " " + minlen_string + " " + headcrop_string + " 2> %s/%s_trim_out.log" % (out_path, os.path.basename(os.path.dirname(out_path))) keep_logging(cmdstring, cmdstring, logger, 'debug') try: call(cmdstring, logger) @@ -61,7 +66,7 @@ def trim(input1, input2, out_path, crop, logger, Config): else: crop_string = 'CROP:' + crop - cmdstring = "java -jar " + ConfigSectionMap("bin_path", Config)['binbase'] + ConfigSectionMap("Trimmomatic", Config)['trimmomatic_bin'] + "trimmomatic-0.36.jar SE " + input1 + " " + clean_filenames + " " + crop_string + " " + illumina_string + " " + sliding_string + " " + minlen_string + + " 2> %s/%s_trim_out.log" % (out_path, os.path.basename(os.path.dirname(out_path))) + cmdstring = "trimmomatic SE " + input1 + " " + clean_filenames + " " + crop_string + " " + illumina_string + " " + sliding_string + " " + minlen_string + + " 2> %s/%s_trim_out.log" % (out_path, os.path.basename(os.path.dirname(out_path))) keep_logging(cmdstring, cmdstring, logger, 'debug') try: call(cmdstring, logger) diff --git a/modules/trim.pyc b/modules/trim.pyc index a7fff21..795f18c 100755 Binary files a/modules/trim.pyc and b/modules/trim.pyc differ diff --git a/modules/variant_diagnostics/__init__.pyc b/modules/variant_diagnostics/__init__.pyc index 23dc1d3..831df5a 100755 Binary files a/modules/variant_diagnostics/__init__.pyc and b/modules/variant_diagnostics/__init__.pyc differ diff --git a/modules/variant_diagnostics/config_settings.pyc b/modules/variant_diagnostics/config_settings.pyc index d11b601..9ae9f66 100755 Binary files a/modules/variant_diagnostics/config_settings.pyc and b/modules/variant_diagnostics/config_settings.pyc differ diff --git a/modules/variant_diagnostics/core_pipeline.py b/modules/variant_diagnostics/core_pipeline.py index c044c02..b1627b9 100755 --- a/modules/variant_diagnostics/core_pipeline.py +++ b/modules/variant_diagnostics/core_pipeline.py @@ -37,6 +37,7 @@ from pyfasta import Fasta from core_prep_sanity_checks import * from iqtree import iqtree +from memory_profiler import profile # Parse Command line Arguments @@ -49,8 +50,8 @@ help='Names of filter2 only SNP vcf files with name per line.') optional.add_argument('-jobrun', action='store', dest="jobrun", help='Running a job on Cluster, Running Parallel jobs, Run jobs/commands locally (default): cluster, local, parallel-local, parallel-single-cluster') -optional.add_argument('-cluster_type', action='store', dest="cluster_type", - help='Type of Cluster: torque, pbs, sgd') +optional.add_argument('-scheduler', action='store', dest="scheduler", + help='Type of Cluster: PBS, SLURM') optional.add_argument('-cluster_resources', action='store', dest="cluster_resources", help='Cluster Resources to use. for example nodes,core. Ex: 1,4') optional.add_argument('-numcores', action='store', dest="numcores", @@ -61,6 +62,8 @@ optional.add_argument('-outgroup', action='store', dest="outgroup", help='outgroup sample name') required.add_argument('-reference', action='store', dest="reference", help='Path to Reference Fasta file for consensus generation') +optional.add_argument('-gubbins_env', action='store', dest="gubbins_env", + help='Name of the Gubbins Raxml Iqtree environment to load for Phylogenetic analysis') required.add_argument('-steps', action='store', dest="steps", help='Analysis Steps to be performed. This should be in sequential order.' 'Step 1: Run pbs jobs and process all pipeline generated vcf files to generate label files' @@ -96,6 +99,40 @@ def make_sure_path_exists(out_path): 'info') exit() + +## Great Lakes Integration Changes +def get_scheduler_directive(scheduler, Config): + """ Generate Cluster Directive lines for a scheduler provided with args.scheduler""" + # Scheduler Changes here; current changes + if scheduler and scheduler == "SLURM": + script_Directive = "#SBATCH" + job_name_flag = "--job-name=" + scheduler_directives = "#SBATCH --mail-user=%s\n#SBATCH --mail-type=%s\n#SBATCH --export=ALL\n#SBATCH --partition=%s\n#SBATCH --account=%s\n#SBATCH %s\n" \ + % (ConfigSectionMap("slurm", Config)['email'], + ConfigSectionMap("slurm", Config)['notification'], + ConfigSectionMap("slurm", Config)['partition'], + ConfigSectionMap("slurm", Config)['flux_account'], + ConfigSectionMap("slurm", Config)['resources']) + elif scheduler and scheduler == "PBS": + script_Directive = "#PBS" + job_name_flag = "-N" + scheduler_directives = "#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n" \ + % (ConfigSectionMap("scheduler", Config)['email'], + ConfigSectionMap("scheduler", Config)['notification'], + ConfigSectionMap("scheduler", Config)['resources'], + ConfigSectionMap("scheduler", Config)['queue'], + ConfigSectionMap("scheduler", Config)['flux_account']) + else: + script_Directive = "#SBATCH" + job_name_flag = "--job-name=" + scheduler_directives = "#SBATCH --mail-user=%s\n#SBATCH --mail-type=%s\n#SBATCH --export=ALL\n#SBATCH --partition=%s\n#SBATCH --account=%s\n#SBATCH %s\n" \ + % (ConfigSectionMap("slurm", Config)['email'], + ConfigSectionMap("slurm", Config)['notification'], + ConfigSectionMap("slurm", Config)['partition'], + ConfigSectionMap("slurm", Config)['flux_account'], + ConfigSectionMap("slurm", Config)['resources']) + return scheduler_directives, script_Directive, job_name_flag + def run_command(i): """Function to run each command and is run as a part of python Parallel mutiprocessing method. @@ -167,15 +204,15 @@ def create_positions_filestep(vcf_filenames): f.close() position_array_unique_excluding_outgroup = set(position_array_excluding_outgroup) position_array_sort_excluding_outgroup = sorted(position_array_unique_excluding_outgroup) - #print len(position_array_sort_excluding_outgroup) outgroup_specific_positions = [] f_outgroup = open("%s/outgroup_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'w+') for i in outgroup_position_array: if i not in position_array_sort_excluding_outgroup: f_outgroup.write(str(i) + '\n') outgroup_specific_positions.append(int(i)) - # outgroup_indel_specific_positions.append(int(i)) f_outgroup.close() + + # Print Checks print "No. of variant positions in outgroup: %s" % len(outgroup_position_array) print "No. of variant positions specific to outgroup: %s" % len(outgroup_specific_positions) @@ -187,7 +224,8 @@ def create_positions_filestep(vcf_filenames): # Changed variable to suit sorting: 25-07-2018 position_array.append(int(line)) f.close() - # Check why python sorting is not working + + keep_logging('Sorting unique variant positions.\n', 'Sorting unique variant positions.\n', logger, 'info') position_array_unique = set(position_array) position_array_sort = sorted(position_array_unique) @@ -208,16 +246,14 @@ def create_positions_filestep(vcf_filenames): else: """ Create position array containing unique positiones from positions file """ - position_array = [] for filess in filter2_only_snp_position_files_array: f = open(filess, 'r+') for line in f: line = line.strip() - # Changed variable to suit sorting: 25-07-2018 position_array.append(int(line)) f.close() - # Check why python sorting is not working + keep_logging('Sorting unique variant positions.\n', 'Sorting unique variant positions.\n', logger, 'info') position_array_unique = set(position_array) position_array_sort = sorted(position_array_unique) @@ -270,7 +306,6 @@ def create_indel_positions_filestep(vcf_filenames): lines = lines.strip() outgroup_position_indel_array.append(int(lines)) f1.close() - #print len(outgroup_position_indel_array) position_array_indel_excluding_outgroup = [] for filess in filter2_only_indel_position_files_array: @@ -289,6 +324,8 @@ def create_indel_positions_filestep(vcf_filenames): f_outgroup.write(str(i) + '\n') outgroup_indel_specific_positions.append(int(i)) f_outgroup.close() + + # Print Checks print "No. of indel variant positions in outgroup: %s" % len(outgroup_position_indel_array) print "No. of indel variant positions specific to outgroup: %s" % len(outgroup_indel_specific_positions) @@ -312,7 +349,6 @@ def create_indel_positions_filestep(vcf_filenames): if len(position_array_sort) == 0: keep_logging('ERROR: No unique positions found. Check if vcf files are empty?', 'ERROR: No unique positions found. Check if vcf files are empty?', logger, 'info') exit() - return unique_indel_position_file @@ -341,7 +377,7 @@ def create_indel_positions_filestep(vcf_filenames): exit() return unique_indel_position_file -def create_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): +def create_job(jobrun, vcf_filenames, unique_position_file, tmp_dir, scheduler_directives, script_Directive, job_name_flag): """ This method takes the unique_position_file and list of final *_no_proximate_snp.vcf files and generates individual jobs/script. @@ -352,12 +388,13 @@ def create_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): :return: """ if jobrun == "parallel-cluster": + """ Deprecated """ """ Supports only PBS clusters for now. """ for i in vcf_filenames: job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) + job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\npython %s/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) job_file_name = "%s.pbs" % (i) f1=open(job_file_name, 'w+') f1.write(job_print_string) @@ -377,15 +414,21 @@ def create_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir f3 = open(command_file, 'w+') - + ### Great Lakes changes for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) + command = "python %s/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) job_file_name = "%s.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + + + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(i)) + out.write("#!/bin/sh" + '\n') + out.write(job_title+'\n') + out.write(scheduler_directives+'\n') + out.write("cd %s/temp_jobs" % args.filter2_only_snp_vcf_dir + '\n') + out.write(command+'\n') + out.close() + pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf.pbs" pbs_scripts = glob.glob(pbs_dir) for i in pbs_scripts: @@ -397,9 +440,17 @@ def create_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): command_array.append(lines) fpp.close() if args.numcores: - num_cores = int(num_cores) + num_cores = int(args.numcores) else: - num_cores = multiprocessing.cpu_count() + # Great Lakes Integration here. + if args.scheduler == "SLURM": + proc = subprocess.Popen(["echo $SLURM_CPUS_PER_TASK"], stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + num_cores = int(out.strip()) + elif args.scheduler == "PBS": + num_cores = multiprocessing.cpu_count() + + print "Number of cores: %s" % num_cores results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) elif jobrun == "cluster": @@ -409,14 +460,21 @@ def create_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir f3 = open(command_file, 'w+') - + ### Great Lakes changes for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) + command = "python %s/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % ( + os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) job_file_name = "%s.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() + + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(i)) + out.write("#!/bin/sh" + '\n') + out.write(job_title + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s/temp_jobs" % args.filter2_only_snp_vcf_dir + '\n') + out.write(command + '\n') + out.close() + #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf.pbs" pbs_scripts = glob.glob(pbs_dir) @@ -429,9 +487,18 @@ def create_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): command_array.append(lines) fpp.close() if args.numcores: - num_cores = int(num_cores) + num_cores = int(args.numcores) else: - num_cores = multiprocessing.cpu_count() + # Slurm Changes here. + if args.scheduler == "SLURM": + proc = subprocess.Popen(["echo $SLURM_CPUS_PER_TASK"], stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + num_cores = int(out.strip()) + elif args.scheduler == "PBS": + num_cores = multiprocessing.cpu_count() + + print "Number of cores: %s" % num_cores + results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) elif jobrun == "local": @@ -443,14 +510,22 @@ def create_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir f3 = open(command_file, 'w+') - + ### Great Lakes changes for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) + command = "python %s/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % ( + os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, unique_position_file, + tmp_dir) job_file_name = "%s.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() + + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(i)) + out.write("#!/bin/sh" + '\n') + out.write(job_title + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s/temp_jobs" % args.filter2_only_snp_vcf_dir + '\n') + out.write(command + '\n') + out.close() + #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf.pbs" pbs_scripts = glob.glob(pbs_dir) @@ -466,7 +541,7 @@ def create_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): fpp.close() call("bash %s" % command_file, logger) -def create_indel_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): +def create_indel_job(jobrun, vcf_filenames, unique_position_file, tmp_dir, scheduler_directives, script_Directive, job_name_flag): """ This method takes the unique_indel_position_file and list of final *_indel_final.vcf files and generates individual jobs/script. @@ -477,12 +552,13 @@ def create_indel_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): :return: """ if jobrun == "parallel-cluster": + """ Deprecated """ """ Supports only PBS clusters for now. """ for i in vcf_filenames: job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_indel_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) + job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\npython %s/reason_job_indel_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) job_file_name = "%s_indel.pbs" % (i) f1=open(job_file_name, 'w+') f1.write(job_print_string) @@ -503,15 +579,22 @@ def create_indel_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): command_file = "%s/commands_indel_list.sh" % args.filter2_only_snp_vcf_dir f3 = open(command_file, 'w+') - + ### Great Lakes changes for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_indel_debug_gatk.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) + command = "python %s/reason_job_indel_debug_gatk.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % ( + os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, unique_position_file, + tmp_dir) job_file_name = "%s_indel.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(i)) + out.write("#!/bin/sh" + '\n') + out.write(job_title + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s/temp_jobs" % args.filter2_only_snp_vcf_dir + '\n') + out.write(command + '\n') + out.close() + pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf_indel.pbs" pbs_scripts = glob.glob(pbs_dir) for i in pbs_scripts: @@ -523,14 +606,19 @@ def create_indel_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): command_array.append(lines) fpp.close() if args.numcores: - num_cores = int(num_cores) + num_cores = int(args.numcores) else: - num_cores = multiprocessing.cpu_count() + # Great Lakes Integration here. + if args.scheduler == "SLURM": + proc = subprocess.Popen(["echo $SLURM_CPUS_PER_TASK"], stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + num_cores = int(out.strip()) + elif args.scheduler == "PBS": + num_cores = multiprocessing.cpu_count() + + print "Number of cores: %s" % num_cores results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) - # elif jobrun == "cluster": - # command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir - # os.system("bash %s" % command_file) elif jobrun == "local": """ Generate a Command list of each job and run it on local system one at a time @@ -540,15 +628,22 @@ def create_indel_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir f3 = open(command_file, 'w+') - + # Great Lakes Integration here for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_indel_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) + command = "python %s/reason_job_indel_debug_gatk.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % ( + os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, unique_position_file, + tmp_dir) job_file_name = "%s_indel.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(i)) + out.write("#!/bin/sh" + '\n') + out.write(job_title + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s/temp_jobs" % args.filter2_only_snp_vcf_dir + '\n') + out.write(command + '\n') + out.close() + pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf_indel.pbs" pbs_scripts = glob.glob(pbs_dir) @@ -704,18 +799,6 @@ def generate_paste_command_outgroup(): outfile.close() call("bash %s/All_label_final_raw_outgroup.sh" % args.filter2_only_snp_vcf_dir, logger) call("bash %s/temp_label_final_raw_outgroup.txt.sh" % args.filter2_only_snp_vcf_dir, logger) - - - """ - remove this lines - #subprocess.call(["%s" % paste_command], shell=True) - #subprocess.call(["%s" % temp_paste_command], shell=True) - #subprocess.check_call('%s' % paste_command) - #subprocess.check_call('%s' % temp_paste_command) - #os.system(paste_command) change - #os.system(temp_paste_command) change - """ - call("%s" % sort_All_label_cmd, logger) call("%s" % paste_command_header, logger) @@ -767,10 +850,6 @@ def generate_indel_paste_command(): sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir - #os.system(header_awk_cmd) - #os.system(sed_header) - #os.system(sed_header_2) - call("%s" % header_awk_cmd, logger) call("%s" % sed_header, logger) call("%s" % sed_header_2, logger) @@ -806,16 +885,6 @@ def generate_indel_paste_command(): call("bash %s/temp_indel_label_final_raw.txt.sh" % args.filter2_only_snp_vcf_dir, logger) keep_logging('Finished pasting...DONE', 'Finished pasting...DONE', logger, 'info') - """ - remove this lines - #subprocess.call(["%s" % paste_command], shell=True) - #subprocess.call(["%s" % temp_paste_command], shell=True) - #subprocess.check_call('%s' % paste_command) - #subprocess.check_call('%s' % temp_paste_command) - #os.system(paste_command) change - #os.system(temp_paste_command) change - """ - call("%s" % sort_All_label_cmd, logger) call("%s" % paste_command_header, logger) @@ -910,17 +979,6 @@ def generate_indel_paste_command_outgroup(): call("bash %s/All_indel_label_final_raw_outgroup.sh" % args.filter2_only_snp_vcf_dir, logger) call("bash %s/temp_indel_label_final_raw_outgroup.txt.sh" % args.filter2_only_snp_vcf_dir, logger) keep_logging('Finished pasting...DONE', 'Finished pasting...DONE', logger, 'info') - - """ - remove this lines - #subprocess.call(["%s" % paste_command], shell=True) - #subprocess.call(["%s" % temp_paste_command], shell=True) - #subprocess.check_call('%s' % paste_command) - #subprocess.check_call('%s' % temp_paste_command) - #os.system(paste_command) change - #os.system(temp_paste_command) change - """ - call("%s" % sort_All_label_cmd, logger) call("%s" % paste_command_header, logger) @@ -952,56 +1010,106 @@ def generate_indel_paste_command_outgroup(): call("%s" % remove_unwanted_text, logger) else: print "Skip generating seperate intermediate files for outgroup" - +@profile def generate_position_label_data_matrix(): - """ - Generate different list of Positions using the matrix All_label_final_sorted_header.txt. + """ + Generate different list of Positions using the matrix All_label_final_sorted_header.txt. - (Defining Core Variant Position: Variant Position which was not filtered out in any of the other samples due to variant filter parameter and also this position was present in all the samples(not unmapped)). + (Defining Core Variant Position: Variant Position which was not filtered out in any of the other samples due to variant filter parameter and also this position was present in all the samples(not unmapped)). - Filtered Position label matrix: - List of non-core positions. These positions didn't make it to the final core list because it was filtered out in one of the samples. + Filtered Position label matrix: + List of non-core positions. These positions didn't make it to the final core list because it was filtered out in one of the samples. - Only_ref_variant_positions_for_closely_matrix.txt : - Those Positions where the variant was either reference allele or a variant that passed all the variant filter parameters. + Only_ref_variant_positions_for_closely_matrix.txt : + Those Positions where the variant was either reference allele or a variant that passed all the variant filter parameters. - :param: null - :return: null + :param: null + :return: null + + """ + def generate_position_label_data_matrix_All_label(): + position_label = OrderedDict() + f1 = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'w+') + f2 = open("%s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f3 = open("%s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f4 = open( + "%s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir, + 'w+') + if args.outgroup: + with open("%s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: + keep_logging( + 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, + 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + position_label[row[0]] = row[1:] + keep_logging('Generating different list of Positions and heatmap data matrix... \n', + 'Generating different list of Positions and heatmap data matrix... \n', logger, 'info') + print_string_header = "\t" + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + f2.write('\t' + print_string_header.strip() + '\n') + f3.write('\t' + print_string_header.strip() + '\n') + f4.write('\t' + print_string_header.strip() + '\n') + for value in position_label: + lll = ['0', '2', '3', '4', '5', '6', '7'] + ref_var = ['1', '1TRUE'] + if set(ref_var) & set(position_label[value]): + if set(lll) & set(position_label[value]): + if int(value) not in outgroup_specific_positions: + print_string = "" + for i in position_label[value]: + print_string = print_string + "\t" + i + STRR2 = value + print_string + "\n" + f3.write(STRR2) + if position_label[value].count('1TRUE') >= 2: + f4.write('1\n') + else: + f4.write('0\n') + else: + if int(value) not in outgroup_specific_positions: + strr = value + "\n" + f1.write(strr) + STRR3 = value + "\t" + str(position_label[value]) + "\n" + f2.write(STRR3) + csv_file.close() + f1.close() + f2.close() + f3.close() + f4.close() + subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) + + else: + with open("%s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: + keep_logging( + 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, + 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, + logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + position_label[row[0]] = row[1:] + keep_logging('Generating different list of Positions and heatmap data matrix... \n', + 'Generating different list of Positions and heatmap data matrix... \n', logger, 'info') + print_string_header = "\t" + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + f2.write('\t' + print_string_header.strip() + '\n') + f3.write('\t' + print_string_header.strip() + '\n') + f4.write('\t' + print_string_header.strip() + '\n') + for value in position_label: + lll = ['0', '2', '3', '4', '5', '6', '7'] + ref_var = ['1', '1TRUE'] + if set(ref_var) & set(position_label[value]): + if set(lll) & set(position_label[value]): - """ - def generate_position_label_data_matrix_All_label(): - position_label = OrderedDict() - f1 = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'w+') - f2 = open("%s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f3 = open("%s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f4 = open( - "%s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir, - 'w+') - if args.outgroup: - with open("%s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - position_label[row[0]] = row[1:] - keep_logging('Generating different list of Positions and heatmap data matrix... \n', - 'Generating different list of Positions and heatmap data matrix... \n', logger, 'info') - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f2.write('\t' + print_string_header.strip() + '\n') - f3.write('\t' + print_string_header.strip() + '\n') - f4.write('\t' + print_string_header.strip() + '\n') - for value in position_label: - lll = ['0', '2', '3', '4', '5', '6', '7'] - ref_var = ['1', '1TRUE'] - if set(ref_var) & set(position_label[value]): - if set(lll) & set(position_label[value]): - if int(value) not in outgroup_specific_positions: print_string = "" for i in position_label[value]: print_string = print_string + "\t" + i @@ -1011,401 +1119,524 @@ def generate_position_label_data_matrix_All_label(): f4.write('1\n') else: f4.write('0\n') - else: - if int(value) not in outgroup_specific_positions: + else: + strr = value + "\n" f1.write(strr) STRR3 = value + "\t" + str(position_label[value]) + "\n" f2.write(STRR3) - csv_file.close() - f1.close() - f2.close() - f3.close() - f4.close() - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) + csv_file.close() + f1.close() + f2.close() + f3.close() + f4.close() + subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], + shell=True) + + + def temp_generate_position_label_data_matrix_All_label(): + + """ + Read temp_label_final_raw.txt SNP position label data matrix for generating barplot statistics. + """ + temp_position_label = OrderedDict() + f33=open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') + print_string_header = "\t" - else: - with open("%s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - position_label[row[0]] = row[1:] - keep_logging('Generating different list of Positions and heatmap data matrix... \n', - 'Generating different list of Positions and heatmap data matrix... \n', logger, 'info') - print_string_header = "\t" + if args.outgroup: + for i in vcf_filenames: + if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: + print_string_header = print_string_header + os.path.basename(i) + "\t" + else: for i in vcf_filenames: print_string_header = print_string_header + os.path.basename(i) + "\t" - f2.write('\t' + print_string_header.strip() + '\n') - f3.write('\t' + print_string_header.strip() + '\n') - f4.write('\t' + print_string_header.strip() + '\n') - for value in position_label: - lll = ['0', '2', '3', '4', '5', '6', '7'] - ref_var = ['1', '1TRUE'] - if set(ref_var) & set(position_label[value]): - if set(lll) & set(position_label[value]): - - print_string = "" - for i in position_label[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f3.write(STRR2) - if position_label[value].count('1TRUE') >= 2: - f4.write('1\n') - else: - f4.write('0\n') - else: - - strr = value + "\n" - f1.write(strr) - STRR3 = value + "\t" + str(position_label[value]) + "\n" - f2.write(STRR3) - csv_file.close() - f1.close() - f2.close() - f3.close() - f4.close() - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - - def temp_generate_position_label_data_matrix_All_label(): - - """ - Read temp_label_final_raw.txt SNP position label data matrix for generating barplot statistics. - """ - temp_position_label = OrderedDict() - f33=open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - print_string_header = "\t" - - if args.outgroup: - for i in vcf_filenames: - if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: - print_string_header = print_string_header + os.path.basename(i) + "\t" - else: - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f33.write('\t' + print_string_header.strip() + '\n') + f33.write('\t' + print_string_header.strip() + '\n') - """ GET individual PHAGE/Repetitive/masked region positions to assign functional class group string """ + """ GET individual PHAGE/Repetitive/masked region positions to assign functional class group string """ - phage_positions = [] - repetitive_positions = [] - mask_positions = [] + phage_positions = [] + repetitive_positions = [] + mask_positions = [] - phage_region_positions = "%s/phage_region_positions.txt" % args.filter2_only_snp_vcf_dir - if os.path.isfile(phage_region_positions): - with open(phage_region_positions, 'rU') as fphage: - for line in fphage: - phage_positions.append(line.strip()) - fphage.close() - else: - raise IOError('%s/phage_region_positions.txt does not exist.' % args.filter2_only_snp_vcf_dir) - exit() + phage_region_positions = "%s/phage_region_positions.txt" % args.filter2_only_snp_vcf_dir + if os.path.isfile(phage_region_positions): + with open(phage_region_positions, 'rU') as fphage: + for line in fphage: + phage_positions.append(line.strip()) + fphage.close() + else: + raise IOError('%s/phage_region_positions.txt does not exist.' % args.filter2_only_snp_vcf_dir) + exit() - """ End: Generate a list of functional class positions from Phaster, Mummer and Custom Masking results/files""" + """ End: Generate a list of functional class positions from Phaster, Mummer and Custom Masking results/files""" - f_open_temp_Only_filtered_positions_for_closely_matrix = open("%s/temp_Only_filtered_positions_for_closely_matrix_exclude_phage.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f_open_temp_Only_filtered_positions_for_closely_matrix = open("%s/temp_Only_filtered_positions_for_closely_matrix_exclude_phage.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_open_temp_Only_filtered_positions_for_closely_matrix.write('\t' + print_string_header.strip() + '\n') + f_open_temp_Only_filtered_positions_for_closely_matrix.write('\t' + print_string_header.strip() + '\n') - keep_logging('Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, 'Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, logger, 'info') - lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] - ref_var = ['reference_allele', 'VARIANT'] + keep_logging('Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, 'Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, logger, 'info') + lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] + ref_var = ['reference_allele', 'VARIANT'] - if args.outgroup: - print "here" - with open("%s/temp_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - if set(ref_var) & set(row[1:]): - if set(lll) & set(row[1:]): - if int(row[0]) not in outgroup_specific_positions: + if args.outgroup: + print "here" + with open("%s/temp_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + if set(ref_var) & set(row[1:]): + if set(lll) & set(row[1:]): + if int(row[0]) not in outgroup_specific_positions: + + print_string = "" + for i in row[1:]: + print_string = print_string + "\t" + i + STRR2 = row[0] + print_string + "\n" + f33.write(STRR2) + + if str(row[0]) not in phage_positions: + print_string_2 = "" + for i in row[1:]: + print_string_2 = print_string_2 + "\t" + i + STRR3 = row[0] + print_string_2 + "\n" + f_open_temp_Only_filtered_positions_for_closely_matrix.write(STRR3) + csv_file.close() + f33.close() + f_open_temp_Only_filtered_positions_for_closely_matrix.close() + else: + with open("%s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + if set(ref_var) & set(row[1:]): + if set(lll) & set(row[1:]): print_string = "" for i in row[1:]: print_string = print_string + "\t" + i STRR2 = row[0] + print_string + "\n" f33.write(STRR2) - if str(row[0]) not in phage_positions: print_string_2 = "" for i in row[1:]: print_string_2 = print_string_2 + "\t" + i STRR3 = row[0] + print_string_2 + "\n" f_open_temp_Only_filtered_positions_for_closely_matrix.write(STRR3) - csv_file.close() - f33.close() - f_open_temp_Only_filtered_positions_for_closely_matrix.close() - - else: - with open("%s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - if set(ref_var) & set(row[1:]): - if set(lll) & set(row[1:]): - print_string = "" - for i in row[1:]: - print_string = print_string + "\t" + i - STRR2 = row[0] + print_string + "\n" - f33.write(STRR2) - if str(row[0]) not in phage_positions: - print_string_2 = "" - for i in row[1:]: - print_string_2 = print_string_2 + "\t" + i - STRR3 = row[0] + print_string_2 + "\n" - f_open_temp_Only_filtered_positions_for_closely_matrix.write(STRR3) - - - csv_file.close() - f33.close() - """ - Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of FQ - """ - temp_position_label_FQ = OrderedDict() - f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir, 'w+') - with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - keep_logging('Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - - for row in csv_reader: - temp_position_label_FQ[row[0]] = row[1:] - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f44.write('\t' + print_string_header.strip() + '\n') - for value in temp_position_label_FQ: - lll = ['LowFQ'] - if set(lll) & set(temp_position_label_FQ[value]): - - print_string = "" - for i in temp_position_label_FQ[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f44.write(STRR2) - f44.close() - csv_file.close() - f44.close() - - """ - Perform Sed on temp files. Find a faster way to do this. - """ - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - - - """ - Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp - """ - temp_position_label_DP = OrderedDict() - f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') - with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - keep_logging('Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - temp_position_label_DP[row[0]] = row[1:] - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f44.write('\t' + print_string_header.strip() + '\n') - for value in temp_position_label_DP: - lll = ['HighFQ_DP'] - ref_var = ['reference_allele', 'VARIANT'] - if set(lll) & set(temp_position_label_FQ[value]): - - print_string = "" - for i in temp_position_label_FQ[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f44.write(STRR2) - f44.close() - csv_file.close() - - """ - Perform Sed on temp files. Find a faster way to do this. - """ - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - - def barplot_stats(): - keep_logging('\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n', '\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n', logger, 'info') - """ - Read each Sample columns and calculate the percentage of each label to generate barplot statistics. - This will give a visual explanation of how many positions in each samples were filtered out because of different reason - """ - - print "Exluding Phage regions from temp_Only_filtered_positions_for_closely_matrix.txt file. The results will be outputed to temp_Only_filtered_positions_for_closely_matrix_exclude_phage.txt" - - - temp_Only_filtered_positions_for_closely_matrix_exclude_phage = "%s/temp_Only_filtered_positions_for_closely_matrix_exclude_phage.txt" % args.filter2_only_snp_vcf_dir - print temp_Only_filtered_positions_for_closely_matrix_exclude_phage - #c_reader = csv.reader(open('%s/temp_Only_filtered_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') - c_reader_2 = csv.reader( - open(temp_Only_filtered_positions_for_closely_matrix_exclude_phage, 'r'), delimiter='\t') - columns_2 = list(zip(*c_reader_2)) - print len(columns_2) - keep_logging('Finished reading columns...', 'Finished reading columns...', logger, 'info') - counts = 1 - if args.outgroup: - end = len(vcf_filenames) + 1 - end = end - 1 - else: - end = len(vcf_filenames) + 1 - f_bar_count = open("%s/bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_perc = open("%s/bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_count.write("Sample\tunmapped_positions\treference_allele\ttrue_variant\tOnly_low_FQ\tOnly_DP\tOnly_low_MQ\tother\n") - f_bar_perc.write("Sample\tunmapped_positions_perc\ttrue_variant_perc\tOnly_low_FQ_perc\tOnly_DP_perc\tOnly_low_MQ_perc\tother_perc\n") + csv_file.close() + f33.close() + f_open_temp_Only_filtered_positions_for_closely_matrix.close() + """ + Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of FQ + """ + temp_position_label_FQ = OrderedDict() + f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir, 'w+') + with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: + keep_logging('Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) - for i in xrange(1, end, 1): - """ Bar Count Statistics: Variant Position Count Statistics """ - true_variant = columns_2[i].count('VARIANT') - unmapped_positions = columns_2[i].count('reference_unmapped_position') - reference_allele = columns_2[i].count('reference_allele') - Only_low_FQ = columns_2[i].count('LowFQ') - Only_DP = columns_2[i].count('HighFQ_DP') - Only_low_MQ = columns_2[i].count('HighFQ') - low_FQ_other_parameters = columns_2[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_DP_proximate_SNP') + columns_2[i].count('LowFQ_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_DP') + columns_2[i].count('LowFQ_DP_QUAL') + columns_2[i].count('LowFQ_QUAL') + columns_2[i].count('LowFQ_DP') - high_FQ_other_parameters = columns_2[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_DP_proximate_SNP') + columns_2[i].count('HighFQ_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_DP') + columns_2[i].count('HighFQ_DP_QUAL') + columns_2[i].count('HighFQ_QUAL') - other = low_FQ_other_parameters + high_FQ_other_parameters - - total = true_variant + unmapped_positions + reference_allele + Only_low_FQ + Only_DP + low_FQ_other_parameters + high_FQ_other_parameters + Only_low_MQ + for row in csv_reader: + temp_position_label_FQ[row[0]] = row[1:] + print_string_header = "\t" + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + f44.write('\t' + print_string_header.strip() + '\n') + for value in temp_position_label_FQ: + lll = ['LowFQ'] + if set(lll) & set(temp_position_label_FQ[value]): + + print_string = "" + for i in temp_position_label_FQ[value]: + print_string = print_string + "\t" + i + STRR2 = value + print_string + "\n" + f44.write(STRR2) + f44.close() + csv_file.close() + f44.close() + + """ + Perform Sed on temp files. Find a faster way to do this. + """ + subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) + + + """ + Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp + """ + temp_position_label_DP = OrderedDict() + f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') + with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: + keep_logging('Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, logger, 'info') + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + temp_position_label_DP[row[0]] = row[1:] + print_string_header = "\t" + for i in vcf_filenames: + print_string_header = print_string_header + os.path.basename(i) + "\t" + f44.write('\t' + print_string_header.strip() + '\n') + for value in temp_position_label_DP: + lll = ['HighFQ_DP'] + ref_var = ['reference_allele', 'VARIANT'] + if set(lll) & set(temp_position_label_FQ[value]): + + print_string = "" + for i in temp_position_label_FQ[value]: + print_string = print_string + "\t" + i + STRR2 = value + print_string + "\n" + f44.write(STRR2) + f44.close() + csv_file.close() - filename_count = i - 1 + """ + Perform Sed on temp files. Find a faster way to do this. + """ + subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/LowFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) + + def barplot_stats(): + keep_logging('\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n', '\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n', logger, 'info') + """ + Read each Sample columns and calculate the percentage of each label to generate barplot statistics. + This will give a visual explanation of how many positions in each samples were filtered out because of different reason + """ + + print "Exluding Phage regions from temp_Only_filtered_positions_for_closely_matrix.txt file. The results will be outputed to temp_Only_filtered_positions_for_closely_matrix_exclude_phage.txt" + + + + + + #temp_Only_filtered_positions_for_closely_matrix_exclude_phage = "%s/temp_Only_filtered_positions_for_closely_matrix_exclude_phage.txt" % args.filter2_only_snp_vcf_dir + temp_Only_filtered_positions_for_closely_matrix_exclude_phage = "%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir + print temp_Only_filtered_positions_for_closely_matrix_exclude_phage + #c_reader = csv.reader(open('%s/temp_Only_filtered_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') + c_reader_2 = csv.reader( + open(temp_Only_filtered_positions_for_closely_matrix_exclude_phage, 'r'), delimiter='\t') + columns_2 = list(zip(*c_reader_2)) + print len(columns_2) + keep_logging('Finished reading columns...', 'Finished reading columns...', logger, 'info') + counts = 1 if args.outgroup: - bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions, reference_allele, true_variant, Only_low_FQ, Only_DP, Only_low_MQ, other) - f_bar_count.write(bar_string) + end = len(vcf_filenames) + 1 + end = end - 1 else: - bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( - vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), - unmapped_positions, reference_allele, true_variant, - Only_low_FQ, Only_DP, Only_low_MQ, other) - #f_bar_count.write(bar_string) - """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ - try: - true_variant_perc = float((columns_2[i].count('VARIANT') * 100) / total) - except ZeroDivisionError: - true_variant_perc = 0 - try: - unmapped_positions_perc = float((columns_2[i].count('reference_unmapped_position') * 100) / total) - except ZeroDivisionError: - unmapped_positions_perc = 0 - try: - reference_allele_perc = float((columns_2[i].count('reference_allele') * 100) / total) - except ZeroDivisionError: - reference_allele_perc = 0 - try: - Only_low_FQ_perc = float((columns_2[i].count('LowFQ') * 100) / total) - except ZeroDivisionError: - Only_low_FQ_perc = 0 - try: - Only_DP_perc = float((columns_2[i].count('HighFQ_DP') * 100) / total) - except ZeroDivisionError: - Only_DP_perc = 0 - try: - Only_low_MQ_perc = float((columns_2[i].count('HighFQ') * 100) / total) - except ZeroDivisionError: - Only_low_MQ_perc = 0 - try: - low_FQ_other_parameters_perc = float(((columns_2[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_DP_proximate_SNP') + columns_2[i].count('LowFQ_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_DP') + columns_2[i].count('LowFQ_DP_QUAL') + columns_2[i].count('LowFQ_QUAL') + columns_2[i].count('LowFQ_DP')) * 100) / total) - except ZeroDivisionError: - low_FQ_other_parameters_perc = 0 - try: - high_FQ_other_parameters_perc = float(((columns_2[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_DP_proximate_SNP') + columns_2[i].count('HighFQ_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_DP') + columns_2[i].count('HighFQ_DP_QUAL') + columns_2[i].count('HighFQ_QUAL')) * 100) / total) - except ZeroDivisionError: - high_FQ_other_parameters_perc = 0 + end = len(vcf_filenames) + 1 + + f_bar_count = open("%s/bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f_bar_perc = open("%s/bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f_bar_count.write("Sample\tunmapped_positions\treference_allele\ttrue_variant\tOnly_low_FQ\tOnly_DP\tOnly_low_MQ\tother\n") + f_bar_perc.write("Sample\tunmapped_positions_perc\ttrue_variant_perc\tOnly_low_FQ_perc\tOnly_DP_perc\tOnly_low_MQ_perc\tother_perc\n") + + for i in xrange(1, end, 1): + """ Bar Count Statistics: Variant Position Count Statistics """ + true_variant = columns_2[i].count('VARIANT') + unmapped_positions = columns_2[i].count('reference_unmapped_position') + reference_allele = columns_2[i].count('reference_allele') + Only_low_FQ = columns_2[i].count('LowFQ') + Only_DP = columns_2[i].count('HighFQ_DP') + Only_low_MQ = columns_2[i].count('HighFQ') + low_FQ_other_parameters = columns_2[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_DP_proximate_SNP') + columns_2[i].count('LowFQ_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_DP') + columns_2[i].count('LowFQ_DP_QUAL') + columns_2[i].count('LowFQ_QUAL') + columns_2[i].count('LowFQ_DP') + high_FQ_other_parameters = columns_2[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_DP_proximate_SNP') + columns_2[i].count('HighFQ_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_DP') + columns_2[i].count('HighFQ_DP_QUAL') + columns_2[i].count('HighFQ_QUAL') + other = low_FQ_other_parameters + high_FQ_other_parameters + + total = true_variant + unmapped_positions + reference_allele + Only_low_FQ + Only_DP + low_FQ_other_parameters + high_FQ_other_parameters + Only_low_MQ + + filename_count = i - 1 + + if args.outgroup: + bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions, reference_allele, true_variant, Only_low_FQ, Only_DP, Only_low_MQ, other) + f_bar_count.write(bar_string) + else: + bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( + vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + unmapped_positions, reference_allele, true_variant, + Only_low_FQ, Only_DP, Only_low_MQ, other) + #f_bar_count.write(bar_string) + """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ + try: + true_variant_perc = float((columns_2[i].count('VARIANT') * 100) / total) + except ZeroDivisionError: + true_variant_perc = 0 + try: + unmapped_positions_perc = float((columns_2[i].count('reference_unmapped_position') * 100) / total) + except ZeroDivisionError: + unmapped_positions_perc = 0 + try: + reference_allele_perc = float((columns_2[i].count('reference_allele') * 100) / total) + except ZeroDivisionError: + reference_allele_perc = 0 + try: + Only_low_FQ_perc = float((columns_2[i].count('LowFQ') * 100) / total) + except ZeroDivisionError: + Only_low_FQ_perc = 0 + try: + Only_DP_perc = float((columns_2[i].count('HighFQ_DP') * 100) / total) + except ZeroDivisionError: + Only_DP_perc = 0 + try: + Only_low_MQ_perc = float((columns_2[i].count('HighFQ') * 100) / total) + except ZeroDivisionError: + Only_low_MQ_perc = 0 + try: + low_FQ_other_parameters_perc = float(((columns_2[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_DP_proximate_SNP') + columns_2[i].count('LowFQ_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_DP') + columns_2[i].count('LowFQ_DP_QUAL') + columns_2[i].count('LowFQ_QUAL') + columns_2[i].count('LowFQ_DP')) * 100) / total) + except ZeroDivisionError: + low_FQ_other_parameters_perc = 0 + try: + high_FQ_other_parameters_perc = float(((columns_2[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_DP_proximate_SNP') + columns_2[i].count('HighFQ_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_DP') + columns_2[i].count('HighFQ_DP_QUAL') + columns_2[i].count('HighFQ_QUAL')) * 100) / total) + except ZeroDivisionError: + high_FQ_other_parameters_perc = 0 + + other_perc = float(low_FQ_other_parameters_perc + high_FQ_other_parameters_perc) + if args.outgroup: + bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions_perc, true_variant_perc, Only_low_FQ_perc, Only_DP_perc, Only_low_MQ_perc, other_perc) + else: + bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( + vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + unmapped_positions_perc, true_variant_perc, + Only_low_FQ_perc, Only_DP_perc, Only_low_MQ_perc, other_perc) + f_bar_count.write(bar_string) + f_bar_perc.write(bar_perc_string) + f_bar_count.close() + f_bar_perc.close() + bargraph_R_script = "library(ggplot2)\nlibrary(reshape)\nx1 <- read.table(\"bargraph_percentage.txt\", header=TRUE)\nx1$Sample <- reorder(x1$Sample, rowSums(x1[-1]))\nmdf1=melt(x1,id.vars=\"Sample\")\npdf(\"%s/%s_barplot.pdf\", width = 30, height = 30)\nggplot(mdf1, aes(Sample, value, fill=variable)) + geom_bar(stat=\"identity\") + ylab(\"Percentage of Filtered Positions\") + xlab(\"Samples\") + theme(text = element_text(size=9)) + scale_fill_manual(name=\"Reason for filtered out positions\", values=c(\"#08306b\", \"black\", \"orange\", \"darkgrey\", \"#fdd0a2\", \"#7f2704\")) + ggtitle(\"Title Here\") + ylim(0, 100) + theme(text = element_text(size=10), panel.background = element_rect(fill = 'white', colour = 'white'), plot.title = element_text(size=20, face=\"bold\", margin = margin(10, 0, 10, 0)), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_text(colour = \"black\", face= \"bold.italic\", angle = 90)) + theme(legend.position = c(0.6, 0.7), legend.direction = \"horizontal\")\ndev.off()" % ("%s/matrices/plots" % data_matrix_dir, os.path.basename(os.path.normpath(args.results_dir))) + barplot_R_file = open("%s/bargraph.R" % args.filter2_only_snp_vcf_dir, 'w+') + barplot_R_file.write(bargraph_R_script) + keep_logging('Run this R script to generate bargraph plot: %s/bargraph.R' % args.filter2_only_snp_vcf_dir, 'Run this R script to generate bargraph plot: %s/bargraph.R' % args.filter2_only_snp_vcf_dir, logger, 'info') + + def barplot_additional_stats(): + keep_logging( + '\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n', + '\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n', + logger, 'info') + """ + Read each Sample columns and calculate the percentage of each label to generate barplot statistics. + This will give a visual explanation of how many positions in each samples were filtered out because of different reason + """ + + print "Exluding Phage regions from temp_Only_filtered_positions_for_closely_matrix.txt file. The results will be outputed to temp_Only_filtered_positions_for_closely_matrix_exclude_phage.txt" + + # temp_Only_filtered_positions_for_closely_matrix_exclude_phage = "%s/temp_Only_filtered_positions_for_closely_matrix_exclude_phage.txt" % args.filter2_only_snp_vcf_dir + temp_Only_filtered_positions_for_closely_matrix_exclude_phage = "%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir + print temp_Only_filtered_positions_for_closely_matrix_exclude_phage + # c_reader = csv.reader(open('%s/temp_Only_filtered_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') + c_reader_2 = csv.reader( + open(temp_Only_filtered_positions_for_closely_matrix_exclude_phage, 'r'), delimiter='\t') + columns_2 = list(zip(*c_reader_2)) + print len(columns_2) + keep_logging('Finished reading columns...', 'Finished reading columns...', logger, 'info') + counts = 1 - other_perc = float(low_FQ_other_parameters_perc + high_FQ_other_parameters_perc) if args.outgroup: - bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions_perc, true_variant_perc, Only_low_FQ_perc, Only_DP_perc, Only_low_MQ_perc, other_perc) + end = len(vcf_filenames) + 1 + end = end - 1 else: - bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( - vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + end = len(vcf_filenames) + 1 + + f_bar_count = open("%s/bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f_bar_perc = open("%s/bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') + f_bar_count.write( + "Sample\tunmapped_positions\treference_allele\ttrue_variant\tOnly_low_FQ\tOnly_DP\tOnly_low_MQ\tother\tOnly_QUAL\tQUAL_and_others\tlow_FQ_other_parameters\tDP_LowFQ_QUAL\n") + f_bar_perc.write( + "Sample\tunmapped_positions_perc\ttrue_variant_perc\tOnly_low_FQ_perc\tOnly_DP_perc\tOnly_low_MQ_perc\tother_perc\n") + + for i in xrange(1, end, 1): + """ Bar Count Statistics: Variant Position Count Statistics """ + true_variant = columns_2[i].count('VARIANT') + unmapped_positions = columns_2[i].count('reference_unmapped_position') + reference_allele = columns_2[i].count('reference_allele') + Only_low_FQ = columns_2[i].count('LowFQ') + Only_DP = columns_2[i].count('HighFQ_DP') + Only_low_MQ = columns_2[i].count('HighFQ') + low_FQ_other_parameters = columns_2[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns_2[i].count( + 'LowFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_proximate_SNP') + columns_2[ + i].count('LowFQ_DP_proximate_SNP') + columns_2[i].count( + 'LowFQ_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_DP') + columns_2[i].count('LowFQ_DP_QUAL') + \ + columns_2[i].count('LowFQ_QUAL') + columns_2[i].count('LowFQ_DP') + high_FQ_other_parameters = columns_2[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns_2[i].count( + 'HighFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_proximate_SNP') + columns_2[ + i].count('HighFQ_DP_proximate_SNP') + columns_2[i].count( + 'HighFQ_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_DP') + columns_2[i].count( + 'HighFQ_DP_QUAL') + columns_2[i].count('HighFQ_QUAL') + other = low_FQ_other_parameters + high_FQ_other_parameters + Only_QUAL = columns_2[i].count('HighFQ_QUAL') + QUAL_and_others = columns_2[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns_2[i].count( + 'LowFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_DP') + columns_2[i].count('LowFQ_DP_QUAL') + \ + columns_2[i].count('LowFQ_QUAL') + columns_2[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns_2[i].count( + 'HighFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_DP') + columns_2[i].count( + 'HighFQ_DP_QUAL') + + DP_LowFQ_QUAL = columns_2[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns_2[i].count( + 'LowFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_DP') + columns_2[i].count('LowFQ_DP_QUAL') + columns_2[i].count('LowFQ_DP') + + total = true_variant + unmapped_positions + reference_allele + Only_low_FQ + Only_DP + low_FQ_other_parameters + high_FQ_other_parameters + Only_low_MQ + + filename_count = i - 1 + #print len(vcf_filenames_outgroup) + if args.outgroup: + bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( + vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + unmapped_positions, reference_allele, + true_variant, Only_low_FQ, Only_DP, Only_low_MQ, + other, Only_QUAL, QUAL_and_others, low_FQ_other_parameters, DP_LowFQ_QUAL) + #f_bar_count.write(bar_string) + print os.path.basename( + vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')) + else: + bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( + vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + unmapped_positions, reference_allele, + true_variant, + Only_low_FQ, Only_DP, Only_low_MQ, other, Only_QUAL, QUAL_and_others, low_FQ_other_parameters, DP_LowFQ_QUAL) + #f_bar_count.write(bar_string) + print os.path.basename( + vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')) + + f_bar_count.write(bar_string) + """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ + try: + true_variant_perc = float((columns_2[i].count('VARIANT') * 100) / total) + except ZeroDivisionError: + true_variant_perc = 0 + try: + unmapped_positions_perc = float((columns_2[i].count('reference_unmapped_position') * 100) / total) + except ZeroDivisionError: + unmapped_positions_perc = 0 + try: + reference_allele_perc = float((columns_2[i].count('reference_allele') * 100) / total) + except ZeroDivisionError: + reference_allele_perc = 0 + try: + Only_low_FQ_perc = float((columns_2[i].count('LowFQ') * 100) / total) + except ZeroDivisionError: + Only_low_FQ_perc = 0 + try: + Only_DP_perc = float((columns_2[i].count('HighFQ_DP') * 100) / total) + except ZeroDivisionError: + Only_DP_perc = 0 + try: + Only_low_MQ_perc = float((columns_2[i].count('HighFQ') * 100) / total) + except ZeroDivisionError: + Only_low_MQ_perc = 0 + try: + low_FQ_other_parameters_perc = float(((columns_2[i].count('LowFQ_QUAL_DP_proximate_SNP') + + columns_2[i].count('LowFQ_DP_QUAL_proximate_SNP') + + columns_2[i].count('LowFQ_QUAL_proximate_SNP') + columns_2[ + i].count('LowFQ_DP_proximate_SNP') + columns_2[i].count( + 'LowFQ_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_DP') + columns_2[i].count( + 'LowFQ_DP_QUAL') + columns_2[i].count('LowFQ_QUAL') + columns_2[i].count( + 'LowFQ_DP')) * 100) / total) + except ZeroDivisionError: + low_FQ_other_parameters_perc = 0 + try: + high_FQ_other_parameters_perc = float(((columns_2[i].count('HighFQ_QUAL_DP_proximate_SNP') + + columns_2[i].count('HighFQ_DP_QUAL_proximate_SNP') + + columns_2[i].count('HighFQ_QUAL_proximate_SNP') + columns_2[ + i].count('HighFQ_DP_proximate_SNP') + columns_2[ + i].count('HighFQ_proximate_SNP') + columns_2[i].count( + 'HighFQ_QUAL_DP') + columns_2[i].count('HighFQ_DP_QUAL') + columns_2[i].count( + 'HighFQ_QUAL')) * 100) / total) + except ZeroDivisionError: + high_FQ_other_parameters_perc = 0 + + other_perc = float(low_FQ_other_parameters_perc + high_FQ_other_parameters_perc) + if args.outgroup: + bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( + vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions_perc, true_variant_perc, - Only_low_FQ_perc, Only_DP_perc, Only_low_MQ_perc, other_perc) - f_bar_count.write(bar_string) - f_bar_perc.write(bar_perc_string) - f_bar_count.close() - f_bar_perc.close() - bargraph_R_script = "library(ggplot2)\nlibrary(reshape)\nx1 <- read.table(\"bargraph_percentage.txt\", header=TRUE)\nx1$Sample <- reorder(x1$Sample, rowSums(x1[-1]))\nmdf1=melt(x1,id.vars=\"Sample\")\npdf(\"%s/%s_barplot.pdf\", width = 30, height = 30)\nggplot(mdf1, aes(Sample, value, fill=variable)) + geom_bar(stat=\"identity\") + ylab(\"Percentage of Filtered Positions\") + xlab(\"Samples\") + theme(text = element_text(size=9)) + scale_fill_manual(name=\"Reason for filtered out positions\", values=c(\"#08306b\", \"black\", \"orange\", \"darkgrey\", \"#fdd0a2\", \"#7f2704\")) + ggtitle(\"Title Here\") + ylim(0, 100) + theme(text = element_text(size=10), panel.background = element_rect(fill = 'white', colour = 'white'), plot.title = element_text(size=20, face=\"bold\", margin = margin(10, 0, 10, 0)), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_text(colour = \"black\", face= \"bold.italic\", angle = 90)) + theme(legend.position = c(0.6, 0.7), legend.direction = \"horizontal\")\ndev.off()" % ("%s/matrices/plots" % data_matrix_dir, os.path.basename(os.path.normpath(args.results_dir))) - barplot_R_file = open("%s/bargraph.R" % args.filter2_only_snp_vcf_dir, 'w+') - barplot_R_file.write(bargraph_R_script) - keep_logging('Run this R script to generate bargraph plot: %s/bargraph.R' % args.filter2_only_snp_vcf_dir, 'Run this R script to generate bargraph plot: %s/bargraph.R' % args.filter2_only_snp_vcf_dir, logger, 'info') + Only_low_FQ_perc, Only_DP_perc, + Only_low_MQ_perc, other_perc) + else: + bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( + vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), + unmapped_positions_perc, true_variant_perc, + Only_low_FQ_perc, Only_DP_perc, + Only_low_MQ_perc, other_perc) + #f_bar_count.write(bar_string) + f_bar_perc.write(bar_perc_string) + f_bar_count.close() + f_bar_perc.close() + bargraph_R_script = "library(ggplot2)\nlibrary(reshape)\nx1 <- read.table(\"bargraph_percentage.txt\", header=TRUE)\nx1$Sample <- reorder(x1$Sample, rowSums(x1[-1]))\nmdf1=melt(x1,id.vars=\"Sample\")\npdf(\"%s/%s_barplot.pdf\", width = 30, height = 30)\nggplot(mdf1, aes(Sample, value, fill=variable)) + geom_bar(stat=\"identity\") + ylab(\"Percentage of Filtered Positions\") + xlab(\"Samples\") + theme(text = element_text(size=9)) + scale_fill_manual(name=\"Reason for filtered out positions\", values=c(\"#08306b\", \"black\", \"orange\", \"darkgrey\", \"#fdd0a2\", \"#7f2704\")) + ggtitle(\"Title Here\") + ylim(0, 100) + theme(text = element_text(size=10), panel.background = element_rect(fill = 'white', colour = 'white'), plot.title = element_text(size=20, face=\"bold\", margin = margin(10, 0, 10, 0)), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_text(colour = \"black\", face= \"bold.italic\", angle = 90)) + theme(legend.position = c(0.6, 0.7), legend.direction = \"horizontal\")\ndev.off()" % ( + "%s/matrices/plots" % data_matrix_dir, os.path.basename(os.path.normpath(args.results_dir))) + barplot_R_file = open("%s/bargraph.R" % args.filter2_only_snp_vcf_dir, 'w+') + barplot_R_file.write(bargraph_R_script) + keep_logging('Run this R script to generate bargraph plot: %s/bargraph.R' % args.filter2_only_snp_vcf_dir, + 'Run this R script to generate bargraph plot: %s/bargraph.R' % args.filter2_only_snp_vcf_dir, + logger, 'info') + + + + # Commented out for debugging + """ Methods Steps""" + keep_logging('Running: Generating data matrices...', 'Running: Generating data matrices...', logger, 'info') + generate_position_label_data_matrix_All_label() + keep_logging('Running: Changing variables in data matrices to codes for faster processing...', + 'Running: Changing variables in data matrices to codes for faster processing...', logger, 'info') + temp_generate_position_label_data_matrix_All_label() + keep_logging('Running: Generating Barplot statistics data matrices...', + 'Running: Generating Barplot statistics data matrices...', logger, 'info') + barplot_stats() + # barplot_additional_stats() - """ Methods Steps""" - keep_logging('Running: Generating data matrices...', 'Running: Generating data matrices...', logger, 'info') - generate_position_label_data_matrix_All_label() - keep_logging('Running: Changing variables in data matrices to codes for faster processing...', 'Running: Changing variables in data matrices to codes for faster processing...', logger, 'info') - temp_generate_position_label_data_matrix_All_label() - keep_logging('Running: Generating Barplot statistics data matrices...', 'Running: Generating Barplot statistics data matrices...', logger, 'info') - #barplot_stats() def generate_indel_position_label_data_matrix(): @@ -1816,34 +2047,16 @@ def barplot_indel_stats(): keep_logging('Run this R script to generate bargraph plot: %s/bargraph_indel.R' % args.filter2_only_snp_vcf_dir, 'Run this R script to generate bargraph plot: %s/bargraph_indel.R' % args.filter2_only_snp_vcf_dir, logger, 'info') + """ Methods Steps""" keep_logging('Running: Generating data matrices...', 'Running: Generating data matrices...', logger, 'info') - # if args.outgroup: - # f_outgroup = open("%s/outgroup_indel_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') - # global outgroup_indel_specific_positions - # outgroup_indel_specific_positions = [] - # for i in f_outgroup: - # outgroup_indel_specific_positions.append(i) - # f_outgroup.close() - # - # f_outgroup = open("%s/outgroup_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') - # global outgroup_specific_positions - # outgroup_specific_positions = [] - # for i in f_outgroup: - # outgroup_specific_positions.append(i) - # f_outgroup.close() - # else: - # global outgroup_specific_positions - # global outgroup_indel_specific_positions - # outgroup_indel_specific_positions = [] - # outgroup_specific_positions = [] generate_indel_position_label_data_matrix_All_label() keep_logging('Running: Changing variables in data matrices to codes for faster processing...', 'Running: Changing variables in data matrices to codes for faster processing...', logger, 'info') temp_generate_indel_position_label_data_matrix_All_label() keep_logging('Running: Generating Barplot statistics data matrices...', 'Running: Generating Barplot statistics data matrices...', logger, 'info') barplot_indel_stats() -def create_job_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filter): +def create_job_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filter, script_Directive, job_name_flag): """ Generate jobs/scripts that creates core consensus fasta file. @@ -1859,19 +2072,24 @@ def create_job_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filte """ Supports only PBS clusters for now. """ + ### Great Lakes changes for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) + command = "python %s/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % (os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) job_file_name = "%s_fasta.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) + + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(i)) + out.write("#!/bin/sh" + '\n') + out.write(job_title + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s/temp_jobs" % args.filter2_only_snp_vcf_dir + '\n') + out.write(command + '\n') + out.close() + pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" pbs_scripts = glob.glob(pbs_dir) for i in pbs_scripts: keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') - #os.system("qsub %s" % i) call("qsub %s" % i, logger) @@ -1882,13 +2100,21 @@ def create_job_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filte command_array = [] command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir f3 = open(command_file, 'w+') + + ### Great Lakes changes for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) + command = "python %s/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % ( + os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, args.reference, + core_vcf_fasta_dir, functional_filter) job_file_name = "%s_fasta.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(i)) + out.write("#!/bin/sh" + '\n') + out.write(job_title + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s/temp_jobs" % args.filter2_only_snp_vcf_dir + '\n') + out.write(command + '\n') + out.close() pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" pbs_scripts = glob.glob(pbs_dir) for i in pbs_scripts: @@ -1902,31 +2128,17 @@ def create_job_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filte if args.numcores: num_cores = int(num_cores) else: - num_cores = multiprocessing.cpu_count() + # Slurm Changes here. + if args.scheduler == "SLURM": + proc = subprocess.Popen(["echo $SLURM_CPUS_PER_TASK"], stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + num_cores = int(out.strip()) + elif args.scheduler == "PBS": + num_cores = multiprocessing.cpu_count() + + print "Number of cores: %s" % num_cores results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) - # elif jobrun == "cluster": - # command_array = [] - # command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir - # f3 = open(command_file, 'w+') - # for i in vcf_filenames: - # job_name = os.path.basename(i) - # job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'],args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir) - # job_file_name = "%s_fasta.pbs" % (i) - # f1=open(job_file_name, 'w+') - # f1.write(job_print_string) - # f1.close() - # pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" - # pbs_scripts = glob.glob(pbs_dir) - # for i in pbs_scripts: - # f3.write("bash %s\n" % i) - # f3.close() - # with open(command_file, 'r') as fpp: - # for lines in fpp: - # lines = lines.strip() - # command_array.append(lines) - # fpp.close() - # os.system("bash %s/command_file" % args.filter2_only_snp_vcf_dir) else: """ Generate a Command list of each job and run it on local system one at a time @@ -1935,14 +2147,22 @@ def create_job_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filte command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir f3 = open(command_file, 'w+') - + ### Great Lakes changes for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) + command = "python %s/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % ( + os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, args.reference, + core_vcf_fasta_dir, functional_filter) job_file_name = "%s_fasta.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() + + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(i)) + out.write("#!/bin/sh" + '\n') + out.write(job_title + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s/temp_jobs" % args.filter2_only_snp_vcf_dir + '\n') + out.write(command + '\n') + out.close() + #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" pbs_scripts = glob.glob(pbs_dir) @@ -1959,7 +2179,7 @@ def create_job_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filte #os.system("bash command_file") call("bash %s" % command_file, logger) -def create_job_allele_variant_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, config_file): +def create_job_allele_variant_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, config_file, script_Directive, job_name_flag): """ Generate jobs/scripts that creates core consensus fasta file. @@ -1975,13 +2195,19 @@ def create_job_allele_variant_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, c """ Supports only PBS clusters for now. """ + ### Great Lakes changes for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) + command = "python %s/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % (os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(i)) + out.write("#!/bin/sh" + '\n') + out.write(job_title + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s/" % args.filter2_only_snp_vcf_dir + '\n') + out.write(command + '\n') + out.close() + #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" pbs_scripts = glob.glob(pbs_dir) @@ -1998,13 +2224,20 @@ def create_job_allele_variant_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, c command_array = [] command_file = "%s/commands_list_ref_allele_variants_fasta.sh" % args.filter2_only_snp_vcf_dir f3 = open(command_file, 'w+') + ### Great Lakes changes for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) + command = "python %s/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % (os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, args.reference, + core_vcf_fasta_dir, config_file) job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(i)) + out.write("#!/bin/sh" + '\n') + out.write(job_title + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s/" % args.filter2_only_snp_vcf_dir + '\n') + out.write(command + '\n') + out.close() + pbs_dir = args.filter2_only_snp_vcf_dir + "/*_ref_allele_variants_fasta.pbs" pbs_scripts = glob.glob(pbs_dir) for i in pbs_scripts: @@ -2016,33 +2249,19 @@ def create_job_allele_variant_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, c command_array.append(lines) fpp.close() if args.numcores: - num_cores = int(num_cores) + num_cores = int(args.numcores) else: - num_cores = multiprocessing.cpu_count() + # Slurm Changes here. + if args.scheduler == "SLURM": + proc = subprocess.Popen(["echo $SLURM_CPUS_PER_TASK"], stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + num_cores = int(out.strip()) + elif args.scheduler == "PBS": + num_cores = multiprocessing.cpu_count() + + print "Number of cores: %s" % num_cores results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) - # elif jobrun == "cluster": - # command_array = [] - # command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir - # f3 = open(command_file, 'w+') - # for i in vcf_filenames: - # job_name = os.path.basename(i) - # job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'],args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir) - # job_file_name = "%s_fasta.pbs" % (i) - # f1=open(job_file_name, 'w+') - # f1.write(job_print_string) - # f1.close() - # pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" - # pbs_scripts = glob.glob(pbs_dir) - # for i in pbs_scripts: - # f3.write("bash %s\n" % i) - # f3.close() - # with open(command_file, 'r') as fpp: - # for lines in fpp: - # lines = lines.strip() - # command_array.append(lines) - # fpp.close() - # os.system("bash %s/command_file" % args.filter2_only_snp_vcf_dir) else: """ Generate a Command list of each job and run it on local system one at a time @@ -2051,14 +2270,20 @@ def create_job_allele_variant_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, c command_file = "%s/commands_list_ref_allele_variants_fasta.sh" % args.filter2_only_snp_vcf_dir f3 = open(command_file, 'w+') - + ### Great Lakes changes for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) + command = "python %s/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % ( + os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, args.reference, + core_vcf_fasta_dir, config_file) job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(i)) + out.write("#!/bin/sh" + '\n') + out.write(job_title + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s/" % args.filter2_only_snp_vcf_dir + '\n') + out.write(command + '\n') + out.close() #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) pbs_dir = args.filter2_only_snp_vcf_dir + "/*_ref_allele_variants_fasta.pbs" pbs_scripts = glob.glob(pbs_dir) @@ -2075,7 +2300,7 @@ def create_job_allele_variant_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, c #os.system("bash command_file") call("bash %s" % command_file, logger) -def create_job_DP(jobrun, vcf_filenames): +def create_job_DP(jobrun, vcf_filenames, script_Directive, job_name_flag): """ Based on type of jobrun; generate jobs and run accordingly. :param jobrun: Based on this value all the job/scripts will run on "cluster": either on single cluster, "parallel-local": run in parallel on local system, "local": run on local system, "parallel-cluster": submit parallel jobs on cluster. @@ -2087,13 +2312,22 @@ def create_job_DP(jobrun, vcf_filenames): """ Supports only PBS clusters for now. """ + ### Great Lakes changes for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) + command = "python %s/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % ( + os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, args.reference, + core_vcf_fasta_dir, config_file) + command = "python %s/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i) job_file_name = "%s_DP.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(i)) + out.write("#!/bin/sh" + '\n') + out.write(job_title + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s/temp_jobs" % args.filter2_only_snp_vcf_dir + '\n') + out.write(command + '\n') + out.close() + #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" pbs_scripts = glob.glob(pbs_dir) @@ -2111,14 +2345,22 @@ def create_job_DP(jobrun, vcf_filenames): command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir f3 = open(command_file, 'w+') - + ### Great Lakes changes for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) + command = "python %s/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % ( + os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, args.reference, + core_vcf_fasta_dir, config_file) + command = "python %s/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i) job_file_name = "%s_DP.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(i)) + out.write("#!/bin/sh" + '\n') + out.write(job_title + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s/temp_jobs" % args.filter2_only_snp_vcf_dir + '\n') + out.write(command + '\n') + out.close() + #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" pbs_scripts = glob.glob(pbs_dir) @@ -2134,9 +2376,17 @@ def create_job_DP(jobrun, vcf_filenames): fpp.close() print len(command_array) if args.numcores: - num_cores = int(num_cores) + num_cores = int(args.numcores) else: - num_cores = multiprocessing.cpu_count() + # Slurm Changes here. + if args.scheduler == "SLURM": + proc = subprocess.Popen(["echo $SLURM_CPUS_PER_TASK"], stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + num_cores = int(out.strip()) + elif args.scheduler == "PBS": + num_cores = multiprocessing.cpu_count() + + print "Number of cores: %s" % num_cores results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) # elif jobrun == "cluster": @@ -2163,13 +2413,22 @@ def create_job_DP(jobrun, vcf_filenames): """ command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir f3 = open(command_file, 'w+') + ### Great Lakes changes for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) + command = "python %s/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % ( + os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i, args.reference, + core_vcf_fasta_dir, config_file) + command = "python %s/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % ( + os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, i) job_file_name = "%s_DP.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(i)) + out.write("#!/bin/sh" + '\n') + out.write(job_title + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s/temp_jobs" % args.filter2_only_snp_vcf_dir + '\n') + out.write(command + '\n') + out.close() pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" pbs_scripts = glob.glob(pbs_dir) for i in pbs_scripts: @@ -2275,7 +2534,7 @@ def generate_vcf_files(): f1.write(print_string) f1.close() - #Turning off generating core fasta alignemnets. No longer used in pipeline + # Turning off generating core fasta alignemnets. No longer used in pipeline filename = "%s/consensus.sh" % args.filter2_only_snp_vcf_dir keep_logging('Generating Consensus...', 'Generating Consensus...', logger, 'info') for file in filtered_out_vcf_files: @@ -2369,7 +2628,7 @@ def FQ_analysis(): #print grep_fq_field def DP_analysis(): - create_job_DP(args.jobrun, vcf_filenames) + create_job_DP(args.jobrun, vcf_filenames, script_Directive, job_name_flag) paste_command = "paste %s/extract_DP_positions.txt" % args.filter2_only_snp_vcf_dir for i in vcf_filenames: label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_DP_values') @@ -2460,7 +2719,7 @@ def DP_analysis_barplot(): def extract_only_ref_variant_fasta(core_vcf_fasta_dir): if ConfigSectionMap("functional_filters", Config)['apply_functional_filters'] == "yes" and ConfigSectionMap("functional_filters", Config)['apply_to_calls'] == "yes": functional_filter = "yes" - create_job_fasta(args.jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filter) + create_job_fasta(args.jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filter, script_Directive, job_name_flag) def extract_only_ref_variant_fasta_from_reference(): if ConfigSectionMap("functional_filters", Config)['apply_functional_filters'] == "yes" and \ @@ -2518,6 +2777,20 @@ def extract_only_ref_variant_fasta_from_reference_allele_variant(): def prepare_snpEff_db(reference_basename): keep_logging('Preparing snpEff database requirements.', 'Preparing snpEff database requirements.', logger, 'info') reference_basename = (os.path.basename(args.reference)).split(".") + + ## Great Lakes Changes + proc = subprocess.Popen(["find $CONDA_PREFIX/share/ -name snpEff.config"], stdout=subprocess.PIPE, shell=True) + (out2, err2) = proc.communicate() + if out2: + snpeff_config = str(out2) + else: + print "Unable to find snpEff config file in conda Environment share directory" + exit() + + os.system("cp %s $CONDA_PREFIX/bin/" % snpeff_config) + + + if os.path.isfile("%s/%s/snpEff.config" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'])): #os.system("cp %s/%s/snpEff.config %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], args.filter2_only_snp_vcf_dir)) keep_logging("cp %s/%s/snpEff.config %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], args.filter2_only_snp_vcf_dir), "cp %s/%s/snpEff.config %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], args.filter2_only_snp_vcf_dir), logger, 'debug') @@ -2560,10 +2833,21 @@ def prepare_snpEff_db(reference_basename): "Error: %s/%s.gff file doesn't exists. Make sure the GFF file has the same prefix as reference fasta file\nExiting..." % (os.path.dirname(args.reference), reference_basename[0]), logger, 'exception') exit() #keep_logging("java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), "java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger, 'debug') - keep_logging("java -jar %s/%s/%s build -genbank -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), "java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger, 'debug') + # keep_logging("java -jar %s/%s/%s build -genbank -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), "java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger, 'debug') + ## Great Lakes Changes + keep_logging("%s build -genbank -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, + ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), + "java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % ( + ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], + ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, + ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), + logger, 'debug') + + #call("java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger) - call("java -jar %s/%s/%s build -genbank -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger) + ## Great Lakes Changes + call("%s build -genbank -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger) keep_logging('Finished Preparing snpEff database requirements.', 'Finished Preparing snpEff database requirements.', logger, 'info') def variant_annotation(): @@ -2572,7 +2856,8 @@ def variant_annotation(): if ConfigSectionMap("snpeff", Config)['prebuild'] == "yes": if ConfigSectionMap("snpeff", Config)['db']: print "Using pre-built snpEff database: %s" % ConfigSectionMap("snpeff", Config)['db'] - proc = subprocess.Popen(["java -jar %s/%s/%s databases | grep %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], ConfigSectionMap("snpeff", Config)['db'])], + ## Great Lakes Changes + proc = subprocess.Popen(["%s databases | grep %s" % (ConfigSectionMap("snpeff", Config)['base_cmd'], ConfigSectionMap("snpeff", Config)['db'])], stdout=subprocess.PIPE, shell=True) (out2, err2) = proc.communicate() if out2: @@ -2592,18 +2877,32 @@ def variant_annotation(): annotate_final_vcf_cmd_array = [] for i in vcf_filenames: raw_vcf = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_aln_mpileup_raw.vcf') - annotate_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ - (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], raw_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, raw_vcf, raw_vcf) + annotate_vcf_cmd = "%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ + (ConfigSectionMap("snpeff", Config)['base_cmd'], raw_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, raw_vcf, raw_vcf) print annotate_vcf_cmd annotate_vcf_cmd_array.append(annotate_vcf_cmd) final_vcf = i - annotate_final_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ - (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], final_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, final_vcf, final_vcf) + annotate_final_vcf_cmd = "%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ + (ConfigSectionMap("snpeff", Config)['base_cmd'], final_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, final_vcf, final_vcf) annotate_final_vcf_cmd_array.append(annotate_final_vcf_cmd) if args.numcores: - num_cores = int(num_cores) - else: + num_cores = int(args.numcores) + elif args.jobrun == "local": num_cores = multiprocessing.cpu_count() + else: + # Slurm Changes here. + if args.scheduler == "SLURM": + proc = subprocess.Popen(["echo $SLURM_CPUS_PER_TASK"], stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + num_cores = int(out.strip()) + elif args.scheduler == "PBS": + num_cores = multiprocessing.cpu_count() + else: + proc = subprocess.Popen(["echo $SLURM_CPUS_PER_TASK"], stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + num_cores = int(out.strip()) + + #print annotate_vcf_cmd_array results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_vcf_cmd_array) results_2 = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_final_vcf_cmd_array) @@ -2614,7 +2913,7 @@ def indel_annotation(): if ConfigSectionMap("snpeff", Config)['prebuild'] == "yes": if ConfigSectionMap("snpeff", Config)['db']: print "Using pre-built snpEff database: %s" % ConfigSectionMap("snpeff", Config)['db'] - proc = subprocess.Popen(["java -jar %s/%s/%s databases | grep %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], ConfigSectionMap("snpeff", Config)['db'])], + proc = subprocess.Popen(["%s databases | grep %s" % (ConfigSectionMap("snpeff", Config)['base_cmd'], ConfigSectionMap("snpeff", Config)['db'])], stdout=subprocess.PIPE, shell=True) (out2, err2) = proc.communicate() if out2: @@ -2639,13 +2938,25 @@ def indel_annotation(): (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], raw_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, raw_vcf, raw_vcf) annotate_vcf_cmd_array.append(annotate_vcf_cmd) final_vcf = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf') - annotate_final_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ - (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], final_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, final_vcf, final_vcf) + annotate_final_vcf_cmd = "%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ + (ConfigSectionMap("snpeff", Config)['base_cmd'], final_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, final_vcf, final_vcf) annotate_final_vcf_cmd_array.append(annotate_final_vcf_cmd) if args.numcores: - num_cores = int(num_cores) + num_cores = int(args.numcores) else: - num_cores = multiprocessing.cpu_count() + # Slurm Changes here. + if args.scheduler == "SLURM": + proc = subprocess.Popen(["echo $SLURM_CPUS_PER_TASK"], stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + num_cores = int(out.strip()) + elif args.scheduler == "PBS": + num_cores = multiprocessing.cpu_count() + else: + proc = subprocess.Popen(["echo $SLURM_CPUS_PER_TASK"], stdout=subprocess.PIPE, shell=True) + (out, err) = proc.communicate() + num_cores = int(out.strip()) + + print "Number of cores: %s" % num_cores results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_vcf_cmd_array) results_2 = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_final_vcf_cmd_array) @@ -2653,10 +2964,10 @@ def gatk_combine_variants(files_gatk, reference, out_path, merged_file_suffix, l base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)[ 'gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] #files_gatk = "--variant " + ' --variant '.join(vcf_files_array) - keep_logging("java -jar %s -T CombineVariants -R %s %s -o %s/Final_vcf_gatk%s" % (base_cmd, reference, files_gatk, out_path, merged_file_suffix), "java -jar %s -T CombineVariants -R %s %s -o %s/Final_vcf_gatk%s" % (base_cmd, reference, files_gatk, out_path, merged_file_suffix), logger, 'debug') + keep_logging("java -jar %s/GenomeAnalysisTK.jar -T CombineVariants -R %s %s -o %s/Final_vcf_gatk%s" % (os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), reference, files_gatk, out_path, merged_file_suffix), "java -jar %s -T CombineVariants -R %s %s -o %s/Final_vcf_gatk%s" % (ConfigSectionMap("gatk", Config)['base_cmd'], reference, files_gatk, out_path, merged_file_suffix), logger, 'debug') merge_gatk_commands_file = "%s/gatk_merge.sh" % args.filter2_only_snp_vcf_dir with open(merge_gatk_commands_file, 'w+') as fopen: - fopen.write("java -jar %s -T CombineVariants -R %s %s -o %s/Final_vcf_gatk%s" % (base_cmd, reference, files_gatk, out_path, merged_file_suffix) + '\n') + fopen.write("java -jar %s/GenomeAnalysisTK.jar -T CombineVariants -R %s %s -o %s/Final_vcf_gatk%s" % (os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), reference, files_gatk, out_path, merged_file_suffix) + '\n') fopen.close() # Commenting out calling gatk combine variants with a custom logging call method, problem with python subprocess, OSError: [Errno 7] Argument list too long os.system("bash %s" % merge_gatk_commands_file) @@ -2672,7 +2983,7 @@ def annotated_snp_matrix(): """Annotate all VCF file formats with SNPeff""" # Commented for debugging # variant_annotation() - # + # # indel_annotation() @@ -2696,6 +3007,8 @@ def annotated_snp_matrix(): locus_tag_to_gene_name = {} locus_tag_to_product = {} locus_tag_to_strand = {} + #locus_tag_to_uniprot = {} + #locus_tag_to_ec_number = {} keep_logging( 'Reading annotations from Reference genome genbank file: %s/%s.gbf' % (os.path.dirname(args.reference), reference_basename[0]), @@ -2726,6 +3039,11 @@ def annotated_snp_matrix(): last_element = len(record.features) - 1 last_locus_tag = record.features[last_element].qualifiers['locus_tag'][0] + # #Debugging prints + # print first_locus_tag + # print locus_tag_to_gene_name[first_locus_tag] + # print last_locus_tag + # print locus_tag_to_gene_name[last_locus_tag] """ End of Extract Annotation information from Genbank file @@ -2737,6 +3055,8 @@ def annotated_snp_matrix(): """ + + """ Start of Merging Step: - Merge Individual Annotated raw and filtered vcf files to generate a Final merged vcf file using Gatk combine variants method. @@ -2747,10 +3067,10 @@ def annotated_snp_matrix(): keep_logging('Merging Final Annotated VCF files into %s/Final_vcf_no_proximate_snp.vcf using bcftools' % args.filter2_only_snp_vcf_dir, 'Merging Final Annotated VCF files into %s/Final_vcf_no_proximate_snp.vcf using bcftools' % args.filter2_only_snp_vcf_dir, logger, 'info') #Commented for debugging - # files_for_tabix = glob.glob("%s/*.vcf_no_proximate_snp.vcf_ANN.vcf" % args.filter2_only_snp_vcf_dir) - # tabix(files_for_tabix, "vcf", logger, Config) - # files_for_tabix = glob.glob("%s/*_filter2_indel_final.vcf_ANN.vcf" % args.filter2_only_snp_vcf_dir) - # tabix(files_for_tabix, "vcf", logger, Config) + files_for_tabix = glob.glob("%s/*.vcf_no_proximate_snp.vcf_ANN.vcf" % args.filter2_only_snp_vcf_dir) + tabix(files_for_tabix, "vcf", logger, Config) + files_for_tabix = glob.glob("%s/*_filter2_indel_final.vcf_ANN.vcf" % args.filter2_only_snp_vcf_dir) + tabix(files_for_tabix, "vcf", logger, Config) files = ' '.join(vcf_filenames) @@ -2769,39 +3089,39 @@ def annotated_snp_matrix(): """ Merge with Gatk combine variants method """ # #Commented for debugging - # merged_file_suffix = "_no_proximate_snp.vcf" - # - # annotated_no_proximate_snp_file = "%s/annotated_no_proximate_snp_list.txt" % args.filter2_only_snp_vcf_dir - # annotated_no_proximate_snp_indel_file = "%s/annotated_no_proximate_snp_indel_list.txt" % args.filter2_only_snp_vcf_dir - # - # with open(annotated_no_proximate_snp_file, 'w+') as fopen: - # for i in vcf_filenames: - # fopen.write(i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_ANN.vcf.gz') + '\n') - # fopen.close() - # - # with open(annotated_no_proximate_snp_indel_file, 'w+') as fopen: - # for i in vcf_filenames: - # fopen.write(i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf_ANN.vcf.gz') + '\n') - # fopen.close() - # - # #files_gatk = "--variant " + ' --variant '.join(vcf_filenames) - # files_gatk = "" - # for i in vcf_filenames: - # files_gatk = files_gatk + " --variant " + i - # final_gatk_snp_merged_vcf = gatk_combine_variants(files_gatk.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_ANN.vcf.gz'), args.reference, args.filter2_only_snp_vcf_dir, merged_file_suffix, logger, Config) - # - # # Test this merge and annotate this merged file - Testing Mode Right now. - # #merged_file_suffix = "_no_proximate_snp_1.vcf" - # #final_gatk_snp_merged_vcf_1 = gatk_combine_variants(files_gatk,args.reference, args.filter2_only_snp_vcf_dir, merged_file_suffix, logger, Config) - # merged_file_suffix = "_indel.vcf" - # final_gatk_indel_merged_vcf = gatk_combine_variants(files_gatk.replace('_filter2_final.vcf_no_proximate_snp.vcf', - # '_filter2_indel_final.vcf_ANN.vcf.gz'), - # args.reference, args.filter2_only_snp_vcf_dir, merged_file_suffix, - # logger, Config) - # - # """ Tabix index the combined GATK Final vcf file """ - # files_for_tabix = glob.glob("%s/Final_vcf_*.vcf" % args.filter2_only_snp_vcf_dir) - # tabix(files_for_tabix, "vcf", logger, Config) + merged_file_suffix = "_no_proximate_snp.vcf" + + annotated_no_proximate_snp_file = "%s/annotated_no_proximate_snp_list.txt" % args.filter2_only_snp_vcf_dir + annotated_no_proximate_snp_indel_file = "%s/annotated_no_proximate_snp_indel_list.txt" % args.filter2_only_snp_vcf_dir + + with open(annotated_no_proximate_snp_file, 'w+') as fopen: + for i in vcf_filenames: + fopen.write(i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_ANN.vcf.gz') + '\n') + fopen.close() + + with open(annotated_no_proximate_snp_indel_file, 'w+') as fopen: + for i in vcf_filenames: + fopen.write(i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf_ANN.vcf.gz') + '\n') + fopen.close() + + #files_gatk = "--variant " + ' --variant '.join(vcf_filenames) + files_gatk = "" + for i in vcf_filenames: + files_gatk = files_gatk + " --variant " + i + final_gatk_snp_merged_vcf = gatk_combine_variants(files_gatk.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_ANN.vcf.gz'), args.reference, args.filter2_only_snp_vcf_dir, merged_file_suffix, logger, Config) + + # Test this merge and annotate this merged file - Testing Mode Right now. + #merged_file_suffix = "_no_proximate_snp_1.vcf" + #final_gatk_snp_merged_vcf_1 = gatk_combine_variants(files_gatk,args.reference, args.filter2_only_snp_vcf_dir, merged_file_suffix, logger, Config) + merged_file_suffix = "_indel.vcf" + final_gatk_indel_merged_vcf = gatk_combine_variants(files_gatk.replace('_filter2_final.vcf_no_proximate_snp.vcf', + '_filter2_indel_final.vcf_ANN.vcf.gz'), + args.reference, args.filter2_only_snp_vcf_dir, merged_file_suffix, + logger, Config) + + """ Tabix index the combined GATK Final vcf file """ + files_for_tabix = glob.glob("%s/Final_vcf_*.vcf" % args.filter2_only_snp_vcf_dir) + tabix(files_for_tabix, "vcf", logger, Config) """ End of Merging Step. """ @@ -2826,6 +3146,8 @@ def annotated_snp_matrix(): for i in vcf_filenames: print_string_header = print_string_header + os.path.basename(i) + "\t" + + """ Generate an array of core positions. Read Only_ref_variant_positions_for_closely* to get final core variant positions into core_positions array""" core_positions = [] if ConfigSectionMap("functional_filters", Config)['apply_to_calls'] == "yes": @@ -2851,6 +3173,8 @@ def annotated_snp_matrix(): """ End: Generate an array of core positions. """ + + """ Generate a list of functional class positions from Phaster, Mummer and Custom Masking results/files""" """ Read in functional class filter positions. """ functional_filter_pos_array = [] @@ -2899,6 +3223,8 @@ def annotated_snp_matrix(): """ End: Generate a list of functional class positions from Phaster, Mummer and Custom Masking results/files""" + + """ Read and parse final GATK merged vcf file cyvcf library; Generate a header string from the sample lis fo this merged vcf file""" final_merge_anno_file = VCF("%s/Final_vcf_gatk_no_proximate_snp.vcf.gz" % args.filter2_only_snp_vcf_dir) @@ -2913,6 +3239,8 @@ def annotated_snp_matrix(): """ End """ + + """ Prepare a All_indel_label_final_ordered_sorted.txt file with sorted unique variant positions. """ paste_label_command = "paste %s/unique_positions_file " % args.filter2_only_snp_vcf_dir paste_indel_label_command = "paste %s/unique_indel_positions_file " % args.filter2_only_snp_vcf_dir @@ -2935,7 +3263,7 @@ def annotated_snp_matrix(): second_part = filename_base.replace("R1.fastq.gz", "R2.fastq.gz") first_part_split = filename_base.split('R1.fastq.gz') first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) + #first_part = re.sub("_S.*_", "", first_part) # Changed on 03/15/2019 first_part = re.sub("_S.*", "", first_part) elif "1_combine.fastq.gz" in filename_base: @@ -3123,10 +3451,8 @@ def annotated_snp_matrix(): """ End: Generate mask_fq_mq_positions array """ - """ Main: Generate SNP Matrix """ - """ Open Matrix files to write strings """ fp_code = open("%s/SNP_matrix_code.csv" % args.filter2_only_snp_vcf_dir, 'w+') fp_allele = open("%s/SNP_matrix_allele_outdated.csv" % args.filter2_only_snp_vcf_dir, 'w+') @@ -3198,6 +3524,8 @@ def annotated_snp_matrix(): else: code_string = code_string.replace('VARIANT', '3') + + # Annotation Bug fix 2 # Changing SNP type: Date 28/05/2019 if variants.POS in snp_var_ann_dict.keys(): @@ -3486,6 +3814,7 @@ def annotated_snp_matrix(): # Annotation Bug fix 6 # Changing Strandness string: Date 28/05/2019 # Each Locus ID with a strand information + strandness = " Strand Information: " if "-" in tag: tagsplit = tag.split('-') @@ -3617,7 +3946,7 @@ def annotated_snp_matrix(): count += 1 # Annotation Bug fix 8 - """ Mask Phage positions and LowFQ/MQ positions in SNP_matrix_allele_new.csv. This is the default matrix. """ + """ Mask Phage positions in SNP_matrix_allele_new.csv. This is the default matrix. """ if str(variants.POS) in functional_filter_pos_array: ntd_string_array = ntd_string.split('\t') #print ntd_string_array @@ -3625,8 +3954,8 @@ def annotated_snp_matrix(): for i in ntd_string_array[1:]: ntd_string = ntd_string + "\t" + "N" ntd_string_array = ntd_string.split('\t') + #print ntd_string_array - ## 2019-11-18; Was masking both FQ and MQ regions found in mask_fq_mq_positions. BUG FOUND. SHould be Commented; Never ever unComment this out. """ Generate a print_string for each of the matrix - SNP_matrix_allele_new.csv and SNP_matrix_allele_phage.csv """ @@ -3648,13 +3977,6 @@ def annotated_snp_matrix(): fp_allele_new.close() fp_allele_new_phage.close() - - - """ Generate Variant QC report """ - os.system("conda activate variant_QC_env") - os.system("Rscript /nfs/esnitkin/bin_group/pipeline/Github/scripts/render_variant_matrix_qc.R SNP_matrix_code.csv %s" % os.path.dirname(args.filter2_only_snp_vcf_dir)) - - ###################################### """ Indel matrix """ """ Prepare SNP/Indel Matrix print strings and add matrix row information subsequently """ @@ -3671,6 +3993,19 @@ def annotated_snp_matrix(): fp_code.write(header_print_string) fp_allele.write(header_print_string) + # """ Generate mask_fq_mq_positions array with positions where a variant was filtered because of LowFQ or LowMQ""" + # mask_fq_mq_positions = [] + # for key in position_indel_label.keys(): + # label_sep_array = position_indel_label[key].split(',') + # for i in label_sep_array: + # if "LowAF" in i: + # if key not in mask_fq_mq_positions: + # mask_fq_mq_positions.append(key) + # if i == "HighAF": + # if key not in mask_fq_mq_positions: + # mask_fq_mq_positions.append(key) + # + # print "Length of indel mask_fq_mq_positions array:%s" % len(mask_fq_mq_positions) """ Generate mask_fq_mq_positions array with positions where a variant was filtered because of LowFQ or LowMQ""" mask_fq_mq_positions = [] @@ -3904,6 +4239,9 @@ def annotated_snp_matrix(): ann_string = ann_string + '|'.join( [i_split[0], i_split[1], i_split[2], i_split[3], i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" + # Debugging + if i_split[3] == "CD630_00290": + print ann_string # Changing SNP type: Date 28/05/2019 else: if tag in locus_tag_to_gene_name.keys() and tag in locus_tag_to_product.keys(): @@ -4232,8 +4570,8 @@ def core_prep_snp(core_vcf_fasta_dir): """ Generate consensus fasta file with only reference and variant position bases """ extract_only_ref_variant_fasta(core_vcf_fasta_dir) - # """ Analyze the positions that were filtered out only due to insufficient depth""" - # DP_analysis() + """ Analyze the positions that were filtered out only due to insufficient depth""" + #DP_analysis() def core_prep_indel(core_vcf_fasta_dir): """ Generate SNP Filter Label Matrix """ @@ -4321,7 +4659,7 @@ def gubbins(gubbins_dir, input_fasta, jobrun, logger, Config): elif jobrun == "parallel-cluster": job_file_name = "%s/gubbins_%s.pbs" % (gubbins_dir, os.path.basename(input_fasta)) job_name = os.path.basename(job_file_name) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l nodes=1:ppn=12,mem=47000mb,walltime=50:00:00\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\ncd %s\n%s\n%s" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], gubbins_dir, load_module, gubbins_cmd) + job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l nodes=1:ppn=12,mem=47000mb,walltime=250:00:00\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\ncd %s\n%s\n%s" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], gubbins_dir, load_module, gubbins_cmd) f1=open(job_file_name, 'w+') f1.write(job_print_string) f1.close() @@ -4489,6 +4827,10 @@ def mask_fq_mq_positions_specific_to_outgroup(): fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_unmapped_variants.fa\n" % ( args.reference, base_vcftools_bin, vcf_filename_unmapped, outgroup) + # print bgzip_cmd + # print tabix_cmd + # print fasta_cmd + subprocess.call([bgzip_cmd], shell=True) subprocess.call([tabix_cmd], shell=True) subprocess.call([fasta_cmd], shell=True) @@ -4526,7 +4868,6 @@ def mask_fq_mq_positions_specific_to_outgroup(): print "Length of mask_fq_mq_positions:%s" % len(mask_fq_mq_positions) - """ Pending inclusion @@ -4544,8 +4885,6 @@ def someOtherFunc(data, key): Pending inclusion """ - - if __name__ == '__main__': """ @@ -4621,6 +4960,8 @@ def someOtherFunc(data, key): log_file_handle = "%s/%s_%s.log.txt" % (args.filter2_only_snp_vcf_dir, log_unique_time, analysis_name_log) + scheduler_directives, script_Directive, job_name_flag = get_scheduler_directive(args.scheduler, Config) + # Start Variant Calling Core Pipeline steps based on steps argument supplied. if "1" in args.steps: """ @@ -4634,16 +4975,16 @@ def someOtherFunc(data, key): unique_position_file = create_positions_filestep(vcf_filenames) unique_indel_position_file = create_indel_positions_filestep(vcf_filenames) - # # bgzip and tabix all the vcf files in core_temp_dir. - # files_for_tabix = glob.glob("%s/*.vcf" % args.filter2_only_snp_vcf_dir) - # tabix(files_for_tabix, "vcf", logger, Config) + # bgzip and tabix all the vcf files in core_temp_dir. + files_for_tabix = glob.glob("%s/*.vcf" % args.filter2_only_snp_vcf_dir) + tabix(files_for_tabix, "vcf", logger, Config) # Get the cluster option; create and run jobs based on given parameter. The jobs will parse all the intermediate vcf file to extract information such as if any unique variant position was unmapped in a sample, if it was filtered out dur to DP,MQ, FQ, proximity to indel, proximity to other SNPs and other variant filter parameters set in config file. tmp_dir = "/tmp/temp_%s/" % log_unique_time - #create_job(args.jobrun, vcf_filenames, unique_position_file, tmp_dir) + create_job(args.jobrun, vcf_filenames, unique_position_file, tmp_dir, scheduler_directives, script_Directive, job_name_flag) - create_indel_job(args.jobrun, vcf_filenames, unique_indel_position_file, tmp_dir) + create_indel_job(args.jobrun, vcf_filenames, unique_indel_position_file, tmp_dir, scheduler_directives, script_Directive, job_name_flag) # If Phaster Summary file doesn't exist in reference genome folder if not os.path.isfile("%s/summary.txt" % os.path.dirname(args.reference)): @@ -4710,6 +5051,7 @@ def someOtherFunc(data, key): print "No. of outgroup specific variant positions: %s" % len(outgroup_specific_positions) print "No. of outgroup specific Indel variant positions: %s" % len(outgroup_indel_specific_positions) + # Commented out for debugging # Run core steps. Generate SNP and data Matrix results. Extract core SNPS and consensus files. core_prep_indel(core_vcf_fasta_dir) @@ -4724,24 +5066,25 @@ def someOtherFunc(data, key): # Read new allele matrix and generate fasta; generate a seperate function keep_logging('Generating Fasta from Variant Alleles...\n', 'Generating Fasta from Variant Alleles...\n', logger, 'info') - create_job_allele_variant_fasta(args.jobrun, vcf_filenames, args.filter2_only_snp_vcf_dir, config_file) + create_job_allele_variant_fasta(args.jobrun, vcf_filenames, args.filter2_only_snp_vcf_dir, config_file, script_Directive, job_name_flag) extract_only_ref_variant_fasta_from_reference_allele_variant() - mask_fq_mq_positions_specific_to_outgroup() + #mask_fq_mq_positions_specific_to_outgroup() call("cp %s %s/Logs/core/" % ( log_file_handle, os.path.dirname(os.path.dirname(args.filter2_only_snp_vcf_dir))), logger) if "3" in args.steps: """ - report step - """ + report step + """ # Get outgroup_Sample name outgroup = get_outgroup() - keep_logging('Step 3: Generate Reports and Results folder.', 'Step 3: Generate Reports and Results folder.', logger, 'info') + keep_logging('Step 3: Generate Reports and Results folder.', 'Step 3: Generate Reports and Results folder.', + logger, 'info') ## Temporary fix. A bug was introduced that is causing the pipeline to generate *vcf_no_proximate_snp.vcf_filter2_consensus.fa call("rm %s/*vcf_no_proximate_snp.vcf_filter2_consensus.fa" % args.filter2_only_snp_vcf_dir, logger) @@ -4772,27 +5115,35 @@ def someOtherFunc(data, key): make_sure_path_exists(consensus_var_dir) make_sure_path_exists(core_vcf_dir) make_sure_path_exists(consensus_allele_var_dir) - #make_sure_path_exists(consensus_ref_allele_var_dir) + # make_sure_path_exists(consensus_ref_allele_var_dir) make_sure_path_exists(consensus_ref_var_dir) make_sure_path_exists(consensus_ref_allele_unmapped_variant_dir) reference_base = os.path.basename(args.reference).split('.')[0] - # Move results to the results directory - move_data_matrix_results = "cp -r %s/unique_positions_file %s/unique_indel_positions_file %s/*.csv %s/*.txt %s/temp_* %s/All* %s/Only* %s/*.R %s/R_scripts/generate_diagnostics_plots.R %s/*.html %s/" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, os.path.dirname(os.path.abspath(__file__)), args.filter2_only_snp_vcf_dir, data_matrix_dir) - #move_core_vcf_fasta_results = "cp %s/*_core.vcf.gz %s/*.fa %s/*_variants.fa %s/" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, core_vcf_fasta_dir) - move_core_vcf_fasta_results = "mv %s/*_core.vcf.gz* %s/*_ANN* %s/*.fa %s/" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, core_vcf_fasta_dir) + # Move results to the results directory + move_data_matrix_results = "cp -r %s/unique_positions_file %s/unique_indel_positions_file %s/*.csv %s/*.txt %s/temp_* %s/All* %s/Only* %s/*.R %s/R_scripts/generate_diagnostics_plots.R %s/*.html %s/" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, os.path.dirname(os.path.abspath(__file__)), + args.filter2_only_snp_vcf_dir, data_matrix_dir) + # move_core_vcf_fasta_results = "cp %s/*_core.vcf.gz %s/*.fa %s/*_variants.fa %s/" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, core_vcf_fasta_dir) + move_core_vcf_fasta_results = "cp %s/*_core.vcf.gz* %s/*_ANN* %s/*.fa %s/" % ( + args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, core_vcf_fasta_dir) move_consensus_var_fasta_results = "mv %s/*_variants.fa %s/" % (core_vcf_fasta_dir, consensus_var_dir) move_consensus_ref_var_fasta_results = "mv %s/*.fa %s/" % (core_vcf_fasta_dir, consensus_ref_var_dir) - move_core_vcf = "mv %s/*_core.vcf.gz %s/*vcf_core.vcf.gz.tbi %s/" % (core_vcf_fasta_dir, core_vcf_fasta_dir, core_vcf_dir) - move_consensus_allele_var_fasta_results = "mv %s/*allele_variants.fa %s/" % (consensus_var_dir, consensus_allele_var_dir) + move_core_vcf = "mv %s/*_core.vcf.gz %s/*vcf_core.vcf.gz.tbi %s/" % ( + core_vcf_fasta_dir, core_vcf_fasta_dir, core_vcf_dir) + move_consensus_allele_var_fasta_results = "mv %s/*allele_variants.fa %s/" % ( + consensus_var_dir, consensus_allele_var_dir) remove_ref_allele = "rm %s/*_ref_allele_variants.fa" % consensus_allele_var_dir - #move_consensus_ref_allele_var_fasta_results = "mv %s/*_ref_allele_variants.fa %s/" % (consensus_allele_var_dir, consensus_ref_allele_var_dir) - move_consensus_ref_allele_unmapped_var_fasta_results = "mv %s/*_ref_allele_unmapped_variants.fa %s/" % (consensus_var_dir, consensus_ref_allele_unmapped_variant_dir) + # move_consensus_ref_allele_var_fasta_results = "mv %s/*_ref_allele_variants.fa %s/" % (consensus_allele_var_dir, consensus_ref_allele_var_dir) + move_consensus_ref_allele_unmapped_var_fasta_results = "mv %s/*_ref_allele_unmapped_variants.fa %s/" % ( + consensus_var_dir, consensus_ref_allele_unmapped_variant_dir) move_snpeff_results = "mv %s/*ANN* %s/" % (data_matrix_dir, data_matrix_snpeff_dir) move_snpeff_vcf_results = "mv %s/*ANN* %s/" % (core_vcf_fasta_dir, data_matrix_snpeff_dir) copy_reference = "cp %s %s/%s.fa" % (args.reference, consensus_ref_var_dir, reference_base) - #copy_reference_2 = "cp %s %s/%s.fa" % (args.reference, consensus_ref_allele_var_dir, reference_base) + # copy_reference_2 = "cp %s %s/%s.fa" % (args.reference, consensus_ref_allele_var_dir, reference_base) call("%s" % move_data_matrix_results, logger) call("%s" % move_core_vcf_fasta_results, logger) @@ -4801,13 +5152,14 @@ def someOtherFunc(data, key): call("%s" % move_core_vcf, logger) call("%s" % move_consensus_allele_var_fasta_results, logger) call("%s" % remove_ref_allele, logger) - #call("%s" % move_consensus_ref_allele_var_fasta_results, logger) + # call("%s" % move_consensus_ref_allele_var_fasta_results, logger) call("%s" % move_consensus_ref_allele_unmapped_var_fasta_results, logger) call("%s" % copy_reference, logger) - #call("%s" % copy_reference_2, logger) + # call("%s" % copy_reference_2, logger) call("%s" % move_snpeff_results, logger) call("%s" % move_snpeff_vcf_results, logger) - subprocess.call(["sed -i 's/title_here/%s/g' %s/generate_diagnostics_plots.R" % (os.path.basename(args.results_dir), data_matrix_dir)], shell=True) + subprocess.call(["sed -i 's/title_here/%s/g' %s/generate_diagnostics_plots.R" % ( + os.path.basename(args.results_dir), data_matrix_dir)], shell=True) # Sanity Check if the variant consensus files generated are of same length count = 0 @@ -4817,7 +5169,7 @@ def someOtherFunc(data, key): variant_consensus_files = glob.glob("%s/*_variants.fa" % core_vcf_fasta_dir) for f in variant_consensus_files: cmd2 = "%s/%s/bioawk -c fastx '{ print length($seq) }' < %s" % ( - ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin'], f) + ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin'], f) proc = subprocess.Popen([cmd2], stdout=subprocess.PIPE, shell=True) (out2, err2) = proc.communicate() @@ -4842,8 +5194,12 @@ def someOtherFunc(data, key): make_sure_path_exists(functional_ann_dir) make_sure_path_exists(logs_dir) call("mv *.log.txt %s" % logs_dir, logger) - call("mv summary.txt detail.txt Functional_class_filter_positions.txt inexact_repeat_region_positions.txt phage_region_positions.txt repeat_region_positions.txt %s" % functional_ann_dir, logger) - call("mv temp_* All* Only* SNP_matrix_* Indel* extract_DP_positions.txt header.txt unique_indel_positions_file unique_positions_file %s" % matrices_dir, logger) + call( + "mv summary.txt detail.txt Functional_class_filter_positions.txt inexact_repeat_region_positions.txt phage_region_positions.txt repeat_region_positions.txt %s" % functional_ann_dir, + logger) + call( + "mv temp_* All* Only* SNP_matrix_* Indel* extract_DP_positions.txt header.txt unique_indel_positions_file unique_positions_file %s" % matrices_dir, + logger) call("mv annotated_no_proximate_snp_* %s/snpEff_results/" % data_matrix_dir, logger) call("mv bargraph* generate_diagnostics_plots.R %s" % plots_dir, logger) call("cp %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt %s/" % (matrices_dir, plots_dir), logger) @@ -4860,35 +5216,49 @@ def someOtherFunc(data, key): tree_dir = args.results_dir + '/trees' make_sure_path_exists(gubbins_dir) - #make_sure_path_exists(tree_dir) - + # make_sure_path_exists(tree_dir) - prepare_ref_var_consensus_input = "%s/gubbins/%s_%s_genome_aln_w_ref_allele.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) - prepare_var_consensus_input = "%s/gubbins/%s_%s_core_var_aln.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) + prepare_ref_var_consensus_input = "%s/gubbins/%s_%s_genome_aln_w_ref_allele.fa" % ( + args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), + reference_base) + prepare_var_consensus_input = "%s/gubbins/%s_%s_core_var_aln.fa" % ( + args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), + reference_base) prepare_allele_var_consensus_input = "%s/gubbins/%s_%s_noncore_plus_core_variants_aln.fa" % ( + args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), + reference_base) + # prepare_ref_allele_var_consensus_input = "%s/gubbins/%s_%s_ref_allele_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''),reference_base) + prepare_ref_allele_unmapped_consensus_input = "%s/gubbins/%s_%s_genome_aln_w_alt_allele_unmapped.fa" % ( args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) - #prepare_ref_allele_var_consensus_input = "%s/gubbins/%s_%s_ref_allele_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''),reference_base) - prepare_ref_allele_unmapped_consensus_input = "%s/gubbins/%s_%s_genome_aln_w_alt_allele_unmapped.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) - prepare_ref_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_ref_variant_positions/*.fa > %s" % (args.results_dir, prepare_ref_var_consensus_input) - prepare_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_variant_positions/*_variants.fa > %s" % (args.results_dir, prepare_var_consensus_input) + prepare_ref_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_ref_variant_positions/*.fa > %s" % ( + args.results_dir, prepare_ref_var_consensus_input) + prepare_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_variant_positions/*_variants.fa > %s" % ( + args.results_dir, prepare_var_consensus_input) prepare_allele_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_allele_variant_positions/*_allele_variants.fa > %s" % ( - args.results_dir, prepare_allele_var_consensus_input) - #prepare_ref_allele_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_ref_allele_variant_positions/*.fa > %s" % (args.results_dir, prepare_ref_allele_var_consensus_input) - prepare_ref_allele_unmapped_consensus_input_cmd = "cat %s %s/core_snp_consensus/consensus_ref_allele_unmapped_variant/*.fa > %s" % (args.reference, args.results_dir, prepare_ref_allele_unmapped_consensus_input) - call("%s" % prepare_ref_var_consensus_input_cmd, logger) - call("%s" % prepare_var_consensus_input_cmd, logger) - call("%s" % prepare_allele_var_consensus_input_cmd, logger) - #call("%s" % prepare_ref_allele_var_consensus_input_cmd, logger) + args.results_dir, prepare_allele_var_consensus_input) + # prepare_ref_allele_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_ref_allele_variant_positions/*.fa > %s" % (args.results_dir, prepare_ref_allele_var_consensus_input) + prepare_ref_allele_unmapped_consensus_input_cmd = "cat %s %s/core_snp_consensus/consensus_ref_allele_unmapped_variant/*.fa > %s" % ( + args.reference, args.results_dir, prepare_ref_allele_unmapped_consensus_input) + #call("%s" % prepare_ref_var_consensus_input_cmd, logger) + #call("%s" % prepare_var_consensus_input_cmd, logger) + #call("%s" % prepare_allele_var_consensus_input_cmd, logger) + # call("%s" % prepare_ref_allele_var_consensus_input_cmd, logger) call("%s" % prepare_ref_allele_unmapped_consensus_input_cmd, logger) - # os.system(prepare_ref_var_consensus_input_cmd) - # os.system(prepare_var_consensus_input_cmd) + + # Clean up directories + call("mv %s/filtered_* %s/mask_fq_mq_* %s" % (data_matrix_dir, data_matrix_dir, matrices_dir), logger) + make_sure_path_exists(args.results_dir + '/qc_report') + print "%s" % os.path.dirname(os.path.dirname(args.results_dir)) + call("cp %s/*/*_stats_results/* %s/qc_report" % (os.path.dirname(os.path.dirname(args.results_dir)), args.results_dir), logger) + call("rm %s/snpEff_summary.html" % data_matrix_dir, logger) print_details = "Results for core pipeline can be found in: %s\n" \ - "Description of Results:\n" \ - "1. data_matrix folder contains all the data matrices and other temporary files generated during the core pipeline. bargraph_counts.txt and bargraph_percentage.txt: contains counts/percentage of unique positions filtered out due to different filter parameters for each sample. Run bargraph.R to plot bargraph statistics." \ - "2. core_snp_consensus contains all the core vcf and fasta files. *_core.vcf.gz: core vcf files, *.fa and *_variants.fa: core consensus fasta file and core consensus fasta with only variant positions." % (args.results_dir) + "Description of Results:\n" \ + "1. data_matrix folder contains all the data matrices and other temporary files generated during the core pipeline. bargraph_counts.txt and bargraph_percentage.txt: contains counts/percentage of unique positions filtered out due to different filter parameters for each sample. Run bargraph.R to plot bargraph statistics." \ + "2. core_snp_consensus contains all the core vcf and fasta files. *_core.vcf.gz: core vcf files, *.fa and *_variants.fa: core consensus fasta file and core consensus fasta with only variant positions." % ( + args.results_dir) keep_logging(print_details, print_details, logger, 'info') call("cp %s %s/Logs/report/" % ( @@ -4902,13 +5272,17 @@ def someOtherFunc(data, key): keep_logging('Step 4: Run Gubbins on core alignments and generate iqtree/RaxML trees.', 'Step 4: Run Gubbins on core alignments and generate iqtree/RaxML trees.', logger, 'info') + """ + Deactivate current conda environment + """ + #parse_phaster(args.reference) reference_base = os.path.basename(args.reference).split('.')[0] gubbins_dir = args.results_dir + '/gubbins' - tree_dir = args.results_dir + '/trees' + iqtree_results_dir = args.results_dir + '/gubbins/iqtree_results' make_sure_path_exists(gubbins_dir) - #make_sure_path_exists(tree_dir) + make_sure_path_exists(iqtree_results_dir) prepare_ref_var_consensus_input = "%s/gubbins/%s_%s_genome_aln_w_ref_allele.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) @@ -4932,6 +5306,9 @@ def someOtherFunc(data, key): if args.gubbins and args.gubbins == "yes": + if args.gubbins_env: + os.system("conda deactivate") + os.system("conda activate %s" % args.gubbins_env) os.chdir(gubbins_dir) if args.outgroup: # Get outgroup_Sample name @@ -4961,8 +5338,37 @@ def someOtherFunc(data, key): os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input), logger, 'info') - call("%s/scripts/gubbins_iqtree_raxml.sh %s 1" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input), logger) + #call("%s/scripts/gubbins_iqtree_raxml.sh %s 1" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input), logger) #call("%s/scripts/gubbins_iqtree_raxml.sh %s 1" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_var_consensus_input), logger) + + job_file_name = "%s" % (prepare_ref_allele_unmapped_consensus_input.replace('.fa', '.pbs')) + load_conda = "%s" % (prepare_ref_allele_unmapped_consensus_input.replace('.fa', '_conda.sh')) + print job_file_name + gubbins_command = "run_gubbins.py --prefix %s --threads 12 %s" % (os.path.basename(prepare_ref_allele_unmapped_consensus_input).replace('.fa', ''), prepare_ref_allele_unmapped_consensus_input) + iqtree_command = "iqtree -s %s/%s.filtered_polymorphic_sites.fasta -nt AUTO -bb 1000 -m MFP -pre %s/%s" % (os.path.dirname(prepare_ref_allele_unmapped_consensus_input), os.path.basename(prepare_ref_allele_unmapped_consensus_input).replace('.fa', ''), iqtree_results_dir, os.path.basename(prepare_ref_allele_unmapped_consensus_input.replace('.fa', ''))) + with open(job_file_name, 'w') as out: + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(job_file_name)) + out.write("#!/bin/sh" + '\n') + out.write(job_title + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s" % os.path.dirname(prepare_ref_allele_unmapped_consensus_input) + '\n') + #out.write("conda deactivate\n") + #out.write("conda activate variantcalling_env_gubbins_raxml_iqtree\n") + out.write(gubbins_command + '\n') + out.write(iqtree_command + '\n') + out.close() + + with open(load_conda, 'w') as out: + out.write('conda deactivate' + '\n') + out.write('conda activate variantcalling_env_gubbins_raxml_iqtree' + '\n') + out.close() + + keep_logging('Run following code on login terminal:\n', 'Run following code on login terminal:\n', logger, 'info') + keep_logging('conda deactivate', 'conda deactivate', logger, 'info') + keep_logging('conda activate variantcalling_env_gubbins_raxml_iqtree', 'conda activate variantcalling_env_gubbins_raxml_iqtree', logger, 'info') + keep_logging('sbatch %s' % job_file_name, 'sbatch %s' % job_file_name, logger, 'info') + + else: if args.outgroup: # Get outgroup_Sample name @@ -5037,11 +5443,11 @@ def someOtherFunc(data, key): # # Read new allele matrix and generate fasta; generate a seperate function keep_logging('Generating Fasta from Variant Alleles...\n', 'Generating Fasta from Variant Alleles...\n', logger, 'info') - create_job_allele_variant_fasta(args.jobrun, vcf_filenames, args.filter2_only_snp_vcf_dir, config_file) + create_job_allele_variant_fasta(args.jobrun, vcf_filenames, args.filter2_only_snp_vcf_dir, config_file, script_Directive, job_name_flag) - #extract_only_ref_variant_fasta_from_reference_allele_variant() + extract_only_ref_variant_fasta_from_reference_allele_variant() - #mask_fq_mq_positions_specific_to_outgroup() + mask_fq_mq_positions_specific_to_outgroup() call("cp %s %s/Logs/core/" % ( log_file_handle, os.path.dirname(os.path.dirname(args.filter2_only_snp_vcf_dir))), logger) diff --git a/modules/variant_diagnostics/core_pipeline_backup.py b/modules/variant_diagnostics/core_pipeline_backup.py deleted file mode 100755 index 0fc3f35..0000000 --- a/modules/variant_diagnostics/core_pipeline_backup.py +++ /dev/null @@ -1,2284 +0,0 @@ -from __future__ import division -import argparse -import re -import os -import csv -import subprocess -from collections import OrderedDict -from collections import defaultdict -from joblib import Parallel, delayed -import multiprocessing -import thread -import glob -import readline -import pandas as pd -import errno -from pyfasta import Fasta -from datetime import datetime -import threading -import json -from cyvcf2 import VCF -import ConfigParser -from config_settings import ConfigSectionMap -from logging_subprocess import * -from log_modules import * -from Bio import SeqIO - -parser = argparse.ArgumentParser(description='Parsing filtered VCF files and investigating Variants to determine the reason why it was filtered out from the final list') -required = parser.add_argument_group('Required arguments') -optional = parser.add_argument_group('Optional arguments') -required.add_argument('-filter2_only_snp_vcf_dir', action='store', dest="filter2_only_snp_vcf_dir", - help='Directory where all the filter2 only SNP vcf files are saved.') -required.add_argument('-filter2_only_snp_vcf_filenames', action='store', dest="filter2_only_snp_vcf_filenames", - help='Names of filter2 only SNP vcf files with name per line.') -optional.add_argument('-jobrun', action='store', dest="jobrun", - help='Running a job on Cluster, Running Parallel jobs, Run jobs/commands locally (default): cluster, local, parallel-local, parallel-single-cluster') -optional.add_argument('-cluster_type', action='store', dest="cluster_type", - help='Type of Cluster: torque, pbs, sgd') -optional.add_argument('-cluster_resources', action='store', dest="cluster_resources", - help='Cluster Resources to use. for example nodes,core. Ex: 1,4') -optional.add_argument('-numcores', action='store', dest="numcores", - help='Number of cores to use on local system for parallel-local parameter') -optional.add_argument('-remove_temp', action='store', dest="remove_temp", - help='Remove Temporary files generated during the run') -required.add_argument('-reference', action='store', dest="reference", - help='Path to Reference Fasta file for consensus generation') -required.add_argument('-steps', action='store', dest="steps", - help='Analysis Steps to be performed. This should be in sequential order.' - 'Step 1: Run pbs jobs and process all pipeline generated vcf files to generate label files' - 'Step 2: Analyze label files and generate matrix' - 'Step 3: DP/FQ Analysis') -required.add_argument('-results_dir', action='store', dest="results_dir", - help='Path to Core results directory') -required.add_argument('-config', action='store', dest="config", - help='Path to config file') -optional.add_argument('-debug_mode', action='store', dest="debug_mode", - help='yes/no for debug mode') -args = parser.parse_args() - - -def create_positions_filestep(vcf_filenames): - - """ - Gather SNP positions from each final *_no_proximate_snp.vcf file (that passed the variant filter parameters - from variant calling pipeline) and write to *_no_proximate_snp.vcf_position files for use in downstream methods - - """ - - filter2_only_snp_position_files_array = [] - for file in vcf_filenames: - with open(file, 'rU') as csv_file: - file_name = temp_dir + "/" + os.path.basename(file) + "_positions" - addpositionfilenametoarray = file_name - filter2_only_snp_position_files_array.append(addpositionfilenametoarray) - f1 = open(file_name, 'w+') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position = row[0] - if not position.startswith('#'): - p_string = row[1] + "\n" - f1.write(p_string) - f1.close() - csv_file.close() - - """ Create position array containing unique positiones from positions file """ - position_array = [] - for filess in filter2_only_snp_position_files_array: - f = open(filess, 'r+') - for line in f: - line = line.strip() - position_array.append(line) - f.close() - position_array_unique = set(position_array) - position_array_sort = sorted(position_array_unique) - print "\nThe number of unique variant positions: " + str(len(position_array_sort)) + "\n" - unique_position_file = "%s/unique_positions_file" % args.filter2_only_snp_vcf_dir - f=open(unique_position_file, 'w+') - for i in position_array_sort: - f.write(i + "\n") - f.close() - if len(position_array_sort) == 0: - print "ERROR: No unique positions found. Check if vcf files are empty?" - exit() - return unique_position_file - - -def create_indel_positions_filestep(vcf_filenames): - - """ - Gather SNP positions from each final *_no_proximate_snp.vcf file (that passed the variant filter parameters - from variant calling pipeline) and write to *_no_proximate_snp.vcf_position files for use in downstream methods - - """ - - filter2_only_indel_position_files_array = [] - for file in vcf_filenames: - indel_file = file.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf') - with open(indel_file, 'rU') as csv_file: - file_name = temp_dir + "/" + os.path.basename(indel_file) + "_positions" - addpositionfilenametoarray = file_name - filter2_only_indel_position_files_array.append(addpositionfilenametoarray) - f1 = open(file_name, 'w+') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position = row[0] - if not position.startswith('#'): - p_string = row[1] + "\n" - f1.write(p_string) - f1.close() - csv_file.close() - - """ Create position array containing unique positiones from positions file """ - position_array = [] - for filess in filter2_only_indel_position_files_array: - f = open(filess, 'r+') - for line in f: - line = line.strip() - position_array.append(line) - f.close() - position_array_unique = set(position_array) - position_array_sort = sorted(position_array_unique) - print "\nThe number of unique indel positions: " + str(len(position_array_sort)) + "\n" - unique_indel_position_file = "%s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir - f=open(unique_indel_position_file, 'w+') - for i in position_array_sort: - f.write(i + "\n") - f.close() - if len(position_array_sort) == 0: - print "ERROR: No unique positions found. Check if vcf files are empty?" - exit() - return unique_indel_position_file - - - -def make_sure_path_exists(out_path): - """ - Make sure the output folder exists or create at given path - :param out_path: - :return: - """ - try: - os.makedirs(out_path) - except OSError as exception: - if exception.errno != errno.EEXIST: - print "Errors in output folder path! please change the output path or analysis name\n" - exit() - -def run_command(i): - print "Running: %s" % i - os.system(i) - done = "done: %s" % i - return done - - -def create_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): - - """ - Based on type of jobrun; generate jobs and run accordingly. - :param jobrun: - :param vcf_filenames: - :return: - """ - if jobrun == "parallel-cluster": - """ - Supports only PBS clusters for now. - """ - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) - job_file_name = "%s.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - print "Running: qsub %s" % i - os.system("qsub %s" % i) - - elif jobrun == "parallel-local": - """ - Generate a Command list of each job and run it in parallel on different cores available on local system - """ - command_array = [] - command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - - - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) - job_file_name = "%s.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - if args.numcores: - num_cores = int(num_cores) - else: - num_cores = multiprocessing.cpu_count() - results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) - - elif jobrun == "cluster": - command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir - os.system("bash %s" % command_file) - elif jobrun == "local": - """ - Generate a Command list of each job and run it on local system one at a time - """ - - command_array = [] - command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - - - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) - job_file_name = "%s.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf.pbs" - pbs_scripts = glob.glob(pbs_dir) - - - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - #print "Running local mode: bash %s" % command_file - os.system("bash %s" % command_file) - -def create_indel_job(jobrun, vcf_filenames, unique_position_file, tmp_dir): - - """ - Based on type of jobrun; generate jobs and run accordingly. - :param jobrun: - :param vcf_filenames: - :return: - """ - if jobrun == "parallel-cluster": - """ - Supports only PBS clusters for now. - """ - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_indel_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) - job_file_name = "%s_indel.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf_indel.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - print "Running: qsub %s" % i - os.system("qsub %s" % i) - - elif jobrun == "parallel-local": - """ - Generate a Command list of each job and run it in parallel on different cores available on local system - """ - command_array = [] - command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - - - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_indel_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) - job_file_name = "%s_indel.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf_indel.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - if args.numcores: - num_cores = int(num_cores) - else: - num_cores = multiprocessing.cpu_count() - results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) - - elif jobrun == "cluster": - command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir - os.system("bash %s" % command_file) - elif jobrun == "local": - """ - Generate a Command list of each job and run it on local system one at a time - """ - - command_array = [] - command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - - - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/reason_job_indel_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -unique_position_file %s -tmp_dir %s\n" % (job_name, args.filter2_only_snp_vcf_dir, i, unique_position_file, tmp_dir) - job_file_name = "%s_indel.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf_indel.pbs" - pbs_scripts = glob.glob(pbs_dir) - - - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - #print "Running local mode: bash %s" % command_file - os.system("bash %s" % command_file) - -def create_job_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir): - - """ - Based on type of jobrun; generate jobs and run accordingly. - :param jobrun: - :param vcf_filenames: - :return: - """ - if jobrun == "parallel-cluster": - """ - Supports only PBS clusters for now. - """ - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir) - job_file_name = "%s_fasta.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - print "Running: qsub %s" % i - os.system("qsub %s" % i) - - elif jobrun == "parallel-local": - """ - Generate a Command list of each job and run it in parallel on different cores available on local system - """ - command_array = [] - command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir) - job_file_name = "%s_fasta.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - if args.numcores: - num_cores = int(num_cores) - else: - num_cores = multiprocessing.cpu_count() - results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) - - elif jobrun == "cluster": - command_array = [] - command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'],args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir) - job_file_name = "%s_fasta.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - os.system("bash %s/command_file" % args.filter2_only_snp_vcf_dir) - else: - """ - Generate a Command list of each job and run it on local system one at a time - """ - command_array = [] - command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - - - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir) - job_file_name = "%s_fasta.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" - pbs_scripts = glob.glob(pbs_dir) - - - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - os.system("bash command_file") - -def create_job_DP(jobrun, vcf_filenames): - """ - Based on type of jobrun; generate jobs and run accordingly. - :param jobrun: - :param vcf_filenames: - :return: - """ - - if jobrun == "parallel-cluster": - """ - Supports only PBS clusters for now. - """ - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) - job_file_name = "%s_DP.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - print "Running: qsub %s" % i - os.system("qsub %s" % i) - - - elif jobrun == "parallel-local": - """ - Generate a Command list of each job and run it in parallel on different cores available on local system - """ - command_array = [] - command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - - - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) - job_file_name = "%s_DP.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" - pbs_scripts = glob.glob(pbs_dir) - - - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - print len(command_array) - if args.numcores: - num_cores = int(num_cores) - else: - num_cores = multiprocessing.cpu_count() - results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) - - elif jobrun == "cluster": - """ Test pending """ - command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) - job_file_name = "%s_DP.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - os.system("bash %s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir) - - else: - """ - Generate a Command list of each job and run it on local system one at a time - """ - command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) - job_file_name = "%s_DP.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - os.system("bash %s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir) - -def generate_paste_command(): - - """ Generate SNP Filter Label Matrix """ - paste_file = args.filter2_only_snp_vcf_dir + "/paste_label_files.sh" - f4=open(paste_file, 'w+') - paste_command = "paste %s/unique_positions_file" % args.filter2_only_snp_vcf_dir - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') - paste_command = paste_command + " " + label_file - header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) - sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir - sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir - - os.system(header_awk_cmd) - os.system(sed_header) - os.system(sed_header_2) - - temp_paste_command = paste_command + " > %s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir - paste_command = paste_command + " > %s/All_label_final_raw" % args.filter2_only_snp_vcf_dir - f4.write(paste_command) - f4.close() - sort_All_label_cmd = "sort -n -k1,1 %s/All_label_final_raw > %s/All_label_final_sorted.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - paste_command_header = "cat %s/header.txt %s/All_label_final_sorted.txt > %s/All_label_final_sorted_header.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - #print temp_paste_command - - ls = [] - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') - ls.append(label_file) - ls.insert(0, "%s/unique_positions_file" % args.filter2_only_snp_vcf_dir) - - with open('%s/All_label_final_raw.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: - outfile.write(paste_command) - outfile.close() - - with open('%s/temp_label_final_raw.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: - outfile.write(temp_paste_command) - outfile.close() - os.system("bash %s/All_label_final_raw.sh" % args.filter2_only_snp_vcf_dir) - os.system("bash %s/temp_label_final_raw.txt.sh" % args.filter2_only_snp_vcf_dir) - print "Finished pasting...DONE" - - """ - remove this lines - #subprocess.call(["%s" % paste_command], shell=True) - #subprocess.call(["%s" % temp_paste_command], shell=True) - #subprocess.check_call('%s' % paste_command) - #subprocess.check_call('%s' % temp_paste_command) - #os.system(paste_command) change - #os.system(temp_paste_command) change - """ - os.system(sort_All_label_cmd) - os.system(paste_command_header) - subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_allele/1/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/VARIANT/1TRUE/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_proximate_SNP/7/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ/5/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ/6/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir - os.system(remove_unwanted_text) - - -def generate_indel_paste_command(): - - """ Generate SNP Filter Label Matrix """ - paste_file = args.filter2_only_snp_vcf_dir + "/paste_indel_label_files.sh" - f4=open(paste_file, 'w+') - paste_command = "paste %s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf_indel_positions_label') - paste_command = paste_command + " " + label_file - header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) - sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir - sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir - - os.system(header_awk_cmd) - os.system(sed_header) - os.system(sed_header_2) - - temp_paste_command = paste_command + " > %s/temp_indel_label_final_raw.txt" % args.filter2_only_snp_vcf_dir - paste_command = paste_command + " > %s/All_indel_label_final_raw" % args.filter2_only_snp_vcf_dir - f4.write(paste_command) - f4.close() - sort_All_label_cmd = "sort -n -k1,1 %s/All_indel_label_final_raw > %s/All_indel_label_final_sorted.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - paste_command_header = "cat %s/header.txt %s/All_indel_label_final_sorted.txt > %s/All_indel_label_final_sorted_header.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - #print temp_paste_command - - ls = [] - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf_indel_positions_label') - ls.append(label_file) - ls.insert(0, "%s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir) - - with open('%s/All_indel_label_final_raw.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: - outfile.write(paste_command) - outfile.close() - - with open('%s/temp_indel_label_final_raw.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: - outfile.write(temp_paste_command) - outfile.close() - os.system("bash %s/All_indel_label_final_raw.sh" % args.filter2_only_snp_vcf_dir) - os.system("bash %s/temp_indel_label_final_raw.txt.sh" % args.filter2_only_snp_vcf_dir) - print "Finished pasting...DONE" - - """ - remove this lines - #subprocess.call(["%s" % paste_command], shell=True) - #subprocess.call(["%s" % temp_paste_command], shell=True) - #subprocess.check_call('%s' % paste_command) - #subprocess.check_call('%s' % temp_paste_command) - #os.system(paste_command) change - #os.system(temp_paste_command) change - """ - os.system(sort_All_label_cmd) - os.system(paste_command_header) - subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_allele/1/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/VARIANT/1TRUE/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_proximate_SNP/7/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ/5/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ/6/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir - os.system(remove_unwanted_text) - - -def generate_position_label_data_matrix(): - - """ - Generate different list of Positions from the **All_label_final_sorted_header.txt** SNP position label data matrix. - - Filtered Position label matrix: - Too bad! This positions where atleast one variant was observed in atleast one sample - This position didn't made it to the final Only_ref_variant_positions_for_closely_matrix list, - because it was either unmapped(non-core) in one or more of the samples or was filtered out one or more of the sample due to Variant Filtered Parameter - - Only_ref_variant_positions_for_closely_matrix.txt : - Those Positions where the variant was either reference allele or a variant that passed all the variant filter parameters. - Yeah! This ones made it to final vcf file and are core variants - (Core variant Position: Variant Position which was not filtered out in any of the other samples due to variant filter parameter and also this position was present in all the samples(not unmapped)). - - """ - def generate_position_label_data_matrix_All_label(): - position_label = OrderedDict() - f1=open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'w+') - f2=open("%s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f3=open("%s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f4=open("%s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir, 'w+') - with open("%s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - print "Reading All label positions file: %s/All_label_final_sorted_header.txt \n" % args.filter2_only_snp_vcf_dir - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - position_label[row[0]] = row[1:] - print "Generating different list of Positions and heatmap data matrix... \n" - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - #f.write('\t' + print_string_header.strip() + '\n') - f2.write('\t' + print_string_header.strip() + '\n') - f3.write('\t' + print_string_header.strip() + '\n') - f4.write('\t' + print_string_header.strip() + '\n') - for value in position_label: - lll = ['0', '2', '3', '4', '5', '6', '7'] - ref_var = ['1', '1TRUE'] - if set(ref_var) & set(position_label[value]): - if set(lll) & set(position_label[value]): - print_string = "" - for i in position_label[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f3.write(STRR2) - if position_label[value].count('1TRUE') >= 2: - f4.write('1\n') - else: - f4.write('0\n') - else: - strr = value + "\n" - f1.write(strr) - STRR3 = value + "\t" + str(position_label[value]) + "\n" - f2.write(STRR3) - csv_file.close() - f1.close() - f2.close() - f3.close() - f4.close() - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) - - def temp_generate_position_label_data_matrix_All_label(): - - """ - Read **temp_label_final_raw.txt** SNP position label data matrix for generating barplot statistics. - """ - temp_position_label = OrderedDict() - f33=open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f33.write('\t' + print_string_header.strip() + '\n') - print "Reading temporary label positions file: %s/temp_label_final_raw.txt \n" % args.filter2_only_snp_vcf_dir - lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] - ref_var = ['reference_allele', 'VARIANT'] - with open("%s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - if set(ref_var) & set(row[1:]): - if set(lll) & set(row[1:]): - print_string = "" - for i in row[1:]: - print_string = print_string + "\t" + i - STRR2 = row[0] + print_string + "\n" - f33.write(STRR2) - csv_file.close() - f33.close() - """ - Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of FQ - """ - temp_position_label_FQ = OrderedDict() - f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir, 'w+') - with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - print "Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n" % args.filter2_only_snp_vcf_dir - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - - for row in csv_reader: - temp_position_label_FQ[row[0]] = row[1:] - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f44.write('\t' + print_string_header.strip() + '\n') - for value in temp_position_label_FQ: - lll = ['LowFQ'] - if set(lll) & set(temp_position_label_FQ[value]): - print_string = "" - for i in temp_position_label_FQ[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f44.write(STRR2) - f44.close() - csv_file.close() - f44.close() - - """ - Perform Sed on temp files. Find a faster way to do this. - """ - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - - - """ - Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp - """ - temp_position_label_DP = OrderedDict() - f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') - with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - print "Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n" % args.filter2_only_snp_vcf_dir - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - temp_position_label_DP[row[0]] = row[1:] - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f44.write('\t' + print_string_header.strip() + '\n') - for value in temp_position_label_DP: - lll = ['HighFQ_DP'] - ref_var = ['reference_allele', 'VARIANT'] - if set(lll) & set(temp_position_label_FQ[value]): - print_string = "" - for i in temp_position_label_FQ[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f44.write(STRR2) - f44.close() - csv_file.close() - - """ - Perform Sed on temp files. Find a faster way to do this. - """ - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - - - def barplot_stats(): - print "\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n" - """ - Read each Sample columns and calculate the percentage of each label to generate barplot statistics. - This will give a visual explanation of how many positions in each samples were filtered out because of different reason - """ - - c_reader = csv.reader(open('%s/temp_Only_filtered_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') - columns = list(zip(*c_reader)) - print "Finished reading columns..." - counts = 1 - end = len(vcf_filenames) + 1 - f_bar_count = open("%s/bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_perc = open("%s/bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_count.write("Sample\tunmapped_positions\treference_allele\ttrue_variant\tOnly_low_FQ\tOnly_DP\tOnly_low_MQ\tother\n") - f_bar_perc.write("Sample\tunmapped_positions_perc\ttrue_variant_perc\tOnly_low_FQ_perc\tOnly_DP_perc\tOnly_low_MQ_perc\tother_perc\n") - for i in xrange(1, end, 1): - """ Bar Count Statistics: Variant Position Count Statistics """ - true_variant = columns[i].count('VARIANT') - unmapped_positions = columns[i].count('reference_unmapped_position') - reference_allele = columns[i].count('reference_allele') - Only_low_FQ = columns[i].count('LowFQ') - Only_DP = columns[i].count('HighFQ_DP') - Only_low_MQ = columns[i].count('HighFQ') - low_FQ_other_parameters = columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count('LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[i].count('LowFQ_DP') - high_FQ_other_parameters = columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count('HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL') - other = low_FQ_other_parameters + high_FQ_other_parameters - total = true_variant + unmapped_positions + reference_allele + Only_low_FQ + Only_DP + low_FQ_other_parameters + high_FQ_other_parameters + Only_low_MQ - filename_count = i - 1 - bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions, reference_allele, true_variant, Only_low_FQ, Only_DP, Only_low_MQ, other) - f_bar_count.write(bar_string) - - """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ - try: - true_variant_perc = float((columns[i].count('VARIANT') * 100) / total) - except ZeroDivisionError: - true_variant_perc = 0 - try: - unmapped_positions_perc = float((columns[i].count('reference_unmapped_position') * 100) / total) - except ZeroDivisionError: - unmapped_positions_perc = 0 - try: - reference_allele_perc = float((columns[i].count('reference_allele') * 100) / total) - except ZeroDivisionError: - reference_allele_perc = 0 - try: - Only_low_FQ_perc = float((columns[i].count('LowFQ') * 100) / total) - except ZeroDivisionError: - Only_low_FQ_perc = 0 - try: - Only_DP_perc = float((columns[i].count('HighFQ_DP') * 100) / total) - except ZeroDivisionError: - Only_DP_perc = 0 - try: - Only_low_MQ_perc = float((columns[i].count('HighFQ') * 100) / total) - except ZeroDivisionError: - Only_low_MQ_perc = 0 - try: - low_FQ_other_parameters_perc = float(((columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count('LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[i].count('LowFQ_DP')) * 100) / total) - except ZeroDivisionError: - low_FQ_other_parameters_perc = 0 - try: - high_FQ_other_parameters_perc = float(((columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count('HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL')) * 100) / total) - except ZeroDivisionError: - high_FQ_other_parameters_perc = 0 - - other_perc = float(low_FQ_other_parameters_perc + high_FQ_other_parameters_perc) - bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions_perc, true_variant_perc, Only_low_FQ_perc, Only_DP_perc, Only_low_MQ_perc, other_perc) - f_bar_perc.write(bar_perc_string) - f_bar_count.close() - f_bar_perc.close() - bargraph_R_script = "library(ggplot2)\nlibrary(reshape)\nx1 <- read.table(\"bargraph_percentage.txt\", header=TRUE)\nx1$Sample <- reorder(x1$Sample, rowSums(x1[-1]))\nmdf1=melt(x1,id.vars=\"Sample\")\npdf(\"barplot.pdf\", width = 30, height = 30)\nggplot(mdf1, aes(Sample, value, fill=variable)) + geom_bar(stat=\"identity\") + ylab(\"Percentage of Filtered Positions\") + xlab(\"Samples\") + theme(text = element_text(size=9)) + scale_fill_manual(name=\"Reason for filtered out positions\", values=c(\"#08306b\", \"black\", \"orange\", \"darkgrey\", \"#fdd0a2\", \"#7f2704\")) + ggtitle(\"Title Here\") + ylim(0, 100) + theme(text = element_text(size=10), panel.background = element_rect(fill = 'white', colour = 'white'), plot.title = element_text(size=20, face=\"bold\", margin = margin(10, 0, 10, 0)), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_text(colour = \"black\", face= \"bold.italic\", angle = 90)) + theme(legend.position = c(0.6, 0.7), legend.direction = \"horizontal\")\ndev.off()" - barplot_R_file = open("%s/bargraph.R" % args.filter2_only_snp_vcf_dir, 'w+') - barplot_R_file.write(bargraph_R_script) - print "Run this R script to generate bargraph plot: %s" % barplot_R_file - """ Methods Steps""" - print "Running: Generating data matrices..." - generate_position_label_data_matrix_All_label() - print "Running: Changing variables in data matrices to codes for faster processing..." - temp_generate_position_label_data_matrix_All_label() - print "Running: Generating Barplot statistics data matrices..." - barplot_stats() - -def generate_indel_position_label_data_matrix(): - - """ - Generate different list of Positions from the **All_label_final_sorted_header.txt** SNP position label data matrix. - - Filtered Position label matrix: - Too bad! This positions where atleast one variant was observed in atleast one sample - This position didn't made it to the final Only_ref_variant_positions_for_closely_matrix list, - because it was either unmapped(non-core) in one or more of the samples or was filtered out one or more of the sample due to Variant Filtered Parameter - - Only_ref_variant_positions_for_closely_matrix.txt : - Those Positions where the variant was either reference allele or a variant that passed all the variant filter parameters. - Yeah! This ones made it to final vcf file and are core variants - (Core variant Position: Variant Position which was not filtered out in any of the other samples due to variant filter parameter and also this position was present in all the samples(not unmapped)). - - """ - def generate_indel_position_label_data_matrix_All_label(): - position_label = OrderedDict() - f1=open("%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'w+') - f2=open("%s/Only_ref_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f3=open("%s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f4=open("%s/Only_filtered_indel_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir, 'w+') - with open("%s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - print "Reading All label positions file: %s/All_indel_label_final_sorted_header.txt \n" % args.filter2_only_snp_vcf_dir - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - position_label[row[0]] = row[1:] - print "Generating different list of Positions and heatmap data matrix... \n" - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - #f.write('\t' + print_string_header.strip() + '\n') - f2.write('\t' + print_string_header.strip() + '\n') - f3.write('\t' + print_string_header.strip() + '\n') - f4.write('\t' + print_string_header.strip() + '\n') - for value in position_label: - lll = ['0', '2', '3', '4', '5', '6', '7'] - ref_var = ['1', '1TRUE'] - if set(ref_var) & set(position_label[value]): - print "here" - if set(lll) & set(position_label[value]): - print_string = "" - for i in position_label[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f3.write(STRR2) - if position_label[value].count('1TRUE') >= 2: - f4.write('1\n') - else: - f4.write('0\n') - else: - strr = value + "\n" - f1.write(strr) - STRR3 = value + "\t" + str(position_label[value]) + "\n" - f2.write(STRR3) - csv_file.close() - f1.close() - f2.close() - f3.close() - f4.close() - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) - - def temp_generate_indel_position_label_data_matrix_All_label(): - - """ - Read **temp_label_final_raw.txt** SNP position label data matrix for generating barplot statistics. - """ - temp_position_label = OrderedDict() - f33=open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f33.write('\t' + print_string_header.strip() + '\n') - print "Reading temporary label positions file: %s/temp_label_final_raw.txt \n" % args.filter2_only_snp_vcf_dir - lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] - ref_var = ['reference_allele', 'VARIANT'] - with open("%s/temp_indel_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - if set(ref_var) & set(row[1:]): - if set(lll) & set(row[1:]): - print_string = "" - for i in row[1:]: - print_string = print_string + "\t" + i - STRR2 = row[0] + print_string + "\n" - f33.write(STRR2) - csv_file.close() - f33.close() - """ - Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of FQ - """ - temp_position_label_FQ = OrderedDict() - f44=open("%s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir, 'w+') - with open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - print "Reading temporary Only_filtered_indel_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt \n" % args.filter2_only_snp_vcf_dir - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - - for row in csv_reader: - temp_position_label_FQ[row[0]] = row[1:] - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f44.write('\t' + print_string_header.strip() + '\n') - for value in temp_position_label_FQ: - lll = ['LowFQ'] - if set(lll) & set(temp_position_label_FQ[value]): - print_string = "" - for i in temp_position_label_FQ[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f44.write(STRR2) - f44.close() - csv_file.close() - f44.close() - - """ - Perform Sed on temp files. Find a faster way to do this. - """ - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ/3/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - - - """ - Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp - """ - temp_position_label_DP = OrderedDict() - f44=open("%s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') - with open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - print "Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt \n" % args.filter2_only_snp_vcf_dir - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - temp_position_label_DP[row[0]] = row[1:] - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f44.write('\t' + print_string_header.strip() + '\n') - for value in temp_position_label_DP: - lll = ['HighFQ_DP'] - ref_var = ['reference_allele', 'VARIANT'] - if set(lll) & set(temp_position_label_FQ[value]): - print_string = "" - for i in temp_position_label_FQ[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f44.write(STRR2) - f44.close() - csv_file.close() - - """ - Perform Sed on temp files. Find a faster way to do this. - """ - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - - - def barplot_indel_stats(): - print "\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n" - """ - Read each Sample columns and calculate the percentage of each label to generate barplot statistics. - This will give a visual explanation of how many positions in each samples were filtered out because of different reason - """ - - c_reader = csv.reader(open('%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') - columns = list(zip(*c_reader)) - print "Finished reading columns..." - counts = 1 - end = len(vcf_filenames) + 1 - f_bar_count = open("%s/bargraph_indel_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_perc = open("%s/bargraph_indel_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_count.write("Sample\tunmapped_positions\treference_allele\ttrue_variant\tOnly_low_FQ\tOnly_DP\tOnly_low_MQ\tother\n") - f_bar_perc.write("Sample\tunmapped_positions_perc\ttrue_variant_perc\tOnly_low_FQ_perc\tOnly_DP_perc\tOnly_low_MQ_perc\tother_perc\n") - for i in xrange(1, end, 1): - """ Bar Count Statistics: Variant Position Count Statistics """ - true_variant = columns[i].count('VARIANT') - unmapped_positions = columns[i].count('reference_unmapped_position') - reference_allele = columns[i].count('reference_allele') - Only_low_FQ = columns[i].count('LowFQ') - Only_DP = columns[i].count('HighFQ_DP') - Only_low_MQ = columns[i].count('HighFQ') - low_FQ_other_parameters = columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count('LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[i].count('LowFQ_DP') - high_FQ_other_parameters = columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count('HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL') - other = low_FQ_other_parameters + high_FQ_other_parameters - total = true_variant + unmapped_positions + reference_allele + Only_low_FQ + Only_DP + low_FQ_other_parameters + high_FQ_other_parameters + Only_low_MQ - filename_count = i - 1 - bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions, reference_allele, true_variant, Only_low_FQ, Only_DP, Only_low_MQ, other) - f_bar_count.write(bar_string) - - """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ - try: - true_variant_perc = float((columns[i].count('VARIANT') * 100) / total) - except ZeroDivisionError: - true_variant_perc = 0 - try: - unmapped_positions_perc = float((columns[i].count('reference_unmapped_position') * 100) / total) - except ZeroDivisionError: - unmapped_positions_perc = 0 - try: - reference_allele_perc = float((columns[i].count('reference_allele') * 100) / total) - except ZeroDivisionError: - reference_allele_perc = 0 - try: - Only_low_FQ_perc = float((columns[i].count('LowFQ') * 100) / total) - except ZeroDivisionError: - Only_low_FQ_perc = 0 - try: - Only_DP_perc = float((columns[i].count('HighFQ_DP') * 100) / total) - except ZeroDivisionError: - Only_DP_perc = 0 - try: - Only_low_MQ_perc = float((columns[i].count('HighFQ') * 100) / total) - except ZeroDivisionError: - Only_low_MQ_perc = 0 - try: - low_FQ_other_parameters_perc = float(((columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count('LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[i].count('LowFQ_DP')) * 100) / total) - except ZeroDivisionError: - low_FQ_other_parameters_perc = 0 - try: - high_FQ_other_parameters_perc = float(((columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count('HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL')) * 100) / total) - except ZeroDivisionError: - high_FQ_other_parameters_perc = 0 - - other_perc = float(low_FQ_other_parameters_perc + high_FQ_other_parameters_perc) - bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions_perc, true_variant_perc, Only_low_FQ_perc, Only_DP_perc, Only_low_MQ_perc, other_perc) - f_bar_perc.write(bar_perc_string) - f_bar_count.close() - f_bar_perc.close() - bargraph_R_script = "library(ggplot2)\nlibrary(reshape)\nx1 <- read.table(\"bargraph_indel_percentage.txt\", header=TRUE)\nx1$Sample <- reorder(x1$Sample, rowSums(x1[-1]))\nmdf1=melt(x1,id.vars=\"Sample\")\npdf(\"barplot.pdf\", width = 30, height = 30)\nggplot(mdf1, aes(Sample, value, fill=variable)) + geom_bar(stat=\"identity\") + ylab(\"Percentage of Filtered Positions\") + xlab(\"Samples\") + theme(text = element_text(size=9)) + scale_fill_manual(name=\"Reason for filtered out positions\", values=c(\"#08306b\", \"black\", \"orange\", \"darkgrey\", \"#fdd0a2\", \"#7f2704\")) + ggtitle(\"Title Here\") + ylim(0, 100) + theme(text = element_text(size=10), panel.background = element_rect(fill = 'white', colour = 'white'), plot.title = element_text(size=20, face=\"bold\", margin = margin(10, 0, 10, 0)), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_text(colour = \"black\", face= \"bold.italic\", angle = 90)) + theme(legend.position = c(0.6, 0.7), legend.direction = \"horizontal\")\ndev.off()" - barplot_R_file = open("%s/bargraph_indel.R" % args.filter2_only_snp_vcf_dir, 'w+') - barplot_R_file.write(bargraph_R_script) - print "Run this R script to generate bargraph plot: %s/bargraph_indel.R" % args.filter2_only_snp_vcf_dir - - - """ Methods Steps""" - print "Running: Generating data matrices..." - generate_indel_position_label_data_matrix_All_label() - print "Running: Changing variables in data matrices to codes for faster processing..." - temp_generate_indel_position_label_data_matrix_All_label() - print "Running: Generating Barplot statistics data matrices..." - barplot_indel_stats() - - -def generate_vcf_files(): - base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("vcftools", Config)['vcftools_bin'] - filter2_files_array = [] - for i in vcf_filenames: - filter2_file = i.replace('_no_proximate_snp.vcf', '') - filter2_files_array.append(filter2_file) - ref_variant_position_array = [] - ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') - for line in ffp: - line = line.strip() - ref_variant_position_array.append(line) - ffp.close() - - filtered_out_vcf_files = [] - for i in filter2_files_array: - print_array =[] - with open(i) as file_open: - for line in file_open: - line = line.strip() - if line.startswith("#"): - print_array.append(line) - else: - split_array = re.split(r'\t+', line) - if split_array[1] in ref_variant_position_array and 'INDEL' not in split_array[7]: - print_array.append(line) - file_open.close() - file_name = i + "_core.vcf" - print "Generating %s" % file_name - filtered_out_vcf_files.append(file_name) - f1 = open(file_name, 'w+') - for ios in print_array: - print_string = str(ios) + "\n" - f1.write(print_string) - f1.close() - - filename = "%s/consensus.sh" % args.filter2_only_snp_vcf_dir - print "\nGenerating Consensus...\n" - for file in filtered_out_vcf_files: - f1 = open(filename, 'a+') - bgzip_cmd = "%s/%s/bgzip -f %s\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], file) - f1.write(bgzip_cmd) - subprocess.call([bgzip_cmd], shell=True) - tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], file) - f1.write(tabix_cmd) - subprocess.call([tabix_cmd], shell=True) - fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s.fa\n" % (args.reference, base_vcftools_bin, file, file.replace('_filter2_final.vcf_core.vcf', '')) - f1.write(fasta_cmd) - subprocess.call([fasta_cmd], shell=True) - base = os.path.basename(file) - header = base.replace('_filter2_final.vcf_core.vcf', '') - sed_command = "sed -i 's/>.*/>%s/g' %s.fa\n" % (header, file.replace('_filter2_final.vcf_core.vcf', '')) - subprocess.call([sed_command], shell=True) - f1.write(sed_command) - print "The consensus commands are in : %s" % filename - sequence_lgth_cmd = "for i in %s/*.fa; do %s/%s/bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % (args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin']) - os.system(sequence_lgth_cmd) - -def gatk_filter2(final_raw_vcf, out_path, analysis, reference): - gatk_filter2_parameter_expression = "MQ > 50 && QUAL > 100 && DP > 9" - gatk_filter2_command = "java -jar %s/%s/GenomeAnalysisTK.jar -T VariantFiltration -R %s -o %s/%s_filter2_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("gatk", Config)['gatk_bin'], reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) - print "\n\nRunning Command: [%s]\n\n" % gatk_filter2_command - os.system(gatk_filter2_command) - filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_gatk.vcf > %s/%s_filter2_final.vcf" % (out_path, analysis, out_path, analysis) - os.system(filter_flag_command) - gatk_filter2_final_vcf = "%s/%s_filter2_final.vcf" % (out_path, analysis) - return gatk_filter2_final_vcf - -def remove_proximate_snps(gatk_filter2_final_vcf_file, out_path, analysis, reference): - all_position = [] - remove_proximate_position_array = [] - gatk_filter2_final_vcf_file_no_proximate_snp = gatk_filter2_final_vcf_file + "_no_proximate_snp.vcf" - with open(gatk_filter2_final_vcf_file, 'rU') as csv_file: - for line in csv_file: - if not line.startswith('#'): - line_array = line.split('\t') - all_position.append(line_array[1]) - for position in all_position: - position_index = all_position.index(position) - next_position_index = position_index + 1 - - if next_position_index < len(all_position): - diff = int(all_position[next_position_index]) - int(position) - if diff < 10: - #print position + " " + all_position[next_position_index] - if position not in remove_proximate_position_array and all_position[next_position_index] not in remove_proximate_position_array: - remove_proximate_position_array.append(int(position)) - remove_proximate_position_array.append(int(all_position[next_position_index])) - f1=open(gatk_filter2_final_vcf_file_no_proximate_snp, 'w+') - with open(gatk_filter2_final_vcf_file, 'rU') as csv_file2: - for line in csv_file2: - if line.startswith('gi') or line.startswith('MRSA_8058'): ##change this! - line_array = line.split('\t') - if int(line_array[1]) not in remove_proximate_position_array: - print_string = line - f1.write(print_string) - else: - print_string = line - f1.write(print_string) - gatk_filter2_final_vcf_file_no_proximate_snp_positions = gatk_filter2_final_vcf_file + "_no_proximate_snp.vcf_positions_array" - f2=open(gatk_filter2_final_vcf_file_no_proximate_snp_positions, 'w+') - for i in remove_proximate_position_array: - position_print_string = str(i) + "\n" - f2.write(position_print_string) - return gatk_filter2_final_vcf_file_no_proximate_snp - - -def FQ_analysis(): - for i in vcf_filenames: - filename_base = os.path.basename(i) - aln_mpileup_vcf_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_aln_mpileup_raw.vcf_5bp_indel_removed.vcf') - analysis = filename_base.replace('_filter2_final.vcf_no_proximate_snp.vcf', '') - #print aln_mpileup_vcf_file - grep_reference_file = "grep \'^##reference\' %s" % aln_mpileup_vcf_file - proc = subprocess.Popen([grep_reference_file], stdout=subprocess.PIPE, shell=True) - (out, err) = proc.communicate() - out = out.strip() - reference_file = out.split(':') - gatk_filter2_final_vcf_file = gatk_filter2(aln_mpileup_vcf_file, temp_dir, analysis, reference_file[1]) - #print gatk_filter2_final_vcf_file - gatk_filter2_final_vcf_file_no_proximate_snp = remove_proximate_snps(gatk_filter2_final_vcf_file, temp_dir, analysis, reference_file[1]) - grep_fq_field = "awk -F\'\\t\' \'{print $8}\' %s | grep -o \'FQ=.*\' | sed \'s/FQ=//g\' | awk -F\';\' \'{print $1}\' > %s/%s_FQ_values" % (gatk_filter2_final_vcf_file_no_proximate_snp, os.path.dirname(i), analysis) - os.system(grep_fq_field) - #print grep_fq_field - - -def DP_analysis(): - create_job_DP(args.jobrun, vcf_filenames) - paste_command = "paste %s/extract_DP_positions.txt" % args.filter2_only_snp_vcf_dir - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_DP_values') - paste_command = paste_command + " " + label_file - - paste_file = args.filter2_only_snp_vcf_dir + "/paste_DP_files.sh" - f2=open(paste_file, 'w+') - paste_command = paste_command + " > %s/filtered_DP_values_temp.txt" % args.filter2_only_snp_vcf_dir - #os.system(paste_command) - f2.write(paste_command + '\n') - cat_header = "cat %s/header.txt %s/filtered_DP_values_temp.txt > %s/filtered_DP_values.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - #os.system(cat_header) - f2.write(cat_header + '\n') - sed_command = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/filtered_DP_values.txt" % (args.filter2_only_snp_vcf_dir) - #os.system(sed_command) - f2.write(sed_command + '\n') - cmd = "bash %s" % paste_file - # os.system("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir) - -def DP_analysis_barplot(): - os.system("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir) - print "Generating DP barplots data..." - c_reader = csv.reader(open('%s/filtered_DP_values.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') - columns = list(zip(*c_reader)) - counts = 1 - end = len(vcf_filenames) + 1 - f_bar_count = open("%s/DP_bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_perc = open("%s/DP_bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_count.write("Sample\treference_position\toneto5\tsixto10\televento14\tfifteenorabove\n") - f_bar_perc.write("Sample\treference_position\toneto5\tsixto10\televento14\tfifteenorabove\n") - for i in xrange(1, end, 1): - """ Bar Count Statistics: Variant Position Count Statistics """ - reference_position = columns[i].count('NA') - oneto5 = 0 - for k in list(columns[i][1:]): - if k != "": - if k != "NA": - if int(k) < 5: - oneto5 += 1 - sixto10 = 0 - for k in list(columns[i][1:]): - if k != "": - if k != "NA": - if int(k) >= 5 and int(k) <= 10: - sixto10 += 1 - elevento14 = 0 - for k in list(columns[i][1:]): - if k != "": - if k != "NA": - if int(k) >= 11 and int(k) <= 14: - elevento14 += 1 - fifteenorabove = 0 - for k in list(columns[i][1:]): - if k != "": - if k != "NA": - if int(k) >= 15: - fifteenorabove += 1 - total = reference_position + oneto5 + sixto10 + elevento14 + fifteenorabove - filename_count = i - 1 - bar_string = "%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), reference_position, oneto5, sixto10, elevento14, fifteenorabove) - f_bar_count.write(bar_string) - - """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ - try: - reference_position_perc = float(reference_position * 100 / total) - except ZeroDivisionError: - reference_position_perc = 0 - try: - oneto5_perc = float(oneto5 * 100 / total) - except ZeroDivisionError: - oneto5_perc = 0 - try: - sixto10_perc = float(sixto10 * 100 / total) - except ZeroDivisionError: - sixto10_perc = 0 - try: - elevento14_perc = float(elevento14 * 100 / total) - except ZeroDivisionError: - elevento14_perc = 0 - try: - fifteenorabove_perc = float(fifteenorabove * 100 / total) - except ZeroDivisionError: - fifteenorabove_perc = 0 - bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), reference_position_perc, oneto5_perc, sixto10_perc, elevento14_perc, fifteenorabove_perc) - f_bar_perc.write(bar_perc_string) - - -def extract_only_ref_variant_fasta(core_vcf_fasta_dir): - create_job_fasta(args.jobrun, vcf_filenames, core_vcf_fasta_dir) - -def extract_only_ref_variant_fasta_from_reference(): - ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir).readlines() - fasta_string = "" - #firstLine = ffp.pop(0) - for lines in ffp: - lines = lines.strip() - extract_base = "grep -v \'>\' %s | tr -d \'\\n\'| cut -b%s" % (args.reference, lines) - proc = subprocess.Popen([extract_base], stdout=subprocess.PIPE, shell=True) - (out, err) = proc.communicate() - out = out.strip() - fasta_string = fasta_string + out - if not out: - print "Error extracting reference allele" - - pattern = re.compile(r'\s+') - fasta_string = re.sub(pattern, '', fasta_string) - final_fasta_string = ">%s\n" % os.path.basename(args.reference.replace('.fasta', '')) + fasta_string + "\n" - fp = open("%s/%s_variants.fa" % (args.filter2_only_snp_vcf_dir, os.path.basename(args.reference.replace('.fasta', ''))), 'w+') - fp.write(final_fasta_string) - fp.close() - -def prepare_snpEff_db(reference_basename): - keep_logging('Preparing snpEff database requirements.', 'Preparing snpEff database requirements.', logger, 'info') - os.system("cp %s/%s/snpEff.config %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], args.filter2_only_snp_vcf_dir)) - make_sure_path_exists("%s/%s/data/%s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0])) - make_sure_path_exists("%s/%s/data/genomes/" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'])) - os.system("cp %s %s/%s/data/genomes/" % (args.reference, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'])) - with open("%s/snpEff.config" % args.filter2_only_snp_vcf_dir, "a") as conf_file: - conf_file.write("\n\n##Building Custom Database###\n%s.genome\t: %s\n\n" % (reference_basename[0], reference_basename[0])) - conf_file.close() - #get the gff name from config file - os.system("cp %s/%s.gff %s/%s/data/%s/genes.gff" % (os.path.dirname(args.reference), reference_basename[0], ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0])) - os.system("java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'])) - - keep_logging('Finished Preparing snpEff database requirements.', 'Finished Preparing snpEff database requirements.', logger, 'info') - -def variant_annotation(): - keep_logging('Annotating Variants using snpEff.', 'Annotating Variants using snpEff.', logger, 'info') - reference_basename = (os.path.basename(args.reference)).split(".") - print reference_basename[0] - prepare_snpEff_db(reference_basename) - annotate_vcf_cmd_array = [] - annotate_final_vcf_cmd_array = [] - for i in vcf_filenames: - raw_vcf = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_aln_mpileup_raw.vcf') - annotate_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ - (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], raw_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, reference_basename[0], raw_vcf, raw_vcf) - annotate_vcf_cmd_array.append(annotate_vcf_cmd) - final_vcf = i - annotate_final_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ - (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], final_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, reference_basename[0], final_vcf, final_vcf) - annotate_final_vcf_cmd_array.append(annotate_final_vcf_cmd) - if args.numcores: - num_cores = int(num_cores) - else: - num_cores = multiprocessing.cpu_count() - print "\n\nhere\n\n" - print annotate_final_vcf_cmd_array - results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_vcf_cmd_array) - results_2 = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_final_vcf_cmd_array) - -def indel_annotation(): - keep_logging('Annotating indels using snpEff.', 'Annotating indels using snpEff.', logger, 'info') - reference_basename = (os.path.basename(args.reference)).split(".") - print reference_basename[0] - prepare_snpEff_db(reference_basename) - annotate_vcf_cmd_array = [] - annotate_final_vcf_cmd_array = [] - for i in vcf_filenames: - raw_vcf = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_aln_mpileup_raw.vcf') - annotate_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ - (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], raw_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, reference_basename[0], raw_vcf, raw_vcf) - annotate_vcf_cmd_array.append(annotate_vcf_cmd) - final_vcf = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf') - annotate_final_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ - (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], final_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, reference_basename[0], final_vcf, final_vcf) - annotate_final_vcf_cmd_array.append(annotate_final_vcf_cmd) - if args.numcores: - num_cores = int(num_cores) - else: - num_cores = multiprocessing.cpu_count() - print annotate_final_vcf_cmd_array - results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_vcf_cmd_array) - results_2 = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_final_vcf_cmd_array) - -def annotated_snp_matrix(): - """ - :return: Read Genbank file and return a dictionary of Prokka ID mapped to Gene Name, Prokka ID mapped to Product Name - """ - reference_basename = (os.path.basename(args.reference)).split(".") - if os.path.isfile("%s/%s.gbf" % (os.path.dirname(args.reference), reference_basename[0])): - handle = open("%s/%s.gbf" % (os.path.dirname(args.reference), reference_basename[0]), 'rU') - else: - raise IOError('%s/%s.gbf does not exist.' % (os.path.dirname(args.reference), reference_basename[0])) - locus_tag_to_gene_name = {} - locus_tag_to_product = {} - #locus_tag_to_uniprot = {} - #locus_tag_to_ec_number = {} - - for record in SeqIO.parse(handle, 'genbank') : - for feature in record.features: - if 'locus_tag' in feature.qualifiers: - if 'gene' in feature.qualifiers: - locus_tag_to_gene_name[str(feature.qualifiers['locus_tag'][0])] = str(feature.qualifiers['gene'][0]) - else: - locus_tag_to_gene_name[str(feature.qualifiers['locus_tag'][0])] = "null or hypothetical protein" - if 'product' in feature.qualifiers: - locus_tag_to_product[str(feature.qualifiers['locus_tag'][0])] = str(feature.qualifiers['product'][0]) - else: - locus_tag_to_product[str(feature.qualifiers['locus_tag'][0])] = "null or hypothetical protein" - # elif 'uniprot' in feature.qualifiers: - # locus_tag_to_product[str(feature.qualifiers['locus_tag'][0])] = str(feature.qualifiers['product'][0]) - - - - """ Merge Annotated final vcf file """ - print "Merging Final Annotated VCF files into %s/Final_vcf_no_proximate_snp.vcf" % args.filter2_only_snp_vcf_dir - os.system("for i in %s/*.vcf_no_proximate_snp.vcf_ANN.vcf; do bgzip -c $i > $i.gz; done" % args.filter2_only_snp_vcf_dir) - os.system("for i in %s/*.vcf_no_proximate_snp.vcf_ANN.vcf.gz; do tabix $i; done" % args.filter2_only_snp_vcf_dir) - os.system("for i in %s/*_filter2_indel_final.vcf_ANN.vcf; do bgzip -c $i > $i.gz; done" % args.filter2_only_snp_vcf_dir) - os.system("for i in %s/*_filter2_indel_final.vcf_ANN.vcf.gz; do tabix $i; done" % args.filter2_only_snp_vcf_dir) - - files = ' '.join(vcf_filenames) - print files.replace("_filter2_final.vcf_no_proximate_snp.vcf", "_filter2_final.vcf_no_proximate_snp.vcf_ANN.vcf.gz") - - os.system("bcftools merge -i ANN:join -m both -o %s/Final_vcf_no_proximate_snp.vcf -O v %s" % (args.filter2_only_snp_vcf_dir, files.replace("_filter2_final.vcf_no_proximate_snp.vcf", "_filter2_final.vcf_no_proximate_snp.vcf_ANN.vcf.gz"))) - os.system("bcftools merge -i ANN:join -m both -o %s/Final_vcf_indel.vcf -O v %s" % (args.filter2_only_snp_vcf_dir, files.replace("_filter2_final.vcf_no_proximate_snp.vcf", "_filter2_indel_final.vcf_ANN.vcf.gz"))) - - os.system("bgzip -c %s/Final_vcf_no_proximate_snp.vcf > %s/Final_vcf_no_proximate_snp.vcf.gz" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - os.system("tabix %s/Final_vcf_no_proximate_snp.vcf.gz" % args.filter2_only_snp_vcf_dir) - os.system("bgzip -c %s/Final_vcf_indel.vcf > %s/Final_vcf_indel.vcf.gz" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - os.system("tabix %s/Final_vcf_indel.vcf.gz" % args.filter2_only_snp_vcf_dir) - - - - position_label = OrderedDict() - with open("%s/All_label_final_sorted.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - print "Reading All label positions file: %s/All_label_final_sorted.txt \n" % args.filter2_only_snp_vcf_dir - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position_label[row[0]] = ','.join(row[1:]) - csv_file.close() - - position_indel_label = OrderedDict() - with open("%s/All_indel_label_final_sorted.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - print "Reading All label positions file: %s/All_indel_label_final_sorted.txt \n" % args.filter2_only_snp_vcf_dir - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - if row[0] not in position_label.keys(): - position_indel_label[row[0]] = ','.join(row[1:]) - else: - position_indel_label[row[0]] = ','.join(row[1:]) - print "Warning: position %s already present as a SNP" % row[0] - csv_file.close() - - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - - core_positions = [] - with open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir) as fp: - for line in fp: - line = line.strip() - core_positions.append(line) - fp.close() - with open("%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir) as fp: - for line in fp: - line = line.strip() - core_positions.append(line) - fp.close() - - - header_print_string = "Type of SNP at POS > ALT; ALT|Effect|Impact|GeneID|Nrchange|Aachange|Nrgenepos|AAgenepos:::" - final_merge_anno_file = VCF("%s/Final_vcf_no_proximate_snp.vcf.gz" % args.filter2_only_snp_vcf_dir) - for sample in final_merge_anno_file.samples: - header_print_string = header_print_string + "," + sample - header_print_string = header_print_string + "\n" - header_print_string = header_print_string.replace(':::,', ':::') - fp_code = open("%s/SNP_matrix_code.csv" % args.filter2_only_snp_vcf_dir, 'w+') - fp_allele = open("%s/SNP_matrix_allele.csv" % args.filter2_only_snp_vcf_dir, 'w+') - fp_code.write(header_print_string) - fp_allele.write(header_print_string) - - for variants in VCF("%s/Final_vcf_no_proximate_snp.vcf.gz" % args.filter2_only_snp_vcf_dir): - print_string = "" - - code_string = position_label[str(variants.POS)] - code_string = code_string.replace('reference_allele', '0') - code_string = code_string.replace('reference_unmapped_position', '-1') - code_string = code_string.replace('LowFQ_QUAL_DP_proximate_SNP', '2') - code_string = code_string.replace('LowFQ_DP_QUAL_proximate_SNP', '2') - code_string = code_string.replace('LowFQ_QUAL_proximate_SNP', '2') - code_string = code_string.replace('LowFQ_DP_proximate_SNP', '2') - code_string = code_string.replace('LowFQ_proximate_SNP', '2') - code_string = code_string.replace('LowFQ_QUAL_DP', '2') - code_string = code_string.replace('LowFQ_DP_QUAL', '2') - code_string = code_string.replace('LowFQ_QUAL', '2') - code_string = code_string.replace('LowFQ_DP', '2') - code_string = code_string.replace('HighFQ_QUAL_DP_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_DP_QUAL_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_QUAL_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_DP_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_QUAL_DP', '2') - code_string = code_string.replace('HighFQ_DP_QUAL', '2') - code_string = code_string.replace('HighFQ_QUAL', '2') - code_string = code_string.replace('HighFQ_DP', '2') - code_string = code_string.replace('LowFQ', '2') - code_string = code_string.replace('HighFQ', '2') - - if str(variants.POS) in core_positions: - code_string = code_string.replace('VARIANT', '1') - else: - code_string = code_string.replace('VARIANT', '3') - - - if "protein_coding" in variants.INFO.get('ANN'): - snp_type = "Coding SNP" - else: - snp_type = "Non-coding SNP" - print_string = print_string + snp_type + " at %s > " % str(variants.POS) + str(",".join(variants.ALT)) - - ann_array = (variants.INFO.get('ANN')).split(',') - ann_string = ";" - for i in list(set(ann_array)): - i_split = i.split('|') - #ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13]]) + ";" - tag = str(i_split[3]).replace('CHR_START-', '') - tag = str(tag).replace('-CHR_END', '') - if "-" in tag: - #print tag - extra_tags = "" - tag_split = tag.split('-') - for i in tag_split: - extra_tags = extra_tags + locus_tag_to_gene_name[i] + "," - extra_tags_prot = "" - for i in tag_split: - extra_tags_prot = extra_tags_prot + locus_tag_to_product[i] + "," - ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags, extra_tags_prot]) + ";" - else: - extra_tags = str(locus_tag_to_gene_name[tag]) + "|" + str(locus_tag_to_product[tag]) - # ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" - ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" - - print_string = print_string + ann_string - - gt_string = "" - for gt in variants.gt_bases: - gt = gt.replace('./.', '.') - gt_string = gt_string + "," + gt - gt_string = gt_string.replace('A/A', 'A') - gt_string = gt_string.replace('G/G', 'G') - gt_string = gt_string.replace('C/C', 'C') - gt_string = gt_string.replace('T/T', 'T') - gt_string = gt_string.replace('.', variants.REF) - - # #print print_string + gt_string + '\n' - # fp_allele.write(print_string + gt_string + '\n') - # #print print_string + "," + code_string + '\n' - # fp_code.write(print_string + "," + code_string + '\n') - - final_allele_string = print_string + gt_string + '\n' - final_code_string = print_string + "," + code_string + '\n' - final_allele_string = final_allele_string.replace(',|', '|') - final_allele_string = final_allele_string.replace(',;,', ':::') - final_allele_string = final_allele_string.replace(';,', ':::') - final_code_string = final_code_string.replace(',|', '|') - final_code_string = final_code_string.replace(',;,', ':::') - final_code_string = final_code_string.replace(';,', ':::') - fp_allele.write(final_allele_string) - fp_code.write(final_code_string) - - fp_code.close() - fp_allele.close() - - - - ##Indel - header_print_string = "Type of SNP at POS > ALT; ALT|Effect|Impact|GeneID|Nrchange|Aachange|Nrgenepos|AAgenepos:::" - final_merge_anno_file = VCF("%s/Final_vcf_indel.vcf.gz" % args.filter2_only_snp_vcf_dir) - for sample in final_merge_anno_file.samples: - header_print_string = header_print_string + "," + sample - header_print_string = header_print_string + "\n" - header_print_string = header_print_string.replace(':::,', ':::') - fp_code = open("%s/Indel_matrix_code.csv" % args.filter2_only_snp_vcf_dir, 'w+') - fp_allele = open("%s/Indel_matrix_allele.csv" % args.filter2_only_snp_vcf_dir, 'w+') - fp_code.write(header_print_string) - fp_allele.write(header_print_string) - - for variants in VCF("%s/Final_vcf_indel.vcf.gz" % args.filter2_only_snp_vcf_dir): - print_string = "" - - code_string = position_indel_label[str(variants.POS)] - code_string = code_string.replace('reference_allele', '0') - code_string = code_string.replace('reference_unmapped_position', '-1') - code_string = code_string.replace('LowFQ_QUAL_DP_proximate_SNP', '2') - code_string = code_string.replace('LowFQ_DP_QUAL_proximate_SNP', '2') - code_string = code_string.replace('LowFQ_QUAL_proximate_SNP', '2') - code_string = code_string.replace('LowFQ_DP_proximate_SNP', '2') - code_string = code_string.replace('LowFQ_proximate_SNP', '2') - code_string = code_string.replace('LowFQ_QUAL_DP', '2') - code_string = code_string.replace('LowFQ_DP_QUAL', '2') - code_string = code_string.replace('LowFQ_QUAL', '2') - code_string = code_string.replace('LowFQ_DP', '2') - code_string = code_string.replace('HighFQ_QUAL_DP_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_DP_QUAL_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_QUAL_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_DP_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_QUAL_DP', '2') - code_string = code_string.replace('HighFQ_DP_QUAL', '2') - code_string = code_string.replace('HighFQ_QUAL', '2') - code_string = code_string.replace('HighFQ_DP', '2') - code_string = code_string.replace('LowFQ', '2') - code_string = code_string.replace('HighFQ', '2') - - if str(variants.POS) in core_positions: - code_string = code_string.replace('VARIANT', '1') - else: - code_string = code_string.replace('VARIANT', '3') - - if "protein_coding" in variants.INFO.get('ANN'): - snp_type = "Coding SNP" - else: - snp_type = "Non-coding SNP" - print_string = print_string + snp_type + " at %s > " % str(variants.POS) + str(",".join(variants.ALT)) - - ann_array = (variants.INFO.get('ANN')).split(',') - ann_string = ";" - # for i in list(set(ann_array)): - # i_split = i.split('|') - # ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13]]) + ";" - # print_string = print_string + ann_string - for i in list(set(ann_array)): - i_split = i.split('|') - #ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13]]) + ";" - tag = str(i_split[3]).replace('CHR_START-', '') - tag = str(tag).replace('-CHR_END', '') - tag = str(tag).replace('&', '-') - if "-" in tag: - #print tag - extra_tags = "" - tag_split = tag.split('-') - for i in tag_split: - extra_tags = extra_tags + locus_tag_to_gene_name[i] + "," - extra_tags_prot = "" - for i in tag_split: - extra_tags_prot = extra_tags_prot + locus_tag_to_product[i] + "," - ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags, extra_tags_prot]) + ";" - else: - extra_tags = str(locus_tag_to_gene_name[tag]) + "|" + str(locus_tag_to_product[tag]) - # ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" - ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" - - print_string = print_string + ann_string - - gt_string = "" - for gt in variants.gt_bases: - gt = gt.replace('./.', '.') - if "/" in gt: - gt_split = gt.split('/') - gt = gt_split[0] - gt_string = gt_string + "," + gt - gt_string = gt_string.replace('.', variants.REF) - final_allele_string = print_string + gt_string + '\n' - final_code_string = print_string + "," + code_string + '\n' - final_allele_string = final_allele_string.replace(',|', '|') - final_allele_string = final_allele_string.replace(',;,', ':::') - final_allele_string = final_allele_string.replace(';,', ':::') - final_code_string = final_code_string.replace(',|', '|') - final_code_string = final_code_string.replace(',;,', ';') - final_code_string = final_code_string.replace(';,', ':::') - fp_allele.write(final_allele_string) - fp_code.write(final_code_string) - fp_code.close() - fp_allele.close() - - -def alignment_report(data_matrix_dir): - print "\nGenerating Alignment report...\n" - varcall_dir = os.path.dirname(os.path.abspath(args.results_dir)) - report_string = "" - header = "Sample,QC-passed reads,Mapped reads,% mapped reads,mean depth,%_bases_above_5,%_bases_above_10,%_bases_above_15,unmapped_positions,READ_PAIR_DUPLICATES,READ_PAIR_OPTICAL_DUPLICATES,unmapped reads,% unmapped reads" - fp = open("%s/Report_alignment.txt" % (data_matrix_dir), 'w+') - fp.write(header + '\n') - for vcf in vcf_filenames: - sample = os.path.basename(vcf.replace('_filter2_final.vcf_no_proximate_snp.vcf', '')) - #print sample - report_string = sample + "," - qc = (subprocess.check_output("grep \'QC-passed\' %s/%s/%s_alignment_stats | sed \'s/ + 0 in total (QC-passed reads + QC-failed reads)//g\'" % (varcall_dir, sample, sample), shell=True)).strip() - mapped = (subprocess.check_output("grep \'mapped (\' %s/%s/%s_alignment_stats | awk -F\' \' \'{print $1}\'" % (varcall_dir, sample, sample), shell=True)).strip() - replace = "%:-nan%)" - perc_mapped = (subprocess.check_output("grep \'mapped (\' %s/%s/%s_alignment_stats | awk -F\' \' \'{print $5}\' | sed \'s/%s//g\' | sed \'s/(//g\'" % (varcall_dir, sample, sample, replace), shell=True)).strip() - depth_of_coverage = (subprocess.check_output("awk -F\'\\t\' \'{OFS=\",\"};FNR==2{print $3,$7,$8,$9}\' %s/%s/%s_depth_of_coverage.sample_summary" % (varcall_dir, sample, sample), shell=True)).strip() - unmapped_positions = (subprocess.check_output("wc -l %s/%s/%s_unmapped.bed_positions | cut -d\' \' -f1" % (varcall_dir, sample, sample), shell=True)).strip() - opt_dup = (subprocess.check_output("awk -F\'\\t\' \'{OFS=\",\"};FNR==8{print $7,$8,$5}\' %s/%s/%s_markduplicates_metrics" % (varcall_dir, sample, sample), shell=True)).strip() - perc_unmapped = str(100 - float(perc_mapped)) - myList = ','.join(map(str, (sample, qc, mapped, perc_mapped, depth_of_coverage, unmapped_positions, opt_dup, perc_unmapped))) - #print myList - fp.write(myList + '\n') - fp.close() - print "Alignment report can be found in %s/Report_alignment.txt" % data_matrix_dir - - - -def variant_report(data_matrix_dir): - print "\nGenerating Variants report...\n" - varcall_dir = os.path.dirname(os.path.abspath(args.results_dir)) - report_string = "" - header = "Sample,Total Unique Variants,core SNPs,unmapped_positions,reference_allele,true_variant,Only_low_FQ,Only_DP,Only_low_MQ,other,unmapped_positions_perc,true_variant_perc,Only_low_FQ_perc,Only_DP_perc,Only_low_MQ_perc,other_perc" - fp = open("%s/Report_variants.txt" % (data_matrix_dir), 'w+') - fp.write(header + '\n') - - for vcf in vcf_filenames: - sample = os.path.basename(vcf.replace('_filter2_final.vcf_no_proximate_snp.vcf', '')) - report_string = sample + "," - unmapped_positions = (subprocess.check_output("wc -l %s/core_temp_dir/unique_positions_file | cut -d\' \' -f1" % (varcall_dir), shell=True)).strip() - core_snps = (subprocess.check_output("wc -l %s/core_temp_dir/Only_ref_variant_positions_for_closely | cut -d\' \' -f1" % (varcall_dir), shell=True)).strip() - filtered_snp_count = (subprocess.check_output("grep -w \'^%s\' %s/core_temp_dir/bargraph_counts.txt | awk -F\'\\t\' \'{OFS=\",\"};{print $2,$3,$4,$5,$6,$7}\'" % (sample, varcall_dir), shell=True)).strip() - filtered_snp_perc = (subprocess.check_output("grep -w \'^%s\' %s/core_temp_dir/bargraph_percentage.txt | awk -F\'\\t\' \'{OFS=\",\"};{print $2,$3,$4,$5,$6,$7}\'" % (sample, varcall_dir), shell=True)).strip() - myList = ','.join(map(str, (sample, unmapped_positions, core_snps, filtered_snp_count, filtered_snp_perc))) - fp.write(myList + '\n') - fp.close() - print "Variant call report can be found in %s/Report_variants.txt" % data_matrix_dir - - -def fasttree(tree_dir, input_fasta, cluster): - keep_logging('Running Fasttree on input: %s' % input_fasta, 'Running Fasttree on input: %s' % input_fasta, logger, 'info') - fasttree_cmd = "%s/%s/%s -nt %s > %s/%s_FastTree.tree" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("fasttree", Config)['fasttree_bin'], ConfigSectionMap("fasttree", Config)['base_cmd'], input_fasta, tree_dir, (os.path.basename(input_fasta)).replace('.fa', '')) - keep_logging('%s' % fasttree_cmd, '%s' % fasttree_cmd, logger, 'info') - if cluster == "parallel-local" or cluster == "local": - os.system("cd %s" % tree_dir) - os.system(fasttree_cmd) - elif cluster == "parallel-cluster": - job_name = os.path.basename(tree_dir) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l nodes=1:ppn=4,mem=47000mb,walltime=76:00:00\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\ncd %s\n%s" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], tree_dir, fasttree_cmd) - job_file_name = "%s/fasttree_%s.pbs" % (tree_dir, os.path.basename(input_fasta)) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - os.system("qsub %s" % job_file_name) - -def raxml(tree_dir, input_fasta): - keep_logging('Running RAXML on input: %s' % input_fasta, 'Running RAXML on input: %s' % input_fasta, logger, 'info') - raxml_cmd = "%s/%s/%s %s -s %s -n %s_raxML" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("raxml", Config)['raxml_bin'], ConfigSectionMap("raxml", Config)['base_cmd'], ConfigSectionMap("raxml", Config)['parameters'], input_fasta, (os.path.basename(input_fasta)).replace('.fa', '')) - keep_logging('%s' % raxml_cmd, '%s' % raxml_cmd, logger, 'info') - if args.jobrun == "parallel-local" or args.jobrun == "local": - os.system("cd %s" % tree_dir) - os.system(raxml_cmd) - elif args.jobrun == "parallel-cluster": - job_name = os.path.basename(tree_dir) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l nodes=1:ppn=4,mem=47000mb,walltime=76:00:00\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\ncd %s\n%s" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], tree_dir, raxml_cmd) - job_file_name = "%s/raxml_%s.pbs" % (tree_dir, os.path.basename(input_fasta)) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - os.system("qsub %s" % job_file_name) - - -def gubbins(gubbins_dir, input_fasta): - print "\nRunning Gubbins on input: %s\n" % input_fasta - os.system("cd %s" % ConfigSectionMap("gubbins", Config)['gubbins_bin']) - gubbins_cmd = "%s/%s --prefix %s/%s %s" % (ConfigSectionMap("gubbins", Config)['gubbins_bin'], ConfigSectionMap("gubbins", Config)['base_cmd'], gubbins_dir, (os.path.basename(input_fasta)).replace('.fa', ''), input_fasta) - print gubbins_cmd - os.system(gubbins_cmd) - -def core_prep_snp(core_vcf_fasta_dir): - - """ Run snpEff annotation step """ - variant_annotation() - - """ Generate SNP Filter Label Matrix """ - generate_paste_command() - - """ Generate different list of Positions from the **All_label_final_sorted_header.txt** SNP position label data matrix. """ - generate_position_label_data_matrix() - - """ Generate VCF files from final list of variants in Only_ref_variant_positions_for_closely; generate commands for consensus generation """ - generate_vcf_files() - - """ Generate consensus fasta file from core vcf files """ - extract_only_ref_variant_fasta_from_reference() - - """ Generate consensus fasta file with only reference and variant position bases """ - extract_only_ref_variant_fasta(core_vcf_fasta_dir) - - """ Analyze the positions that were filtered out only due to insufficient depth""" - DP_analysis() - -def core_prep_indel(core_vcf_fasta_dir): - - """ Run snpEff annotation step """ - indel_annotation() - - # """ Generate SNP Filter Label Matrix """ - generate_indel_paste_command() - - # """ Generate different list of Positions from the **All_label_final_sorted_header.txt** SNP position label data matrix. """ - generate_indel_position_label_data_matrix() - - - -""" -Pending inclusion -""" - -class FuncThread(threading.Thread): - def __init__(self, target, *args): - self._target = target - self._args = args - threading.Thread.__init__(self) - def run(self): - self._target(*self._args) - -def someOtherFunc(data, key): - print "someOtherFunc was called : data=%s; key=%s" % (str(data), str(key)) - -def run_phaster(reference_genome): - print "\nRunning Phaster on input reference genome: %s\n" % reference_genome - out_name = (os.path.basename(reference_genome)).split('.') - phaster_post_cmd = "wget --post-file=\"%s\" \"http://phaster.ca/phaster_api\" -O %s/%s" % (reference_genome, args.filter2_only_snp_vcf_dir, str(out_name[0]) + "_phaster_post.json") - - print "Running: %s\n" % phaster_post_cmd - #os.system(phaster_post_cmd) - with open('%s/%s' % (args.filter2_only_snp_vcf_dir, str(out_name[0]) + "_phaster_post.json")) as json_data: - data = json.load(json_data) - print "Status: %s\njob_id: %s\n" % (data["status"], data["job_id"]) - -def parse_phaster(reference_genome): - out_name = (os.path.basename(reference_genome)).split('.') - with open('%s/%s' % (args.filter2_only_snp_vcf_dir, str(out_name[0]) + "_phaster_post.json")) as json_data: - data = json.load(json_data) - phaster_get_cmd = "wget \"http://phaster.ca/phaster_api?acc=%s\" -O %s/%s" % (data["job_id"], args.filter2_only_snp_vcf_dir, str(out_name[0]) + "_phaster_get.json") - print phaster_get_cmd - - with open('%s/%s' % (args.filter2_only_snp_vcf_dir, str(out_name[0]) + "_phaster_get.json")) as json_get_data: - get_data = json.load(json_get_data) - print get_data["zip"] - phaster_zip_cmd = "wget \"http://%s\" -O %s/%s_phaster_get.zip" % (str(get_data["zip"]), args.filter2_only_snp_vcf_dir, str(out_name[0])) - phaster_unzip_cmd = "unzip %s/%s_phaster_get.zip" % (args.filter2_only_snp_vcf_dir, str(out_name[0])) - print phaster_zip_cmd - print phaster_unzip_cmd - # for key, value in get_data.items(): - # print get_data["zip"][0] -""" -Pending inclusion -""" - - - -#Main Steps -if __name__ == '__main__': - - """Start Timer""" - start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - start_time_2 = datetime.now() - log_unique_time = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') - global logger - analysis_name_log = "step_" + str(args.steps) - logger = generate_logger(args.filter2_only_snp_vcf_dir, analysis_name_log, log_unique_time) - keep_logging('The Script started at: %s' % start_time, 'The Script started at: %s' % start_time, logger, 'info') - print_details = "This step will parse final vcf files(*_no_proximate_snp.vcf) generated at the end of Variant Calling Pipeline. At the end of this step, the following results will be generated and placed in output directory:\n\n" \ - "1. Final Core SNP Positions list(Variant positions that were not filtered out in any of the samples and passed all the filters)\n" \ - "2. SNP Positions that were filtered out with labels indicating the reason (Depth, FQ, MQ, Unmapped in one or other samples, Proximate SNPS, Quality of Variant) why they were filtered out.\n" \ - "3. Barplot Statistics about the filtered variants and their reason for getting filtered.\n" \ - "4. Final Consensus fasta file using only Core SNP Positions\n" - keep_logging('%s' % print_details, '%s' % print_details, logger, 'info') - - """ Create Temp Directory for storing unwanted temp files generated while running script """ - temp_dir = args.filter2_only_snp_vcf_dir + "/temp" - make_sure_path_exists(temp_dir) - - filter2_only_snp_vcf_filenames = args.filter2_only_snp_vcf_filenames - vcf_filenames = [] - with open(filter2_only_snp_vcf_filenames) as fp: - for line in fp: - line = line.strip() - line = args.filter2_only_snp_vcf_dir + line - vcf_filenames.append(line) - fp.close() - #print sorted(vcf_filenames) - - global config_file - if args.config: - config_file = args.config - else: - config_file = os.path.dirname(os.path.abspath(__file__)) + "/config" - global Config - Config = ConfigParser.ConfigParser() - Config.read(config_file) - keep_logging('Path to config file: %s' % config_file, 'Path to config file: %s' % config_file, logger, 'info') - - - - ### Start the core SNP pipeline steps - """ core_prep step """ - if "1" in args.steps: - keep_logging('Gathering SNP position information from each final *_no_proximate_snp.vcf file...', 'Gathering SNP position information from each final *_no_proximate_snp.vcf file...', logger, 'info') - - """ - Gather SNP positions from each final *_no_proximate_snp.vcf file (that passed the variant filter parameters - from variant calling pipeline) and write to *_no_proximate_snp.vcf_position files for use in downstream methods - """ - unique_position_file = create_positions_filestep(vcf_filenames) - - unique_indel_position_file = create_indel_positions_filestep(vcf_filenames) - - tmp_dir = "/tmp/temp_%s/" %log_unique_time - - bgzip_cmd = "for i in %s/*.vcf; do bgzip -c $i > $i%s; done" % (args.filter2_only_snp_vcf_dir, ".gz") - tabix_cmd = "for i in %s/*.vcf.gz; do tabix -f $i; done" % (args.filter2_only_snp_vcf_dir) - os.system(bgzip_cmd) - os.system(tabix_cmd) - - """ Get the cluster option; create and run jobs based on given parameter """ - create_job(args.jobrun, vcf_filenames, unique_position_file, tmp_dir) - - create_indel_job(args.jobrun, vcf_filenames, unique_indel_position_file, tmp_dir) - """ Find ProPhage region in reference genome """ - #run_phaster(args.reference) - - - """ core step """ - if "2" in args.steps: - # #Adhoc - data_matrix_dir = args.results_dir + '/data_matrix' - core_vcf_fasta_dir = args.results_dir + '/core_snp_consensus' - make_sure_path_exists(data_matrix_dir) - make_sure_path_exists(core_vcf_fasta_dir) - - #core_prep_snp(core_vcf_fasta_dir) - - #core_prep_indel(core_vcf_fasta_dir) - - annotated_snp_matrix() - - keep_logging('Wait for individual cluster jobs to finish before running the third step', 'Wait for individual cluster jobs to finish before running the third step', logger, 'info') - - """ report step """ - if "3" in args.steps: - keep_logging('Step 3: Generate Reports and Results folder.', 'Step 3: Generate Reports and Results folder.', logger, 'info') - """ Generate DP barplots data """ - DP_analysis_barplot() - - """ Analyze the FQ values of all the unique variant """ - FQ_analysis() - - data_matrix_dir = args.results_dir + '/data_matrix' - core_vcf_fasta_dir = args.results_dir + '/core_snp_consensus' - consensus_var_dir = core_vcf_fasta_dir + '/consensus_variant_positions' - consensus_ref_var_dir = core_vcf_fasta_dir + '/consensus_ref_variant_positions' - - make_sure_path_exists(data_matrix_dir) - make_sure_path_exists(core_vcf_fasta_dir) - make_sure_path_exists(consensus_var_dir) - make_sure_path_exists(consensus_ref_var_dir) - - move_data_matrix_results = "cp -r %s/*.txt %s/temp* %s/All* %s/Only* %s/*.R %s/R_scripts/generate_diagnostics_plots.R %s/" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, os.path.dirname(os.path.abspath(__file__)), data_matrix_dir) - #move_core_vcf_fasta_results = "cp %s/*_core.vcf.gz %s/*.fa %s/*_variants.fa %s/" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, core_vcf_fasta_dir) - move_core_vcf_fasta_results = "cp %s/*_core.vcf.gz %s/*.fa %s/" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, core_vcf_fasta_dir) - move_consensus_var_fasta_results = "mv %s/*_variants.fa %s/" % (core_vcf_fasta_dir, consensus_var_dir) - move_consensus_ref_var_fasta_results = "mv %s/*.fa %s/" % (core_vcf_fasta_dir, consensus_ref_var_dir) - - - os.system(move_data_matrix_results) - os.system(move_core_vcf_fasta_results) - os.system(move_consensus_var_fasta_results) - os.system(move_consensus_ref_var_fasta_results) - - subprocess.call(["sed -i 's/title_here/%s/g' %s/generate_diagnostics_plots.R" % (os.path.basename(args.results_dir), data_matrix_dir)], shell=True) - - # Check if the variant consensus files generated are of same length - count = 0 - for line in open("%s/Only_ref_variant_positions_for_closely_matrix.txt" % data_matrix_dir).xreadlines(): - count += 1 - ref_variants = count - 1 - - variant_consensus_files = glob.glob("%s/*_variants.fa" % core_vcf_fasta_dir) - - for f in variant_consensus_files: - cmd2 = "%s/%s/bioawk -c fastx '{ print length($seq) }' < %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin'], f) - proc = subprocess.Popen([cmd2], stdout=subprocess.PIPE, shell=True) - (out2, err2) = proc.communicate() - - try: - int(out2) != int(ref_variants) - except OSError as exception: - if exception.errno != errno.EEXIST: - print "Error generating variant consensus position file: %s\n" % f - keep_logging('Error generating variant consensus position file: %s' % f, 'Error generating variant consensus position file: %s' % f, logger, 'exception') - - """ Generate alignment report """ - alignment_report(data_matrix_dir) - - """ Generate core snps report """ - variant_report(data_matrix_dir) - - print_details = "Results for core pipeline can be found in: %s\n" \ - "Description of Results:\n" \ - "1. data_matrix folder contains all the data matrices and other temporary files generated during the core pipeline. bargraph_counts.txt and bargraph_percentage.txt: contains counts/percentage of unique positions filtered out due to different filter parameters for each sample. Run bargraph.R to plot bargraph statistics." \ - "2. core_snp_consensus contains all the core vcf and fasta files. *_core.vcf.gz: core vcf files, *.fa and *_variants.fa: core consensus fasta file and core consensus fasta with only variant positions." % (args.results_dir) - keep_logging(print_details, print_details, logger, 'info') - - """ tree step """ - if "4" in args.steps: - keep_logging('Step 4: Ongoing Testing.', 'Step 4: Ongoing Testing.', logger, 'info') - #parse_phaster(args.reference) - - gubbins_dir = args.results_dir + '/gubbins' - tree_dir = args.results_dir + '/trees' - - make_sure_path_exists(gubbins_dir) - make_sure_path_exists(tree_dir) - - prepare_ref_var_consensus_input = "%s/gubbins/%s_ref_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', '')) - prepare_var_consensus_input = "%s/gubbins/%s_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', '')) - - prepare_ref_var_consensus_input_cmd = "cat %s %s/core_snp_consensus/consensus_ref_variant_positions/*.fa > %s" % (args.reference, args.results_dir, prepare_ref_var_consensus_input) - prepare_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_variant_positions/*.fa > %s" % (args.results_dir, prepare_var_consensus_input) - - - os.system(prepare_ref_var_consensus_input_cmd) - os.system(prepare_var_consensus_input_cmd) - - fasttree(tree_dir, prepare_ref_var_consensus_input, args.jobrun) - fasttree(tree_dir, prepare_var_consensus_input, args.jobrun) - - raxml(tree_dir, prepare_ref_var_consensus_input) - raxml(tree_dir, prepare_var_consensus_input) - - # Disabling Gubbins function due to installation issues - #gubbins(gubbins_dir, prepare_ref_var_consensus_input) - - time_taken = datetime.now() - start_time_2 - if args.remove_temp: - del_command = "rm -r %s" % temp_dir - os.system(del_command) - - - - - - - - - - - - - - - diff --git a/modules/variant_diagnostics/core_pipeline_core_prep_label.py b/modules/variant_diagnostics/core_pipeline_core_prep_label.py deleted file mode 100644 index 470eed5..0000000 --- a/modules/variant_diagnostics/core_pipeline_core_prep_label.py +++ /dev/null @@ -1,293 +0,0 @@ -# System wide imports -from __future__ import division -import sys -import argparse -import re -import os -import csv -import subprocess -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -""" Hacky way to append. Instead Add this path to PYTHONPATH Variable """ -from collections import OrderedDict -from collections import defaultdict -from joblib import Parallel, delayed -import multiprocessing -import thread -import glob -import readline -import errno -from datetime import datetime -import threading -import json -import ConfigParser -from config_settings import ConfigSectionMap -from logging_subprocess import * -from log_modules import * -from tabix import * -from Bio import SeqIO -from core_prep_sanity_checks import * -from PBS_generate_jobs import * -from core_pipeline_core_prep_main import * - -def core_prep_label(vcf_filenames, filter2_only_snp_vcf_dir, outgroup, reference, log_unique_time, log_file_handle, logger, jobrun, Config): - # Create temporary Directory core_temp_dir/temp for storing temporary intermediate files. Check if core_temp_dir contains all the required files to run these pipeline. - global temp_dir - temp_dir = filter2_only_snp_vcf_dir + "/temp" - - # # Extract All the unique SNO and Indel position list from final filtered *_no_proximate_snp.vcf files. - unique_position_file = create_positions_filestep(vcf_filenames, filter2_only_snp_vcf_dir, outgroup, logger) - unique_indel_position_file = create_indel_positions_filestep(vcf_filenames, filter2_only_snp_vcf_dir, outgroup, logger) - - # bgzip and tabix all the vcf files in core_temp_dir. - files_for_tabix = glob.glob("%s/*.vcf" % filter2_only_snp_vcf_dir) - tabix(files_for_tabix, "vcf", logger, Config) - - # Get the cluster option; create and run jobs based on given parameter. The jobs will parse all the intermediate vcf file to extract information such as if any unique variant position was unmapped in a sample, if it was filtered out dur to DP,MQ, FQ, proximity to indel, proximity to other SNPs and other variant filter parameters set in config file. - tmp_dir = "/tmp/temp_%s/" % log_unique_time - - create_job(filter2_only_snp_vcf_dir, jobrun, vcf_filenames, unique_position_file, tmp_dir, Config) - - create_indel_job(filter2_only_snp_vcf_dir, jobrun, vcf_filenames, unique_indel_position_file, tmp_dir, Config) - - # If Phaster Summary file doesn't exist in reference genome folder - if not os.path.isfile("%s/summary.txt" % os.path.dirname(reference)): - if ConfigSectionMap("functional_filters", Config)['apply_functional_filters'] == "yes": - keep_logging('Functional class filter is set to yes. Preparing Functional class filters\n', - 'Functional class filter is set to yes. Preparing Functional class filters\n', logger, - 'info') - if ConfigSectionMap("functional_filters", Config)['find_phage_region'] == "yes": - # Submit Phaster jobs to find ProPhage region in reference genome. - run_phaster(reference, filter2_only_snp_vcf_dir, logger, Config) - - call( - "cp %s %s/Logs/core_prep/" % (log_file_handle, os.path.dirname(os.path.dirname(filter2_only_snp_vcf_dir))), - logger) - - -"""core_prep methods - - This block contains methods that are respnsible for running the first part of core_All step of the pipeline. - This methods generates all the necessary intermediate files required for the second part of core_All step. - Example of intermediate files: various diagnostics files/matrices where it decides why a variant was filtered out. - -""" - -def create_positions_filestep(vcf_filenames, filter2_only_snp_vcf_dir, outgroup, logger): - - """ - This method gathers SNP positions from each final *_no_proximate_snp.vcf file (these are the positions that passed variant filter parameters - from variant calling pipeline) and write to *_no_proximate_snp.vcf_position files. Use these *_no_proximate_snp.vcf_position files to generate a list of unique_position_file - :param: list of final vcf filenames i.e *.vcf_no_proximate_snp.vcf . These files are the final output of variant calling step for each sample. - :return: unique_position_file - """ - - filter2_only_snp_position_files_array = [] - for file in vcf_filenames: - with open(file, 'rU') as csv_file: - file_name = temp_dir + "/" + os.path.basename(file) + "_positions" - addpositionfilenametoarray = file_name - filter2_only_snp_position_files_array.append(addpositionfilenametoarray) - f1 = open(file_name, 'w+') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position = row[0] - if not position.startswith('#'): - p_string = row[1] + "\n" - f1.write(p_string) - f1.close() - csv_file.close() - - """ Get Positions Specific to Outgroup Sample name """ - if outgroup is not None: - outgroup_position_file_name = temp_dir + "/" + outgroup_vcf_filename + "_positions" - outgroup_position_array = [] - f1 = open(outgroup_position_file_name, 'r+') - for lines in f1: - lines = lines.strip() - outgroup_position_array.append(int(lines)) - f1.close() - - - position_array_excluding_outgroup = [] - for filess in filter2_only_snp_position_files_array: - if outgroup not in filess: - f = open(filess, 'r+') - for line in f: - line = line.strip() - position_array_excluding_outgroup.append(int(line)) - f.close() - position_array_unique_excluding_outgroup = set(position_array_excluding_outgroup) - position_array_sort_excluding_outgroup = sorted(position_array_unique_excluding_outgroup) - #print len(position_array_sort_excluding_outgroup) - outgroup_specific_positions = [] - f_outgroup = open("%s/outgroup_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'w+') - for i in outgroup_position_array: - if i not in position_array_sort_excluding_outgroup: - f_outgroup.write(str(i) + '\n') - outgroup_specific_positions.append(int(i)) - # outgroup_indel_specific_positions.append(int(i)) - f_outgroup.close() - print "No. of variant positions in outgroup: %s" % len(outgroup_position_array) - print "No. of variant positions specific to outgroup: %s" % len(outgroup_specific_positions) - - position_array = [] - for filess in filter2_only_snp_position_files_array: - f = open(filess, 'r+') - for line in f: - line = line.strip() - # Changed variable to suit sorting: 25-07-2018 - position_array.append(int(line)) - f.close() - # Check why python sorting is not working - keep_logging('Sorting unique variant positions.\n', 'Sorting unique variant positions.\n', logger, 'info') - position_array_unique = set(position_array) - position_array_sort = sorted(position_array_unique) - keep_logging('\nThe number of unique variant positions:%s' % len(position_array_sort), '\nThe number of unique variant positions:%s' % len(position_array_sort), logger, 'info') - unique_position_file = "%s/unique_positions_file" % filter2_only_snp_vcf_dir - f=open(unique_position_file, 'w+') - for i in position_array_sort: - # Changed variable to suit sorting: 25-07-2018 - f.write(str(i) + "\n") - f.close() - - if len(position_array_sort) == 0: - keep_logging('ERROR: No unique positions found. Check if vcf files are empty?', 'ERROR: No unique positions found. Check if vcf files are empty?', logger, 'info') - exit() - - return unique_position_file - - else: - - """ Create position array containing unique positiones from positions file """ - - position_array = [] - for filess in filter2_only_snp_position_files_array: - f = open(filess, 'r+') - for line in f: - line = line.strip() - # Changed variable to suit sorting: 25-07-2018 - position_array.append(int(line)) - f.close() - # Check why python sorting is not working - keep_logging('Sorting unique variant positions.\n', 'Sorting unique variant positions.\n', logger, 'info') - position_array_unique = set(position_array) - position_array_sort = sorted(position_array_unique) - keep_logging('\nThe number of unique variant positions:%s' % len(position_array_sort), '\nThe number of unique variant positions:%s' % len(position_array_sort), logger, 'info') - unique_position_file = "%s/unique_positions_file" % filter2_only_snp_vcf_dir - f=open(unique_position_file, 'w+') - for i in position_array_sort: - # Changed variable to suit sorting: 25-07-2018 - f.write(str(i) + "\n") - f.close() - - if len(position_array_sort) == 0: - keep_logging('ERROR: No unique positions found. Check if vcf files are empty?', 'ERROR: No unique positions found. Check if vcf files are empty?', logger, 'info') - exit() - return unique_position_file - -def create_indel_positions_filestep(vcf_filenames, filter2_only_snp_vcf_dir, outgroup, logger): - - """ - This function gathers Indel positions from each final *_indel_final.vcf (these are the positions that passed variant filter parameters - from variant calling pipeline) and write to *_indel_final.vcf files. Use these *_indel_final.vcf_position files to generate a list of unique_position_file - :param: list of final vcf filenames i.e *_indel_final.vcf . These files are the final output of variant calling step for each sample. - :return: unique_indel_position_file - """ - - filter2_only_indel_position_files_array = [] - for file in vcf_filenames: - indel_file = file.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf') - with open(indel_file, 'rU') as csv_file: - file_name = temp_dir + "/" + os.path.basename(indel_file) + "_positions" - addpositionfilenametoarray = file_name - filter2_only_indel_position_files_array.append(addpositionfilenametoarray) - f1 = open(file_name, 'w+') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position = row[0] - if not position.startswith('#'): - p_string = row[1] + "\n" - f1.write(p_string) - f1.close() - csv_file.close() - - """ Get Positions Specific to Outgroup Sample name """ - if outgroup is not None: - outgroup_position_indel_file_name = temp_dir + "/" + outgroup_indel_vcf_filename + "_positions" - print outgroup_position_indel_file_name - outgroup_position_indel_array = [] - f1 = open(outgroup_position_indel_file_name, 'r+') - for lines in f1: - lines = lines.strip() - outgroup_position_indel_array.append(int(lines)) - f1.close() - #print len(outgroup_position_indel_array) - - position_array_indel_excluding_outgroup = [] - for filess in filter2_only_indel_position_files_array: - if outgroup not in filess: - f = open(filess, 'r+') - for line in f: - line = line.strip() - position_array_indel_excluding_outgroup.append(int(line)) - f.close() - position_array_indel_unique_excluding_outgroup = set(position_array_indel_excluding_outgroup) - position_array_sort_indel_excluding_outgroup = sorted(position_array_indel_unique_excluding_outgroup) - outgroup_indel_specific_positions = [] - f_outgroup = open("%s/outgroup_indel_specific_positions.txt" % filter2_only_snp_vcf_dir, 'w+') - for i in outgroup_position_indel_array: - if i not in position_array_sort_indel_excluding_outgroup: - f_outgroup.write(str(i) + '\n') - outgroup_indel_specific_positions.append(int(i)) - f_outgroup.close() - print "No. of indel variant positions in outgroup: %s" % len(outgroup_position_indel_array) - print "No. of indel variant positions specific to outgroup: %s" % len(outgroup_indel_specific_positions) - - position_array = [] - for filess in filter2_only_indel_position_files_array: - f = open(filess, 'r+') - for line in f: - line = line.strip() - # Changed variable to suit sorting: 25-07-2018 - position_array.append(int(line)) - f.close() - position_array_unique = set(position_array) - position_array_sort = sorted(position_array_unique) - keep_logging('\nThe number of unique indel positions:%s' % len(position_array_sort), '\nThe number of unique indel positions:%s' % len(position_array_sort), logger, 'info') - unique_indel_position_file = "%s/unique_indel_positions_file" % filter2_only_snp_vcf_dir - f=open(unique_indel_position_file, 'w+') - for i in position_array_sort: - # Changed variable to suit sorting: 25-07-2018 - f.write(str(i) + "\n") - f.close() - if len(position_array_sort) == 0: - keep_logging('ERROR: No unique positions found. Check if vcf files are empty?', 'ERROR: No unique positions found. Check if vcf files are empty?', logger, 'info') - exit() - - return unique_indel_position_file - - - else: - - """ Create position array containing unique positiones from positions file """ - position_array = [] - for filess in filter2_only_indel_position_files_array: - f = open(filess, 'r+') - for line in f: - line = line.strip() - # Changed variable to suit sorting: 25-07-2018 - position_array.append(int(line)) - f.close() - position_array_unique = set(position_array) - position_array_sort = sorted(position_array_unique) - keep_logging('\nThe number of unique indel positions:%s' % len(position_array_sort), '\nThe number of unique indel positions:%s' % len(position_array_sort), logger, 'info') - unique_indel_position_file = "%s/unique_indel_positions_file" % filter2_only_snp_vcf_dir - f=open(unique_indel_position_file, 'w+') - for i in position_array_sort: - # Changed variable to suit sorting: 25-07-2018 - f.write(str(i) + "\n") - f.close() - if len(position_array_sort) == 0: - keep_logging('ERROR: No unique positions found. Check if vcf files are empty?', 'ERROR: No unique positions found. Check if vcf files are empty?', logger, 'info') - exit() - return unique_indel_position_file \ No newline at end of file diff --git a/modules/variant_diagnostics/core_pipeline_core_prep_main.py b/modules/variant_diagnostics/core_pipeline_core_prep_main.py deleted file mode 100644 index 1a54436..0000000 --- a/modules/variant_diagnostics/core_pipeline_core_prep_main.py +++ /dev/null @@ -1,2439 +0,0 @@ -# System wide imports -from __future__ import division -import sys -import argparse -import re -import os -import csv -import subprocess -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -""" Hacky way to append. Instead Add this path to PYTHONPATH Variable """ -from collections import OrderedDict -from collections import defaultdict -from joblib import Parallel, delayed -import multiprocessing -import thread -import glob -import readline -import errno -from datetime import datetime -import threading -import json -import ConfigParser -from config_settings import ConfigSectionMap -from modules.logging_subprocess import * -from modules.log_modules import * -from modules.tabix import * -from Bio import SeqIO -from modules.core_prep_sanity_checks import * -from PBS_generate_jobs import * - - -"""core methods - - This block contains methods that are respnsible for running the second part of core_All step of the pipeline. - It uses intermediate files generated during the first step, finds core SNPs and annotates variants using snpEff. - It will generate all types of SNP matrices that is required for downstream pathways / Association analysis. - Output: - - - -""" - - -def generate_paste_command(): - """ - This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. - :param: null - :return: null - """ - - """ Paste/Generate and sort SNP Filter Label Matrix """ - paste_file = args.filter2_only_snp_vcf_dir + "/paste_label_files.sh" - f4 = open(paste_file, 'w+') - paste_command = "paste %s/unique_positions_file" % args.filter2_only_snp_vcf_dir - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', - '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') - paste_command = paste_command + " " + label_file - header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % ( - args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) - sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir - sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir - - call("%s" % header_awk_cmd, logger) - call("%s" % sed_header, logger) - call("%s" % sed_header_2, logger) - - temp_paste_command = paste_command + " > %s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir - paste_command = paste_command + " > %s/All_label_final_raw" % args.filter2_only_snp_vcf_dir - f4.write(paste_command) - f4.close() - sort_All_label_cmd = "sort -n -k1,1 %s/All_label_final_raw > %s/All_label_final_sorted.txt" % ( - args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - paste_command_header = "cat %s/header.txt %s/All_label_final_sorted.txt > %s/All_label_final_sorted_header.txt" % ( - args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - - ls = [] - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', - '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') - ls.append(label_file) - ls.insert(0, "%s/unique_positions_file" % args.filter2_only_snp_vcf_dir) - - with open('%s/All_label_final_raw.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: - outfile.write(paste_command) - outfile.close() - - with open('%s/temp_label_final_raw.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: - outfile.write(temp_paste_command) - outfile.close() - - call("bash %s/All_label_final_raw.sh" % args.filter2_only_snp_vcf_dir, logger) - call("bash %s/temp_label_final_raw.txt.sh" % args.filter2_only_snp_vcf_dir, logger) - call("%s" % sort_All_label_cmd, logger) - call("%s" % paste_command_header, logger) - - """ Assign numeric code to each variant filter reason""" - subprocess.call([ - "sed -i 's/reference_unmapped_position/0/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/reference_allele/1/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call(["sed -i 's/VARIANT/1TRUE/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_QUAL_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_DP_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/LowFQ_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/LowFQ_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/LowFQ_QUAL_DP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/LowFQ_DP_QUAL/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call(["sed -i 's/LowFQ_DP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/HighFQ_proximate_SNP/7/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/HighFQ_QUAL_DP/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/HighFQ_DP_QUAL/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call(["sed -i 's/LowFQ/5/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call(["sed -i 's/HighFQ/6/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir - call("%s" % remove_unwanted_text, logger) - -def generate_paste_command_outgroup(): - """ - This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. - :param: null - :return: null - """ - - if args.outgroup: - """ Paste/Generate and sort SNP Filter Label Matrix """ - paste_file = args.filter2_only_snp_vcf_dir + "/paste_label_files_outgroup.sh" - f4 = open(paste_file, 'w+') - paste_command = "paste %s/unique_positions_file" % args.filter2_only_snp_vcf_dir - for i in vcf_filenames: - if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', - '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') - paste_command = paste_command + " " + label_file - - """Exclude outgroup sample name in header - - header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) - sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir - sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir - - """ - - header_awk_cmd = "grep -v \'%s\' %s | awk \'{ORS=\"\t\";}{print $1}\' > %s/header_outgroup.txt" % ( - outgroup, args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) - sed_header = "sed -i \'s/^/\t/\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir - sed_header_2 = "sed -i -e \'$a\\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir - - call("%s" % header_awk_cmd, logger) - call("%s" % sed_header, logger) - call("%s" % sed_header_2, logger) - - temp_paste_command = paste_command + " > %s/temp_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir - paste_command = paste_command + " > %s/All_label_final_raw_outgroup" % args.filter2_only_snp_vcf_dir - f4.write(paste_command) - f4.close() - sort_All_label_cmd = "sort -n -k1,1 %s/All_label_final_raw_outgroup > %s/All_label_final_sorted_outgroup.txt" % ( - args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - paste_command_header = "cat %s/header_outgroup.txt %s/All_label_final_sorted_outgroup.txt > %s/All_label_final_sorted_header_outgroup.txt" % ( - args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - - ls = [] - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', - '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') - ls.append(label_file) - ls.insert(0, "%s/unique_positions_file" % args.filter2_only_snp_vcf_dir) - - with open('%s/All_label_final_raw_outgroup.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: - outfile.write(paste_command) - outfile.close() - - with open('%s/temp_label_final_raw_outgroup.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: - outfile.write(temp_paste_command) - outfile.close() - call("bash %s/All_label_final_raw_outgroup.sh" % args.filter2_only_snp_vcf_dir, logger) - call("bash %s/temp_label_final_raw_outgroup.txt.sh" % args.filter2_only_snp_vcf_dir, logger) - - """ - remove this lines - #subprocess.call(["%s" % paste_command], shell=True) - #subprocess.call(["%s" % temp_paste_command], shell=True) - #subprocess.check_call('%s' % paste_command) - #subprocess.check_call('%s' % temp_paste_command) - #os.system(paste_command) change - #os.system(temp_paste_command) change - """ - - call("%s" % sort_All_label_cmd, logger) - call("%s" % paste_command_header, logger) - - """ Assign numeric code to each variant filter reason""" - subprocess.call([ - "sed -i 's/reference_unmapped_position/0/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/reference_allele/1/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/VARIANT/1TRUE/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_QUAL_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_DP_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_QUAL_DP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_DP_QUAL/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/LowFQ_QUAL/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/LowFQ_DP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_proximate_SNP/7/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_QUAL_DP/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_DP_QUAL/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_QUAL/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/HighFQ_DP/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/LowFQ/5/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/HighFQ/6/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir - call("%s" % remove_unwanted_text, logger) - - else: - print "Skip generating seperate intermediate files for outgroup" - - -def generate_indel_paste_command(): - """ - This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. - :param: null - :return: null - """ - - """ Paste/Generate and sort SNP Filter Label Matrix """ - paste_file = args.filter2_only_snp_vcf_dir + "/paste_indel_label_files.sh" - f4 = open(paste_file, 'w+') - paste_command = "paste %s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', - '_filter2_indel_final.vcf_indel_positions_label') - paste_command = paste_command + " " + label_file - header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % ( - args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) - sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir - sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir - - # os.system(header_awk_cmd) - # os.system(sed_header) - # os.system(sed_header_2) - - call("%s" % header_awk_cmd, logger) - call("%s" % sed_header, logger) - call("%s" % sed_header_2, logger) - - temp_paste_command = paste_command + " > %s/temp_indel_label_final_raw.txt" % args.filter2_only_snp_vcf_dir - paste_command = paste_command + " > %s/All_indel_label_final_raw" % args.filter2_only_snp_vcf_dir - f4.write(paste_command) - f4.close() - - call("bash %s" % paste_file, logger) - - sort_All_label_cmd = "sort -n -k1,1 %s/All_indel_label_final_raw > %s/All_indel_label_final_sorted.txt" % ( - args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - paste_command_header = "cat %s/header.txt %s/All_indel_label_final_sorted.txt > %s/All_indel_label_final_sorted_header.txt" % ( - args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - - ls = [] - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', - '_filter2_indel_final.vcf_indel_positions_label') - ls.append(label_file) - ls.insert(0, "%s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir) - - with open('%s/All_indel_label_final_raw.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: - outfile2.write(paste_command) - outfile2.close() - - with open('%s/temp_indel_label_final_raw.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: - outfile2.write(temp_paste_command) - outfile2.close() - - # Why is this not working? - call("bash %s/All_indel_label_final_raw.sh" % args.filter2_only_snp_vcf_dir, logger) - call("bash %s/temp_indel_label_final_raw.txt.sh" % args.filter2_only_snp_vcf_dir, logger) - keep_logging('Finished pasting...DONE', 'Finished pasting...DONE', logger, 'info') - - """ - remove this lines - #subprocess.call(["%s" % paste_command], shell=True) - #subprocess.call(["%s" % temp_paste_command], shell=True) - #subprocess.check_call('%s' % paste_command) - #subprocess.check_call('%s' % temp_paste_command) - #os.system(paste_command) change - #os.system(temp_paste_command) change - """ - - call("%s" % sort_All_label_cmd, logger) - call("%s" % paste_command_header, logger) - - """ Assign numeric code to each variant filter reason""" - subprocess.call([ - "sed -i 's/reference_unmapped_position/0/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/reference_allele/1/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/VARIANT/1TRUE/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_QUAL_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_DP_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/LowAF_QUAL_DP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/LowAF_DP_QUAL/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/LowAF_QUAL/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/LowAF_DP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_proximate_SNP/7/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/HighAF_QUAL_DP/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/HighAF_DP_QUAL/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/HighAF_QUAL/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/HighAF_DP/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call(["sed -i 's/LowAF/5/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call( - ["sed -i 's/HighAF/6/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir - call("%s" % remove_unwanted_text, logger) - - -def generate_indel_paste_command_outgroup(): - """ - This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. - :param: null - :return: null - """ - - if args.outgroup: - """ Paste/Generate and sort SNP Filter Label Matrix """ - # define a file name where the paste commands will be saved. - paste_file = args.filter2_only_snp_vcf_dir + "/paste_indel_label_files_outgroup.sh" - f4 = open(paste_file, 'w+') - - # initiate paste command string - paste_command = "paste %s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir - - # Generate paste command - for i in vcf_filenames: - if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', - '_filter2_indel_final.vcf_indel_positions_label') - paste_command = paste_command + " " + label_file - # Change header awk command to exclude outgroup - # header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) - header_awk_cmd = "grep -v \'%s\' %s | awk \'{ORS=\"\t\";}{print $1}\' > %s/header_outgroup.txt" % ( - outgroup, args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) - sed_header = "sed -i \'s/^/\t/\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir - sed_header_2 = "sed -i -e \'$a\\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir - - call("%s" % header_awk_cmd, logger) - call("%s" % sed_header, logger) - call("%s" % sed_header_2, logger) - - temp_paste_command = paste_command + " > %s/temp_indel_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir - paste_command = paste_command + " > %s/All_indel_label_final_raw_outgroup" % args.filter2_only_snp_vcf_dir - f4.write(paste_command) - f4.close() - - call("bash %s" % paste_file, logger) - - sort_All_label_cmd = "sort -n -k1,1 %s/All_indel_label_final_raw_outgroup > %s/All_indel_label_final_sorted_outgroup.txt" % ( - args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - paste_command_header = "cat %s/header_outgroup.txt %s/All_indel_label_final_sorted_outgroup.txt > %s/All_indel_label_final_sorted_header_outgroup.txt" % ( - args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - - ls = [] - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', - '_filter2_indel_final.vcf_indel_positions_label') - ls.append(label_file) - ls.insert(0, "%s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir) - - with open('%s/All_indel_label_final_raw_outgroup.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: - outfile2.write(paste_command) - outfile2.close() - - with open('%s/temp_indel_label_final_raw_outgroup.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: - outfile2.write(temp_paste_command) - outfile2.close() - - # Why is this not working? - call("bash %s/All_indel_label_final_raw_outgroup.sh" % args.filter2_only_snp_vcf_dir, logger) - call("bash %s/temp_indel_label_final_raw_outgroup.txt.sh" % args.filter2_only_snp_vcf_dir, logger) - keep_logging('Finished pasting...DONE', 'Finished pasting...DONE', logger, 'info') - - """ - remove this lines - #subprocess.call(["%s" % paste_command], shell=True) - #subprocess.call(["%s" % temp_paste_command], shell=True) - #subprocess.check_call('%s' % paste_command) - #subprocess.check_call('%s' % temp_paste_command) - #os.system(paste_command) change - #os.system(temp_paste_command) change - """ - - call("%s" % sort_All_label_cmd, logger) - call("%s" % paste_command_header, logger) - - """ Assign numeric code to each variant filter reason""" - subprocess.call([ - "sed -i 's/reference_unmapped_position/0/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/reference_allele/1/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/VARIANT/1TRUE/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_QUAL_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_DP_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_QUAL_DP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_DP_QUAL/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_QUAL/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_DP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_proximate_SNP/7/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_QUAL_DP/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_DP_QUAL/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_QUAL/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_DP/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF/5/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF/6/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir - call("%s" % remove_unwanted_text, logger) - else: - print "Skip generating seperate intermediate files for outgroup" - - -def generate_position_label_data_matrix(): - """ - Generate different list of Positions using the matrix All_label_final_sorted_header.txt. - - (Defining Core Variant Position: Variant Position which was not filtered out in any of the other samples due to variant filter parameter and also this position was present in all the samples(not unmapped)). - - Filtered Position label matrix: - List of non-core positions. These positions didn't make it to the final core list because it was filtered out in one of the samples. - - Only_ref_variant_positions_for_closely_matrix.txt : - Those Positions where the variant was either reference allele or a variant that passed all the variant filter parameters. - - :param: null - :return: null - - """ - - def generate_position_label_data_matrix_All_label(): - position_label = OrderedDict() - f1 = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'w+') - f2 = open("%s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f3 = open("%s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f4 = open( - "%s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir, - 'w+') - if args.outgroup: - with open("%s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir, - 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - position_label[row[0]] = row[1:] - keep_logging('Generating different list of Positions and heatmap data matrix... \n', - 'Generating different list of Positions and heatmap data matrix... \n', logger, 'info') - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f2.write('\t' + print_string_header.strip() + '\n') - f3.write('\t' + print_string_header.strip() + '\n') - f4.write('\t' + print_string_header.strip() + '\n') - for value in position_label: - lll = ['0', '2', '3', '4', '5', '6', '7'] - ref_var = ['1', '1TRUE'] - if set(ref_var) & set(position_label[value]): - if set(lll) & set(position_label[value]): - if int(value) not in outgroup_specific_positions: - print_string = "" - for i in position_label[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f3.write(STRR2) - if position_label[value].count('1TRUE') >= 2: - f4.write('1\n') - else: - f4.write('0\n') - else: - if int(value) not in outgroup_specific_positions: - strr = value + "\n" - f1.write(strr) - STRR3 = value + "\t" + str(position_label[value]) + "\n" - f2.write(STRR3) - csv_file.close() - f1.close() - f2.close() - f3.close() - f4.close() - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/1TRUE/-1/g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - - else: - with open("%s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - position_label[row[0]] = row[1:] - keep_logging('Generating different list of Positions and heatmap data matrix... \n', - 'Generating different list of Positions and heatmap data matrix... \n', logger, 'info') - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f2.write('\t' + print_string_header.strip() + '\n') - f3.write('\t' + print_string_header.strip() + '\n') - f4.write('\t' + print_string_header.strip() + '\n') - for value in position_label: - lll = ['0', '2', '3', '4', '5', '6', '7'] - ref_var = ['1', '1TRUE'] - if set(ref_var) & set(position_label[value]): - if set(lll) & set(position_label[value]): - - print_string = "" - for i in position_label[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f3.write(STRR2) - if position_label[value].count('1TRUE') >= 2: - f4.write('1\n') - else: - f4.write('0\n') - else: - - strr = value + "\n" - f1.write(strr) - STRR3 = value + "\t" + str(position_label[value]) + "\n" - f2.write(STRR3) - csv_file.close() - f1.close() - f2.close() - f3.close() - f4.close() - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/1TRUE/-1/g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - - def temp_generate_position_label_data_matrix_All_label(): - - """ - Read temp_label_final_raw.txt SNP position label data matrix for generating barplot statistics. - """ - temp_position_label = OrderedDict() - f33 = open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - print_string_header = "\t" - - if args.outgroup: - for i in vcf_filenames: - if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: - print_string_header = print_string_header + os.path.basename(i) + "\t" - else: - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - - f33.write('\t' + print_string_header.strip() + '\n') - keep_logging( - 'Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, - 'Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, - logger, 'info') - lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', - 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', - 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', - 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', - 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] - ref_var = ['reference_allele', 'VARIANT'] - - if args.outgroup: - print "here" - with open("%s/temp_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - if set(ref_var) & set(row[1:]): - if set(lll) & set(row[1:]): - if int(row[0]) not in outgroup_specific_positions: - - print_string = "" - for i in row[1:]: - print_string = print_string + "\t" + i - STRR2 = row[0] + print_string + "\n" - f33.write(STRR2) - csv_file.close() - f33.close() - - else: - with open("%s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - if set(ref_var) & set(row[1:]): - if set(lll) & set(row[1:]): - - print_string = "" - for i in row[1:]: - print_string = print_string + "\t" + i - STRR2 = row[0] + print_string + "\n" - f33.write(STRR2) - csv_file.close() - f33.close() - """ - Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of FQ - """ - temp_position_label_FQ = OrderedDict() - f44 = open("%s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir, 'w+') - with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, - 'rU') as csv_file: - keep_logging( - 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, - 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - - for row in csv_reader: - temp_position_label_FQ[row[0]] = row[1:] - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f44.write('\t' + print_string_header.strip() + '\n') - for value in temp_position_label_FQ: - lll = ['LowFQ'] - if set(lll) & set(temp_position_label_FQ[value]): - - print_string = "" - for i in temp_position_label_FQ[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f44.write(STRR2) - f44.close() - csv_file.close() - f44.close() - - """ - Perform Sed on temp files. Find a faster way to do this. - """ - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - - """ - Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp - """ - temp_position_label_DP = OrderedDict() - f44 = open("%s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') - with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, - 'rU') as csv_file: - keep_logging( - 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, - 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - temp_position_label_DP[row[0]] = row[1:] - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f44.write('\t' + print_string_header.strip() + '\n') - for value in temp_position_label_DP: - lll = ['HighFQ_DP'] - ref_var = ['reference_allele', 'VARIANT'] - if set(lll) & set(temp_position_label_FQ[value]): - - print_string = "" - for i in temp_position_label_FQ[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f44.write(STRR2) - f44.close() - csv_file.close() - - """ - Perform Sed on temp files. Find a faster way to do this. - """ - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ_DP/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - - def barplot_stats(): - keep_logging( - '\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n', - '\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n', - logger, 'info') - """ - Read each Sample columns and calculate the percentage of each label to generate barplot statistics. - This will give a visual explanation of how many positions in each samples were filtered out because of different reason - """ - - c_reader = csv.reader( - open('%s/temp_Only_filtered_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, 'r'), - delimiter='\t') - columns = list(zip(*c_reader)) - keep_logging('Finished reading columns...', 'Finished reading columns...', logger, 'info') - counts = 1 - - if args.outgroup: - end = len(vcf_filenames) + 1 - end = end - 1 - else: - end = len(vcf_filenames) + 1 - - f_bar_count = open("%s/bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_perc = open("%s/bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_count.write( - "Sample\tunmapped_positions\treference_allele\ttrue_variant\tOnly_low_FQ\tOnly_DP\tOnly_low_MQ\tother\n") - f_bar_perc.write( - "Sample\tunmapped_positions_perc\ttrue_variant_perc\tOnly_low_FQ_perc\tOnly_DP_perc\tOnly_low_MQ_perc\tother_perc\n") - - for i in xrange(1, end, 1): - """ Bar Count Statistics: Variant Position Count Statistics """ - true_variant = columns[i].count('VARIANT') - unmapped_positions = columns[i].count('reference_unmapped_position') - reference_allele = columns[i].count('reference_allele') - Only_low_FQ = columns[i].count('LowFQ') - Only_DP = columns[i].count('HighFQ_DP') - Only_low_MQ = columns[i].count('HighFQ') - low_FQ_other_parameters = columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count( - 'LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count( - 'LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count( - 'LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[ - i].count('LowFQ_DP') - high_FQ_other_parameters = columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count( - 'HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count( - 'HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count( - 'HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL') - other = low_FQ_other_parameters + high_FQ_other_parameters - - total = true_variant + unmapped_positions + reference_allele + Only_low_FQ + Only_DP + low_FQ_other_parameters + high_FQ_other_parameters + Only_low_MQ - - filename_count = i - 1 - - if args.outgroup: - bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( - vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), - unmapped_positions, reference_allele, true_variant, - Only_low_FQ, Only_DP, Only_low_MQ, other) - f_bar_count.write(bar_string) - else: - bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( - vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), - unmapped_positions, reference_allele, true_variant, - Only_low_FQ, Only_DP, Only_low_MQ, other) - # f_bar_count.write(bar_string) - """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ - try: - true_variant_perc = float((columns[i].count('VARIANT') * 100) / total) - except ZeroDivisionError: - true_variant_perc = 0 - try: - unmapped_positions_perc = float((columns[i].count('reference_unmapped_position') * 100) / total) - except ZeroDivisionError: - unmapped_positions_perc = 0 - try: - reference_allele_perc = float((columns[i].count('reference_allele') * 100) / total) - except ZeroDivisionError: - reference_allele_perc = 0 - try: - Only_low_FQ_perc = float((columns[i].count('LowFQ') * 100) / total) - except ZeroDivisionError: - Only_low_FQ_perc = 0 - try: - Only_DP_perc = float((columns[i].count('HighFQ_DP') * 100) / total) - except ZeroDivisionError: - Only_DP_perc = 0 - try: - Only_low_MQ_perc = float((columns[i].count('HighFQ') * 100) / total) - except ZeroDivisionError: - Only_low_MQ_perc = 0 - try: - low_FQ_other_parameters_perc = float(((columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[ - i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[ - i].count('LowFQ_DP_proximate_SNP') + columns[i].count( - 'LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + - columns[i].count('LowFQ_QUAL') + columns[i].count( - 'LowFQ_DP')) * 100) / total) - except ZeroDivisionError: - low_FQ_other_parameters_perc = 0 - try: - high_FQ_other_parameters_perc = float(((columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[ - i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[ - i].count('HighFQ_DP_proximate_SNP') + columns[i].count( - 'HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + - columns[i].count('HighFQ_QUAL')) * 100) / total) - except ZeroDivisionError: - high_FQ_other_parameters_perc = 0 - - other_perc = float(low_FQ_other_parameters_perc + high_FQ_other_parameters_perc) - if args.outgroup: - bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( - vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), - unmapped_positions_perc, true_variant_perc, - Only_low_FQ_perc, Only_DP_perc, Only_low_MQ_perc, - other_perc) - else: - bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( - vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), - unmapped_positions_perc, reference_allele_perc, - true_variant_perc, - Only_low_FQ_perc, Only_DP_perc, - Only_low_MQ_perc, other_perc) - f_bar_count.write(bar_string) - f_bar_perc.write(bar_perc_string) - f_bar_count.close() - f_bar_perc.close() - bargraph_R_script = "library(ggplot2)\nlibrary(reshape)\nx1 <- read.table(\"bargraph_percentage.txt\", header=TRUE)\nx1$Sample <- reorder(x1$Sample, rowSums(x1[-1]))\nmdf1=melt(x1,id.vars=\"Sample\")\npdf(\"%s/%s_barplot.pdf\", width = 30, height = 30)\nggplot(mdf1, aes(Sample, value, fill=variable)) + geom_bar(stat=\"identity\") + ylab(\"Percentage of Filtered Positions\") + xlab(\"Samples\") + theme(text = element_text(size=9)) + scale_fill_manual(name=\"Reason for filtered out positions\", values=c(\"#08306b\", \"black\", \"orange\", \"darkgrey\", \"#fdd0a2\", \"#7f2704\")) + ggtitle(\"Title Here\") + ylim(0, 100) + theme(text = element_text(size=10), panel.background = element_rect(fill = 'white', colour = 'white'), plot.title = element_text(size=20, face=\"bold\", margin = margin(10, 0, 10, 0)), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_text(colour = \"black\", face= \"bold.italic\", angle = 90)) + theme(legend.position = c(0.6, 0.7), legend.direction = \"horizontal\")\ndev.off()" % ( - args.filter2_only_snp_vcf_dir, os.path.basename(os.path.normpath(args.results_dir))) - barplot_R_file = open("%s/bargraph.R" % args.filter2_only_snp_vcf_dir, 'w+') - barplot_R_file.write(bargraph_R_script) - keep_logging('Run this R script to generate bargraph plot: %s/bargraph.R' % args.filter2_only_snp_vcf_dir, - 'Run this R script to generate bargraph plot: %s/bargraph.R' % args.filter2_only_snp_vcf_dir, - logger, 'info') - - """ Methods Steps""" - keep_logging('Running: Generating data matrices...', 'Running: Generating data matrices...', logger, 'info') - generate_position_label_data_matrix_All_label() - keep_logging('Running: Changing variables in data matrices to codes for faster processing...', - 'Running: Changing variables in data matrices to codes for faster processing...', logger, 'info') - temp_generate_position_label_data_matrix_All_label() - keep_logging('Running: Generating Barplot statistics data matrices...', - 'Running: Generating Barplot statistics data matrices...', logger, 'info') - barplot_stats() - - -def generate_indel_position_label_data_matrix(): - """ - Generate different list of Positions using the matrix All_label_final_sorted_header.txt. - - (Defining Core Variant Position: Variant Position which was not filtered out in any of the other samples due to variant filter parameter and also this position was present in all the samples(not unmapped)). - - Filtered Position label matrix: - List of non-core positions. These positions didn't make it to the final core list because it was filtered out in one of the samples. - - Only_ref_variant_positions_for_closely_matrix.txt : - Those Positions where the variant was either reference allele or a variant that passed all the variant filter parameters. - - :param: null - :return: null - - """ - - def generate_indel_position_label_data_matrix_All_label(): - position_label = OrderedDict() - print "Generating Only_ref_indel_positions_for_closely" - f1 = open("%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'w+') - f2 = open("%s/Only_ref_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f3 = open("%s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f4 = open( - "%s/Only_filtered_indel_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir, - 'w+') - - if args.outgroup: - with open("%s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir, - 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - position_label[row[0]] = row[1:] - keep_logging('Generating different list of Positions and heatmap data matrix...', - 'Generating different list of Positions and heatmap data matrix...', logger, 'info') - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - # f.write('\t' + print_string_header.strip() + '\n') - f2.write('\t' + print_string_header.strip() + '\n') - f3.write('\t' + print_string_header.strip() + '\n') - f4.write('\t' + print_string_header.strip() + '\n') - for value in position_label: - lll = ['0', '2', '3', '4', '5', '6', '7'] - ref_var = ['1', '1TRUE'] - if set(ref_var) & set(position_label[value]): - if set(lll) & set(position_label[value]): - if int(value) not in outgroup_indel_specific_positions: - print_string = "" - for i in position_label[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f3.write(STRR2) - if position_label[value].count('1TRUE') >= 2: - f4.write('1\n') - else: - f4.write('0\n') - else: - if int(value) not in outgroup_indel_specific_positions: - strr = value + "\n" - f1.write(strr) - STRR3 = value + "\t" + str(position_label[value]) + "\n" - f2.write(STRR3) - csv_file.close() - f1.close() - f2.close() - f3.close() - f4.close() - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/1TRUE/-1/g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - else: - with open("%s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - position_label[row[0]] = row[1:] - keep_logging('Generating different list of Positions and heatmap data matrix...', - 'Generating different list of Positions and heatmap data matrix...', logger, 'info') - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - # f.write('\t' + print_string_header.strip() + '\n') - f2.write('\t' + print_string_header.strip() + '\n') - f3.write('\t' + print_string_header.strip() + '\n') - f4.write('\t' + print_string_header.strip() + '\n') - for value in position_label: - - lll = ['0', '2', '3', '4', '5', '6', '7'] - ref_var = ['1', '1TRUE'] - if set(ref_var) & set(position_label[value]): - if set(lll) & set(position_label[value]): - print_string = "" - for i in position_label[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f3.write(STRR2) - if position_label[value].count('1TRUE') >= 2: - f4.write('1\n') - else: - f4.write('0\n') - else: - strr = value + "\n" - f1.write(strr) - STRR3 = value + "\t" + str(position_label[value]) + "\n" - f2.write(STRR3) - csv_file.close() - f1.close() - f2.close() - f3.close() - f4.close() - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/1TRUE/-1/g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - - def temp_generate_indel_position_label_data_matrix_All_label(): - - """ - Read **temp_label_final_raw.txt** SNP position label data matrix for generating barplot statistics. - """ - temp_position_label = OrderedDict() - f33 = open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - print_string_header = "\t" - if args.outgroup: - for i in vcf_filenames: - - if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: - print_string_header = print_string_header + os.path.basename(i) + "\t" - else: - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - - f33.write('\t' + print_string_header.strip() + '\n') - keep_logging( - 'Reading temporary label positions file: %s/temp_label_final_raw.txt' % args.filter2_only_snp_vcf_dir, - 'Reading temporary label positions file: %s/temp_label_final_raw.txt' % args.filter2_only_snp_vcf_dir, - logger, 'info') - # lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] - lll = ['reference_unmapped_position', 'LowAF', 'LowAF_DP', 'LowAF_QUAL', 'LowAF_DP_QUAL', 'LowAF_QUAL_DP', - 'HighAF_DP', 'HighAF_QUAL', 'HighAF_DP_QUAL', 'HighAF_QUAL_DP', 'HighAF', 'LowAF_proximate_SNP', - 'LowAF_DP_proximate_SNP', 'LowAF_QUAL_proximate_SNP', 'LowAF_DP_QUAL_proximate_SNP', - 'LowAF_QUAL_DP_proximate_SNP', 'HighAF_DP_proximate_SNP', 'HighAF_QUAL_proximate_SNP', - 'HighAF_DP_QUAL_proximate_SNP', 'HighAF_QUAL_DP_proximate_SNP', 'HighAF_proximate_SNP', '_proximate_SNP'] - ref_var = ['reference_allele', 'VARIANT'] - - if args.outgroup: - with open("%s/temp_indel_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - if set(ref_var) & set(row[1:]): - if set(lll) & set(row[1:]): - if int(row[0]) not in outgroup_indel_specific_positions: - print_string = "" - for i in row[1:]: - print_string = print_string + "\t" + i - STRR2 = row[0] + print_string + "\n" - f33.write(STRR2) - csv_file.close() - f33.close() - else: - with open("%s/temp_indel_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - if set(ref_var) & set(row[1:]): - if set(lll) & set(row[1:]): - - print_string = "" - for i in row[1:]: - print_string = print_string + "\t" + i - STRR2 = row[0] + print_string + "\n" - f33.write(STRR2) - csv_file.close() - f33.close() - """ - Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of AF - """ - temp_position_label_AF = OrderedDict() - f44 = open("%s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir, - 'w+') - with open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, - 'rU') as csv_file: - keep_logging( - 'Reading temporary Only_filtered_indel_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, - 'Reading temporary Only_filtered_indel_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - - for row in csv_reader: - temp_position_label_AF[row[0]] = row[1:] - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f44.write('\t' + print_string_header.strip() + '\n') - for value in temp_position_label_AF: - lll = ['LowAF'] - if set(lll) & set(temp_position_label_AF[value]): - - print_string = "" - for i in temp_position_label_AF[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f44.write(STRR2) - f44.close() - csv_file.close() - f44.close() - - """ - Perform Sed on temp files. Find a faster way to do this. - """ - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF/3/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - - """ - Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp - """ - temp_position_label_DP = OrderedDict() - f44 = open("%s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, - 'w+') - with open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, - 'rU') as csv_file: - keep_logging( - 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, - 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - temp_position_label_DP[row[0]] = row[1:] - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f44.write('\t' + print_string_header.strip() + '\n') - for value in temp_position_label_DP: - lll = ['HighAF_DP'] - ref_var = ['reference_allele', 'VARIANT'] - if set(lll) & set(temp_position_label_AF[value]): - print_string = "" - for i in temp_position_label_AF[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f44.write(STRR2) - f44.close() - csv_file.close() - - """ - Perform Sed on temp files. Find a faster way to do this. - """ - subprocess.call([ - "sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF_DP/3/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/LowAF/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - subprocess.call([ - "sed -i 's/HighAF/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], - shell=True) - - def barplot_indel_stats(): - keep_logging( - 'Read each Sample columns and calculate the percentage of each label to generate barplot statistics.', - 'Read each Sample columns and calculate the percentage of each label to generate barplot statistics.', - logger, 'info') - """ - Read each Sample columns and calculate the percentage of each label to generate barplot statistics. - This will give a visual explanation of how many positions in each samples were filtered out because of different reason - """ - - c_reader = csv.reader( - open('%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, - 'r'), delimiter='\t') - columns = list(zip(*c_reader)) - print len(columns) - keep_logging('Finished reading columns...', 'Finished reading columns...', logger, 'info') - counts = 1 - - if args.outgroup: - end = len(vcf_filenames) + 1 - end = end - 1 - else: - end = len(vcf_filenames) + 1 - print end - - f_bar_count = open("%s/bargraph_indel_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_perc = open("%s/bargraph_indel_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_count.write( - "Sample\tunmapped_positions\treference_allele\ttrue_variant\tOnly_low_AF\tOnly_DP\tOnly_low_MQ\tother\n") - f_bar_perc.write( - "Sample\tunmapped_positions_perc\ttrue_variant_perc\tOnly_low_AF_perc\tOnly_DP_perc\tOnly_low_MQ_perc\tother_perc\n") - for i in xrange(1, end, 1): - """ Bar Count Statistics: Variant Position Count Statistics """ - print i - true_variant = columns[i].count('VARIANT') - unmapped_positions = columns[i].count('reference_unmapped_position') - reference_allele = columns[i].count('reference_allele') - Only_low_AF = columns[i].count('LowAF') - Only_DP = columns[i].count('HighAF_DP') - Only_low_MQ = columns[i].count('HighAF') - low_AF_other_parameters = columns[i].count('LowAF_QUAL_DP_proximate_SNP') + columns[i].count( - 'LowAF_DP_QUAL_proximate_SNP') + columns[i].count('LowAF_QUAL_proximate_SNP') + columns[i].count( - 'LowAF_DP_proximate_SNP') + columns[i].count('LowAF_proximate_SNP') + columns[i].count( - 'LowAF_QUAL_DP') + columns[i].count('LowAF_DP_QUAL') + columns[i].count('LowAF_QUAL') + columns[ - i].count('LowAF_DP') - high_AF_other_parameters = columns[i].count('HighAF_QUAL_DP_proximate_SNP') + columns[i].count( - 'HighAF_DP_QUAL_proximate_SNP') + columns[i].count('HighAF_QUAL_proximate_SNP') + columns[i].count( - 'HighAF_DP_proximate_SNP') + columns[i].count('HighAF_proximate_SNP') + columns[i].count( - 'HighAF_QUAL_DP') + columns[i].count('HighAF_DP_QUAL') + columns[i].count('HighAF_QUAL') - other = low_AF_other_parameters + high_AF_other_parameters - total = true_variant + unmapped_positions + reference_allele + Only_low_AF + Only_DP + low_AF_other_parameters + high_AF_other_parameters + Only_low_MQ - filename_count = i - 1 - # bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions, reference_allele, true_variant, Only_low_AF, Only_DP, Only_low_MQ, other) - if args.outgroup: - ### - - bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( - vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), - unmapped_positions, reference_allele, true_variant, - Only_low_AF, Only_DP, Only_low_MQ, other) - else: - bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( - vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), - unmapped_positions, reference_allele, true_variant, - Only_low_AF, Only_DP, Only_low_MQ, other) - - f_bar_count.write(bar_string) - - """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ - try: - true_variant_perc = float((columns[i].count('VARIANT') * 100) / total) - except ZeroDivisionError: - true_variant_perc = 0 - try: - unmapped_positions_perc = float((columns[i].count('reference_unmapped_position') * 100) / total) - except ZeroDivisionError: - unmapped_positions_perc = 0 - try: - reference_allele_perc = float((columns[i].count('reference_allele') * 100) / total) - except ZeroDivisionError: - reference_allele_perc = 0 - try: - Only_low_AF_perc = float((columns[i].count('LowAF') * 100) / total) - except ZeroDivisionError: - Only_low_AF_perc = 0 - try: - Only_DP_perc = float((columns[i].count('HighAF_DP') * 100) / total) - except ZeroDivisionError: - Only_DP_perc = 0 - try: - Only_low_MQ_perc = float((columns[i].count('HighAF') * 100) / total) - except ZeroDivisionError: - Only_low_MQ_perc = 0 - try: - low_AF_other_parameters_perc = float(((columns[i].count('LowAF_QUAL_DP_proximate_SNP') + columns[ - i].count('LowAF_DP_QUAL_proximate_SNP') + columns[i].count('LowAF_QUAL_proximate_SNP') + columns[ - i].count('LowAF_DP_proximate_SNP') + columns[i].count( - 'LowAF_proximate_SNP') + columns[i].count('LowAF_QUAL_DP') + columns[i].count('LowAF_DP_QUAL') + - columns[i].count('LowAF_QUAL') + columns[i].count( - 'LowAF_DP')) * 100) / total) - except ZeroDivisionError: - low_AF_other_parameters_perc = 0 - try: - high_AF_other_parameters_perc = float(((columns[i].count('HighAF_QUAL_DP_proximate_SNP') + columns[ - i].count('HighAF_DP_QUAL_proximate_SNP') + columns[i].count('HighAF_QUAL_proximate_SNP') + columns[ - i].count('HighAF_DP_proximate_SNP') + columns[i].count( - 'HighAF_proximate_SNP') + columns[i].count('HighAF_QUAL_DP') + columns[i].count('HighAF_DP_QUAL') + - columns[i].count('HighAF_QUAL')) * 100) / total) - except ZeroDivisionError: - high_AF_other_parameters_perc = 0 - - other_perc = float(low_AF_other_parameters_perc + high_AF_other_parameters_perc) - if args.outgroup: - ### - bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( - os.path.basename( - vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), - unmapped_positions_perc, true_variant_perc, Only_low_AF_perc, Only_DP_perc, Only_low_MQ_perc, - other_perc) - f_bar_perc.write(bar_perc_string) - else: - bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( - os.path.basename( - vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), - unmapped_positions_perc, true_variant_perc, Only_low_AF_perc, Only_DP_perc, Only_low_MQ_perc, - other_perc) - f_bar_perc.write(bar_perc_string) - - f_bar_count.close() - f_bar_perc.close() - bargraph_R_script = "library(ggplot2)\nlibrary(reshape)\nx1 <- read.table(\"bargraph_indel_percentage.txt\", header=TRUE)\nx1$Sample <- reorder(x1$Sample, rowSums(x1[-1]))\nmdf1=melt(x1,id.vars=\"Sample\")\npdf(\"%s/%s_barplot_indel.pdf\", width = 30, height = 30)\nggplot(mdf1, aes(Sample, value, fill=variable)) + geom_bar(stat=\"identity\") + ylab(\"Percentage of Filtered Positions\") + xlab(\"Samples\") + theme(text = element_text(size=9)) + scale_fill_manual(name=\"Reason for filtered out positions\", values=c(\"#08306b\", \"black\", \"orange\", \"darkgrey\", \"#fdd0a2\", \"#7f2704\")) + ggtitle(\"Title Here\") + ylim(0, 100) + theme(text = element_text(size=10), panel.background = element_rect(fill = 'white', colour = 'white'), plot.title = element_text(size=20, face=\"bold\", margin = margin(10, 0, 10, 0)), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_text(colour = \"black\", face= \"bold.italic\", angle = 90)) + theme(legend.position = c(0.6, 0.7), legend.direction = \"horizontal\")\ndev.off()" % ( - args.filter2_only_snp_vcf_dir, os.path.basename(os.path.normpath(args.results_dir))) - barplot_R_file = open("%s/bargraph_indel.R" % args.filter2_only_snp_vcf_dir, 'w+') - barplot_R_file.write(bargraph_R_script) - keep_logging('Run this R script to generate bargraph plot: %s/bargraph_indel.R' % args.filter2_only_snp_vcf_dir, - 'Run this R script to generate bargraph plot: %s/bargraph_indel.R' % args.filter2_only_snp_vcf_dir, - logger, 'info') - - """ Methods Steps""" - keep_logging('Running: Generating data matrices...', 'Running: Generating data matrices...', logger, 'info') - # if args.outgroup: - # f_outgroup = open("%s/outgroup_indel_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') - # global outgroup_indel_specific_positions - # outgroup_indel_specific_positions = [] - # for i in f_outgroup: - # outgroup_indel_specific_positions.append(i) - # f_outgroup.close() - # - # f_outgroup = open("%s/outgroup_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') - # global outgroup_specific_positions - # outgroup_specific_positions = [] - # for i in f_outgroup: - # outgroup_specific_positions.append(i) - # f_outgroup.close() - # else: - # global outgroup_specific_positions - # global outgroup_indel_specific_positions - # outgroup_indel_specific_positions = [] - # outgroup_specific_positions = [] - generate_indel_position_label_data_matrix_All_label() - keep_logging('Running: Changing variables in data matrices to codes for faster processing...', - 'Running: Changing variables in data matrices to codes for faster processing...', logger, 'info') - temp_generate_indel_position_label_data_matrix_All_label() - keep_logging('Running: Generating Barplot statistics data matrices...', - 'Running: Generating Barplot statistics data matrices...', logger, 'info') - barplot_indel_stats() - - -def create_job_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filter): - """ Generate jobs/scripts that creates core consensus fasta file. - - This function will generate and run scripts/jobs to create core consensus fasta file of only core variant positions. - Input for Fasttree, Beast and pairwise variant analysis. - - :param jobrun: Based on this value all the job/scripts will run on "cluster": either on single cluster, "parallel-local": run in parallel on local system, "local": run on local system, "parallel-cluster": submit parallel jobs on cluster. - :param vcf_filenames: list of final vcf filenames i.e *_no_proximate_snp.vcf. These files are the final output of variant calling step for each sample. - :return: - :raises: - """ - if jobrun == "parallel-cluster": - """ - Supports only PBS clusters for now. - """ - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % ( - job_name, ConfigSectionMap("scheduler", Config)['email'], - ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], - ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], - args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) - job_file_name = "%s_fasta.pbs" % (i) - f1 = open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - # os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') - # os.system("qsub %s" % i) - call("qsub %s" % i, logger) - - - elif jobrun == "parallel-local" or jobrun == "cluster": - """ - Generate a Command list of each job and run it in parallel on different cores available on local system - """ - command_array = [] - command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % ( - job_name, ConfigSectionMap("scheduler", Config)['email'], - ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], - ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], - args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) - job_file_name = "%s_fasta.pbs" % (i) - f1 = open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - if args.numcores: - num_cores = int(num_cores) - else: - num_cores = multiprocessing.cpu_count() - results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) - - # elif jobrun == "cluster": - # command_array = [] - # command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir - # f3 = open(command_file, 'w+') - # for i in vcf_filenames: - # job_name = os.path.basename(i) - # job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'],args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir) - # job_file_name = "%s_fasta.pbs" % (i) - # f1=open(job_file_name, 'w+') - # f1.write(job_print_string) - # f1.close() - # pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" - # pbs_scripts = glob.glob(pbs_dir) - # for i in pbs_scripts: - # f3.write("bash %s\n" % i) - # f3.close() - # with open(command_file, 'r') as fpp: - # for lines in fpp: - # lines = lines.strip() - # command_array.append(lines) - # fpp.close() - # os.system("bash %s/command_file" % args.filter2_only_snp_vcf_dir) - else: - """ - Generate a Command list of each job and run it on local system one at a time - """ - command_array = [] - command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % ( - job_name, ConfigSectionMap("scheduler", Config)['email'], - ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], - ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], - args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) - job_file_name = "%s_fasta.pbs" % (i) - f1 = open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - # os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" - pbs_scripts = glob.glob(pbs_dir) - - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - # os.system("bash command_file") - call("bash %s" % command_file, logger) - - -def create_job_allele_variant_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, config_file): - """ Generate jobs/scripts that creates core consensus fasta file. - - This function will generate and run scripts/jobs to create core consensus fasta file of only core variant positions. - Input for Fasttree, Beast and pairwise variant analysis. - - :param jobrun: Based on this value all the job/scripts will run on "cluster": either on single cluster, "parallel-local": run in parallel on local system, "local": run on local system, "parallel-cluster": submit parallel jobs on cluster. - :param vcf_filenames: list of final vcf filenames i.e *_no_proximate_snp.vcf. These files are the final output of variant calling step for each sample. - :return: - :raises: - """ - if jobrun == "parallel-cluster": - """ - Supports only PBS clusters for now. - """ - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % ( - job_name, ConfigSectionMap("scheduler", Config)['email'], - ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], - ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], - args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) - job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) - f1 = open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - # os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') - # os.system("qsub %s" % i) - call("qsub %s" % i, logger) - - - elif jobrun == "parallel-local" or jobrun == "cluster": - """ - Generate a Command list of each job and run it in parallel on different cores available on local system - """ - command_array = [] - command_file = "%s/commands_list_ref_allele_variants_fasta.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % ( - job_name, ConfigSectionMap("scheduler", Config)['email'], - ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], - ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], - args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) - job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) - f1 = open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_ref_allele_variants_fasta.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - if args.numcores: - num_cores = int(num_cores) - else: - num_cores = multiprocessing.cpu_count() - results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) - - # elif jobrun == "cluster": - # command_array = [] - # command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir - # f3 = open(command_file, 'w+') - # for i in vcf_filenames: - # job_name = os.path.basename(i) - # job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'],args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir) - # job_file_name = "%s_fasta.pbs" % (i) - # f1=open(job_file_name, 'w+') - # f1.write(job_print_string) - # f1.close() - # pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" - # pbs_scripts = glob.glob(pbs_dir) - # for i in pbs_scripts: - # f3.write("bash %s\n" % i) - # f3.close() - # with open(command_file, 'r') as fpp: - # for lines in fpp: - # lines = lines.strip() - # command_array.append(lines) - # fpp.close() - # os.system("bash %s/command_file" % args.filter2_only_snp_vcf_dir) - else: - """ - Generate a Command list of each job and run it on local system one at a time - """ - command_array = [] - command_file = "%s/commands_list_ref_allele_variants_fasta.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % ( - job_name, ConfigSectionMap("scheduler", Config)['email'], - ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], - ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], - args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) - job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) - f1 = open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - # os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_ref_allele_variants_fasta.pbs" - pbs_scripts = glob.glob(pbs_dir) - - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - # os.system("bash command_file") - call("bash %s" % command_file, logger) - - -def create_job_DP(jobrun, vcf_filenames): - """ - Based on type of jobrun; generate jobs and run accordingly. - :param jobrun: Based on this value all the job/scripts will run on "cluster": either on single cluster, "parallel-local": run in parallel on local system, "local": run on local system, "parallel-cluster": submit parallel jobs on cluster. - :param vcf_filenames: - :return: - """ - - if jobrun == "parallel-cluster": - """ - Supports only PBS clusters for now. - """ - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % ( - job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) - job_file_name = "%s_DP.pbs" % (i) - f1 = open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - # os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') - # os.system("qsub %s" % i) - call("qsub %s" % i, logger) - - - elif jobrun == "parallel-local" or jobrun == "cluster": - """ - Generate a Command list of each job and run it in parallel on different cores available on local system - """ - command_array = [] - command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % ( - job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) - job_file_name = "%s_DP.pbs" % (i) - f1 = open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - # os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" - pbs_scripts = glob.glob(pbs_dir) - - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - print len(command_array) - if args.numcores: - num_cores = int(num_cores) - else: - num_cores = multiprocessing.cpu_count() - results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) - - # elif jobrun == "cluster": - # """ Test pending """ - # command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir - # f3 = open(command_file, 'w+') - # for i in vcf_filenames: - # job_name = os.path.basename(i) - # job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) - # job_file_name = "%s_DP.pbs" % (i) - # f1=open(job_file_name, 'w+') - # f1.write(job_print_string) - # f1.close() - # pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" - # pbs_scripts = glob.glob(pbs_dir) - # for i in pbs_scripts: - # f3.write("bash %s\n" % i) - # f3.close() - # os.system("bash %s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir) - - else: - """ - Generate a Command list of each job and run it on local system one at a time - """ - command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % ( - job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) - job_file_name = "%s_DP.pbs" % (i) - f1 = open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - # os.system("bash %s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir) - call("bash %s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir, logger) - - -def generate_vcf_files(): - if ConfigSectionMap("functional_filters", Config)['apply_functional_filters'] == "yes": - keep_logging( - 'Removing Variants falling in Functional filters positions file: %s\n' % functional_class_filter_positions, - 'Removing Variants falling in Functional filters positions file: %s\n' % functional_class_filter_positions, - logger, - 'info') - # phage_positions = [] - # phage_region_positions = "%s/phage_region_positions.txt" % args.filter2_only_snp_vcf_dir - # with open(phage_region_positions, 'rU') as fp: - # for line in fp: - # phage_positions.append(line.strip()) - # fp.close() - - functional_filter_pos_array = [] - with open(functional_class_filter_positions, 'rU') as f_functional: - for line_func in f_functional: - functional_filter_pos_array.append(line_func.strip()) - - ref_variant_position_array = [] - ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') - for line in ffp: - line = line.strip() - if line not in functional_filter_pos_array: - ref_variant_position_array.append(line) - ffp.close() - - # Adding core indel support: 2018-07-24 - ref_indel_variant_position_array = [] - ffp = open("%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') - for line in ffp: - line = line.strip() - if line not in functional_filter_pos_array: - ref_indel_variant_position_array.append(line) - ffp.close() - - else: - functional_filter_pos_array = [] - ref_variant_position_array = [] - ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') - for line in ffp: - line = line.strip() - ref_variant_position_array.append(line) - ffp.close() - - # Adding core indel support: 2018-07-24 - ref_indel_variant_position_array = [] - ffp = open("%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') - for line in ffp: - line = line.strip() - if line not in functional_filter_pos_array: - ref_indel_variant_position_array.append(line) - ffp.close() - - print "No. of core SNPs: %s" % len(ref_variant_position_array) - print "No. of core INDELs: %s" % len(ref_indel_variant_position_array) - - f_file = open( - "%s/Only_ref_variant_positions_for_closely_without_functional_filtered_positions" % args.filter2_only_snp_vcf_dir, - 'w+') - for pos in ref_variant_position_array: - f_file.write(pos + '\n') - f_file.close() - - # Adding core indel support: 2018-07-24 - f_file = open( - "%s/Only_ref_indel_variant_positions_for_closely_without_functional_filtered_positions" % args.filter2_only_snp_vcf_dir, - 'w+') - for pos in ref_indel_variant_position_array: - f_file.write(pos + '\n') - f_file.close() - - base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("vcftools", Config)[ - 'vcftools_bin'] - filter2_files_array = [] - for i in vcf_filenames: - filter2_file = i.replace('_no_proximate_snp.vcf', '') - filter2_files_array.append(filter2_file) - - filtered_out_vcf_files = [] - for i in filter2_files_array: - print_array = [] - with open(i) as file_open: - for line in file_open: - line = line.strip() - if line.startswith("#"): - print_array.append(line) - else: - split_array = re.split(r'\t+', line) - if split_array[1] in ref_variant_position_array and 'INDEL' not in split_array[7]: - print_array.append(line) - file_open.close() - file_name = i + "_core.vcf" - keep_logging('Generating %s' % file_name, 'Generating %s' % file_name, logger, 'info') - filtered_out_vcf_files.append(file_name) - f1 = open(file_name, 'w+') - for ios in print_array: - print_string = str(ios) + "\n" - f1.write(print_string) - f1.close() - - filename = "%s/consensus.sh" % args.filter2_only_snp_vcf_dir - keep_logging('Generating Consensus...', 'Generating Consensus...', logger, 'info') - for file in filtered_out_vcf_files: - f1 = open(filename, 'a+') - bgzip_cmd = "%s/%s/bgzip -f %s\n" % ( - ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], file) - f1.write(bgzip_cmd) - subprocess.call([bgzip_cmd], shell=True) - tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % ( - ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], file) - f1.write(tabix_cmd) - subprocess.call([tabix_cmd], shell=True) - fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s.fa\n" % ( - args.reference, base_vcftools_bin, file, file.replace('_filter2_final.vcf_core.vcf', '')) - f1.write(fasta_cmd) - subprocess.call([fasta_cmd], shell=True) - base = os.path.basename(file) - header = base.replace('_filter2_final.vcf_core.vcf', '') - sed_command = "sed -i 's/>.*/>%s/g' %s.fa\n" % (header, file.replace('_filter2_final.vcf_core.vcf', '')) - subprocess.call([sed_command], shell=True) - f1.write(sed_command) - keep_logging('The consensus commands are in : %s' % filename, 'The consensus commands are in : %s' % filename, - logger, 'info') - sequence_lgth_cmd = "for i in %s/*.fa; do %s/%s/bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % ( - args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], - ConfigSectionMap("bioawk", Config)['bioawk_bin']) - # os.system(sequence_lgth_cmd) - call("%s" % sequence_lgth_cmd, logger) - - -def gatk_filter2(final_raw_vcf, out_path, analysis, reference): - gatk_filter2_parameter_expression = "MQ > 50 && QUAL > 100 && DP > 9" - gatk_filter2_command = "java -jar %s/%s/GenomeAnalysisTK.jar -T VariantFiltration -R %s -o %s/%s_filter2_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % ( - ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("gatk", Config)['gatk_bin'], reference, out_path, - analysis, final_raw_vcf, gatk_filter2_parameter_expression) - keep_logging('Running Command: [%s]' % gatk_filter2_command, 'Running Command: [%s]' % gatk_filter2_command, logger, - 'info') - # os.system(gatk_filter2_command) - call("%s" % gatk_filter2_command, logger) - filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_gatk.vcf > %s/%s_filter2_final.vcf" % ( - out_path, analysis, out_path, analysis) - call("%s" % filter_flag_command, logger) - gatk_filter2_final_vcf = "%s/%s_filter2_final.vcf" % (out_path, analysis) - return gatk_filter2_final_vcf - - -def remove_proximate_snps(gatk_filter2_final_vcf_file, out_path, analysis, reference): - all_position = [] - remove_proximate_position_array = [] - gatk_filter2_final_vcf_file_no_proximate_snp = gatk_filter2_final_vcf_file + "_no_proximate_snp.vcf" - with open(gatk_filter2_final_vcf_file, 'rU') as csv_file: - for line in csv_file: - if not line.startswith('#'): - line_array = line.split('\t') - all_position.append(line_array[1]) - for position in all_position: - position_index = all_position.index(position) - next_position_index = position_index + 1 - - if next_position_index < len(all_position): - diff = int(all_position[next_position_index]) - int(position) - if diff < 10: - # print position + " " + all_position[next_position_index] - if position not in remove_proximate_position_array and all_position[ - next_position_index] not in remove_proximate_position_array: - remove_proximate_position_array.append(int(position)) - remove_proximate_position_array.append(int(all_position[next_position_index])) - f1 = open(gatk_filter2_final_vcf_file_no_proximate_snp, 'w+') - with open(gatk_filter2_final_vcf_file, 'rU') as csv_file2: - for line in csv_file2: - if line.startswith('gi') or line.startswith('MRSA_8058'): ##change this! - line_array = line.split('\t') - if int(line_array[1]) not in remove_proximate_position_array: - print_string = line - f1.write(print_string) - else: - print_string = line - f1.write(print_string) - gatk_filter2_final_vcf_file_no_proximate_snp_positions = gatk_filter2_final_vcf_file + "_no_proximate_snp.vcf_positions_array" - f2 = open(gatk_filter2_final_vcf_file_no_proximate_snp_positions, 'w+') - for i in remove_proximate_position_array: - position_print_string = str(i) + "\n" - f2.write(position_print_string) - return gatk_filter2_final_vcf_file_no_proximate_snp - - -def FQ_analysis(): - for i in vcf_filenames: - filename_base = os.path.basename(i) - aln_mpileup_vcf_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', - '_aln_mpileup_raw.vcf_5bp_indel_removed.vcf') - analysis = filename_base.replace('_filter2_final.vcf_no_proximate_snp.vcf', '') - # print aln_mpileup_vcf_file - grep_reference_file = "grep \'^##reference\' %s" % aln_mpileup_vcf_file - proc = subprocess.Popen([grep_reference_file], stdout=subprocess.PIPE, shell=True) - (out, err) = proc.communicate() - out = out.strip() - reference_file = out.split(':') - # Change it to multiprocessing - gatk_filter2_final_vcf_file = gatk_filter2(aln_mpileup_vcf_file, temp_dir, analysis, reference_file[1]) - # print gatk_filter2_final_vcf_file - gatk_filter2_final_vcf_file_no_proximate_snp = remove_proximate_snps(gatk_filter2_final_vcf_file, temp_dir, - analysis, reference_file[1]) - grep_fq_field = "awk -F\'\\t\' \'{print $8}\' %s | grep -o \'FQ=.*\' | sed \'s/FQ=//g\' | awk -F\';\' \'{print $1}\' > %s/%s_FQ_values" % ( - gatk_filter2_final_vcf_file_no_proximate_snp, os.path.dirname(i), analysis) - # os.system(grep_fq_field) - call("%s" % grep_fq_field, logger) - # print grep_fq_field - - -def DP_analysis(): - create_job_DP(args.jobrun, vcf_filenames) - paste_command = "paste %s/extract_DP_positions.txt" % args.filter2_only_snp_vcf_dir - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_DP_values') - paste_command = paste_command + " " + label_file - - paste_file = args.filter2_only_snp_vcf_dir + "/paste_DP_files.sh" - f2 = open(paste_file, 'w+') - paste_command = paste_command + " > %s/filtered_DP_values_temp.txt" % args.filter2_only_snp_vcf_dir - # os.system(paste_command) - f2.write(paste_command + '\n') - cat_header = "cat %s/header.txt %s/filtered_DP_values_temp.txt > %s/filtered_DP_values.txt" % ( - args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - # os.system(cat_header) - f2.write(cat_header + '\n') - sed_command = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/filtered_DP_values.txt" % ( - args.filter2_only_snp_vcf_dir) - # os.system(sed_command) - f2.write(sed_command + '\n') - cmd = "bash %s" % paste_file - # os.system("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir) - - -def DP_analysis_barplot(): - # os.system("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir) - call("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir, logger) - keep_logging('Generating DP barplots data...', 'Generating DP barplots data...', logger, 'info') - c_reader = csv.reader(open('%s/filtered_DP_values.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') - columns = list(zip(*c_reader)) - counts = 1 - end = len(vcf_filenames) + 1 - f_bar_count = open("%s/DP_bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_perc = open("%s/DP_bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_count.write("Sample\treference_position\toneto5\tsixto10\televento14\tfifteenorabove\n") - f_bar_perc.write("Sample\treference_position\toneto5\tsixto10\televento14\tfifteenorabove\n") - for i in xrange(1, end, 1): - """ Bar Count Statistics: Variant Position Count Statistics """ - reference_position = columns[i].count('NA') - oneto5 = 0 - for k in list(columns[i][1:]): - if k != "": - if k != "NA": - if int(k) < 5: - oneto5 += 1 - sixto10 = 0 - for k in list(columns[i][1:]): - if k != "": - if k != "NA": - if int(k) >= 5 and int(k) <= 10: - sixto10 += 1 - elevento14 = 0 - for k in list(columns[i][1:]): - if k != "": - if k != "NA": - if int(k) >= 11 and int(k) <= 14: - elevento14 += 1 - fifteenorabove = 0 - for k in list(columns[i][1:]): - if k != "": - if k != "NA": - if int(k) >= 15: - fifteenorabove += 1 - total = reference_position + oneto5 + sixto10 + elevento14 + fifteenorabove - filename_count = i - 1 - bar_string = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( - os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), - reference_position, oneto5, sixto10, elevento14, fifteenorabove) - f_bar_count.write(bar_string) - - """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ - try: - reference_position_perc = float(reference_position * 100 / total) - except ZeroDivisionError: - reference_position_perc = 0 - try: - oneto5_perc = float(oneto5 * 100 / total) - except ZeroDivisionError: - oneto5_perc = 0 - try: - sixto10_perc = float(sixto10 * 100 / total) - except ZeroDivisionError: - sixto10_perc = 0 - try: - elevento14_perc = float(elevento14 * 100 / total) - except ZeroDivisionError: - elevento14_perc = 0 - try: - fifteenorabove_perc = float(fifteenorabove * 100 / total) - except ZeroDivisionError: - fifteenorabove_perc = 0 - bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( - os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), - reference_position_perc, oneto5_perc, sixto10_perc, elevento14_perc, fifteenorabove_perc) - f_bar_perc.write(bar_perc_string) diff --git a/modules/variant_diagnostics/core_pipeline_modular.py b/modules/variant_diagnostics/core_pipeline_modular.py deleted file mode 100755 index 59fc42b..0000000 --- a/modules/variant_diagnostics/core_pipeline_modular.py +++ /dev/null @@ -1,4658 +0,0 @@ -# System wide imports -from __future__ import division -import sys -import argparse -import re -import os -import csv -import subprocess -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -""" Hacky way to append. Instead Add this path to PYTHONPATH Variable """ -from collections import OrderedDict -from collections import defaultdict -from joblib import Parallel, delayed -import multiprocessing -import thread -import glob -import readline -#import pandas as pd -import errno -from pyfasta import Fasta -from datetime import datetime -import threading -import json -from cyvcf2 import VCF -import ConfigParser -from config_settings import ConfigSectionMap -from logging_subprocess import * -from log_modules import * -from tabix import * -from Bio import SeqIO -from phage_detection import * -from find_repeats import * -from mask_regions import * -from fasttree import fasttree -from gubbins import * -from raxml import raxml -from pyfasta import Fasta -from core_prep_sanity_checks import * -from iqtree import iqtree -from core_pipeline_core_prep_label import core_prep_label - -# Parse Command line Arguments -parser = argparse.ArgumentParser(description='Parsing filtered VCF files and investigating Variants to determine the reason why it was filtered out from the final list') -required = parser.add_argument_group('Required arguments') -optional = parser.add_argument_group('Optional arguments') -required.add_argument('-filter2_only_snp_vcf_dir', action='store', dest="filter2_only_snp_vcf_dir", - help='Directory where all the filter2 only SNP vcf files are saved.') -required.add_argument('-filter2_only_snp_vcf_filenames', action='store', dest="filter2_only_snp_vcf_filenames", - help='Names of filter2 only SNP vcf files with name per line.') -optional.add_argument('-jobrun', action='store', dest="jobrun", - help='Running a job on Cluster, Running Parallel jobs, Run jobs/commands locally (default): cluster, local, parallel-local, parallel-single-cluster') -optional.add_argument('-cluster_type', action='store', dest="cluster_type", - help='Type of Cluster: torque, pbs, sgd') -optional.add_argument('-cluster_resources', action='store', dest="cluster_resources", - help='Cluster Resources to use. for example nodes,core. Ex: 1,4') -optional.add_argument('-numcores', action='store', dest="numcores", - help='Number of cores to use on local system for parallel-local parameter') -optional.add_argument('-remove_temp', action='store', dest="remove_temp", - help='Remove Temporary files generated during the run') -optional.add_argument('-gubbins', action='store', dest="gubbins", help='yes/no for running gubbins') -optional.add_argument('-outgroup', action='store', dest="outgroup", help='outgroup sample name') -required.add_argument('-reference', action='store', dest="reference", - help='Path to Reference Fasta file for consensus generation') -required.add_argument('-steps', action='store', dest="steps", - help='Analysis Steps to be performed. This should be in sequential order.' - 'Step 1: Run pbs jobs and process all pipeline generated vcf files to generate label files' - 'Step 2: Analyze label files and generate matrix' - 'Step 3: DP/FQ Analysis') -required.add_argument('-results_dir', action='store', dest="results_dir", - help='Path to Core results directory') -required.add_argument('-config', action='store', dest="config", - help='Path to config file') -# optional.add_argument('-db', action='store', dest="snpeff_db", -# help='snpEff prebuilt reference database to use for variant annotations. The database will be downloaded in /data/ folder under snpEff install directory. Make sure if you are providing the name of pre-built snpEff reference database then the build option of snpeff section in config section is set to \"no\"') -optional.add_argument('-debug_mode', action='store', dest="debug_mode", - help='yes/no for debug mode') -args = parser.parse_args() - -""" Generic Methods """ -def make_sure_path_exists(out_path): - """This function checks if the args out_path exists and generates an empty directory if it doesn't. - - :param: - out_path: Directory path to check or create a new directory. - - :return: null/exception - - """ - - try: - os.makedirs(out_path) - except OSError as exception: - if exception.errno != errno.EEXIST: - keep_logging('\nErrors in output folder path! please change the output path or analysis name\n', - '\nErrors in output folder path! please change the output path or analysis name\n', logger, - 'info') - exit() - -def run_command(i): - """Function to run each command and is run as a part of python Parallel mutiprocessing method. - - :param: - i: command variable to run - - :return: - done: string variable with completion status of command. - """ - - call("%s" % i, logger) - # A subprocess exception is raised if the command finish abnormally. - # An exception is raised in call method. - # If none of the exceptions are raised, return done status. - done = "Completed: %s" % i - return done - -# """core methods -# -# This block contains methods that are respnsible for running the second part of core_All step of the pipeline. -# It uses intermediate files generated during the first step, finds core SNPs and annotates variants using snpEff. -# It will generate all types of SNP matrices that is required for downstream pathways / Association analysis. -# Output: -# - -# -# """ -# -# def generate_paste_command(): -# """ -# This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. -# :param: null -# :return: null -# """ -# -# """ Paste/Generate and sort SNP Filter Label Matrix """ -# paste_file = args.filter2_only_snp_vcf_dir + "/paste_label_files.sh" -# f4=open(paste_file, 'w+') -# paste_command = "paste %s/unique_positions_file" % args.filter2_only_snp_vcf_dir -# for i in vcf_filenames: -# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') -# paste_command = paste_command + " " + label_file -# header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) -# sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir -# sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir -# -# call("%s" % header_awk_cmd, logger) -# call("%s" % sed_header, logger) -# call("%s" % sed_header_2, logger) -# -# temp_paste_command = paste_command + " > %s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir -# paste_command = paste_command + " > %s/All_label_final_raw" % args.filter2_only_snp_vcf_dir -# f4.write(paste_command) -# f4.close() -# sort_All_label_cmd = "sort -n -k1,1 %s/All_label_final_raw > %s/All_label_final_sorted.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) -# paste_command_header = "cat %s/header.txt %s/All_label_final_sorted.txt > %s/All_label_final_sorted_header.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) -# -# ls = [] -# for i in vcf_filenames: -# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') -# ls.append(label_file) -# ls.insert(0, "%s/unique_positions_file" % args.filter2_only_snp_vcf_dir) -# -# with open('%s/All_label_final_raw.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: -# outfile.write(paste_command) -# outfile.close() -# -# with open('%s/temp_label_final_raw.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: -# outfile.write(temp_paste_command) -# outfile.close() -# -# call("bash %s/All_label_final_raw.sh" % args.filter2_only_snp_vcf_dir, logger) -# call("bash %s/temp_label_final_raw.txt.sh" % args.filter2_only_snp_vcf_dir, logger) -# call("%s" % sort_All_label_cmd, logger) -# call("%s" % paste_command_header, logger) -# -# """ Assign numeric code to each variant filter reason""" -# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/reference_allele/1/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/VARIANT/1TRUE/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL_DP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP_QUAL/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_proximate_SNP/7/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL_DP/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP_QUAL/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ/5/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ/6/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir -# call("%s" % remove_unwanted_text, logger) -# -# def generate_paste_command_outgroup(): -# """ -# This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. -# :param: null -# :return: null -# """ -# -# if args.outgroup: -# """ Paste/Generate and sort SNP Filter Label Matrix """ -# paste_file = args.filter2_only_snp_vcf_dir + "/paste_label_files_outgroup.sh" -# f4=open(paste_file, 'w+') -# paste_command = "paste %s/unique_positions_file" % args.filter2_only_snp_vcf_dir -# for i in vcf_filenames: -# if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: -# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') -# paste_command = paste_command + " " + label_file -# -# -# """Exclude outgroup sample name in header -# -# header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) -# sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir -# sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir -# -# """ -# -# header_awk_cmd = "grep -v \'%s\' %s | awk \'{ORS=\"\t\";}{print $1}\' > %s/header_outgroup.txt" % (outgroup, args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) -# sed_header = "sed -i \'s/^/\t/\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir -# sed_header_2 = "sed -i -e \'$a\\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir -# -# call("%s" % header_awk_cmd, logger) -# call("%s" % sed_header, logger) -# call("%s" % sed_header_2, logger) -# -# temp_paste_command = paste_command + " > %s/temp_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir -# paste_command = paste_command + " > %s/All_label_final_raw_outgroup" % args.filter2_only_snp_vcf_dir -# f4.write(paste_command) -# f4.close() -# sort_All_label_cmd = "sort -n -k1,1 %s/All_label_final_raw_outgroup > %s/All_label_final_sorted_outgroup.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) -# paste_command_header = "cat %s/header_outgroup.txt %s/All_label_final_sorted_outgroup.txt > %s/All_label_final_sorted_header_outgroup.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) -# -# ls = [] -# for i in vcf_filenames: -# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') -# ls.append(label_file) -# ls.insert(0, "%s/unique_positions_file" % args.filter2_only_snp_vcf_dir) -# -# with open('%s/All_label_final_raw_outgroup.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: -# outfile.write(paste_command) -# outfile.close() -# -# with open('%s/temp_label_final_raw_outgroup.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: -# outfile.write(temp_paste_command) -# outfile.close() -# call("bash %s/All_label_final_raw_outgroup.sh" % args.filter2_only_snp_vcf_dir, logger) -# call("bash %s/temp_label_final_raw_outgroup.txt.sh" % args.filter2_only_snp_vcf_dir, logger) -# -# -# """ -# remove this lines -# #subprocess.call(["%s" % paste_command], shell=True) -# #subprocess.call(["%s" % temp_paste_command], shell=True) -# #subprocess.check_call('%s' % paste_command) -# #subprocess.check_call('%s' % temp_paste_command) -# #os.system(paste_command) change -# #os.system(temp_paste_command) change -# """ -# -# call("%s" % sort_All_label_cmd, logger) -# call("%s" % paste_command_header, logger) -# -# """ Assign numeric code to each variant filter reason""" -# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/reference_allele/1/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/VARIANT/1TRUE/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_proximate_SNP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL_DP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP_QUAL/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP/2/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_proximate_SNP/7/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL_DP/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP_QUAL/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ/5/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ/6/g' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir -# call("%s" % remove_unwanted_text, logger) -# -# else: -# print "Skip generating seperate intermediate files for outgroup" -# -# def generate_indel_paste_command(): -# """ -# This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. -# :param: null -# :return: null -# """ -# -# """ Paste/Generate and sort SNP Filter Label Matrix """ -# paste_file = args.filter2_only_snp_vcf_dir + "/paste_indel_label_files.sh" -# f4=open(paste_file, 'w+') -# paste_command = "paste %s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir -# for i in vcf_filenames: -# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf_indel_positions_label') -# paste_command = paste_command + " " + label_file -# header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) -# sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir -# sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir -# -# #os.system(header_awk_cmd) -# #os.system(sed_header) -# #os.system(sed_header_2) -# -# call("%s" % header_awk_cmd, logger) -# call("%s" % sed_header, logger) -# call("%s" % sed_header_2, logger) -# -# -# -# temp_paste_command = paste_command + " > %s/temp_indel_label_final_raw.txt" % args.filter2_only_snp_vcf_dir -# paste_command = paste_command + " > %s/All_indel_label_final_raw" % args.filter2_only_snp_vcf_dir -# f4.write(paste_command) -# f4.close() -# -# call("bash %s" % paste_file, logger) -# -# sort_All_label_cmd = "sort -n -k1,1 %s/All_indel_label_final_raw > %s/All_indel_label_final_sorted.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) -# paste_command_header = "cat %s/header.txt %s/All_indel_label_final_sorted.txt > %s/All_indel_label_final_sorted_header.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) -# -# ls = [] -# for i in vcf_filenames: -# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf_indel_positions_label') -# ls.append(label_file) -# ls.insert(0, "%s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir) -# -# with open('%s/All_indel_label_final_raw.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: -# outfile2.write(paste_command) -# outfile2.close() -# -# with open('%s/temp_indel_label_final_raw.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: -# outfile2.write(temp_paste_command) -# outfile2.close() -# -# # Why is this not working? -# call("bash %s/All_indel_label_final_raw.sh" % args.filter2_only_snp_vcf_dir, logger) -# call("bash %s/temp_indel_label_final_raw.txt.sh" % args.filter2_only_snp_vcf_dir, logger) -# keep_logging('Finished pasting...DONE', 'Finished pasting...DONE', logger, 'info') -# -# """ -# remove this lines -# #subprocess.call(["%s" % paste_command], shell=True) -# #subprocess.call(["%s" % temp_paste_command], shell=True) -# #subprocess.check_call('%s' % paste_command) -# #subprocess.check_call('%s' % temp_paste_command) -# #os.system(paste_command) change -# #os.system(temp_paste_command) change -# """ -# -# call("%s" % sort_All_label_cmd, logger) -# call("%s" % paste_command_header, logger) -# -# """ Assign numeric code to each variant filter reason""" -# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/reference_allele/1/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/VARIANT/1TRUE/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL_DP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP_QUAL/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP/2/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_proximate_SNP/7/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL_DP/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP_QUAL/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP/3/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF/5/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF/6/g' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir -# call("%s" % remove_unwanted_text, logger) -# -# def generate_indel_paste_command_outgroup(): -# """ -# This Function will take all the *label file and generate/paste it column wise to generate a matrix. These matrix will be used in downstream analysis. -# :param: null -# :return: null -# """ -# -# if args.outgroup: -# """ Paste/Generate and sort SNP Filter Label Matrix """ -# # define a file name where the paste commands will be saved. -# paste_file = args.filter2_only_snp_vcf_dir + "/paste_indel_label_files_outgroup.sh" -# f4=open(paste_file, 'w+') -# -# # initiate paste command string -# paste_command = "paste %s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir -# -# -# # Generate paste command -# for i in vcf_filenames: -# if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: -# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf_indel_positions_label') -# paste_command = paste_command + " " + label_file -# # Change header awk command to exclude outgroup -# #header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) -# header_awk_cmd = "grep -v \'%s\' %s | awk \'{ORS=\"\t\";}{print $1}\' > %s/header_outgroup.txt" % (outgroup, args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) -# sed_header = "sed -i \'s/^/\t/\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir -# sed_header_2 = "sed -i -e \'$a\\' %s/header_outgroup.txt" % args.filter2_only_snp_vcf_dir -# -# -# -# call("%s" % header_awk_cmd, logger) -# call("%s" % sed_header, logger) -# call("%s" % sed_header_2, logger) -# -# -# -# temp_paste_command = paste_command + " > %s/temp_indel_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir -# paste_command = paste_command + " > %s/All_indel_label_final_raw_outgroup" % args.filter2_only_snp_vcf_dir -# f4.write(paste_command) -# f4.close() -# -# call("bash %s" % paste_file, logger) -# -# sort_All_label_cmd = "sort -n -k1,1 %s/All_indel_label_final_raw_outgroup > %s/All_indel_label_final_sorted_outgroup.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) -# paste_command_header = "cat %s/header_outgroup.txt %s/All_indel_label_final_sorted_outgroup.txt > %s/All_indel_label_final_sorted_header_outgroup.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) -# -# ls = [] -# for i in vcf_filenames: -# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf_indel_positions_label') -# ls.append(label_file) -# ls.insert(0, "%s/unique_indel_positions_file" % args.filter2_only_snp_vcf_dir) -# -# with open('%s/All_indel_label_final_raw_outgroup.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: -# outfile2.write(paste_command) -# outfile2.close() -# -# with open('%s/temp_indel_label_final_raw_outgroup.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile2: -# outfile2.write(temp_paste_command) -# outfile2.close() -# -# # Why is this not working? -# call("bash %s/All_indel_label_final_raw_outgroup.sh" % args.filter2_only_snp_vcf_dir, logger) -# call("bash %s/temp_indel_label_final_raw_outgroup.txt.sh" % args.filter2_only_snp_vcf_dir, logger) -# keep_logging('Finished pasting...DONE', 'Finished pasting...DONE', logger, 'info') -# -# """ -# remove this lines -# #subprocess.call(["%s" % paste_command], shell=True) -# #subprocess.call(["%s" % temp_paste_command], shell=True) -# #subprocess.check_call('%s' % paste_command) -# #subprocess.check_call('%s' % temp_paste_command) -# #os.system(paste_command) change -# #os.system(temp_paste_command) change -# """ -# -# call("%s" % sort_All_label_cmd, logger) -# call("%s" % paste_command_header, logger) -# -# """ Assign numeric code to each variant filter reason""" -# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/reference_allele/1/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/VARIANT/1TRUE/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_proximate_SNP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL_DP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP_QUAL/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP/2/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_proximate_SNP/7/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL_DP/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP_QUAL/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP/3/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF/5/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF/6/g' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir -# call("%s" % remove_unwanted_text, logger) -# else: -# print "Skip generating seperate intermediate files for outgroup" -# -# def generate_position_label_data_matrix(): -# -# """ -# Generate different list of Positions using the matrix All_label_final_sorted_header.txt. -# -# (Defining Core Variant Position: Variant Position which was not filtered out in any of the other samples due to variant filter parameter and also this position was present in all the samples(not unmapped)). -# -# Filtered Position label matrix: -# List of non-core positions. These positions didn't make it to the final core list because it was filtered out in one of the samples. -# -# Only_ref_variant_positions_for_closely_matrix.txt : -# Those Positions where the variant was either reference allele or a variant that passed all the variant filter parameters. -# -# :param: null -# :return: null -# -# """ -# def generate_position_label_data_matrix_All_label(): -# position_label = OrderedDict() -# f1 = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'w+') -# f2 = open("%s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# f3 = open("%s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# f4 = open( -# "%s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir, -# 'w+') -# if args.outgroup: -# with open("%s/All_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: -# keep_logging( -# 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, -# 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, -# logger, 'info') -# csv_reader = csv.reader(csv_file, delimiter='\t') -# next(csv_reader, None) -# for row in csv_reader: -# position_label[row[0]] = row[1:] -# keep_logging('Generating different list of Positions and heatmap data matrix... \n', -# 'Generating different list of Positions and heatmap data matrix... \n', logger, 'info') -# print_string_header = "\t" -# for i in vcf_filenames: -# print_string_header = print_string_header + os.path.basename(i) + "\t" -# f2.write('\t' + print_string_header.strip() + '\n') -# f3.write('\t' + print_string_header.strip() + '\n') -# f4.write('\t' + print_string_header.strip() + '\n') -# for value in position_label: -# lll = ['0', '2', '3', '4', '5', '6', '7'] -# ref_var = ['1', '1TRUE'] -# if set(ref_var) & set(position_label[value]): -# if set(lll) & set(position_label[value]): -# if int(value) not in outgroup_specific_positions: -# print_string = "" -# for i in position_label[value]: -# print_string = print_string + "\t" + i -# STRR2 = value + print_string + "\n" -# f3.write(STRR2) -# if position_label[value].count('1TRUE') >= 2: -# f4.write('1\n') -# else: -# f4.write('0\n') -# else: -# if int(value) not in outgroup_specific_positions: -# strr = value + "\n" -# f1.write(strr) -# STRR3 = value + "\t" + str(position_label[value]) + "\n" -# f2.write(STRR3) -# csv_file.close() -# f1.close() -# f2.close() -# f3.close() -# f4.close() -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# -# else: -# with open("%s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: -# keep_logging( -# 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, -# 'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir, -# logger, 'info') -# csv_reader = csv.reader(csv_file, delimiter='\t') -# next(csv_reader, None) -# for row in csv_reader: -# position_label[row[0]] = row[1:] -# keep_logging('Generating different list of Positions and heatmap data matrix... \n', -# 'Generating different list of Positions and heatmap data matrix... \n', logger, 'info') -# print_string_header = "\t" -# for i in vcf_filenames: -# print_string_header = print_string_header + os.path.basename(i) + "\t" -# f2.write('\t' + print_string_header.strip() + '\n') -# f3.write('\t' + print_string_header.strip() + '\n') -# f4.write('\t' + print_string_header.strip() + '\n') -# for value in position_label: -# lll = ['0', '2', '3', '4', '5', '6', '7'] -# ref_var = ['1', '1TRUE'] -# if set(ref_var) & set(position_label[value]): -# if set(lll) & set(position_label[value]): -# -# print_string = "" -# for i in position_label[value]: -# print_string = print_string + "\t" + i -# STRR2 = value + print_string + "\n" -# f3.write(STRR2) -# if position_label[value].count('1TRUE') >= 2: -# f4.write('1\n') -# else: -# f4.write('0\n') -# else: -# -# strr = value + "\n" -# f1.write(strr) -# STRR3 = value + "\t" + str(position_label[value]) + "\n" -# f2.write(STRR3) -# csv_file.close() -# f1.close() -# f2.close() -# f3.close() -# f4.close() -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir], -# shell=True) -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], -# shell=True) -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], -# shell=True) -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], -# shell=True) -# subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], -# shell=True) -# -# def temp_generate_position_label_data_matrix_All_label(): -# -# """ -# Read temp_label_final_raw.txt SNP position label data matrix for generating barplot statistics. -# """ -# temp_position_label = OrderedDict() -# f33=open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# print_string_header = "\t" -# -# if args.outgroup: -# for i in vcf_filenames: -# if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: -# print_string_header = print_string_header + os.path.basename(i) + "\t" -# else: -# for i in vcf_filenames: -# print_string_header = print_string_header + os.path.basename(i) + "\t" -# -# f33.write('\t' + print_string_header.strip() + '\n') -# keep_logging('Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, 'Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, logger, 'info') -# lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] -# ref_var = ['reference_allele', 'VARIANT'] -# -# if args.outgroup: -# print "here" -# with open("%s/temp_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: -# csv_reader = csv.reader(csv_file, delimiter='\t') -# next(csv_reader, None) -# for row in csv_reader: -# if set(ref_var) & set(row[1:]): -# if set(lll) & set(row[1:]): -# if int(row[0]) not in outgroup_specific_positions: -# -# print_string = "" -# for i in row[1:]: -# print_string = print_string + "\t" + i -# STRR2 = row[0] + print_string + "\n" -# f33.write(STRR2) -# csv_file.close() -# f33.close() -# -# else: -# with open("%s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: -# csv_reader = csv.reader(csv_file, delimiter='\t') -# next(csv_reader, None) -# for row in csv_reader: -# if set(ref_var) & set(row[1:]): -# if set(lll) & set(row[1:]): -# -# print_string = "" -# for i in row[1:]: -# print_string = print_string + "\t" + i -# STRR2 = row[0] + print_string + "\n" -# f33.write(STRR2) -# csv_file.close() -# f33.close() -# """ -# Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of FQ -# """ -# temp_position_label_FQ = OrderedDict() -# f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: -# keep_logging('Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, logger, 'info') -# csv_reader = csv.reader(csv_file, delimiter='\t') -# next(csv_reader, None) -# -# for row in csv_reader: -# temp_position_label_FQ[row[0]] = row[1:] -# print_string_header = "\t" -# for i in vcf_filenames: -# print_string_header = print_string_header + os.path.basename(i) + "\t" -# f44.write('\t' + print_string_header.strip() + '\n') -# for value in temp_position_label_FQ: -# lll = ['LowFQ'] -# if set(lll) & set(temp_position_label_FQ[value]): -# -# print_string = "" -# for i in temp_position_label_FQ[value]: -# print_string = print_string + "\t" + i -# STRR2 = value + print_string + "\n" -# f44.write(STRR2) -# f44.close() -# csv_file.close() -# f44.close() -# -# """ -# Perform Sed on temp files. Find a faster way to do this. -# """ -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# -# -# """ -# Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp -# """ -# temp_position_label_DP = OrderedDict() -# f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: -# keep_logging('Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n' % args.filter2_only_snp_vcf_dir, logger, 'info') -# csv_reader = csv.reader(csv_file, delimiter='\t') -# next(csv_reader, None) -# for row in csv_reader: -# temp_position_label_DP[row[0]] = row[1:] -# print_string_header = "\t" -# for i in vcf_filenames: -# print_string_header = print_string_header + os.path.basename(i) + "\t" -# f44.write('\t' + print_string_header.strip() + '\n') -# for value in temp_position_label_DP: -# lll = ['HighFQ_DP'] -# ref_var = ['reference_allele', 'VARIANT'] -# if set(lll) & set(temp_position_label_FQ[value]): -# -# print_string = "" -# for i in temp_position_label_FQ[value]: -# print_string = print_string + "\t" + i -# STRR2 = value + print_string + "\n" -# f44.write(STRR2) -# f44.close() -# csv_file.close() -# -# """ -# Perform Sed on temp files. Find a faster way to do this. -# """ -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# -# def barplot_stats(): -# keep_logging('\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n', '\nRead each Sample columns and calculate the percentage of each label to generate barplot statistics.\n', logger, 'info') -# """ -# Read each Sample columns and calculate the percentage of each label to generate barplot statistics. -# This will give a visual explanation of how many positions in each samples were filtered out because of different reason -# """ -# -# c_reader = csv.reader(open('%s/temp_Only_filtered_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') -# columns = list(zip(*c_reader)) -# keep_logging('Finished reading columns...', 'Finished reading columns...', logger, 'info') -# counts = 1 -# -# if args.outgroup: -# end = len(vcf_filenames) + 1 -# end = end - 1 -# else: -# end = len(vcf_filenames) + 1 -# -# f_bar_count = open("%s/bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# f_bar_perc = open("%s/bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# f_bar_count.write("Sample\tunmapped_positions\treference_allele\ttrue_variant\tOnly_low_FQ\tOnly_DP\tOnly_low_MQ\tother\n") -# f_bar_perc.write("Sample\tunmapped_positions_perc\ttrue_variant_perc\tOnly_low_FQ_perc\tOnly_DP_perc\tOnly_low_MQ_perc\tother_perc\n") -# -# for i in xrange(1, end, 1): -# """ Bar Count Statistics: Variant Position Count Statistics """ -# true_variant = columns[i].count('VARIANT') -# unmapped_positions = columns[i].count('reference_unmapped_position') -# reference_allele = columns[i].count('reference_allele') -# Only_low_FQ = columns[i].count('LowFQ') -# Only_DP = columns[i].count('HighFQ_DP') -# Only_low_MQ = columns[i].count('HighFQ') -# low_FQ_other_parameters = columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count('LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[i].count('LowFQ_DP') -# high_FQ_other_parameters = columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count('HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL') -# other = low_FQ_other_parameters + high_FQ_other_parameters -# -# total = true_variant + unmapped_positions + reference_allele + Only_low_FQ + Only_DP + low_FQ_other_parameters + high_FQ_other_parameters + Only_low_MQ -# -# filename_count = i - 1 -# -# if args.outgroup: -# bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions, reference_allele, true_variant, Only_low_FQ, Only_DP, Only_low_MQ, other) -# f_bar_count.write(bar_string) -# else: -# bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( -# vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), -# unmapped_positions, reference_allele, true_variant, -# Only_low_FQ, Only_DP, Only_low_MQ, other) -# #f_bar_count.write(bar_string) -# """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ -# try: -# true_variant_perc = float((columns[i].count('VARIANT') * 100) / total) -# except ZeroDivisionError: -# true_variant_perc = 0 -# try: -# unmapped_positions_perc = float((columns[i].count('reference_unmapped_position') * 100) / total) -# except ZeroDivisionError: -# unmapped_positions_perc = 0 -# try: -# reference_allele_perc = float((columns[i].count('reference_allele') * 100) / total) -# except ZeroDivisionError: -# reference_allele_perc = 0 -# try: -# Only_low_FQ_perc = float((columns[i].count('LowFQ') * 100) / total) -# except ZeroDivisionError: -# Only_low_FQ_perc = 0 -# try: -# Only_DP_perc = float((columns[i].count('HighFQ_DP') * 100) / total) -# except ZeroDivisionError: -# Only_DP_perc = 0 -# try: -# Only_low_MQ_perc = float((columns[i].count('HighFQ') * 100) / total) -# except ZeroDivisionError: -# Only_low_MQ_perc = 0 -# try: -# low_FQ_other_parameters_perc = float(((columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count('LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[i].count('LowFQ_DP')) * 100) / total) -# except ZeroDivisionError: -# low_FQ_other_parameters_perc = 0 -# try: -# high_FQ_other_parameters_perc = float(((columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count('HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL')) * 100) / total) -# except ZeroDivisionError: -# high_FQ_other_parameters_perc = 0 -# -# other_perc = float(low_FQ_other_parameters_perc + high_FQ_other_parameters_perc) -# if args.outgroup: -# bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions_perc, true_variant_perc, Only_low_FQ_perc, Only_DP_perc, Only_low_MQ_perc, other_perc) -# else: -# bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( -# vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), -# unmapped_positions_perc, reference_allele_perc, true_variant_perc, -# Only_low_FQ_perc, Only_DP_perc, Only_low_MQ_perc, other_perc) -# f_bar_count.write(bar_string) -# f_bar_perc.write(bar_perc_string) -# f_bar_count.close() -# f_bar_perc.close() -# bargraph_R_script = "library(ggplot2)\nlibrary(reshape)\nx1 <- read.table(\"bargraph_percentage.txt\", header=TRUE)\nx1$Sample <- reorder(x1$Sample, rowSums(x1[-1]))\nmdf1=melt(x1,id.vars=\"Sample\")\npdf(\"%s/%s_barplot.pdf\", width = 30, height = 30)\nggplot(mdf1, aes(Sample, value, fill=variable)) + geom_bar(stat=\"identity\") + ylab(\"Percentage of Filtered Positions\") + xlab(\"Samples\") + theme(text = element_text(size=9)) + scale_fill_manual(name=\"Reason for filtered out positions\", values=c(\"#08306b\", \"black\", \"orange\", \"darkgrey\", \"#fdd0a2\", \"#7f2704\")) + ggtitle(\"Title Here\") + ylim(0, 100) + theme(text = element_text(size=10), panel.background = element_rect(fill = 'white', colour = 'white'), plot.title = element_text(size=20, face=\"bold\", margin = margin(10, 0, 10, 0)), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_text(colour = \"black\", face= \"bold.italic\", angle = 90)) + theme(legend.position = c(0.6, 0.7), legend.direction = \"horizontal\")\ndev.off()" % (args.filter2_only_snp_vcf_dir, os.path.basename(os.path.normpath(args.results_dir))) -# barplot_R_file = open("%s/bargraph.R" % args.filter2_only_snp_vcf_dir, 'w+') -# barplot_R_file.write(bargraph_R_script) -# keep_logging('Run this R script to generate bargraph plot: %s/bargraph.R' % args.filter2_only_snp_vcf_dir, 'Run this R script to generate bargraph plot: %s/bargraph.R' % args.filter2_only_snp_vcf_dir, logger, 'info') -# -# """ Methods Steps""" -# keep_logging('Running: Generating data matrices...', 'Running: Generating data matrices...', logger, 'info') -# generate_position_label_data_matrix_All_label() -# keep_logging('Running: Changing variables in data matrices to codes for faster processing...', 'Running: Changing variables in data matrices to codes for faster processing...', logger, 'info') -# temp_generate_position_label_data_matrix_All_label() -# keep_logging('Running: Generating Barplot statistics data matrices...', 'Running: Generating Barplot statistics data matrices...', logger, 'info') -# barplot_stats() -# -# def generate_indel_position_label_data_matrix(): -# -# """ -# Generate different list of Positions using the matrix All_label_final_sorted_header.txt. -# -# (Defining Core Variant Position: Variant Position which was not filtered out in any of the other samples due to variant filter parameter and also this position was present in all the samples(not unmapped)). -# -# Filtered Position label matrix: -# List of non-core positions. These positions didn't make it to the final core list because it was filtered out in one of the samples. -# -# Only_ref_variant_positions_for_closely_matrix.txt : -# Those Positions where the variant was either reference allele or a variant that passed all the variant filter parameters. -# -# :param: null -# :return: null -# -# """ -# def generate_indel_position_label_data_matrix_All_label(): -# position_label = OrderedDict() -# print "Generating Only_ref_indel_positions_for_closely" -# f1=open("%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'w+') -# f2=open("%s/Only_ref_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# f3=open("%s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# f4=open("%s/Only_filtered_indel_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# -# if args.outgroup: -# with open("%s/All_indel_label_final_sorted_header_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: -# keep_logging( -# 'Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, -# 'Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, -# logger, 'info') -# csv_reader = csv.reader(csv_file, delimiter='\t') -# next(csv_reader, None) -# for row in csv_reader: -# position_label[row[0]] = row[1:] -# keep_logging('Generating different list of Positions and heatmap data matrix...', -# 'Generating different list of Positions and heatmap data matrix...', logger, 'info') -# print_string_header = "\t" -# for i in vcf_filenames: -# print_string_header = print_string_header + os.path.basename(i) + "\t" -# # f.write('\t' + print_string_header.strip() + '\n') -# f2.write('\t' + print_string_header.strip() + '\n') -# f3.write('\t' + print_string_header.strip() + '\n') -# f4.write('\t' + print_string_header.strip() + '\n') -# for value in position_label: -# lll = ['0', '2', '3', '4', '5', '6', '7'] -# ref_var = ['1', '1TRUE'] -# if set(ref_var) & set(position_label[value]): -# if set(lll) & set(position_label[value]): -# if int(value) not in outgroup_indel_specific_positions: -# print_string = "" -# for i in position_label[value]: -# print_string = print_string + "\t" + i -# STRR2 = value + print_string + "\n" -# f3.write(STRR2) -# if position_label[value].count('1TRUE') >= 2: -# f4.write('1\n') -# else: -# f4.write('0\n') -# else: -# if int(value) not in outgroup_indel_specific_positions: -# strr = value + "\n" -# f1.write(strr) -# STRR3 = value + "\t" + str(position_label[value]) + "\n" -# f2.write(STRR3) -# csv_file.close() -# f1.close() -# f2.close() -# f3.close() -# f4.close() -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir], -# shell=True) -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], -# shell=True) -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], -# shell=True) -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], -# shell=True) -# subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], -# shell=True) -# else: -# with open("%s/All_indel_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: -# keep_logging('Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, 'Reading All label positions file: %s/All_indel_label_final_sorted_header.txt' % args.filter2_only_snp_vcf_dir, logger, 'info') -# csv_reader = csv.reader(csv_file, delimiter='\t') -# next(csv_reader, None) -# for row in csv_reader: -# position_label[row[0]] = row[1:] -# keep_logging('Generating different list of Positions and heatmap data matrix...', 'Generating different list of Positions and heatmap data matrix...', logger, 'info') -# print_string_header = "\t" -# for i in vcf_filenames: -# print_string_header = print_string_header + os.path.basename(i) + "\t" -# #f.write('\t' + print_string_header.strip() + '\n') -# f2.write('\t' + print_string_header.strip() + '\n') -# f3.write('\t' + print_string_header.strip() + '\n') -# f4.write('\t' + print_string_header.strip() + '\n') -# for value in position_label: -# -# lll = ['0', '2', '3', '4', '5', '6', '7'] -# ref_var = ['1', '1TRUE'] -# if set(ref_var) & set(position_label[value]): -# if set(lll) & set(position_label[value]): -# print_string = "" -# for i in position_label[value]: -# print_string = print_string + "\t" + i -# STRR2 = value + print_string + "\n" -# f3.write(STRR2) -# if position_label[value].count('1TRUE') >= 2: -# f4.write('1\n') -# else: -# f4.write('0\n') -# else: -# strr = value + "\n" -# f1.write(strr) -# STRR3 = value + "\t" + str(position_label[value]) + "\n" -# f2.write(STRR3) -# csv_file.close() -# f1.close() -# f2.close() -# f3.close() -# f4.close() -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_indel_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# -# def temp_generate_indel_position_label_data_matrix_All_label(): -# -# """ -# Read **temp_label_final_raw.txt** SNP position label data matrix for generating barplot statistics. -# """ -# temp_position_label = OrderedDict() -# f33=open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# print_string_header = "\t" -# if args.outgroup: -# for i in vcf_filenames: -# -# if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in i: -# print_string_header = print_string_header + os.path.basename(i) + "\t" -# else: -# for i in vcf_filenames: -# print_string_header = print_string_header + os.path.basename(i) + "\t" -# -# f33.write('\t' + print_string_header.strip() + '\n') -# keep_logging('Reading temporary label positions file: %s/temp_label_final_raw.txt' % args.filter2_only_snp_vcf_dir, 'Reading temporary label positions file: %s/temp_label_final_raw.txt' % args.filter2_only_snp_vcf_dir, logger, 'info') -# # lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] -# lll = ['reference_unmapped_position', 'LowAF', 'LowAF_DP', 'LowAF_QUAL', 'LowAF_DP_QUAL', 'LowAF_QUAL_DP', -# 'HighAF_DP', 'HighAF_QUAL', 'HighAF_DP_QUAL', 'HighAF_QUAL_DP', 'HighAF', 'LowAF_proximate_SNP', -# 'LowAF_DP_proximate_SNP', 'LowAF_QUAL_proximate_SNP', 'LowAF_DP_QUAL_proximate_SNP', -# 'LowAF_QUAL_DP_proximate_SNP', 'HighAF_DP_proximate_SNP', 'HighAF_QUAL_proximate_SNP', -# 'HighAF_DP_QUAL_proximate_SNP', 'HighAF_QUAL_DP_proximate_SNP', 'HighAF_proximate_SNP', '_proximate_SNP'] -# ref_var = ['reference_allele', 'VARIANT'] -# -# if args.outgroup: -# with open("%s/temp_indel_label_final_raw_outgroup.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: -# csv_reader = csv.reader(csv_file, delimiter='\t') -# next(csv_reader, None) -# for row in csv_reader: -# if set(ref_var) & set(row[1:]): -# if set(lll) & set(row[1:]): -# if int(row[0]) not in outgroup_indel_specific_positions: -# print_string = "" -# for i in row[1:]: -# print_string = print_string + "\t" + i -# STRR2 = row[0] + print_string + "\n" -# f33.write(STRR2) -# csv_file.close() -# f33.close() -# else: -# with open("%s/temp_indel_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: -# csv_reader = csv.reader(csv_file, delimiter='\t') -# next(csv_reader, None) -# for row in csv_reader: -# if set(ref_var) & set(row[1:]): -# if set(lll) & set(row[1:]): -# -# print_string = "" -# for i in row[1:]: -# print_string = print_string + "\t" + i -# STRR2 = row[0] + print_string + "\n" -# f33.write(STRR2) -# csv_file.close() -# f33.close() -# """ -# Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of AF -# """ -# temp_position_label_AF = OrderedDict() -# f44=open("%s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# with open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: -# keep_logging('Reading temporary Only_filtered_indel_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, 'Reading temporary Only_filtered_indel_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, logger, 'info') -# csv_reader = csv.reader(csv_file, delimiter='\t') -# next(csv_reader, None) -# -# for row in csv_reader: -# temp_position_label_AF[row[0]] = row[1:] -# print_string_header = "\t" -# for i in vcf_filenames: -# print_string_header = print_string_header + os.path.basename(i) + "\t" -# f44.write('\t' + print_string_header.strip() + '\n') -# for value in temp_position_label_AF: -# lll = ['LowAF'] -# if set(lll) & set(temp_position_label_AF[value]): -# -# print_string = "" -# for i in temp_position_label_AF[value]: -# print_string = print_string + "\t" + i -# STRR2 = value + print_string + "\n" -# f44.write(STRR2) -# f44.close() -# csv_file.close() -# f44.close() -# -# """ -# Perform Sed on temp files. Find a faster way to do this. -# """ -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF/3/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_AF.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# -# -# """ -# Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp -# """ -# temp_position_label_DP = OrderedDict() -# f44=open("%s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# with open("%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: -# keep_logging('Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, 'Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_indel_positions_for_closely_matrix.txt ' % args.filter2_only_snp_vcf_dir, logger, 'info') -# csv_reader = csv.reader(csv_file, delimiter='\t') -# next(csv_reader, None) -# for row in csv_reader: -# temp_position_label_DP[row[0]] = row[1:] -# print_string_header = "\t" -# for i in vcf_filenames: -# print_string_header = print_string_header + os.path.basename(i) + "\t" -# f44.write('\t' + print_string_header.strip() + '\n') -# for value in temp_position_label_DP: -# lll = ['HighAF_DP'] -# ref_var = ['reference_allele', 'VARIANT'] -# if set(lll) & set(temp_position_label_AF[value]): -# print_string = "" -# for i in temp_position_label_AF[value]: -# print_string = print_string + "\t" + i -# STRR2 = value + print_string + "\n" -# f44.write(STRR2) -# f44.close() -# csv_file.close() -# -# """ -# Perform Sed on temp files. Find a faster way to do this. -# """ -# subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_proximate_SNP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL_DP/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_QUAL/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF_DP/3/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/LowAF/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# subprocess.call(["sed -i 's/HighAF/4/g' %s/temp_Only_filtered_indel_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) -# -# -# def barplot_indel_stats(): -# keep_logging('Read each Sample columns and calculate the percentage of each label to generate barplot statistics.', 'Read each Sample columns and calculate the percentage of each label to generate barplot statistics.', logger, 'info') -# """ -# Read each Sample columns and calculate the percentage of each label to generate barplot statistics. -# This will give a visual explanation of how many positions in each samples were filtered out because of different reason -# """ -# -# c_reader = csv.reader( -# open('%s/temp_Only_filtered_indel_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, -# 'r'), delimiter='\t') -# columns = list(zip(*c_reader)) -# print len(columns) -# keep_logging('Finished reading columns...', 'Finished reading columns...', logger, 'info') -# counts = 1 -# -# if args.outgroup: -# end = len(vcf_filenames) + 1 -# end = end - 1 -# else: -# end = len(vcf_filenames) + 1 -# print end -# -# f_bar_count = open("%s/bargraph_indel_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# f_bar_perc = open("%s/bargraph_indel_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# f_bar_count.write("Sample\tunmapped_positions\treference_allele\ttrue_variant\tOnly_low_AF\tOnly_DP\tOnly_low_MQ\tother\n") -# f_bar_perc.write("Sample\tunmapped_positions_perc\ttrue_variant_perc\tOnly_low_AF_perc\tOnly_DP_perc\tOnly_low_MQ_perc\tother_perc\n") -# for i in xrange(1, end, 1): -# """ Bar Count Statistics: Variant Position Count Statistics """ -# print i -# true_variant = columns[i].count('VARIANT') -# unmapped_positions = columns[i].count('reference_unmapped_position') -# reference_allele = columns[i].count('reference_allele') -# Only_low_AF = columns[i].count('LowAF') -# Only_DP = columns[i].count('HighAF_DP') -# Only_low_MQ = columns[i].count('HighAF') -# low_AF_other_parameters = columns[i].count('LowAF_QUAL_DP_proximate_SNP') + columns[i].count('LowAF_DP_QUAL_proximate_SNP') + columns[i].count('LowAF_QUAL_proximate_SNP') + columns[i].count('LowAF_DP_proximate_SNP') + columns[i].count('LowAF_proximate_SNP') + columns[i].count('LowAF_QUAL_DP') + columns[i].count('LowAF_DP_QUAL') + columns[i].count('LowAF_QUAL') + columns[i].count('LowAF_DP') -# high_AF_other_parameters = columns[i].count('HighAF_QUAL_DP_proximate_SNP') + columns[i].count('HighAF_DP_QUAL_proximate_SNP') + columns[i].count('HighAF_QUAL_proximate_SNP') + columns[i].count('HighAF_DP_proximate_SNP') + columns[i].count('HighAF_proximate_SNP') + columns[i].count('HighAF_QUAL_DP') + columns[i].count('HighAF_DP_QUAL') + columns[i].count('HighAF_QUAL') -# other = low_AF_other_parameters + high_AF_other_parameters -# total = true_variant + unmapped_positions + reference_allele + Only_low_AF + Only_DP + low_AF_other_parameters + high_AF_other_parameters + Only_low_MQ -# filename_count = i - 1 -# # bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions, reference_allele, true_variant, Only_low_AF, Only_DP, Only_low_MQ, other) -# if args.outgroup: -# ### -# -# bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( -# vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), -# unmapped_positions, reference_allele, true_variant, -# Only_low_AF, Only_DP, Only_low_MQ, other) -# else: -# bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename( -# vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), -# unmapped_positions, reference_allele, true_variant, -# Only_low_AF, Only_DP, Only_low_MQ, other) -# -# f_bar_count.write(bar_string) -# -# """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ -# try: -# true_variant_perc = float((columns[i].count('VARIANT') * 100) / total) -# except ZeroDivisionError: -# true_variant_perc = 0 -# try: -# unmapped_positions_perc = float((columns[i].count('reference_unmapped_position') * 100) / total) -# except ZeroDivisionError: -# unmapped_positions_perc = 0 -# try: -# reference_allele_perc = float((columns[i].count('reference_allele') * 100) / total) -# except ZeroDivisionError: -# reference_allele_perc = 0 -# try: -# Only_low_AF_perc = float((columns[i].count('LowAF') * 100) / total) -# except ZeroDivisionError: -# Only_low_AF_perc = 0 -# try: -# Only_DP_perc = float((columns[i].count('HighAF_DP') * 100) / total) -# except ZeroDivisionError: -# Only_DP_perc = 0 -# try: -# Only_low_MQ_perc = float((columns[i].count('HighAF') * 100) / total) -# except ZeroDivisionError: -# Only_low_MQ_perc = 0 -# try: -# low_AF_other_parameters_perc = float(((columns[i].count('LowAF_QUAL_DP_proximate_SNP') + columns[i].count('LowAF_DP_QUAL_proximate_SNP') + columns[i].count('LowAF_QUAL_proximate_SNP') + columns[i].count('LowAF_DP_proximate_SNP') + columns[i].count('LowAF_proximate_SNP') + columns[i].count('LowAF_QUAL_DP') + columns[i].count('LowAF_DP_QUAL') + columns[i].count('LowAF_QUAL') + columns[i].count('LowAF_DP')) * 100) / total) -# except ZeroDivisionError: -# low_AF_other_parameters_perc = 0 -# try: -# high_AF_other_parameters_perc = float(((columns[i].count('HighAF_QUAL_DP_proximate_SNP') + columns[i].count('HighAF_DP_QUAL_proximate_SNP') + columns[i].count('HighAF_QUAL_proximate_SNP') + columns[i].count('HighAF_DP_proximate_SNP') + columns[i].count('HighAF_proximate_SNP') + columns[i].count('HighAF_QUAL_DP') + columns[i].count('HighAF_DP_QUAL') + columns[i].count('HighAF_QUAL')) * 100) / total) -# except ZeroDivisionError: -# high_AF_other_parameters_perc = 0 -# -# other_perc = float(low_AF_other_parameters_perc + high_AF_other_parameters_perc) -# if args.outgroup: -# ### -# bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( -# os.path.basename(vcf_filenames_outgroup[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), -# unmapped_positions_perc, true_variant_perc, Only_low_AF_perc, Only_DP_perc, Only_low_MQ_perc, -# other_perc) -# f_bar_perc.write(bar_perc_string) -# else: -# bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( -# os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), -# unmapped_positions_perc, true_variant_perc, Only_low_AF_perc, Only_DP_perc, Only_low_MQ_perc, -# other_perc) -# f_bar_perc.write(bar_perc_string) -# -# f_bar_count.close() -# f_bar_perc.close() -# bargraph_R_script = "library(ggplot2)\nlibrary(reshape)\nx1 <- read.table(\"bargraph_indel_percentage.txt\", header=TRUE)\nx1$Sample <- reorder(x1$Sample, rowSums(x1[-1]))\nmdf1=melt(x1,id.vars=\"Sample\")\npdf(\"%s/%s_barplot_indel.pdf\", width = 30, height = 30)\nggplot(mdf1, aes(Sample, value, fill=variable)) + geom_bar(stat=\"identity\") + ylab(\"Percentage of Filtered Positions\") + xlab(\"Samples\") + theme(text = element_text(size=9)) + scale_fill_manual(name=\"Reason for filtered out positions\", values=c(\"#08306b\", \"black\", \"orange\", \"darkgrey\", \"#fdd0a2\", \"#7f2704\")) + ggtitle(\"Title Here\") + ylim(0, 100) + theme(text = element_text(size=10), panel.background = element_rect(fill = 'white', colour = 'white'), plot.title = element_text(size=20, face=\"bold\", margin = margin(10, 0, 10, 0)), axis.ticks.y = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_text(colour = \"black\", face= \"bold.italic\", angle = 90)) + theme(legend.position = c(0.6, 0.7), legend.direction = \"horizontal\")\ndev.off()" % (args.filter2_only_snp_vcf_dir, os.path.basename(os.path.normpath(args.results_dir))) -# barplot_R_file = open("%s/bargraph_indel.R" % args.filter2_only_snp_vcf_dir, 'w+') -# barplot_R_file.write(bargraph_R_script) -# keep_logging('Run this R script to generate bargraph plot: %s/bargraph_indel.R' % args.filter2_only_snp_vcf_dir, 'Run this R script to generate bargraph plot: %s/bargraph_indel.R' % args.filter2_only_snp_vcf_dir, logger, 'info') -# -# -# """ Methods Steps""" -# keep_logging('Running: Generating data matrices...', 'Running: Generating data matrices...', logger, 'info') -# # if args.outgroup: -# # f_outgroup = open("%s/outgroup_indel_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') -# # global outgroup_indel_specific_positions -# # outgroup_indel_specific_positions = [] -# # for i in f_outgroup: -# # outgroup_indel_specific_positions.append(i) -# # f_outgroup.close() -# # -# # f_outgroup = open("%s/outgroup_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') -# # global outgroup_specific_positions -# # outgroup_specific_positions = [] -# # for i in f_outgroup: -# # outgroup_specific_positions.append(i) -# # f_outgroup.close() -# # else: -# # global outgroup_specific_positions -# # global outgroup_indel_specific_positions -# # outgroup_indel_specific_positions = [] -# # outgroup_specific_positions = [] -# generate_indel_position_label_data_matrix_All_label() -# keep_logging('Running: Changing variables in data matrices to codes for faster processing...', 'Running: Changing variables in data matrices to codes for faster processing...', logger, 'info') -# temp_generate_indel_position_label_data_matrix_All_label() -# keep_logging('Running: Generating Barplot statistics data matrices...', 'Running: Generating Barplot statistics data matrices...', logger, 'info') -# barplot_indel_stats() -# -# def create_job_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filter): -# -# """ Generate jobs/scripts that creates core consensus fasta file. -# -# This function will generate and run scripts/jobs to create core consensus fasta file of only core variant positions. -# Input for Fasttree, Beast and pairwise variant analysis. -# -# :param jobrun: Based on this value all the job/scripts will run on "cluster": either on single cluster, "parallel-local": run in parallel on local system, "local": run on local system, "parallel-cluster": submit parallel jobs on cluster. -# :param vcf_filenames: list of final vcf filenames i.e *_no_proximate_snp.vcf. These files are the final output of variant calling step for each sample. -# :return: -# :raises: -# """ -# if jobrun == "parallel-cluster": -# """ -# Supports only PBS clusters for now. -# """ -# for i in vcf_filenames: -# job_name = os.path.basename(i) -# job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) -# job_file_name = "%s_fasta.pbs" % (i) -# f1=open(job_file_name, 'w+') -# f1.write(job_print_string) -# f1.close() -# #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) -# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" -# pbs_scripts = glob.glob(pbs_dir) -# for i in pbs_scripts: -# keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') -# #os.system("qsub %s" % i) -# call("qsub %s" % i, logger) -# -# -# elif jobrun == "parallel-local" or jobrun == "cluster": -# """ -# Generate a Command list of each job and run it in parallel on different cores available on local system -# """ -# command_array = [] -# command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir -# f3 = open(command_file, 'w+') -# for i in vcf_filenames: -# job_name = os.path.basename(i) -# job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) -# job_file_name = "%s_fasta.pbs" % (i) -# f1=open(job_file_name, 'w+') -# f1.write(job_print_string) -# f1.close() -# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" -# pbs_scripts = glob.glob(pbs_dir) -# for i in pbs_scripts: -# f3.write("bash %s\n" % i) -# f3.close() -# with open(command_file, 'r') as fpp: -# for lines in fpp: -# lines = lines.strip() -# command_array.append(lines) -# fpp.close() -# if args.numcores: -# num_cores = int(num_cores) -# else: -# num_cores = multiprocessing.cpu_count() -# results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) -# -# # elif jobrun == "cluster": -# # command_array = [] -# # command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir -# # f3 = open(command_file, 'w+') -# # for i in vcf_filenames: -# # job_name = os.path.basename(i) -# # job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'],args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir) -# # job_file_name = "%s_fasta.pbs" % (i) -# # f1=open(job_file_name, 'w+') -# # f1.write(job_print_string) -# # f1.close() -# # pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" -# # pbs_scripts = glob.glob(pbs_dir) -# # for i in pbs_scripts: -# # f3.write("bash %s\n" % i) -# # f3.close() -# # with open(command_file, 'r') as fpp: -# # for lines in fpp: -# # lines = lines.strip() -# # command_array.append(lines) -# # fpp.close() -# # os.system("bash %s/command_file" % args.filter2_only_snp_vcf_dir) -# else: -# """ -# Generate a Command list of each job and run it on local system one at a time -# """ -# command_array = [] -# command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir -# f3 = open(command_file, 'w+') -# -# -# for i in vcf_filenames: -# job_name = os.path.basename(i) -# job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -functional_filter %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, functional_filter) -# job_file_name = "%s_fasta.pbs" % (i) -# f1=open(job_file_name, 'w+') -# f1.write(job_print_string) -# f1.close() -# #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) -# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" -# pbs_scripts = glob.glob(pbs_dir) -# -# -# for i in pbs_scripts: -# f3.write("bash %s\n" % i) -# f3.close() -# with open(command_file, 'r') as fpp: -# for lines in fpp: -# lines = lines.strip() -# command_array.append(lines) -# fpp.close() -# #os.system("bash command_file") -# call("bash %s" % command_file, logger) -# -# def create_job_allele_variant_fasta(jobrun, vcf_filenames, core_vcf_fasta_dir, config_file): -# -# """ Generate jobs/scripts that creates core consensus fasta file. -# -# This function will generate and run scripts/jobs to create core consensus fasta file of only core variant positions. -# Input for Fasttree, Beast and pairwise variant analysis. -# -# :param jobrun: Based on this value all the job/scripts will run on "cluster": either on single cluster, "parallel-local": run in parallel on local system, "local": run on local system, "parallel-cluster": submit parallel jobs on cluster. -# :param vcf_filenames: list of final vcf filenames i.e *_no_proximate_snp.vcf. These files are the final output of variant calling step for each sample. -# :return: -# :raises: -# """ -# if jobrun == "parallel-cluster": -# """ -# Supports only PBS clusters for now. -# """ -# for i in vcf_filenames: -# job_name = os.path.basename(i) -# job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) -# job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) -# f1=open(job_file_name, 'w+') -# f1.write(job_print_string) -# f1.close() -# #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) -# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" -# pbs_scripts = glob.glob(pbs_dir) -# for i in pbs_scripts: -# keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') -# #os.system("qsub %s" % i) -# call("qsub %s" % i, logger) -# -# -# elif jobrun == "parallel-local" or jobrun == "cluster": -# """ -# Generate a Command list of each job and run it in parallel on different cores available on local system -# """ -# command_array = [] -# command_file = "%s/commands_list_ref_allele_variants_fasta.sh" % args.filter2_only_snp_vcf_dir -# f3 = open(command_file, 'w+') -# for i in vcf_filenames: -# job_name = os.path.basename(i) -# job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) -# job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) -# f1=open(job_file_name, 'w+') -# f1.write(job_print_string) -# f1.close() -# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_ref_allele_variants_fasta.pbs" -# pbs_scripts = glob.glob(pbs_dir) -# for i in pbs_scripts: -# f3.write("bash %s\n" % i) -# f3.close() -# with open(command_file, 'r') as fpp: -# for lines in fpp: -# lines = lines.strip() -# command_array.append(lines) -# fpp.close() -# if args.numcores: -# num_cores = int(num_cores) -# else: -# num_cores = multiprocessing.cpu_count() -# results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) -# -# # elif jobrun == "cluster": -# # command_array = [] -# # command_file = "%s/commands_list_fasta.sh" % args.filter2_only_snp_vcf_dir -# # f3 = open(command_file, 'w+') -# # for i in vcf_filenames: -# # job_name = os.path.basename(i) -# # job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'],args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir) -# # job_file_name = "%s_fasta.pbs" % (i) -# # f1=open(job_file_name, 'w+') -# # f1.write(job_print_string) -# # f1.close() -# # pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" -# # pbs_scripts = glob.glob(pbs_dir) -# # for i in pbs_scripts: -# # f3.write("bash %s\n" % i) -# # f3.close() -# # with open(command_file, 'r') as fpp: -# # for lines in fpp: -# # lines = lines.strip() -# # command_array.append(lines) -# # fpp.close() -# # os.system("bash %s/command_file" % args.filter2_only_snp_vcf_dir) -# else: -# """ -# Generate a Command list of each job and run it on local system one at a time -# """ -# command_array = [] -# command_file = "%s/commands_list_ref_allele_variants_fasta.sh" % args.filter2_only_snp_vcf_dir -# f3 = open(command_file, 'w+') -# -# -# for i in vcf_filenames: -# job_name = os.path.basename(i) -# job_print_string = "#PBS -N %s_fasta\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s -out_core %s -config %s\n" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], args.filter2_only_snp_vcf_dir, i, args.reference, core_vcf_fasta_dir, config_file) -# job_file_name = "%s_ref_allele_variants_fasta.pbs" % (i) -# f1=open(job_file_name, 'w+') -# f1.write(job_print_string) -# f1.close() -# #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) -# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_ref_allele_variants_fasta.pbs" -# pbs_scripts = glob.glob(pbs_dir) -# -# -# for i in pbs_scripts: -# f3.write("bash %s\n" % i) -# f3.close() -# with open(command_file, 'r') as fpp: -# for lines in fpp: -# lines = lines.strip() -# command_array.append(lines) -# fpp.close() -# #os.system("bash command_file") -# call("bash %s" % command_file, logger) -# -# def create_job_DP(jobrun, vcf_filenames): -# """ -# Based on type of jobrun; generate jobs and run accordingly. -# :param jobrun: Based on this value all the job/scripts will run on "cluster": either on single cluster, "parallel-local": run in parallel on local system, "local": run on local system, "parallel-cluster": submit parallel jobs on cluster. -# :param vcf_filenames: -# :return: -# """ -# -# if jobrun == "parallel-cluster": -# """ -# Supports only PBS clusters for now. -# """ -# for i in vcf_filenames: -# job_name = os.path.basename(i) -# job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) -# job_file_name = "%s_DP.pbs" % (i) -# f1=open(job_file_name, 'w+') -# f1.write(job_print_string) -# f1.close() -# #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) -# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" -# pbs_scripts = glob.glob(pbs_dir) -# for i in pbs_scripts: -# keep_logging('Running: qsub %s' % i, 'Running: qsub %s' % i, logger, 'info') -# #os.system("qsub %s" % i) -# call("qsub %s" % i, logger) -# -# -# elif jobrun == "parallel-local" or jobrun == "cluster" : -# """ -# Generate a Command list of each job and run it in parallel on different cores available on local system -# """ -# command_array = [] -# command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir -# f3 = open(command_file, 'w+') -# -# -# for i in vcf_filenames: -# job_name = os.path.basename(i) -# job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) -# job_file_name = "%s_DP.pbs" % (i) -# f1=open(job_file_name, 'w+') -# f1.write(job_print_string) -# f1.close() -# #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) -# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" -# pbs_scripts = glob.glob(pbs_dir) -# -# -# for i in pbs_scripts: -# f3.write("bash %s\n" % i) -# f3.close() -# with open(command_file, 'r') as fpp: -# for lines in fpp: -# lines = lines.strip() -# command_array.append(lines) -# fpp.close() -# print len(command_array) -# if args.numcores: -# num_cores = int(num_cores) -# else: -# num_cores = multiprocessing.cpu_count() -# results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) -# -# # elif jobrun == "cluster": -# # """ Test pending """ -# # command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir -# # f3 = open(command_file, 'w+') -# # for i in vcf_filenames: -# # job_name = os.path.basename(i) -# # job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) -# # job_file_name = "%s_DP.pbs" % (i) -# # f1=open(job_file_name, 'w+') -# # f1.write(job_print_string) -# # f1.close() -# # pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" -# # pbs_scripts = glob.glob(pbs_dir) -# # for i in pbs_scripts: -# # f3.write("bash %s\n" % i) -# # f3.close() -# # os.system("bash %s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir) -# -# else: -# """ -# Generate a Command list of each job and run it on local system one at a time -# """ -# command_file = "%s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir -# f3 = open(command_file, 'w+') -# for i in vcf_filenames: -# job_name = os.path.basename(i) -# job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/nfs/esnitkin/bin_group/anaconda2/bin/python /nfs/esnitkin/bin_group/pipeline/Github/variant_calling_pipeline_dev/modules/variant_diagnostics/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) -# job_file_name = "%s_DP.pbs" % (i) -# f1=open(job_file_name, 'w+') -# f1.write(job_print_string) -# f1.close() -# pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" -# pbs_scripts = glob.glob(pbs_dir) -# for i in pbs_scripts: -# f3.write("bash %s\n" % i) -# f3.close() -# #os.system("bash %s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir) -# call("bash %s/commands_list_DP.sh" % args.filter2_only_snp_vcf_dir, logger) -# -# def generate_vcf_files(): -# if ConfigSectionMap("functional_filters", Config)['apply_functional_filters'] == "yes": -# keep_logging('Removing Variants falling in Functional filters positions file: %s\n' % functional_class_filter_positions, 'Removing Variants falling in Functional filters positions file: %s\n' % functional_class_filter_positions, logger, -# 'info') -# # phage_positions = [] -# # phage_region_positions = "%s/phage_region_positions.txt" % args.filter2_only_snp_vcf_dir -# # with open(phage_region_positions, 'rU') as fp: -# # for line in fp: -# # phage_positions.append(line.strip()) -# # fp.close() -# -# -# functional_filter_pos_array = [] -# with open(functional_class_filter_positions, 'rU') as f_functional: -# for line_func in f_functional: -# functional_filter_pos_array.append(line_func.strip()) -# -# ref_variant_position_array = [] -# ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') -# for line in ffp: -# line = line.strip() -# if line not in functional_filter_pos_array: -# ref_variant_position_array.append(line) -# ffp.close() -# -# # Adding core indel support: 2018-07-24 -# ref_indel_variant_position_array = [] -# ffp = open("%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') -# for line in ffp: -# line = line.strip() -# if line not in functional_filter_pos_array: -# ref_indel_variant_position_array.append(line) -# ffp.close() -# -# else: -# functional_filter_pos_array = [] -# ref_variant_position_array = [] -# ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') -# for line in ffp: -# line = line.strip() -# ref_variant_position_array.append(line) -# ffp.close() -# -# # Adding core indel support: 2018-07-24 -# ref_indel_variant_position_array = [] -# ffp = open("%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') -# for line in ffp: -# line = line.strip() -# if line not in functional_filter_pos_array: -# ref_indel_variant_position_array.append(line) -# ffp.close() -# -# print "No. of core SNPs: %s" % len(ref_variant_position_array) -# print "No. of core INDELs: %s" % len(ref_indel_variant_position_array) -# -# f_file = open("%s/Only_ref_variant_positions_for_closely_without_functional_filtered_positions" % args.filter2_only_snp_vcf_dir, 'w+') -# for pos in ref_variant_position_array: -# f_file.write(pos + '\n') -# f_file.close() -# -# # Adding core indel support: 2018-07-24 -# f_file = open( -# "%s/Only_ref_indel_variant_positions_for_closely_without_functional_filtered_positions" % args.filter2_only_snp_vcf_dir, -# 'w+') -# for pos in ref_indel_variant_position_array: -# f_file.write(pos + '\n') -# f_file.close() -# -# base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("vcftools", Config)['vcftools_bin'] -# filter2_files_array = [] -# for i in vcf_filenames: -# filter2_file = i.replace('_no_proximate_snp.vcf', '') -# filter2_files_array.append(filter2_file) -# -# -# filtered_out_vcf_files = [] -# for i in filter2_files_array: -# print_array =[] -# with open(i) as file_open: -# for line in file_open: -# line = line.strip() -# if line.startswith("#"): -# print_array.append(line) -# else: -# split_array = re.split(r'\t+', line) -# if split_array[1] in ref_variant_position_array and 'INDEL' not in split_array[7]: -# print_array.append(line) -# file_open.close() -# file_name = i + "_core.vcf" -# keep_logging('Generating %s' % file_name, 'Generating %s' % file_name, logger, 'info') -# filtered_out_vcf_files.append(file_name) -# f1 = open(file_name, 'w+') -# for ios in print_array: -# print_string = str(ios) + "\n" -# f1.write(print_string) -# f1.close() -# -# filename = "%s/consensus.sh" % args.filter2_only_snp_vcf_dir -# keep_logging('Generating Consensus...', 'Generating Consensus...', logger, 'info') -# for file in filtered_out_vcf_files: -# f1 = open(filename, 'a+') -# bgzip_cmd = "%s/%s/bgzip -f %s\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], file) -# f1.write(bgzip_cmd) -# subprocess.call([bgzip_cmd], shell=True) -# tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], file) -# f1.write(tabix_cmd) -# subprocess.call([tabix_cmd], shell=True) -# fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s.fa\n" % (args.reference, base_vcftools_bin, file, file.replace('_filter2_final.vcf_core.vcf', '')) -# f1.write(fasta_cmd) -# subprocess.call([fasta_cmd], shell=True) -# base = os.path.basename(file) -# header = base.replace('_filter2_final.vcf_core.vcf', '') -# sed_command = "sed -i 's/>.*/>%s/g' %s.fa\n" % (header, file.replace('_filter2_final.vcf_core.vcf', '')) -# subprocess.call([sed_command], shell=True) -# f1.write(sed_command) -# keep_logging('The consensus commands are in : %s' % filename, 'The consensus commands are in : %s' % filename, logger, 'info') -# sequence_lgth_cmd = "for i in %s/*.fa; do %s/%s/bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % (args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin']) -# #os.system(sequence_lgth_cmd) -# call("%s" % sequence_lgth_cmd, logger) -# -# def gatk_filter2(final_raw_vcf, out_path, analysis, reference): -# gatk_filter2_parameter_expression = "MQ > 50 && QUAL > 100 && DP > 9" -# gatk_filter2_command = "java -jar %s/%s/GenomeAnalysisTK.jar -T VariantFiltration -R %s -o %s/%s_filter2_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("gatk", Config)['gatk_bin'], reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) -# keep_logging('Running Command: [%s]' % gatk_filter2_command, 'Running Command: [%s]' % gatk_filter2_command, logger, 'info') -# #os.system(gatk_filter2_command) -# call("%s" % gatk_filter2_command, logger) -# filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_gatk.vcf > %s/%s_filter2_final.vcf" % (out_path, analysis, out_path, analysis) -# call("%s" % filter_flag_command, logger) -# gatk_filter2_final_vcf = "%s/%s_filter2_final.vcf" % (out_path, analysis) -# return gatk_filter2_final_vcf -# -# def remove_proximate_snps(gatk_filter2_final_vcf_file, out_path, analysis, reference): -# all_position = [] -# remove_proximate_position_array = [] -# gatk_filter2_final_vcf_file_no_proximate_snp = gatk_filter2_final_vcf_file + "_no_proximate_snp.vcf" -# with open(gatk_filter2_final_vcf_file, 'rU') as csv_file: -# for line in csv_file: -# if not line.startswith('#'): -# line_array = line.split('\t') -# all_position.append(line_array[1]) -# for position in all_position: -# position_index = all_position.index(position) -# next_position_index = position_index + 1 -# -# if next_position_index < len(all_position): -# diff = int(all_position[next_position_index]) - int(position) -# if diff < 10: -# #print position + " " + all_position[next_position_index] -# if position not in remove_proximate_position_array and all_position[next_position_index] not in remove_proximate_position_array: -# remove_proximate_position_array.append(int(position)) -# remove_proximate_position_array.append(int(all_position[next_position_index])) -# f1=open(gatk_filter2_final_vcf_file_no_proximate_snp, 'w+') -# with open(gatk_filter2_final_vcf_file, 'rU') as csv_file2: -# for line in csv_file2: -# if line.startswith('gi') or line.startswith('MRSA_8058'): ##change this! -# line_array = line.split('\t') -# if int(line_array[1]) not in remove_proximate_position_array: -# print_string = line -# f1.write(print_string) -# else: -# print_string = line -# f1.write(print_string) -# gatk_filter2_final_vcf_file_no_proximate_snp_positions = gatk_filter2_final_vcf_file + "_no_proximate_snp.vcf_positions_array" -# f2=open(gatk_filter2_final_vcf_file_no_proximate_snp_positions, 'w+') -# for i in remove_proximate_position_array: -# position_print_string = str(i) + "\n" -# f2.write(position_print_string) -# return gatk_filter2_final_vcf_file_no_proximate_snp -# -# def FQ_analysis(): -# for i in vcf_filenames: -# filename_base = os.path.basename(i) -# aln_mpileup_vcf_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_aln_mpileup_raw.vcf_5bp_indel_removed.vcf') -# analysis = filename_base.replace('_filter2_final.vcf_no_proximate_snp.vcf', '') -# #print aln_mpileup_vcf_file -# grep_reference_file = "grep \'^##reference\' %s" % aln_mpileup_vcf_file -# proc = subprocess.Popen([grep_reference_file], stdout=subprocess.PIPE, shell=True) -# (out, err) = proc.communicate() -# out = out.strip() -# reference_file = out.split(':') -# # Change it to multiprocessing -# gatk_filter2_final_vcf_file = gatk_filter2(aln_mpileup_vcf_file, temp_dir, analysis, reference_file[1]) -# #print gatk_filter2_final_vcf_file -# gatk_filter2_final_vcf_file_no_proximate_snp = remove_proximate_snps(gatk_filter2_final_vcf_file, temp_dir, analysis, reference_file[1]) -# grep_fq_field = "awk -F\'\\t\' \'{print $8}\' %s | grep -o \'FQ=.*\' | sed \'s/FQ=//g\' | awk -F\';\' \'{print $1}\' > %s/%s_FQ_values" % (gatk_filter2_final_vcf_file_no_proximate_snp, os.path.dirname(i), analysis) -# #os.system(grep_fq_field) -# call("%s" % grep_fq_field, logger) -# #print grep_fq_field -# -# def DP_analysis(): -# create_job_DP(args.jobrun, vcf_filenames) -# paste_command = "paste %s/extract_DP_positions.txt" % args.filter2_only_snp_vcf_dir -# for i in vcf_filenames: -# label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_DP_values') -# paste_command = paste_command + " " + label_file -# -# paste_file = args.filter2_only_snp_vcf_dir + "/paste_DP_files.sh" -# f2=open(paste_file, 'w+') -# paste_command = paste_command + " > %s/filtered_DP_values_temp.txt" % args.filter2_only_snp_vcf_dir -# #os.system(paste_command) -# f2.write(paste_command + '\n') -# cat_header = "cat %s/header.txt %s/filtered_DP_values_temp.txt > %s/filtered_DP_values.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) -# #os.system(cat_header) -# f2.write(cat_header + '\n') -# sed_command = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/filtered_DP_values.txt" % (args.filter2_only_snp_vcf_dir) -# #os.system(sed_command) -# f2.write(sed_command + '\n') -# cmd = "bash %s" % paste_file -# # os.system("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir) -# -# def DP_analysis_barplot(): -# #os.system("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir) -# call("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir, logger) -# keep_logging('Generating DP barplots data...', 'Generating DP barplots data...', logger, 'info') -# c_reader = csv.reader(open('%s/filtered_DP_values.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') -# columns = list(zip(*c_reader)) -# counts = 1 -# end = len(vcf_filenames) + 1 -# f_bar_count = open("%s/DP_bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# f_bar_perc = open("%s/DP_bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') -# f_bar_count.write("Sample\treference_position\toneto5\tsixto10\televento14\tfifteenorabove\n") -# f_bar_perc.write("Sample\treference_position\toneto5\tsixto10\televento14\tfifteenorabove\n") -# for i in xrange(1, end, 1): -# """ Bar Count Statistics: Variant Position Count Statistics """ -# reference_position = columns[i].count('NA') -# oneto5 = 0 -# for k in list(columns[i][1:]): -# if k != "": -# if k != "NA": -# if int(k) < 5: -# oneto5 += 1 -# sixto10 = 0 -# for k in list(columns[i][1:]): -# if k != "": -# if k != "NA": -# if int(k) >= 5 and int(k) <= 10: -# sixto10 += 1 -# elevento14 = 0 -# for k in list(columns[i][1:]): -# if k != "": -# if k != "NA": -# if int(k) >= 11 and int(k) <= 14: -# elevento14 += 1 -# fifteenorabove = 0 -# for k in list(columns[i][1:]): -# if k != "": -# if k != "NA": -# if int(k) >= 15: -# fifteenorabove += 1 -# total = reference_position + oneto5 + sixto10 + elevento14 + fifteenorabove -# filename_count = i - 1 -# bar_string = "%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), reference_position, oneto5, sixto10, elevento14, fifteenorabove) -# f_bar_count.write(bar_string) -# -# """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ -# try: -# reference_position_perc = float(reference_position * 100 / total) -# except ZeroDivisionError: -# reference_position_perc = 0 -# try: -# oneto5_perc = float(oneto5 * 100 / total) -# except ZeroDivisionError: -# oneto5_perc = 0 -# try: -# sixto10_perc = float(sixto10 * 100 / total) -# except ZeroDivisionError: -# sixto10_perc = 0 -# try: -# elevento14_perc = float(elevento14 * 100 / total) -# except ZeroDivisionError: -# elevento14_perc = 0 -# try: -# fifteenorabove_perc = float(fifteenorabove * 100 / total) -# except ZeroDivisionError: -# fifteenorabove_perc = 0 -# bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), reference_position_perc, oneto5_perc, sixto10_perc, elevento14_perc, fifteenorabove_perc) -# f_bar_perc.write(bar_perc_string) - -def extract_only_ref_variant_fasta(core_vcf_fasta_dir): - if ConfigSectionMap("functional_filters", Config)['apply_functional_filters'] == "yes" and ConfigSectionMap("functional_filters", Config)['apply_to_calls'] == "yes": - functional_filter = "yes" - create_job_fasta(args.jobrun, vcf_filenames, core_vcf_fasta_dir, functional_filter) - -def extract_only_ref_variant_fasta_from_reference(): - if ConfigSectionMap("functional_filters", Config)['apply_functional_filters'] == "yes" and \ - ConfigSectionMap("functional_filters", Config)['apply_to_calls'] == "yes": - ffp = open("%s/Only_ref_variant_positions_for_closely_without_functional_filtered_positions" % args.filter2_only_snp_vcf_dir).readlines() - else: - ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir).readlines() - fasta_string = "" - #firstLine = ffp.pop(0) - for lines in ffp: - lines = lines.strip() - extract_base = "grep -v \'>\' %s | tr -d \'\\n\'| cut -b%s" % (args.reference, lines) - proc = subprocess.Popen([extract_base], stdout=subprocess.PIPE, shell=True) - (out, err) = proc.communicate() - out = out.strip() - fasta_string = fasta_string + out - if not out: - print lines - keep_logging('Error extracting reference allele', 'Error extracting reference allele', logger, 'info') - exit() - - pattern = re.compile(r'\s+') - fasta_string = re.sub(pattern, '', fasta_string) - final_fasta_string = ">%s\n" % os.path.basename(args.reference.replace('.fasta', '').replace('.fa', '')) + fasta_string + "\n" - fp = open("%s/%s_variants.fa" % (args.filter2_only_snp_vcf_dir, os.path.basename(args.reference.replace('.fasta', '').replace('.fa', ''))), 'w+') - fp.write(final_fasta_string) - fp.close() - -def extract_only_ref_variant_fasta_from_reference_allele_variant(): - ffp = open("%s/unique_positions_file" % args.filter2_only_snp_vcf_dir).readlines() - #unique_positions_array = [] - - fasta_string = "" - #firstLine = ffp.pop(0) - for lines in ffp: - lines = lines.strip() - #unique_positions_array.append(lines) - extract_base = "grep -v \'>\' %s | tr -d \'\\n\'| cut -b%s" % (args.reference, lines) - proc = subprocess.Popen([extract_base], stdout=subprocess.PIPE, shell=True) - (out, err) = proc.communicate() - out = out.strip() - fasta_string = fasta_string + out - if not out: - print lines - keep_logging('Error extracting reference allele', 'Error extracting reference allele', logger, 'info') - exit() - - pattern = re.compile(r'\s+') - fasta_string = re.sub(pattern, '', fasta_string) - final_fasta_string = ">%s\n" % os.path.basename(args.reference.replace('.fasta', '').replace('.fa', '')) + fasta_string + "\n" - fp = open("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, os.path.basename(args.reference.replace('.fasta', '').replace('.fa', ''))), 'w+') - fp.write(final_fasta_string) - fp.close() - -def prepare_snpEff_db(reference_basename): - keep_logging('Preparing snpEff database requirements.', 'Preparing snpEff database requirements.', logger, 'info') - reference_basename = (os.path.basename(args.reference)).split(".") - if os.path.isfile("%s/%s/snpEff.config" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'])): - #os.system("cp %s/%s/snpEff.config %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], args.filter2_only_snp_vcf_dir)) - keep_logging("cp %s/%s/snpEff.config %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], args.filter2_only_snp_vcf_dir), "cp %s/%s/snpEff.config %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], args.filter2_only_snp_vcf_dir), logger, 'debug') - call("cp %s/%s/snpEff.config %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], args.filter2_only_snp_vcf_dir), logger) - else: - keep_logging("Error: %s/%s/snpEff.config doesn't exists.\nExiting..." % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']),"Error: %s/%s/snpEff.config doesn't exists.\nExiting..." % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger, 'exception') - exit() - make_sure_path_exists("%s/%s/data/%s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0])) - make_sure_path_exists("%s/%s/data/genomes/" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'])) - #os.system("cp %s %s/%s/data/genomes/" % (args.reference, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'])) - keep_logging("cp %s %s/%s/data/genomes/%s.fa" % (args.reference, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0]), "cp %s %s/%s/data/genomes/" % (args.reference, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger, 'debug') - call("cp %s %s/%s/data/genomes/%s.fa" % (args.reference, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0]), logger) - with open("%s/snpEff.config" % args.filter2_only_snp_vcf_dir, "a") as conf_file: - conf_file.write("\n\n##Building Custom Database###\n%s.genome\t: %s\n\n" % (reference_basename[0], reference_basename[0])) - conf_file.close() - #get the gff name from config file - if os.path.isfile("%s/%s.gff" % (os.path.dirname(args.reference), reference_basename[0])): - keep_logging("cp %s/%s.gff %s/%s/data/%s/genes.gff" % ( - os.path.dirname(args.reference), reference_basename[0], ConfigSectionMap("bin_path", Config)['binbase'], - ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0]), - "cp %s/%s.gff %s/%s/data/%s/genes.gff" % (os.path.dirname(args.reference), reference_basename[0], - ConfigSectionMap("bin_path", Config)['binbase'], - ConfigSectionMap("snpeff", Config)['snpeff_bin'], - reference_basename[0]), logger, 'debug') - keep_logging("cp %s/%s.gb* %s/%s/data/%s/genes.gbk" % ( - os.path.dirname(args.reference), reference_basename[0], ConfigSectionMap("bin_path", Config)['binbase'], - ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0]), - "cp %s/%s.gff %s/%s/data/%s/genes.gff" % (os.path.dirname(args.reference), reference_basename[0], - ConfigSectionMap("bin_path", Config)['binbase'], - ConfigSectionMap("snpeff", Config)['snpeff_bin'], - reference_basename[0]), logger, 'debug') - call("cp %s/%s.gff %s/%s/data/%s/genes.gff" % ( - os.path.dirname(args.reference), reference_basename[0], ConfigSectionMap("bin_path", Config)['binbase'], - ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0]), logger) - call("cp %s/%s.gb* %s/%s/data/%s/genes.gbk" % ( - os.path.dirname(args.reference), reference_basename[0], ConfigSectionMap("bin_path", Config)['binbase'], - ConfigSectionMap("snpeff", Config)['snpeff_bin'], reference_basename[0]), logger) - else: - keep_logging("Error: %s/%s.gff file doesn't exists. Make sure the GFF file has the same prefix as reference fasta file\nExiting..." % (os.path.dirname(args.reference), reference_basename[0]), - "Error: %s/%s.gff file doesn't exists. Make sure the GFF file has the same prefix as reference fasta file\nExiting..." % (os.path.dirname(args.reference), reference_basename[0]), logger, 'exception') - exit() - #keep_logging("java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), "java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger, 'debug') - keep_logging("java -jar %s/%s/%s build -genbank -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), "java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger, 'debug') - - #call("java -jar %s/%s/%s build -gff3 -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger) - call("java -jar %s/%s/%s build -genbank -v %s -c %s/snpEff.config -dataDir %s/%s/data" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], reference_basename[0], args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin']), logger) - keep_logging('Finished Preparing snpEff database requirements.', 'Finished Preparing snpEff database requirements.', logger, 'info') - -def variant_annotation(): - keep_logging('Annotating Variants using snpEff.', 'Annotating Variants using snpEff.', logger, 'info') - - if ConfigSectionMap("snpeff", Config)['prebuild'] == "yes": - if ConfigSectionMap("snpeff", Config)['db']: - print "Using pre-built snpEff database: %s" % ConfigSectionMap("snpeff", Config)['db'] - proc = subprocess.Popen(["java -jar %s/%s/%s databases | grep %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], ConfigSectionMap("snpeff", Config)['db'])], - stdout=subprocess.PIPE, shell=True) - (out2, err2) = proc.communicate() - if out2: - snpeffdb = ConfigSectionMap("snpeff", Config)['db'] - else: - print "The database name %s provided was not found. Check the name and try again" % ConfigSectionMap("snpeff", Config)['db'] - exit() - else: - print "snpEff db section is not set in config file" - exit() - else: - reference_basename = (os.path.basename(args.reference)).split(".") - snpeffdb = reference_basename[0] - prepare_snpEff_db(reference_basename) - - annotate_vcf_cmd_array = [] - annotate_final_vcf_cmd_array = [] - for i in vcf_filenames: - raw_vcf = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_aln_mpileup_raw.vcf') - annotate_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ - (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], raw_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, raw_vcf, raw_vcf) - print annotate_vcf_cmd - annotate_vcf_cmd_array.append(annotate_vcf_cmd) - final_vcf = i - annotate_final_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ - (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], final_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, final_vcf, final_vcf) - annotate_final_vcf_cmd_array.append(annotate_final_vcf_cmd) - if args.numcores: - num_cores = int(num_cores) - else: - num_cores = multiprocessing.cpu_count() - #print annotate_vcf_cmd_array - results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_vcf_cmd_array) - results_2 = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_final_vcf_cmd_array) - -def indel_annotation(): - keep_logging('Annotating indels using snpEff.', 'Annotating indels using snpEff.', logger, 'info') - - if ConfigSectionMap("snpeff", Config)['prebuild'] == "yes": - if ConfigSectionMap("snpeff", Config)['db']: - print "Using pre-built snpEff database: %s" % ConfigSectionMap("snpeff", Config)['db'] - proc = subprocess.Popen(["java -jar %s/%s/%s databases | grep %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], ConfigSectionMap("snpeff", Config)['db'])], - stdout=subprocess.PIPE, shell=True) - (out2, err2) = proc.communicate() - if out2: - snpeffdb = ConfigSectionMap("snpeff", Config)['db'] - else: - print "The database name %s provided was not found. Check the name and try again" % ConfigSectionMap("snpeff", Config)['db'] - exit() - else: - print "snpEff db section is not set in config file" - exit() - else: - reference_basename = (os.path.basename(args.reference)).split(".") - snpeffdb = reference_basename[0] - prepare_snpEff_db(reference_basename) - - - annotate_vcf_cmd_array = [] - annotate_final_vcf_cmd_array = [] - for i in vcf_filenames: - raw_vcf = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_aln_mpileup_raw.vcf') - annotate_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ - (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], raw_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, raw_vcf, raw_vcf) - annotate_vcf_cmd_array.append(annotate_vcf_cmd) - final_vcf = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf') - annotate_final_vcf_cmd = "java -Xmx4g -jar %s/%s/%s -csvStats %s_ANN.csv -dataDir %s/%s/data/ %s -c %s/snpEff.config %s %s > %s_ANN.vcf" % \ - (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['base_cmd'], final_vcf, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("snpeff", Config)['snpeff_bin'], ConfigSectionMap("snpeff", Config)['snpeff_parameters'], args.filter2_only_snp_vcf_dir, snpeffdb, final_vcf, final_vcf) - annotate_final_vcf_cmd_array.append(annotate_final_vcf_cmd) - if args.numcores: - num_cores = int(num_cores) - else: - num_cores = multiprocessing.cpu_count() - results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_vcf_cmd_array) - results_2 = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in annotate_final_vcf_cmd_array) - -def gatk_combine_variants(files_gatk, reference, out_path, merged_file_suffix, logger, Config): - base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)[ - 'gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] - #files_gatk = "--variant " + ' --variant '.join(vcf_files_array) - keep_logging("java -jar %s -T CombineVariants -R %s %s -o %s/Final_vcf_gatk%s" % (base_cmd, reference, files_gatk, out_path, merged_file_suffix), "java -jar %s -T CombineVariants -R %s %s -o %s/Final_vcf_gatk%s" % (base_cmd, reference, files_gatk, out_path, merged_file_suffix), logger, 'debug') - merge_gatk_commands_file = "%s/gatk_merge.sh" % args.filter2_only_snp_vcf_dir - with open(merge_gatk_commands_file, 'w+') as fopen: - fopen.write("java -jar %s -T CombineVariants -R %s %s -o %s/Final_vcf_gatk%s" % (base_cmd, reference, files_gatk, out_path, merged_file_suffix) + '\n') - fopen.close() - # Commenting out calling gatk combine variants with a custom logging call method, problem with python subprocess, OSError: [Errno 7] Argument list too long - os.system("bash %s" % merge_gatk_commands_file) - return "%s/Final_vcf_gatk%s" % (out_path, merged_file_suffix) - -def annotated_snp_matrix(): - """ - :return: Annotate core vcf files generated at core_prep steps. - Read Genbank file and return a dictionary of Prokka ID mapped to Gene Name, Prokka ID mapped to Product Name. - This dictionary will then be used to insert annotation into SNP/Indel matrix - """ - - """Annotate all VCF file formats with SNPeff""" - # Commented for debugging - variant_annotation() - - indel_annotation() - - - """ Start of Extract Annotation information from Genbank file - - Extract Annotation information from Genbank file - - - Check if Reference genome Genbank file exists. - - Initiate dictionaries that maps locus tag to gene name and product. This information will be used for annotating SNP/Indel Matrix - - Read the locus tag and gene annotations into a dictionary that maps locus tags to gene name/product name - - """ - - reference_basename = (os.path.basename(args.reference)).split(".") - if os.path.isfile("%s/%s.gbf" % (os.path.dirname(args.reference), reference_basename[0])): - handle = open("%s/%s.gbf" % (os.path.dirname(args.reference), reference_basename[0]), 'rU') - else: - raise IOError('%s/%s.gbf does not exist.' % (os.path.dirname(args.reference), reference_basename[0])) - exit() - - locus_tag_to_gene_name = {} - locus_tag_to_product = {} - locus_tag_to_strand = {} - #locus_tag_to_uniprot = {} - #locus_tag_to_ec_number = {} - - keep_logging( - 'Reading annotations from Reference genome genbank file: %s/%s.gbf' % (os.path.dirname(args.reference), reference_basename[0]), - 'Reading annotations from Reference genome genbank file: %s/%s.gbf' % (os.path.dirname(args.reference), reference_basename[0]), - logger, 'info') - for record in SeqIO.parse(handle, 'genbank') : - for feature in record.features: - location = str(feature.location) - strand = location.split('(')[1].replace(')', '') - if 'locus_tag' in feature.qualifiers: - locus_tag_to_strand[str(feature.qualifiers['locus_tag'][0])] = strand - if 'gene' in feature.qualifiers: - locus_tag_to_gene_name[str(feature.qualifiers['locus_tag'][0])] = str(feature.qualifiers['gene'][0]) - else: - locus_tag_to_gene_name[str(feature.qualifiers['locus_tag'][0])] = "null or hypothetical protein" - if 'product' in feature.qualifiers: - locus_tag_to_product[str(feature.qualifiers['locus_tag'][0])] = str(feature.qualifiers['product'][0]) - else: - locus_tag_to_product[str(feature.qualifiers['locus_tag'][0])] = "null or hypothetical protein" - else: - keep_logging( - 'Error: locus_tag specifications for the below feature doesnt exists. Please check the format of genbank file\n%s' % str(feature), - 'Error: locus_tag specifications for the below feature doesnt exists. Please check the format of genbank file\n%s' % str(feature), - logger, 'exception') - - # Annotation Bug fix 1 - first_locus_tag = record.features[1].qualifiers['locus_tag'][0] - last_element = len(record.features) - 1 - last_locus_tag = record.features[last_element].qualifiers['locus_tag'][0] - - # #Debugging prints - # print first_locus_tag - # print locus_tag_to_gene_name[first_locus_tag] - # print last_locus_tag - # print locus_tag_to_gene_name[last_locus_tag] - - """ End of Extract Annotation information from Genbank file - - Extract Annotation information from Genbank file - - - Check if Reference genome Genbank file exists. - - Initiate dictionaries that maps locus tag to gene name and product. This information will be used for annotating SNP/Indel Matrix - - Read the locus tag and gene annotations into a dictionary that maps locus tags to gene name/product name - - """ - - - - """ Start of Merging Step: - - - Merge Individual Annotated raw and filtered vcf files to generate a Final merged vcf file using Gatk combine variants method. - - Parse this merged Final_vcf* file and generate a SNP/Indel matrix - - """ - - keep_logging('Merging Final Annotated VCF files into %s/Final_vcf_no_proximate_snp.vcf using bcftools' % args.filter2_only_snp_vcf_dir, 'Merging Final Annotated VCF files into %s/Final_vcf_no_proximate_snp.vcf using bcftools' % args.filter2_only_snp_vcf_dir, logger, 'info') - - #Commented for debugging - files_for_tabix = glob.glob("%s/*.vcf_no_proximate_snp.vcf_ANN.vcf" % args.filter2_only_snp_vcf_dir) - tabix(files_for_tabix, "vcf", logger, Config) - files_for_tabix = glob.glob("%s/*_filter2_indel_final.vcf_ANN.vcf" % args.filter2_only_snp_vcf_dir) - tabix(files_for_tabix, "vcf", logger, Config) - - files = ' '.join(vcf_filenames) - - - """ bcftools merging is deprecated. Replaced with GATK combinevariants """ - merge_commands_file = "%s/bcftools_merge.sh" % args.filter2_only_snp_vcf_dir - - with open(merge_commands_file, 'w+') as fopen: - fopen.write("%s/%s/bcftools merge -i ANN:join -m both -o %s/Final_vcf_no_proximate_snp.vcf -O v %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bcftools", Config)['bcftools_bin'], args.filter2_only_snp_vcf_dir, files.replace("_filter2_final.vcf_no_proximate_snp.vcf", "_filter2_final.vcf_no_proximate_snp.vcf_ANN.vcf.gz")) + '\n') - fopen.write("%s/%s/bcftools merge -i ANN:join -m both -o %s/Final_vcf_indel.vcf -O v %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bcftools", Config)['bcftools_bin'], args.filter2_only_snp_vcf_dir,files.replace("_filter2_final.vcf_no_proximate_snp.vcf","_filter2_indel_final.vcf_ANN.vcf.gz")) + '\n') - - fopen.close() - - os.system("bash %s" % merge_commands_file) - - - """ Merge with Gatk combine variants method """ - # #Commented for debugging - merged_file_suffix = "_no_proximate_snp.vcf" - - annotated_no_proximate_snp_file = "%s/annotated_no_proximate_snp_list.txt" % args.filter2_only_snp_vcf_dir - annotated_no_proximate_snp_indel_file = "%s/annotated_no_proximate_snp_indel_list.txt" % args.filter2_only_snp_vcf_dir - - with open(annotated_no_proximate_snp_file, 'w+') as fopen: - for i in vcf_filenames: - fopen.write(i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_ANN.vcf.gz') + '\n') - fopen.close() - - with open(annotated_no_proximate_snp_indel_file, 'w+') as fopen: - for i in vcf_filenames: - fopen.write(i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_indel_final.vcf_ANN.vcf.gz') + '\n') - fopen.close() - - #files_gatk = "--variant " + ' --variant '.join(vcf_filenames) - files_gatk = "" - for i in vcf_filenames: - files_gatk = files_gatk + " --variant " + i - final_gatk_snp_merged_vcf = gatk_combine_variants(files_gatk.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_ANN.vcf.gz'), args.reference, args.filter2_only_snp_vcf_dir, merged_file_suffix, logger, Config) - - # Test this merge and annotate this merged file - Testing Mode Right now. - #merged_file_suffix = "_no_proximate_snp_1.vcf" - #final_gatk_snp_merged_vcf_1 = gatk_combine_variants(files_gatk,args.reference, args.filter2_only_snp_vcf_dir, merged_file_suffix, logger, Config) - merged_file_suffix = "_indel.vcf" - final_gatk_indel_merged_vcf = gatk_combine_variants(files_gatk.replace('_filter2_final.vcf_no_proximate_snp.vcf', - '_filter2_indel_final.vcf_ANN.vcf.gz'), - args.reference, args.filter2_only_snp_vcf_dir, merged_file_suffix, - logger, Config) - - """ Tabix index the combined GATK Final vcf file """ - files_for_tabix = glob.glob("%s/Final_vcf_*.vcf" % args.filter2_only_snp_vcf_dir) - tabix(files_for_tabix, "vcf", logger, Config) - - - """ End of Merging Step. """ - - - """ Extract ANN information from bcftools Final vcf file. (There is a reason why i am using bcftools merged file to extract ANN information) """ - snp_var_ann_dict = {} - indel_var_ann_dict = {} - - for variants in VCF("%s/Final_vcf_no_proximate_snp.vcf.gz" % args.filter2_only_snp_vcf_dir): - snp_var_ann_dict[variants.POS] = variants.INFO.get('ANN') - - for variants in VCF("%s/Final_vcf_indel.vcf.gz" % args.filter2_only_snp_vcf_dir): - indel_var_ann_dict[variants.POS] = variants.INFO.get('ANN') - - """ End of Extract ANN information from bcftools Final vcf file""" - - - - """ This step is no longer required: Remove this after testing. print_string_header will be the column names of SNP matrix. Column names = Sample names""" - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - - - - """ Generate an array of core positions. Read Only_ref_variant_positions_for_closely* to get final core variant positions into core_positions array""" - core_positions = [] - if ConfigSectionMap("functional_filters", Config)['apply_to_calls'] == "yes": - core_positions_file = "%s/Only_ref_variant_positions_for_closely_without_functional_filtered_positions" % args.filter2_only_snp_vcf_dir - else: - core_positions_file = "%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir - with open(core_positions_file) as fp: - for line in fp: - line = line.strip() - core_positions.append(line) - fp.close() - - indel_core_positions = [] - if ConfigSectionMap("functional_filters", Config)['apply_to_calls'] == "yes": - core_positions_file = "%s/Only_ref_indel_variant_positions_for_closely_without_functional_filtered_positions" % args.filter2_only_snp_vcf_dir - else: - core_positions_file = "%s/Only_ref_indel_positions_for_closely" % args.filter2_only_snp_vcf_dir - with open(core_positions_file) as fp: - for line in fp: - line = line.strip() - indel_core_positions.append(line) - fp.close() - - """ End: Generate an array of core positions. """ - - - - """ Generate a list of functional class positions from Phaster, Mummer and Custom Masking results/files""" - """ Read in functional class filter positions. """ - functional_filter_pos_array = [] - with open(functional_class_filter_positions, 'rU') as f_functional: - for line_func in f_functional: - functional_filter_pos_array.append(line_func.strip()) - - """ GET individual PHAGE/Repetitive/masked region positions to assign functional class group string """ - phage_positions = [] - repetitive_positions = [] - mask_positions = [] - if ConfigSectionMap("functional_filters", Config)['apply_functional_filters'] == "yes": - if ConfigSectionMap("functional_filters", Config)['find_phage_region'] == "yes": - phage_region_positions = "%s/phage_region_positions.txt" % args.filter2_only_snp_vcf_dir - if os.path.isfile(phage_region_positions): - with open(phage_region_positions, 'rU') as fphage: - for line in fphage: - phage_positions.append(line.strip()) - fphage.close() - else: - raise IOError('%s/phage_region_positions.txt does not exist.' % args.filter2_only_snp_vcf_dir) - exit() - # GET REPETITIVE REGIONS - if ConfigSectionMap("functional_filters", Config)['find_repetitive_region'] == "yes": - repetitive_positions_file = "%s/repeat_region_positions.txt" % args.filter2_only_snp_vcf_dir - if os.path.isfile(repetitive_positions_file): - with open(repetitive_positions_file, 'rU') as frep: - for line in frep: - repetitive_positions.append(line.strip()) - frep.close() - else: - raise IOError('%s/repeat_region_positions.txt does not exist.' % args.filter2_only_snp_vcf_dir) - exit() - # GET MASK REGIONS - if ConfigSectionMap("functional_filters", Config)['mask_region'] == "yes": - mask_positions_file = "%s/mask_positions.txt" % args.filter2_only_snp_vcf_dir - if os.path.isfile(mask_positions_file): - with open(mask_positions_file, 'rU') as fmask: - for line in fmask: - mask_positions.append(line.strip()) - fmask.close() - else: - raise IOError('%s/mask_positions.txt does not exist.' % args.filter2_only_snp_vcf_dir) - exit() - - """ End: Generate a list of functional class positions from Phaster, Mummer and Custom Masking results/files""" - - - - - """ Read and parse final GATK merged vcf file cyvcf library; Generate a header string from the sample lis fo this merged vcf file""" - - final_merge_anno_file = VCF("%s/Final_vcf_gatk_no_proximate_snp.vcf.gz" % args.filter2_only_snp_vcf_dir) - - """ Prepare SNP/Indel Matrix print strings and add matrix row information subsequently """ - header_print_string = "Type of SNP at POS > ALT functional=PHAGE_REPEAT_MASK locus_tag=locus_id strand=strand; ALT|Effect|Impact|GeneID|Nrchange|Aachange|Nrgenepos|AAgenepos|gene_symbol|product" - for sample in final_merge_anno_file.samples: - # header_print_string = header_print_string + "," + sample - header_print_string = header_print_string + "\t" + sample - header_print_string = header_print_string + "\n" - - """ End """ - - - - - """ Prepare a All_indel_label_final_ordered_sorted.txt file with sorted unique variant positions. """ - paste_label_command = "paste %s/unique_positions_file " % args.filter2_only_snp_vcf_dir - paste_indel_label_command = "paste %s/unique_indel_positions_file " % args.filter2_only_snp_vcf_dir - paste_label_command_exclude_outgroup = "paste %s/unique_positions_file " % args.filter2_only_snp_vcf_dir - paste_indel_label_command_exclude_outgroup = "paste %s/unique_indel_positions_file " % args.filter2_only_snp_vcf_dir - - for filename_base in final_merge_anno_file.samples: - if "R1_001_final.fastq.gz" in filename_base: - second_part = filename_base.replace("R1_001_final.fastq.gz", "R2_001_final.fastq.gz") - first_part_split = filename_base.split('R1_001_final.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) - elif "_R1.fastq.gz" in filename_base: - second_part = filename_base.replace("_R1.fastq.gz", "_R2.fastq.gz") - first_part_split = filename_base.split('_R1.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) - # Changed on 03/15/2019 - elif "R1.fastq.gz" in filename_base: - second_part = filename_base.replace("R1.fastq.gz", "R2.fastq.gz") - first_part_split = filename_base.split('R1.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) - # Changed on 03/15/2019 - first_part = re.sub("_S.*", "", first_part) - elif "1_combine.fastq.gz" in filename_base: - second_part = filename_base.replace("1_combine.fastq.gz", "2_combine.fastq.gz") - first_part_split = filename_base.split('1_combine.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) - elif "1_sequence.fastq.gz" in filename_base: - second_part = filename_base.replace("1_sequence.fastq.gz", "2_sequence.fastq.gz") - first_part_split = filename_base.split('1_sequence.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) - elif "_forward.fastq.gz" in filename_base: - second_part = filename_base.replace("_forward.fastq.gz", "_reverse.fastq.gz") - first_part_split = filename_base.split('_forward.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) - elif "R1_001.fastq.gz" in filename_base: - second_part = filename_base.replace("R1_001.fastq.gz", "R2_001.fastq.gz") - first_part_split = filename_base.split('R1_001.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) - elif "_1.fastq.gz" in filename_base: - second_part = filename_base.replace("_1.fastq.gz", "_2.fastq.gz") - first_part_split = filename_base.split('_1.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) - elif ".1.fastq.gz" in filename_base: - second_part = filename_base.replace(".1.fastq.gz", ".2.fastq.gz") - first_part_split = filename_base.split('.1.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) - sample_label_file = "%s/%s_filter2_final.vcf_no_proximate_snp.vcf_positions_label" % ( - args.filter2_only_snp_vcf_dir, first_part) - sample_indel_label_file = "%s/%s_filter2_indel_final.vcf_indel_positions_label" % ( - args.filter2_only_snp_vcf_dir, first_part) - paste_label_command = paste_label_command + sample_label_file + " " - paste_indel_label_command = paste_indel_label_command + sample_indel_label_file + " " - if args.outgroup: - if outgroup not in sample_label_file: - paste_label_command_exclude_outgroup = paste_label_command_exclude_outgroup + sample_label_file + " " - paste_indel_label_command_exclude_outgroup = paste_indel_label_command_exclude_outgroup + sample_indel_label_file + " " - - paste_label_command = paste_label_command + " > %s/All_label_final_ordered.txt" % args.filter2_only_snp_vcf_dir - paste_indel_label_command = paste_indel_label_command + " > %s/All_indel_label_final_ordered.txt" % args.filter2_only_snp_vcf_dir - sort_ordered_label_cmd = "sort -n -k1,1 %s/All_label_final_ordered.txt > %s/All_label_final_ordered_sorted.txt" % ( - args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - sort_ordered_indel_label_cmd = "sort -n -k1,1 %s/All_indel_label_final_ordered.txt > %s/All_indel_label_final_ordered_sorted.txt" % ( - args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - - if args.outgroup: - paste_label_command_exclude_outgroup = paste_label_command_exclude_outgroup + " > %s/All_label_final_ordered_exclude_outgroup.txt" % args.filter2_only_snp_vcf_dir - paste_indel_label_command_exclude_outgroup = paste_indel_label_command_exclude_outgroup + " > %s/All_indel_label_final_ordered_exclude_outgroup.txt" % args.filter2_only_snp_vcf_dir - sort_ordered_label_cmd_exclude_outgroup = "sort -n -k1,1 %s/All_label_final_ordered_exclude_outgroup.txt > %s/All_label_final_ordered_exclude_outgroup_sorted.txt" % ( - args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - sort_ordered_indel_label_cmd_exclude_outgroup = "sort -n -k1,1 %s/All_indel_label_final_ordered_exclude_outgroup.txt > %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt" % ( - args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - - - with open('%s/All_label_final_ordered.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: - outfile.write(paste_label_command + '\n') - outfile.write(sort_ordered_label_cmd + '\n') - outfile.write(paste_indel_label_command + '\n') - outfile.write(sort_ordered_indel_label_cmd + '\n') - outfile.close() - - os.system("bash %s/All_label_final_ordered.sh" % args.filter2_only_snp_vcf_dir) - - if args.outgroup: - # Just in case if os.system past commands doesn't work - with open('%s/All_label_final_ordered_exclude_outgroup.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: - outfile.write(paste_label_command_exclude_outgroup + '\n') - outfile.write(sort_ordered_label_cmd_exclude_outgroup + '\n') - outfile.write(paste_indel_label_command_exclude_outgroup + '\n') - outfile.write(sort_ordered_indel_label_cmd_exclude_outgroup + '\n') - outfile.close() - - # Changed: Uncomment this - os.system("bash %s/All_label_final_ordered_exclude_outgroup.sh" % args.filter2_only_snp_vcf_dir) - - """ End: Prepare a All_indel_label_final_ordered_sorted.txt file with sorted unique variant positions. """ - - - - - - - - """ Generate a position_label and position_indel_label dictionary that will contain information about each unique variant position that passed variant filters in any sample and reasons for being filtered out in any sample """ - position_label = OrderedDict() - with open("%s/All_label_final_ordered_sorted.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - keep_logging('Reading All label positions file: %s/All_label_final_ordered_sorted.txt' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_label_final_ordered_sorted.txt' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position_label[row[0]] = ','.join(row[1:]) - csv_file.close() - - # #Commented for debugging - position_indel_label = OrderedDict() - with open("%s/All_indel_label_final_ordered_sorted.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_indel_label_final_ordered_sorted.txt' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_indel_label_final_ordered_sorted.txt' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - if row[0] not in position_label.keys(): - position_indel_label[row[0]] = ','.join(row[1:]) - else: - position_indel_label[row[0]] = ','.join(row[1:]) - keep_logging('Warning: position %s already present as a SNP' % row[0], - 'Warning: position %s already present as a SNP' % row[0], logger, 'info') - csv_file.close() - - """ End: Generate a position_label and position_indel_label dictionary """ - - - - - - """ Generate mask_fq_mq_positions array with positions where a variant was filtered because of LowFQ or LowMQ """ - mask_fq_mq_positions = [] - mask_fq_mq_positions_outgroup_specific = [] - if args.outgroup: - position_label_exclude_outgroup = OrderedDict() - with open("%s/All_label_final_ordered_exclude_outgroup_sorted.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position_label_exclude_outgroup[row[0]] = ','.join(row[1:]) - csv_file.close() - - #Commented for debugging - position_indel_label_exclude_outgroup = OrderedDict() - with open("%s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - if row[0] not in position_label_exclude_outgroup.keys(): - position_indel_label_exclude_outgroup[row[0]] = ','.join(row[1:]) - else: - position_indel_label_exclude_outgroup[row[0]] = ','.join(row[1:]) - keep_logging('Warning: position %s already present as a SNP' % row[0], - 'Warning: position %s already present as a SNP' % row[0], logger, 'info') - csv_file.close() - - for key in position_label_exclude_outgroup.keys(): - label_sep_array = position_label_exclude_outgroup[key].split(',') - for i in label_sep_array: - if "LowFQ" in str(i): - if key not in mask_fq_mq_positions: - if int(key) not in outgroup_specific_positions: - mask_fq_mq_positions.append(key) - elif int(key) in outgroup_specific_positions: - mask_fq_mq_positions_outgroup_specific.append(key) - if i == "HighFQ": - if key not in mask_fq_mq_positions: - if int(key) not in outgroup_specific_positions: - mask_fq_mq_positions.append(key) - elif int(key) in outgroup_specific_positions: - mask_fq_mq_positions_outgroup_specific.append(key) - else: - for key in position_label.keys(): - label_sep_array = position_label[key].split(',') - for i in label_sep_array: - if "LowFQ" in str(i): - if key not in mask_fq_mq_positions: - mask_fq_mq_positions.append(key) - if i == "HighFQ": - if key not in mask_fq_mq_positions: - mask_fq_mq_positions.append(key) - - fp = open("%s/mask_fq_mq_positions.txt" % (args.filter2_only_snp_vcf_dir), 'w+') - for i in mask_fq_mq_positions: - fp.write(i + '\n') - fp.close() - - fp = open("%s/mask_fq_mq_positions_outgroup_specific.txt" % (args.filter2_only_snp_vcf_dir), 'w+') - for i in mask_fq_mq_positions_outgroup_specific: - fp.write(i + '\n') - fp.close() - - print "Length of mask_fq_mq_positions:%s" % len(mask_fq_mq_positions) - print "Length of mask_fq_mq_positions specific to outgroup:%s" % len(mask_fq_mq_positions_outgroup_specific) - - """ End: Generate mask_fq_mq_positions array """ - - - - - - - - - - - """ Main: Generate SNP Matrix """ - - - """ Open Matrix files to write strings """ - fp_code = open("%s/SNP_matrix_code.csv" % args.filter2_only_snp_vcf_dir, 'w+') - fp_allele = open("%s/SNP_matrix_allele_outdated.csv" % args.filter2_only_snp_vcf_dir, 'w+') - fp_allele_new = open("%s/SNP_matrix_allele_new.csv" % args.filter2_only_snp_vcf_dir, 'w+') - fp_allele_new_phage = open("%s/SNP_matrix_allele_unmasked.csv" % args.filter2_only_snp_vcf_dir, 'w+') - fp_code.write(header_print_string) - fp_allele.write(header_print_string) - fp_allele_new.write(header_print_string) - fp_allele_new_phage.write(header_print_string) - - """ Parse variant positions from the loaded cyvcf VCF object and generate the matrix row information """ - for variants in VCF("%s/Final_vcf_gatk_no_proximate_snp.vcf.gz" % args.filter2_only_snp_vcf_dir): - # Initiate print_string variable to add matrix row information. - # print_string generator no. 1 - print_string = "" - - # Initiate and assign Functional Field filter string => PHAGE/REPEAT/MASK/NULL - functional_field = "" - if str(variants.POS) in phage_positions: - functional_field = functional_field + "PHAGE_" - else: - functional_field = functional_field + "NULL_" - if str(variants.POS) in repetitive_positions: - functional_field = functional_field + "REPEATS_" - else: - functional_field = functional_field + "NULL_" - if str(variants.POS) in mask_positions: - functional_field = functional_field + "MASK" - else: - functional_field = functional_field + "NULL" - - # Initiate variant code string where the code means: - # REF allele = 0, core = 1, Filtered = 2, unmapped = -1, True but non-core = 3 - # This will be used as row information for SNP_matrix_code file - - code_string = position_label[str(variants.POS)] - code_string = code_string.replace('reference_allele', '0') - code_string = code_string.replace('reference_unmapped_position', '-1') - # Changing LowFQ code from 2 to -3 - # Changing HighFQ but LowMQ code from 2 to -4 - code_string = code_string.replace('LowFQ_QUAL_DP_proximate_SNP', '-3') - code_string = code_string.replace('LowFQ_DP_QUAL_proximate_SNP', '-3') - code_string = code_string.replace('LowFQ_QUAL_proximate_SNP', '-3') - code_string = code_string.replace('LowFQ_DP_proximate_SNP', '-3') - code_string = code_string.replace('LowFQ_proximate_SNP', '-3') - code_string = code_string.replace('LowFQ_QUAL_DP', '-3') - code_string = code_string.replace('LowFQ_DP_QUAL', '-3') - code_string = code_string.replace('LowFQ_QUAL', '-3') - code_string = code_string.replace('LowFQ_DP', '-3') - code_string = code_string.replace('HighFQ_QUAL_DP_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_DP_QUAL_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_QUAL_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_DP_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_proximate_SNP', '2') - code_string = code_string.replace('HighFQ_QUAL_DP', '2') - code_string = code_string.replace('HighFQ_DP_QUAL', '2') - code_string = code_string.replace('HighFQ_QUAL', '2') - code_string = code_string.replace('HighFQ_DP', '2') - code_string = code_string.replace('LowFQ', '-3') - code_string = code_string.replace('HighFQ', '-4') - - - if str(variants.POS) in core_positions: - code_string = code_string.replace('VARIANT', '1') - # Adding functional class status code to SNP matrix: 2018-07-24 - elif str(variants.POS) in functional_filter_pos_array: - # Changing Functional class filter code to -2 from 2: 2018-12-04 - code_string = code_string.replace('VARIANT', '-2') - else: - code_string = code_string.replace('VARIANT', '3') - - # Remove this commented section: Deprecated - # Changing SNP type: Date 28/05/2019 - # Assign type of snp: coding / non-coding - # if variants.INFO.get('ANN'): - # if "protein_coding" in variants.INFO.get('ANN'): - # snp_type = "Coding SNP" - # else: - # snp_type = "Non-coding SNP" - # else: - # if len(variants.ALT) > 1 and snp_var_ann_dict[variants.POS]: - # #print variants.ALT - # #print ';'.join(set(snp_var_ann_dict[variants.POS].split(','))) - # #print variants.POS - # #print set(snp_var_ann_dict[variants.POS]) - # if "protein_coding" in set(snp_var_ann_dict[variants.POS].split(',')): - # snp_type = "Coding SNP" - # else: - # snp_type = "Non-coding SNP" - # else: - # snp_type = "Non-coding SNP" - # Remove this commented section: Deprecated - - # Annotation Bug fix 2 - # Changing SNP type: Date 28/05/2019 - if variants.POS in snp_var_ann_dict.keys(): - if snp_var_ann_dict[variants.POS] is not None: - if "protein_coding" in set(snp_var_ann_dict[variants.POS].split('|')) and "intergenic_region" not in set(snp_var_ann_dict[variants.POS].split('|')): - snp_type = "Coding SNP" - elif "protein_coding" in set(snp_var_ann_dict[variants.POS].split('|')) and "intergenic_region" in set(snp_var_ann_dict[variants.POS].split('|')): - snp_type = "Coding and Non-coding SNP" - elif "protein_coding" not in set(snp_var_ann_dict[variants.POS].split('|')) and "intergenic_region" in set(snp_var_ann_dict[variants.POS].split('|')): - snp_type = "Non-Coding SNP" - elif "protein_coding" not in set(snp_var_ann_dict[variants.POS].split('|')) and "intragenic_variant" in set(snp_var_ann_dict[variants.POS].split('|')): - snp_type = "Non-Coding SNP" - else: - print set((snp_var_ann_dict[variants.POS].split('|'))) - snp_type = "No_protein_coding/intergenic_region_field_in_ANN SNP" - #print snp_type - else: - keep_logging('Warning: position %s not found in snp_var_ann_dict dictionary. Assigning Not found as SNP type.' % variants.POS, 'Warning: position %s not found in snp_var_ann_dict dictionary. Assigning Not found as SNP type.' % variants.POS, logger, 'info') - print set((snp_var_ann_dict[variants.POS].split('|'))) - snp_type = "Not Found in Annotated VCF file" - - #print snp_type - - # print_string generator no. 2 - print_string = print_string + snp_type + " at %s > " % str(variants.POS) + str(",".join(variants.ALT)) + " functional=%s" % functional_field - - # Annotation Bug fix 3 - # Get ANN field from variant INFO column and save it as an array. Split and Go through each elements, add bells and whistles - if variants.INFO.get('ANN'): - - ann_array = (variants.INFO.get('ANN')).split(',') - - # Generate tag string before generating ann_string - if len(ann_array) > 1: - # print variants.INFO.get('ANN') - # print list(set(ann_array)) - tag_list = [] - - for i_again in set(snp_var_ann_dict[variants.POS].split(',')): - i_split_again = i_again.split('|') - - - - - if "-" not in i_split_again[4]: - if i_split_again[4] not in tag_list: - tag_list.append(i_split_again[4]) - - else: - split_tags = i_split_again[4].split('-') - for splittagsindividual in split_tags: - if splittagsindividual not in tag_list: - tag_list.append(splittagsindividual) - - if len(tag_list) == 1: - tag = tag_list[0] - - elif len(tag_list) == 2: - tag = str(tag_list[0]) + "-" + str(tag_list[1]) - - elif len(tag_list) > 2: - print tag_list - tag = tag.replace('CHR_START-', '') - tag = tag.replace('-CHR_END', '') - else: - for i in list(set(ann_array)): - i_split = i.split('|') - tag = str(i_split[4]).replace('CHR_START-', '') - tag = str(tag).replace('-CHR_END', '') - - - ann_string = ";" - for i in list(set(ann_array)): - i_split = i.split('|') - #ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13]]) + ";" - - # MOve this tag before this for loop because of multiple tags associated. - # tag = str(i_split[4]).replace('CHR_START-', '') - # tag = str(tag).replace('-CHR_END', '') - - if "-" in tag: - #print tag - extra_tags = "" - tag_split = tag.split('-') - for i in tag_split: - if i in locus_tag_to_gene_name.keys(): - extra_tags = extra_tags + locus_tag_to_gene_name[i] + "," - else: - extra_tags = extra_tags + "None" + "," - extra_tags_prot = "" - for i in tag_split: - if i in locus_tag_to_product.keys(): - extra_tags_prot = extra_tags_prot + locus_tag_to_product[i] + "," - else: - extra_tags_prot = extra_tags_prot + "None" + "," - ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags, extra_tags_prot]) + ";" - # Changing SNP type: Date 28/05/2019 - elif tag == "": - print "ERROR: Issues with this locus tag. Check this tag in genbank file" - print list(set(ann_array)) - # Adding this so that Ann string is not empty: 30/05/2019 - if tag in locus_tag_to_gene_name.keys() and tag in locus_tag_to_product.keys(): - extra_tags = str(locus_tag_to_gene_name[tag]) + "|" + str(locus_tag_to_product[tag]) - else: - print "tag key not found: %s" % tag - extra_tags = "NULL" + "|" + "NULL" - # ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" - # Added 2019-31-05 - if "ERROR_OUT_OF_CHROMOSOME_RANGE" in i: - ann_string = ann_string + '|'.join( - [i_split[0], "intergenic_region", i_split[2], "ERROR_OUT_OF_CHROMOSOME_RANGE", i_split[9], i_split[10], i_split[11], - i_split[13], extra_tags]) + ";" - else: - ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" - # Debugging - if i_split[3] == "CD630_00290": - print ann_string - # Changing SNP type: Date 28/05/2019 - else: - if tag in locus_tag_to_gene_name.keys() and tag in locus_tag_to_product.keys(): - extra_tags = str(locus_tag_to_gene_name[tag]) + "|" + str(locus_tag_to_product[tag]) - else: - print "tag key not found: %s" % tag - extra_tags = "NULL" + "|" + "NULL" - # ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" - ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" - - # Annotation Bug fix 4 - # Changing SNP type: Date 28/05/2019 - # Working/Testing - else: - if len(variants.ALT) > 1 and snp_var_ann_dict[variants.POS]: - #print variants.ALT - #print ';'.join(set(snp_var_ann_dict[variants.POS].split(','))) - - ann_string = ";%s" % ';'.join(set(snp_var_ann_dict[variants.POS].split(','))) - # Get Tag here; Multiple tag names. - tag_list = [] - - - for i in set(snp_var_ann_dict[variants.POS].split(',')): - i_split = i.split('|') - if i_split[4] not in tag_list: - tag_list.append(i_split[4]) - if len(tag_list) > 1: - tag = str(tag_list[0]) + "-" + str(tag_list[1]) - else: - tag = tag_list[0] - - # if len(set(snp_var_ann_dict[variants.POS].split(','))) > 2: - # print tag - # print set(snp_var_ann_dict[variants.POS].split(',')) - - else: - ann_string = ";None" - - # Annotation Bug fix 5 - # Changing SNP type: Date 28/05/2019 - ann_string = ann_string.replace('ERROR_OUT_OF_CHROMOSOME_RANGE', '%s-%s' % (locus_tag_to_gene_name[last_locus_tag], locus_tag_to_gene_name[first_locus_tag])) - ann_string = ann_string.replace('CHR_END', '%s' % locus_tag_to_gene_name[first_locus_tag]) - - # SNP Matrix Bug - # No changes here: 28/05/2019 - ann_string_split = ann_string.split(';') - #print len(ann_string_split) - if len(ann_string_split) == 3: - first_allele_ann_string_split = ann_string_split[1].split('|') - second_allele_ann_string_split = ann_string_split[2].split('|') - if len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10: - ann_string = ann_string - elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) == 10: - if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": - prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] - else: - prod = first_allele_ann_string_split[14] + "|" + first_allele_ann_string_split[15] - new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + first_allele_ann_string_split[1] + "|" + first_allele_ann_string_split[2] + "|" + first_allele_ann_string_split[4] + "|" + first_allele_ann_string_split[9] + "|" + first_allele_ann_string_split[10] + "|" + first_allele_ann_string_split[11] + "|" + first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = new_first_allele_ann_string + str(ann_string_split[2]) - - elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) > 10: - - if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": - prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] - else: - prod = second_allele_ann_string_split[14] + "|" + second_allele_ann_string_split[15] - new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + second_allele_ann_string_split[1] + "|" + second_allele_ann_string_split[2] + "|" + \ - second_allele_ann_string_split[4] + "|" + second_allele_ann_string_split[9] + "|" + \ - second_allele_ann_string_split[10] + "|" + second_allele_ann_string_split[11] + "|" + \ - second_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = str(ann_string_split[1]) + new_second_allele_ann_string - elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) > 10: - - - if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": - prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] - else: - prod = first_allele_ann_string_split[14] + "|" + first_allele_ann_string_split[15] - new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + first_allele_ann_string_split[1] + "|" + first_allele_ann_string_split[2] + "|" + first_allele_ann_string_split[4] + "|" + first_allele_ann_string_split[9] + "|" + first_allele_ann_string_split[10] + "|" + first_allele_ann_string_split[11] + "|" + first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": - prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] - else: - prod = second_allele_ann_string_split[14] + "|" + second_allele_ann_string_split[15] - new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + second_allele_ann_string_split[1] + "|" + second_allele_ann_string_split[2] + "|" + \ - second_allele_ann_string_split[4] + "|" + second_allele_ann_string_split[9] + "|" + \ - second_allele_ann_string_split[10] + "|" + second_allele_ann_string_split[11] + "|" + \ - second_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = new_first_allele_ann_string + new_second_allele_ann_string - - - if len(ann_string_split) > 3: - first_allele_ann_string_split = ann_string_split[1].split('|') - second_allele_ann_string_split = ann_string_split[2].split('|') - third_allele_ann_string_split = ann_string_split[3].split('|') - if len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10 and len(third_allele_ann_string_split) == 10: - ann_string = ann_string - - elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) == 10 and len(third_allele_ann_string_split) == 10: - if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": - prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] - else: - prod = first_allele_ann_string_split[14] + "|" + first_allele_ann_string_split[15] - new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + first_allele_ann_string_split[1] + "|" + first_allele_ann_string_split[2] + "|" + first_allele_ann_string_split[4] + "|" + first_allele_ann_string_split[9] + "|" + first_allele_ann_string_split[10] + "|" + first_allele_ann_string_split[11] + "|" + first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = new_first_allele_ann_string + str(ann_string_split[2]) + str(ann_string_split[3]) - - elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) > 10 and len(third_allele_ann_string_split) == 10: - - if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": - prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] - else: - prod = second_allele_ann_string_split[14] + "|" + second_allele_ann_string_split[15] - new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + second_allele_ann_string_split[1] + "|" + second_allele_ann_string_split[2] + "|" + \ - second_allele_ann_string_split[4] + "|" + second_allele_ann_string_split[9] + "|" + \ - second_allele_ann_string_split[10] + "|" + second_allele_ann_string_split[11] + "|" + \ - second_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = str(ann_string_split[1]) + new_second_allele_ann_string + str(ann_string_split[3]) - - elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10 and len(third_allele_ann_string_split) > 10: - - if third_allele_ann_string_split[14] == "" and third_allele_ann_string_split[15] == "": - prod = third_allele_ann_string_split[3] + third_allele_ann_string_split[15] - else: - prod = third_allele_ann_string_split[14] + "|" + third_allele_ann_string_split[15] - new_third_allele_ann_string = third_allele_ann_string_split[0] + "|" + third_allele_ann_string_split[1] + "|" + third_allele_ann_string_split[2] + "|" + \ - third_allele_ann_string_split[4] + "|" + third_allele_ann_string_split[9] + "|" + \ - third_allele_ann_string_split[10] + "|" + third_allele_ann_string_split[11] + "|" + \ - third_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = str(ann_string_split[1]) + str(ann_string_split[2]) + new_third_allele_ann_string - - elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) > 10 and len(third_allele_ann_string_split) > 10: - #print ann_string - if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": - prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] - else: - prod = first_allele_ann_string_split[14] + "|" + first_allele_ann_string_split[15] - new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + first_allele_ann_string_split[1] + "|" + first_allele_ann_string_split[2] + "|" + first_allele_ann_string_split[4] + "|" + first_allele_ann_string_split[9] + "|" + first_allele_ann_string_split[10] + "|" + first_allele_ann_string_split[11] + "|" + first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": - prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] - else: - prod = second_allele_ann_string_split[14] + "|" + second_allele_ann_string_split[15] - new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + second_allele_ann_string_split[1] + "|" + second_allele_ann_string_split[2] + "|" + \ - second_allele_ann_string_split[4] + "|" + second_allele_ann_string_split[9] + "|" + \ - second_allele_ann_string_split[10] + "|" + second_allele_ann_string_split[11] + "|" + \ - second_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - if third_allele_ann_string_split[14] == "" and third_allele_ann_string_split[15] == "": - prod = third_allele_ann_string_split[3] + third_allele_ann_string_split[15] - else: - prod = third_allele_ann_string_split[14] + "|" + third_allele_ann_string_split[15] - new_third_allele_ann_string = third_allele_ann_string_split[0] + "|" + third_allele_ann_string_split[1] + "|" + third_allele_ann_string_split[2] + "|" + \ - third_allele_ann_string_split[4] + "|" + third_allele_ann_string_split[9] + "|" + \ - third_allele_ann_string_split[10] + "|" + third_allele_ann_string_split[11] + "|" + \ - third_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = new_first_allele_ann_string + new_second_allele_ann_string + new_third_allele_ann_string - - - # print_string generator no. 3 - - # Annotation Bug fix 6 - # Changing Strandness string: Date 28/05/2019 - # Each Locus ID with a strand information - strandness = " Strand Information: " - if "-" in tag: - tagsplit = tag.split('-') - for i in tagsplit: - if i in locus_tag_to_strand.keys(): - if "," in locus_tag_to_strand[i]: - locus_tag_to_strand_split = locus_tag_to_strand[i].split(',') - strand = locus_tag_to_strand_split[0] - else: - strand = locus_tag_to_strand[i] - strandness = strandness + i + "=" + strand + "/" - else: - if i == "" or i == "None": - strandness = strandness + "NULL=" + "No Strand Information found" + "/" - else: - strandness = strandness + i + "=" + "No Strand Information found" + "/" - else: - if tag in locus_tag_to_strand.keys(): - # strandness = strandness + locus_tag_to_strand[tag] - if "," in locus_tag_to_strand[tag]: - locus_tag_to_strand_split = locus_tag_to_strand[tag].split(',') - strand = locus_tag_to_strand_split[0] - else: - strand = locus_tag_to_strand[tag] - strandness = strandness + tag + "=" + strand - else: - if tag == "" or tag == "None": - strandness = strandness + "NULL=" + "No Strand Information found" - else: - strandness = strandness + tag + "=" + "No Strand Information found" - - # Annotation Bug fix 7 - # Changing tag equals NULL: Date 30/05/2019 - if tag == "" or tag == "None": - tag = "NULL" - - print_string = print_string + " locus_tag=" + tag + strandness + ann_string - print_string_phage = print_string - - - - """ Go over each genotype for a variant and generate a gt_string variable """ - gt_string = "" - for gt in variants.gt_bases: - gt = gt.replace('./.', '.') - gt_string = gt_string + "," + gt - gt_string = gt_string.replace('A/A', 'A') - gt_string = gt_string.replace('G/G', 'G') - gt_string = gt_string.replace('C/C', 'C') - gt_string = gt_string.replace('T/T', 'T') - gt_string = gt_string.replace('.', variants.REF) - - - # print_string generator no. 4 - # Replace various seperators that were used in old matrix. Clean up this block of code - final_allele_string = print_string + gt_string.replace(',', '\t') + '\n' - # Replace code at Phage Positions with -2 - if str(variants.POS) in functional_filter_pos_array: - code_string_array = code_string.split(',') - for (i, item) in enumerate(code_string_array): - if item == "0": - code_string_array[i] = "-2" - for (i, item) in enumerate(code_string_array): - if item == "1": - code_string_array[i] = "-2" - for (i, item) in enumerate(code_string_array): - if item == "2": - code_string_array[i] = "-2" - for (i, item) in enumerate(code_string_array): - if item == "3": - code_string_array[i] = "-2" - for (i, item) in enumerate(code_string_array): - if item == "4": - code_string_array[i] = "-2" - for (i, item) in enumerate(code_string_array): - if item == "-1": - code_string_array[i] = "-2" - for (i, item) in enumerate(code_string_array): - if item == "-2": - code_string_array[i] = "-2" - for (i, item) in enumerate(code_string_array): - if item == "-3": - code_string_array[i] = "-2" - for (i, item) in enumerate(code_string_array): - if item == "-4": - code_string_array[i] = "-2" - code_string = ','.join(code_string_array) - - final_code_string = print_string + "\t" + code_string.replace(',', '\t') + '\n' - final_allele_string = final_allele_string.replace(',|', '|') - - final_allele_string = final_allele_string.replace(',;,', ':::') - final_allele_string = final_allele_string.replace(';,', ':::') - final_code_string = final_code_string.replace(',|', '|') - - - final_code_string = final_code_string.replace(',;,', ':::') - final_code_string = final_code_string.replace(';,', ':::') - final_code_string = final_code_string.replace(';\t\t', ';\t') - final_code_string = final_code_string.replace('\t\t', '\t') - final_allele_string = final_allele_string.replace('\t\t', '\t') - fp_allele.write(final_allele_string) - fp_code.write(final_code_string) - - - - ntd_string = "" - ntd_string_phage = "" - count = 0 - code_string_array = code_string.split(',') - gt_string_array = gt_string[1:].split(',') - - - for i in gt_string_array: - if str(code_string_array[count]) == "0" or str(code_string_array[count]) == "1" or str(code_string_array[count]) == "3": - ntd_string = ntd_string + "\t" + str(i) - ntd_string_phage = ntd_string_phage + "\t" + str(i) - if code_string_array[count] == "-1": - ntd_string = ntd_string + "\t" + "-" - ntd_string_phage = ntd_string_phage + "\t" + "-" - # Changing Functional class filter code to -2 from 2 and replacing variant allele with N: 2018-12-04 - if str(code_string_array[count]) == "2" or str(code_string_array[count]) == "-2" or str(code_string_array[count]) == "-3" or str(code_string_array[count]) == "-4": - - ntd_string = ntd_string + "\t" + "N" - if str(code_string_array[count]) == "2": - ntd_string_phage = ntd_string_phage + "\t" + "N" - if str(code_string_array[count]) == "-2": - ntd_string_phage = ntd_string_phage + "\t" + str(i) - count += 1 - - # Annotation Bug fix 8 - """ Mask Phage positions and LowFQ/MQ positions in SNP_matrix_allele_new.csv. This is the default matrix. """ - if str(variants.POS) in functional_filter_pos_array: - ntd_string_array = ntd_string.split('\t') - #print ntd_string_array - ntd_string = "" - for i in ntd_string_array[1:]: - ntd_string = ntd_string + "\t" + "N" - ntd_string_array = ntd_string.split('\t') - #print ntd_string_array - - - if str(variants.POS) in mask_fq_mq_positions: - ntd_string_array = ntd_string.split('\t') - #print ntd_string_array - ntd_string = "" - for i in ntd_string_array[1:]: - ntd_string = ntd_string + "\t" + "N" - ntd_string_array = ntd_string.split('\t') - #print ntd_string_array - - - """ Generate a print_string for each of the matrix - SNP_matrix_allele_new.csv and SNP_matrix_allele_phage.csv """ - print_string = print_string + ntd_string + "\n" - - print_string_phage = print_string_phage + ntd_string_phage + "\n" - - """ This is a hardcoded solution. Find the root cause of these strings getting into the print_strint variable """ - print_string.replace(',;,', '\t') - print_string.replace(';,', '\t') - print_string_phage.replace(',;,', '\t') - print_string_phage.replace(';,', '\t') - - fp_allele_new.write(print_string) - fp_allele_new_phage.write(print_string_phage) - - fp_code.close() - fp_allele.close() - fp_allele_new.close() - fp_allele_new_phage.close() - -###################################### - """ Indel matrix """ - """ Prepare SNP/Indel Matrix print strings and add matrix row information subsequently """ - header_print_string = "Type of SNP at POS > ALT functional=PHAGE_REPEAT_MASK locus_tag=locus_id strand=strand; ALT|Effect|Impact|GeneID|Nrchange|Aachange|Nrgenepos|AAgenepos|gene_symbol|product" - final_merge_anno_file = VCF("%s/Final_vcf_gatk_indel.vcf.gz" % args.filter2_only_snp_vcf_dir) - for sample in final_merge_anno_file.samples: - # header_print_string = header_print_string + "," + sample - header_print_string = header_print_string + "\t" + sample - header_print_string = header_print_string + "\n" - #header_print_string = header_print_string.replace(':::,', ':::') - #header_print_string = header_print_string.replace(':::,', '\t') - fp_code = open("%s/Indel_matrix_code.csv" % args.filter2_only_snp_vcf_dir, 'w+') - fp_allele = open("%s/Indel_matrix_allele.csv" % args.filter2_only_snp_vcf_dir, 'w+') - fp_code.write(header_print_string) - fp_allele.write(header_print_string) - - # """ Generate mask_fq_mq_positions array with positions where a variant was filtered because of LowFQ or LowMQ""" - # mask_fq_mq_positions = [] - # for key in position_indel_label.keys(): - # label_sep_array = position_indel_label[key].split(',') - # for i in label_sep_array: - # if "LowAF" in i: - # if key not in mask_fq_mq_positions: - # mask_fq_mq_positions.append(key) - # if i == "HighAF": - # if key not in mask_fq_mq_positions: - # mask_fq_mq_positions.append(key) - # - # print "Length of indel mask_fq_mq_positions array:%s" % len(mask_fq_mq_positions) - - """ Generate mask_fq_mq_positions array with positions where a variant was filtered because of LowFQ or LowMQ""" - mask_fq_mq_positions = [] - mask_fq_mq_positions_outgroup_specific = [] - - if args.outgroup: - position_label_exclude_outgroup = OrderedDict() - with open("%s/All_label_final_ordered_exclude_outgroup_sorted.txt" % args.filter2_only_snp_vcf_dir, - 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position_label_exclude_outgroup[row[0]] = ','.join(row[1:]) - csv_file.close() - - position_indel_label_exclude_outgroup = OrderedDict() - with open("%s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt" % args.filter2_only_snp_vcf_dir, - 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - if row[0] not in position_label_exclude_outgroup.keys(): - position_indel_label_exclude_outgroup[row[0]] = ','.join(row[1:]) - else: - position_indel_label_exclude_outgroup[row[0]] = ','.join(row[1:]) - keep_logging('Warning: position %s already present as a SNP' % row[0], - 'Warning: position %s already present as a SNP' % row[0], logger, 'info') - csv_file.close() - for key in position_label_exclude_outgroup.keys(): - label_sep_array = position_label_exclude_outgroup[key].split(',') - for i in label_sep_array: - if "LowFQ" in str(i): - if key not in mask_fq_mq_positions: - if int(key) not in outgroup_specific_positions: - mask_fq_mq_positions.append(key) - elif int(key) in outgroup_specific_positions: - mask_fq_mq_positions_outgroup_specific.append(key) - if i == "HighFQ": - if key not in mask_fq_mq_positions: - if int(key) not in outgroup_specific_positions: - mask_fq_mq_positions.append(key) - elif int(key) in outgroup_specific_positions: - mask_fq_mq_positions_outgroup_specific.append(key) - else: - for key in position_label.keys(): - label_sep_array = position_label[key].split(',') - for i in label_sep_array: - if "LowFQ" in str(i): - if key not in mask_fq_mq_positions: - mask_fq_mq_positions.append(key) - if i == "HighFQ": - if key not in mask_fq_mq_positions: - mask_fq_mq_positions.append(key) - - - - print "Length of Indel mask_fq_mq_positions:%s" % len(mask_fq_mq_positions) - print "Length of Indel mask_fq_mq_positions specific to outgroup:%s" % len(mask_fq_mq_positions_outgroup_specific) - - - - - - - - for variants in VCF("%s/Final_vcf_gatk_indel.vcf.gz" % args.filter2_only_snp_vcf_dir): - print_string = "" - - functional_field = "" - if str(variants.POS) in phage_positions: - functional_field = functional_field + "PHAGE_" - else: - functional_field = functional_field + "NULL_" - if str(variants.POS) in repetitive_positions: - functional_field = functional_field + "REPEATS_" - else: - functional_field = functional_field + "NULL_" - if str(variants.POS) in mask_positions: - functional_field = functional_field + "MASK" - else: - functional_field = functional_field + "NULL" - - code_string = position_indel_label[str(variants.POS)] - code_string = code_string.replace('reference_allele', '0') - code_string = code_string.replace('reference_unmapped_position', '-1') - code_string = code_string.replace('LowAF_QUAL_DP_proximate_SNP', '2') - code_string = code_string.replace('LowAF_DP_QUAL_proximate_SNP', '2') - code_string = code_string.replace('LowAF_QUAL_proximate_SNP', '2') - code_string = code_string.replace('LowAF_DP_proximate_SNP', '2') - code_string = code_string.replace('LowAF_proximate_SNP', '2') - code_string = code_string.replace('LowAF_QUAL_DP', '2') - code_string = code_string.replace('LowAF_DP_QUAL', '2') - code_string = code_string.replace('LowAF_QUAL', '2') - code_string = code_string.replace('LowAF_DP', '2') - code_string = code_string.replace('HighAF_QUAL_DP_proximate_SNP', '2') - code_string = code_string.replace('HighAF_DP_QUAL_proximate_SNP', '2') - code_string = code_string.replace('HighAF_QUAL_proximate_SNP', '2') - code_string = code_string.replace('HighAF_DP_proximate_SNP', '2') - code_string = code_string.replace('HighAF_proximate_SNP', '2') - code_string = code_string.replace('HighAF_QUAL_DP', '2') - code_string = code_string.replace('HighAF_DP_QUAL', '2') - code_string = code_string.replace('HighAF_QUAL', '2') - code_string = code_string.replace('HighAF_DP', '2') - code_string = code_string.replace('LowAF', '-3') - code_string = code_string.replace('HighAF', '-4') - - if str(variants.POS) in indel_core_positions: - code_string = code_string.replace('VARIANT', '1') - # Adding functional class status code to SNP matrix: 2018-07-24 - elif str(variants.POS) in functional_filter_pos_array: - # Changing Functional class filter code to -2 from 2: 2018-12-04 - code_string = code_string.replace('VARIANT', '-2') - else: - code_string = code_string.replace('VARIANT', '3') - - - - - # Changing SNP type: Date 28/05/2019 - # Assign type of snp: coding / non-coding - if variants.POS in indel_var_ann_dict.keys(): - if indel_var_ann_dict[variants.POS] is not None: - if "protein_coding" in set(indel_var_ann_dict[variants.POS].split('|')) and "intergenic_region" not in set(indel_var_ann_dict[variants.POS].split('|')): - snp_type = "Coding Indel" - elif "protein_coding" in set(indel_var_ann_dict[variants.POS].split('|')) and "intergenic_region" in set(indel_var_ann_dict[variants.POS].split('|')): - snp_type = "Coding and Non-coding Indel" - elif "protein_coding" not in set(indel_var_ann_dict[variants.POS].split('|')) and "intergenic_region" in set(indel_var_ann_dict[variants.POS].split('|')): - snp_type = "Non-Coding Indel" - elif "protein_coding" not in set(indel_var_ann_dict[variants.POS].split('|')) and "intragenic_variant" in set(indel_var_ann_dict[variants.POS].split('|')): - snp_type = "Non-Coding Indel" - else: - print set((indel_var_ann_dict[variants.POS].split('|'))) - snp_type = "No_protein_coding/intergenic_region_field_in_ANN SNP" - #print snp_type - else: - keep_logging('Warning: position %s not found in snp_var_ann_dict dictionary. Assigning Not found as SNP type.' % variants.POS, 'Warning: position %s not found in snp_var_ann_dict dictionary. Assigning Not found as SNP type.' % variants.POS, logger, 'info') - print set((indel_var_ann_dict[variants.POS].split('|'))) - snp_type = "Not Found in Annotated VCF file" - - - - - print_string = print_string + snp_type + " at %s > " % str(variants.POS) + str(",".join(variants.ALT)) + " functional=%s" % functional_field - - # Get ANN field from variant INFO column and save it as an array. Split and Go through each elements, add bells and whistles - if variants.INFO.get('ANN'): - - ann_array = (variants.INFO.get('ANN')).split(',') - - # Generate tag string before generating ann_string - if len(ann_array) > 1: - # print variants.INFO.get('ANN') - # print list(set(ann_array)) - tag_list = [] - - for i_again in set(indel_var_ann_dict[variants.POS].split(',')): - i_split_again = i_again.split('|') - - if "-" not in i_split_again[4]: - if i_split_again[4] not in tag_list: - tag_list.append(i_split_again[4]) - - else: - split_tags = i_split_again[4].split('-') - for splittagsindividual in split_tags: - if splittagsindividual not in tag_list: - tag_list.append(splittagsindividual) - - if len(tag_list) == 1: - tag = tag_list[0] - - elif len(tag_list) == 2: - tag = str(tag_list[0]) + "-" + str(tag_list[1]) - - elif len(tag_list) > 2: - print tag_list - tag = tag.replace('CHR_START-', '') - tag = tag.replace('-CHR_END', '') - else: - for i in list(set(ann_array)): - i_split = i.split('|') - tag = str(i_split[4]).replace('CHR_START-', '') - tag = str(tag).replace('-CHR_END', '') - - ann_string = ";" - for i in list(set(ann_array)): - i_split = i.split('|') - # ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13]]) + ";" - - # MOve this tag before this for loop because of multiple tags associated. - # tag = str(i_split[4]).replace('CHR_START-', '') - # tag = str(tag).replace('-CHR_END', '') - - if "-" in tag: - # print tag - extra_tags = "" - tag_split = tag.split('-') - for i in tag_split: - if i in locus_tag_to_gene_name.keys(): - extra_tags = extra_tags + locus_tag_to_gene_name[i] + "," - else: - extra_tags = extra_tags + "None" + "," - extra_tags_prot = "" - for i in tag_split: - if i in locus_tag_to_product.keys(): - extra_tags_prot = extra_tags_prot + locus_tag_to_product[i] + "," - else: - extra_tags_prot = extra_tags_prot + "None" + "," - ann_string = ann_string + '|'.join( - [i_split[0], i_split[1], i_split[2], i_split[3], i_split[9], i_split[10], i_split[11], - i_split[13], extra_tags, extra_tags_prot]) + ";" - # Changing SNP type: Date 28/05/2019 - elif tag == "": - print "ERROR: Issues with this locus tag. Check this tag in genbank file" - print list(set(ann_array)) - # Adding this so that Ann string is not empty: 30/05/2019 - if tag in locus_tag_to_gene_name.keys() and tag in locus_tag_to_product.keys(): - extra_tags = str(locus_tag_to_gene_name[tag]) + "|" + str(locus_tag_to_product[tag]) - else: - print "tag key not found: %s" % tag - extra_tags = "NULL" + "|" + "NULL" - # ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" - # Added 2019-31-05 - if "ERROR_OUT_OF_CHROMOSOME_RANGE" in i: - ann_string = ann_string + '|'.join( - [i_split[0], "intergenic_region", i_split[2], "ERROR_OUT_OF_CHROMOSOME_RANGE", i_split[9], - i_split[10], i_split[11], - i_split[13], extra_tags]) + ";" - else: - ann_string = ann_string + '|'.join( - [i_split[0], i_split[1], i_split[2], i_split[3], i_split[9], i_split[10], i_split[11], - i_split[13], extra_tags]) + ";" - # Debugging - if i_split[3] == "CD630_00290": - print ann_string - # Changing SNP type: Date 28/05/2019 - else: - if tag in locus_tag_to_gene_name.keys() and tag in locus_tag_to_product.keys(): - extra_tags = str(locus_tag_to_gene_name[tag]) + "|" + str(locus_tag_to_product[tag]) - else: - print "tag key not found: %s" % tag - extra_tags = "NULL" + "|" + "NULL" - # ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" - ann_string = ann_string + '|'.join( - [i_split[0], i_split[1], i_split[2], i_split[3], i_split[9], i_split[10], i_split[11], - i_split[13], extra_tags]) + ";" - - - # Changing SNP type: Date 28/05/2019 - # Working/Testing - else: - if len(variants.ALT) > 1 and indel_var_ann_dict[variants.POS]: - # print variants.ALT - # print ';'.join(set(snp_var_ann_dict[variants.POS].split(','))) - - ann_string = ";%s" % ';'.join(set(indel_var_ann_dict[variants.POS].split(','))) - # Get Tag here; Multiple tag names. - tag_list = [] - - for i in set(indel_var_ann_dict[variants.POS].split(',')): - i_split = i.split('|') - if i_split[4] not in tag_list: - tag_list.append(i_split[4]) - if len(tag_list) > 1: - tag = str(tag_list[0]) + "-" + str(tag_list[1]) - else: - tag = tag_list[0] - - # if len(set(snp_var_ann_dict[variants.POS].split(','))) > 2: - # print tag - # print set(snp_var_ann_dict[variants.POS].split(',')) - - else: - ann_string = ";None" - - - # Changing SNP type: Date 28/05/2019 - ann_string = ann_string.replace('ERROR_OUT_OF_CHROMOSOME_RANGE', '%s-%s' % (locus_tag_to_gene_name[last_locus_tag], locus_tag_to_gene_name[first_locus_tag])) - ann_string = ann_string.replace('CHR_END', '%s' % locus_tag_to_gene_name[first_locus_tag]) - - - # SNP Matrix Bug - ann_string_split = ann_string.split(';') - if len(ann_string_split) == 3: - first_allele_ann_string_split = ann_string_split[1].split('|') - second_allele_ann_string_split = ann_string_split[2].split('|') - if len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10: - ann_string = ann_string - elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) == 10: - if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": - prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] - else: - prod = first_allele_ann_string_split[14] + first_allele_ann_string_split[15] - new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + \ - first_allele_ann_string_split[1] + "|" + \ - first_allele_ann_string_split[2] + "|" + \ - first_allele_ann_string_split[4] + "|" + \ - first_allele_ann_string_split[9] + "|" + \ - first_allele_ann_string_split[10] + "|" + \ - first_allele_ann_string_split[11] + "|" + \ - first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = new_first_allele_ann_string + str(ann_string_split[2]) - - elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) > 10: - - if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": - prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] - else: - prod = second_allele_ann_string_split[14] + second_allele_ann_string_split[15] - new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + \ - second_allele_ann_string_split[1] + "|" + \ - second_allele_ann_string_split[2] + "|" + \ - second_allele_ann_string_split[4] + "|" + \ - second_allele_ann_string_split[9] + "|" + \ - second_allele_ann_string_split[10] + "|" + \ - second_allele_ann_string_split[11] + "|" + \ - second_allele_ann_string_split[ - 13] + "|" + prod + "|" + prod + ";" - - ann_string = str(ann_string_split[1]) + new_second_allele_ann_string - elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) > 10: - - if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": - prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] - else: - prod = first_allele_ann_string_split[14] + first_allele_ann_string_split[15] - new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + \ - first_allele_ann_string_split[1] + "|" + \ - first_allele_ann_string_split[2] + "|" + \ - first_allele_ann_string_split[4] + "|" + \ - first_allele_ann_string_split[9] + "|" + \ - first_allele_ann_string_split[10] + "|" + \ - first_allele_ann_string_split[11] + "|" + \ - first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": - prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] - else: - prod = second_allele_ann_string_split[14] + second_allele_ann_string_split[15] - new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + \ - second_allele_ann_string_split[1] + "|" + \ - second_allele_ann_string_split[2] + "|" + \ - second_allele_ann_string_split[4] + "|" + \ - second_allele_ann_string_split[9] + "|" + \ - second_allele_ann_string_split[10] + "|" + \ - second_allele_ann_string_split[11] + "|" + \ - second_allele_ann_string_split[ - 13] + "|" + prod + "|" + prod + ";" - - ann_string = new_first_allele_ann_string + new_second_allele_ann_string - - - if len(ann_string_split) > 3: - - first_allele_ann_string_split = ann_string_split[1].split('|') - second_allele_ann_string_split = ann_string_split[2].split('|') - third_allele_ann_string_split = ann_string_split[3].split('|') - - if len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10 and len( - third_allele_ann_string_split) == 10: - ann_string = ann_string - - elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) == 10 and len( - third_allele_ann_string_split) == 10: - if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": - prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] - else: - prod = first_allele_ann_string_split[14] + first_allele_ann_string_split[15] - new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + \ - first_allele_ann_string_split[1] + "|" + \ - first_allele_ann_string_split[2] + "|" + \ - first_allele_ann_string_split[4] + "|" + \ - first_allele_ann_string_split[9] + "|" + \ - first_allele_ann_string_split[10] + "|" + \ - first_allele_ann_string_split[11] + "|" + \ - first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = new_first_allele_ann_string + str(ann_string_split[2]) + str(ann_string_split[3]) - - elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) > 10 and len( - third_allele_ann_string_split) == 10: - - if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": - prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] - else: - prod = second_allele_ann_string_split[14] + second_allele_ann_string_split[15] - new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + \ - second_allele_ann_string_split[1] + "|" + \ - second_allele_ann_string_split[2] + "|" + \ - second_allele_ann_string_split[4] + "|" + \ - second_allele_ann_string_split[9] + "|" + \ - second_allele_ann_string_split[10] + "|" + \ - second_allele_ann_string_split[11] + "|" + \ - second_allele_ann_string_split[ - 13] + "|" + prod + "|" + prod + ";" - - ann_string = str(ann_string_split[1]) + new_second_allele_ann_string + str(ann_string_split[3]) - - elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10 and len( - third_allele_ann_string_split) > 10: - - if third_allele_ann_string_split[14] == "" and third_allele_ann_string_split[15] == "": - prod = third_allele_ann_string_split[3] + third_allele_ann_string_split[15] - else: - prod = third_allele_ann_string_split[14] + third_allele_ann_string_split[15] - new_third_allele_ann_string = third_allele_ann_string_split[0] + "|" + \ - third_allele_ann_string_split[1] + "|" + \ - third_allele_ann_string_split[2] + "|" + \ - third_allele_ann_string_split[4] + "|" + \ - third_allele_ann_string_split[9] + "|" + \ - third_allele_ann_string_split[10] + "|" + \ - third_allele_ann_string_split[11] + "|" + \ - third_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = str(ann_string_split[1]) + str(ann_string_split[2]) + new_third_allele_ann_string - - elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) > 10 and len( - third_allele_ann_string_split) > 10: - # print ann_string - if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": - prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] - else: - prod = first_allele_ann_string_split[14] + first_allele_ann_string_split[15] - new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + \ - first_allele_ann_string_split[1] + "|" + \ - first_allele_ann_string_split[2] + "|" + \ - first_allele_ann_string_split[4] + "|" + \ - first_allele_ann_string_split[9] + "|" + \ - first_allele_ann_string_split[10] + "|" + \ - first_allele_ann_string_split[11] + "|" + \ - first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": - prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] - else: - prod = second_allele_ann_string_split[14] + second_allele_ann_string_split[15] - new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + \ - second_allele_ann_string_split[1] + "|" + \ - second_allele_ann_string_split[2] + "|" + \ - second_allele_ann_string_split[4] + "|" + \ - second_allele_ann_string_split[9] + "|" + \ - second_allele_ann_string_split[10] + "|" + \ - second_allele_ann_string_split[11] + "|" + \ - second_allele_ann_string_split[ - 13] + "|" + prod + "|" + prod + ";" - - if third_allele_ann_string_split[14] == "" and third_allele_ann_string_split[15] == "": - prod = third_allele_ann_string_split[3] + third_allele_ann_string_split[15] - else: - prod = third_allele_ann_string_split[14] + third_allele_ann_string_split[15] - new_third_allele_ann_string = third_allele_ann_string_split[0] + "|" + \ - third_allele_ann_string_split[1] + "|" + \ - third_allele_ann_string_split[2] + "|" + \ - third_allele_ann_string_split[4] + "|" + \ - third_allele_ann_string_split[9] + "|" + \ - third_allele_ann_string_split[10] + "|" + \ - third_allele_ann_string_split[11] + "|" + \ - third_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = new_first_allele_ann_string + new_second_allele_ann_string + new_third_allele_ann_string - - # print ann_string - - # # JUST FOR THE SAKE OF DEBUGGING - # ann_string_split = ann_string.split(';') - # for i in ann_string_split: - # if len(i.split('|')) != 10 and len(i.split('|')) != 1: - # print ann_string - - # Changing Strandness string: Date 28/05/2019 - # Each Locus ID with a strand information - strandness = " Strand Information: " - if "-" in tag: - tagsplit = tag.split('-') - for i in tagsplit: - if i in locus_tag_to_strand.keys(): - if "," in locus_tag_to_strand[i]: - locus_tag_to_strand_split = locus_tag_to_strand[i].split(',') - strand = locus_tag_to_strand_split[0] - else: - strand = locus_tag_to_strand[i] - strandness = strandness + i + "=" + strand + "/" - else: - if i == "" or i == "None": - strandness = strandness + "NULL=" + "No Strand Information found" + "/" - else: - strandness = strandness + i + "=" + "No Strand Information found" + "/" - else: - if tag in locus_tag_to_strand.keys(): - # strandness = strandness + locus_tag_to_strand[tag] - if "," in locus_tag_to_strand[tag]: - locus_tag_to_strand_split = locus_tag_to_strand[tag].split(',') - strand = locus_tag_to_strand_split[0] - else: - strand = locus_tag_to_strand[tag] - strandness = strandness + tag + "=" + strand - else: - if tag == "" or tag == "None": - strandness = strandness + "NULL=" + "No Strand Information found" - else: - strandness = strandness + tag + "=" + "No Strand Information found" - - - # Changing tag equals NULL: Date 30/05/2019 - if tag == "" or tag == "None": - tag = "NULL" - - print_string = print_string + " locus_tag=" + tag + strandness + ann_string - - gt_string = "" - for gt in variants.gt_bases: - gt = gt.replace('./.', '.') - if "/" in gt: - gt_split = gt.split('/') - gt = gt_split[1] - gt_string = gt_string + "," + gt - gt_string = gt_string.replace('.', variants.REF) - - """Replacing Phage/Functional filter position code""" - if str(variants.POS) in functional_filter_pos_array: - code_string_array = code_string.split(',') - code_string = "" - for i in code_string_array: - code_string = code_string + "," + "-2" - - final_allele_string = print_string + gt_string.replace(',', '\t') + '\n' - final_code_string = print_string + "\t" + code_string.replace(',', '\t') + '\n' - final_allele_string = final_allele_string.replace(',|', '|') - # final_allele_string = final_allele_string.replace(',;,', ':::') - # final_allele_string = final_allele_string.replace(';,', ':::') - final_allele_string = final_allele_string.replace(',;,', ':::') - final_allele_string = final_allele_string.replace(';,', ':::') - final_code_string = final_code_string.replace(',|', '|') - # final_code_string = final_code_string.replace(',;,', ':::') - # final_code_string = final_code_string.replace(';,', ':::') - final_code_string = final_code_string.replace(',;,', ':::') - final_code_string = final_code_string.replace(';,', ':::') - final_code_string = final_code_string.replace('\t\t', '\t') - final_allele_string = final_allele_string.replace('\t\t', '\t') - fp_allele.write(final_allele_string) - fp_code.write(final_code_string) - fp_code.close() - fp_allele.close() - -def core_prep_snp(core_vcf_fasta_dir): - """ Generate SNP Filter Label Matrix """ - generate_paste_command() - - generate_paste_command_outgroup() - - """ Generate different list of Positions from the **All_label_final_sorted_header.txt** SNP position label data matrix. """ - generate_position_label_data_matrix() - - """ Generate VCF files from final list of variants in Only_ref_variant_positions_for_closely; generate commands for consensus generation """ - generate_vcf_files() - - """ Generate consensus fasta file from core vcf files """ - extract_only_ref_variant_fasta_from_reference() - - """ Generate consensus fasta file with only reference and variant position bases """ - extract_only_ref_variant_fasta(core_vcf_fasta_dir) - - # """ Analyze the positions that were filtered out only due to insufficient depth""" - # DP_analysis() - -def core_prep_indel(core_vcf_fasta_dir): - """ Generate SNP Filter Label Matrix """ - generate_indel_paste_command() - - generate_indel_paste_command_outgroup() - - """ Generate different list of Positions from the **All_label_final_sorted_header.txt** SNP position label data matrix. """ - generate_indel_position_label_data_matrix() - -""" report methods """ -def alignment_report(data_matrix_dir): - keep_logging('Generating Alignment report...', 'Generating Alignment report...', logger, 'info') - varcall_dir = os.path.dirname(args.results_dir) - print varcall_dir - report_string = "" - header = "Sample,QC-passed reads,Mapped reads,% mapped reads,mean depth,%_bases_above_5,%_bases_above_10,%_bases_above_15,unmapped_positions,READ_PAIR_DUPLICATES,READ_PAIR_OPTICAL_DUPLICATES,unmapped reads,% unmapped reads" - fp = open("%s/Report_alignment.txt" % (data_matrix_dir), 'w+') - fp.write(header + '\n') - for vcf in vcf_filenames: - sample = os.path.basename(vcf.replace('_filter2_final.vcf_no_proximate_snp.vcf', '')) - #print sample - report_string = sample + "," - qc = (subprocess.check_output("grep \'QC-passed\' %s/%s/%s_alignment_stats | sed \'s/ + 0 in total (QC-passed reads + QC-failed reads)//g\'" % (varcall_dir, sample, sample), shell=True)).strip() - mapped = (subprocess.check_output("grep \'mapped (\' %s/%s/%s_alignment_stats | awk -F\' \' \'{print $1}\'" % (varcall_dir, sample, sample), shell=True)).strip() - replace = "%:-nan%)" - perc_mapped = (subprocess.check_output("grep \'mapped (\' %s/%s/%s_alignment_stats | awk -F\' \' \'{print $5}\' | sed \'s/%s//g\' | sed \'s/(//g\'" % (varcall_dir, sample, sample, replace), shell=True)).strip() - depth_of_coverage = (subprocess.check_output("awk -F\'\\t\' \'{OFS=\",\"};FNR==2{print $3,$7,$8,$9}\' %s/%s/%s_depth_of_coverage.sample_summary" % (varcall_dir, sample, sample), shell=True)).strip() - unmapped_positions = (subprocess.check_output("wc -l %s/%s/%s_unmapped.bed_positions | cut -d\' \' -f1" % (varcall_dir, sample, sample), shell=True)).strip() - opt_dup = (subprocess.check_output("awk -F\'\\t\' \'{OFS=\",\"};FNR==8{print $7,$8,$5}\' %s/%s/%s_markduplicates_metrics" % (varcall_dir, sample, sample), shell=True)).strip() - perc_unmapped = str(100 - float(perc_mapped)) - myList = ','.join(map(str, (sample, qc, mapped, perc_mapped, depth_of_coverage, unmapped_positions, opt_dup, perc_unmapped))) - #print myList - fp.write(myList + '\n') - fp.close() - keep_logging('Alignment report can be found in %s/Report_alignment.txt' % data_matrix_dir, 'Alignment report can be found in %s/Report_alignment.txt' % data_matrix_dir, logger, 'info') - -def variant_report(data_matrix_dir): - keep_logging('Generating Variants report...', 'Generating Variants report...', logger, 'info') - varcall_dir = os.path.dirname(os.path.abspath(args.results_dir)) - report_string = "" - header = "Sample,Total Unique Variants,core SNPs,unmapped_positions,reference_allele,true_variant,Only_low_FQ,Only_DP,Only_low_MQ,other,unmapped_positions_perc,true_variant_perc,Only_low_FQ_perc,Only_DP_perc,Only_low_MQ_perc,other_perc" - fp = open("%s/Report_variants.txt" % (data_matrix_dir), 'w+') - fp.write(header + '\n') - - for vcf in vcf_filenames: - sample = os.path.basename(vcf.replace('_filter2_final.vcf_no_proximate_snp.vcf', '')) - report_string = sample + "," - unmapped_positions = (subprocess.check_output("wc -l %s/core_temp_dir/unique_positions_file | cut -d\' \' -f1" % (varcall_dir), shell=True)).strip() - core_snps = (subprocess.check_output("wc -l %s/core_temp_dir/Only_ref_variant_positions_for_closely | cut -d\' \' -f1" % (varcall_dir), shell=True)).strip() - filtered_snp_count = (subprocess.check_output("grep -w \'^%s\' %s/core_temp_dir/bargraph_counts.txt | awk -F\'\\t\' \'{OFS=\",\"};{print $2,$3,$4,$5,$6,$7}\'" % (sample, varcall_dir), shell=True)).strip() - filtered_snp_perc = (subprocess.check_output("grep -w \'^%s\' %s/core_temp_dir/bargraph_percentage.txt | awk -F\'\\t\' \'{OFS=\",\"};{print $2,$3,$4,$5,$6,$7}\'" % (sample, varcall_dir), shell=True)).strip() - myList = ','.join(map(str, (sample, unmapped_positions, core_snps, filtered_snp_count, filtered_snp_perc))) - fp.write(myList + '\n') - fp.close() - keep_logging('Variant call report can be found in %s/Report_variants.txt' % data_matrix_dir, 'Variant call report can be found in %s/Report_variants.txt' % data_matrix_dir, logger, 'info') - -def gubbins(gubbins_dir, input_fasta, jobrun, logger, Config): - keep_logging('\nRunning Gubbins on input: %s\n' % input_fasta, '\nRunning Gubbins on input: %s\n' % input_fasta, - logger, - 'info') - - - call("module load bioperl python-anaconda2/201607 biopython dendropy reportlab fasttree RAxML fastml/gub gubbins", logger) - #os.system("module load bioperl python-anaconda2/201607 biopython dendropy reportlab fasttree RAxML fastml/gub gubbins") - #gubbins_cmd = "%s/%s --prefix %s/%s %s" % ( - # ConfigSectionMap("gubbins", Config)['gubbins_bin'], ConfigSectionMap("gubbins", Config)['base_cmd'], gubbins_dir, - # (os.path.basename(input_fasta)).replace('.fa', ''), input_fasta) - - load_module = "module load bioperl python-anaconda2/201607 biopython dendropy reportlab fasttree RAxML fastml/gub gubbins" - gubbins_cmd = "%s --threads 6 --prefix %s/%s %s" % ( - ConfigSectionMap("gubbins", Config)['base_cmd'], gubbins_dir, - (os.path.basename(input_fasta)).replace('.fa', ''), input_fasta) - keep_logging('\nRunning Gubbins on: %s' % input_fasta, '\nRunning Gubbins: %s\n' % input_fasta, - logger, - 'info') - - keep_logging('Running: %s' % gubbins_cmd, '%s' % gubbins_cmd, logger, 'info') - if jobrun == "parallel-local" or jobrun == "local": - call("cd %s" % gubbins_dir, logger) - call(gubbins_cmd, logger) - elif jobrun == "cluster": - call("cd %s" % gubbins_dir, logger) - call(gubbins_cmd, logger) - elif jobrun == "parallel-cluster": - job_file_name = "%s/gubbins_%s.pbs" % (gubbins_dir, os.path.basename(input_fasta)) - job_name = os.path.basename(job_file_name) - job_print_string = "#PBS -N %s\n#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l nodes=1:ppn=12,mem=47000mb,walltime=250:00:00\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\ncd %s\n%s\n%s" % (job_name, ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account'], gubbins_dir, load_module, gubbins_cmd) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("qsub %s" % job_file_name) - call("qsub %s" % job_file_name, logger) - -def get_outgroup(): - """ - Prepare Outgroup Sample name from the argument. - """ - if args.outgroup: - if "R1_001_final.fastq.gz" in args.outgroup: - first_part_split = args.outgroup.split('R1_001_final.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - outgroup = re.sub("_S.*_", "", first_part) - - elif "_R1.fastq.gz" in args.outgroup: - first_part_split = args.outgroup.split('_R1.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - outgroup = re.sub("_S.*_", "", first_part) - - elif "R1.fastq.gz" in args.outgroup: - first_part_split = args.outgroup.split('R1.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) - outgroup = re.sub("_S.*", "", first_part) - - elif "1_combine.fastq.gz" in args.outgroup: - first_part_split = args.outgroup.split('1_combine.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - outgroup = re.sub("_S.*_", "", first_part) - - elif "1_sequence.fastq.gz" in args.outgroup: - first_part_split = args.outgroup.split('1_sequence.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - outgroup = re.sub("_S.*_", "", first_part) - - elif "_forward.fastq.gz" in args.outgroup: - first_part_split = args.outgroup.split('_forward.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - outgroup = re.sub("_S.*_", "", first_part) - - elif "R1_001.fastq.gz" in args.outgroup: - first_part_split = args.outgroup.split('R1_001.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - outgroup = re.sub("_S.*_", "", first_part) - - elif "_1.fastq.gz" in args.outgroup: - first_part_split = args.outgroup.split('_1.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - outgroup = re.sub("_S.*_", "", first_part) - - elif ".1.fastq.gz" in args.outgroup: - first_part_split = args.outgroup.split('.1.fastq.gz') - first_part = first_part_split[0].replace('_L001', '') - outgroup = re.sub("_S.*_", "", first_part) - - keep_logging( - 'Using %s as Outgroup Sample Name' % outgroup, - 'Using %s as Outgroup Sample Name' % outgroup, - logger, 'info') - - return outgroup - else: - keep_logging('Outgroup Sample Name not provided\n', 'Outgroup Sample Name not provided\n', logger, 'info') - outgroup = "" - -def mask_fq_mq_positions_specific_to_outgroup(): - """ Generate mask_fq_mq_positions array with positions where a variant was filtered because of LowFQ or LowMQ""" - mask_fq_mq_positions = [] - mask_fq_mq_positions_outgroup_specific = [] - if args.outgroup: - position_label_exclude_outgroup = OrderedDict() - with open("%s/All_label_final_ordered_exclude_outgroup_sorted.txt" % args.filter2_only_snp_vcf_dir, - 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position_label_exclude_outgroup[row[0]] = ','.join(row[1:]) - csv_file.close() - - position_indel_label_exclude_outgroup = OrderedDict() - with open("%s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt" % args.filter2_only_snp_vcf_dir, - 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - if row[0] not in position_label_exclude_outgroup.keys(): - position_indel_label_exclude_outgroup[row[0]] = ','.join(row[1:]) - else: - position_indel_label_exclude_outgroup[row[0]] = ','.join(row[1:]) - keep_logging('Warning: position %s already present as a SNP' % row[0], - 'Warning: position %s already present as a SNP' % row[0], logger, 'info') - csv_file.close() - for key in position_label_exclude_outgroup.keys(): - label_sep_array = position_label_exclude_outgroup[key].split(',') - for i in label_sep_array: - if "LowFQ" in str(i): - if key not in mask_fq_mq_positions: - if int(key) not in outgroup_specific_positions: - mask_fq_mq_positions.append(key) - elif int(key) in outgroup_specific_positions: - mask_fq_mq_positions_outgroup_specific.append(key) - if i == "HighFQ": - if key not in mask_fq_mq_positions: - if int(key) not in outgroup_specific_positions: - mask_fq_mq_positions.append(key) - elif int(key) in outgroup_specific_positions: - mask_fq_mq_positions_outgroup_specific.append(key) - - fp = open("%s/mask_fq_mq_positions_outgroup_specific.txt" % (args.filter2_only_snp_vcf_dir), 'w+') - for i in mask_fq_mq_positions_outgroup_specific: - fp.write(i + '\n') - fp.close() - print "Length of mask_fq_mq_positions specific to outgroup:%s" % len(mask_fq_mq_positions_outgroup_specific) - - outgroup = get_outgroup() - fqmqpositionsspecifictooutgroup = [] - - fopen = open("%s/mask_fq_mq_positions_outgroup_specific.txt" % (args.filter2_only_snp_vcf_dir), 'r+') - for i in fopen: - i = i.strip() - fqmqpositionsspecifictooutgroup.append(i) - fopen.close() - - print "Length of low MQ/FQ positions specific to outgroup: %s" % len(fqmqpositionsspecifictooutgroup) - - vcf_filename_unmapped = "%s/%s_ref_allele_unmapped_masked.vcf" % (args.filter2_only_snp_vcf_dir, outgroup) - - fp = open("%s/%s_ref_allele_unmapped_masked.vcf" % (args.filter2_only_snp_vcf_dir, outgroup), 'w+') - - vcf_header = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % outgroup - fp.write(vcf_header) - - for variants in VCF("%s/%s_ref_allele_unmapped.vcf.gz" % (args.filter2_only_snp_vcf_dir, outgroup)): - print_string = "" - if str(variants.POS) in fqmqpositionsspecifictooutgroup: - print_string_array = [str(variants.CHROM), str(variants.POS), '.', str(variants.REF), 'N', '221.999', - '.', '.', '.', '.', '.'] - - - else: - print_string_array = [str(variants.CHROM), str(variants.POS), '.', str(variants.REF), - str(variants.ALT[0]), '221.999', '.', '.', '.', '.', '.'] - print_string = '\t'.join(print_string_array) - fp.write(print_string + '\n') - fp.close() - base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + \ - ConfigSectionMap("vcftools", Config)[ - 'vcftools_bin'] - bgzip_cmd = "%s/%s/bgzip -f %s\n" % ( - ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], - vcf_filename_unmapped) - - tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % ( - ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], - vcf_filename_unmapped) - - fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_unmapped_variants.fa\n" % ( - args.reference, base_vcftools_bin, vcf_filename_unmapped, outgroup) - - # print bgzip_cmd - # print tabix_cmd - # print fasta_cmd - - subprocess.call([bgzip_cmd], shell=True) - subprocess.call([tabix_cmd], shell=True) - subprocess.call([fasta_cmd], shell=True) - sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_unmapped_variants.fa\n" % (outgroup, outgroup) - subprocess.call([sed_command], shell=True) - # print sed_command - - - else: - for key in position_label.keys(): - label_sep_array = position_label[key].split(',') - for i in label_sep_array: - if "LowFQ" in str(i): - if key not in mask_fq_mq_positions: - mask_fq_mq_positions.append(key) - if i == "HighFQ": - if key not in mask_fq_mq_positions: - mask_fq_mq_positions.append(key) - - fp = open("%s/mask_fq_mq_positions.txt" % (args.filter2_only_snp_vcf_dir), 'w+') - for i in mask_fq_mq_positions: - fp.write(i + '\n') - fp.close() - - print "Length of mask_fq_mq_positions:%s" % len(mask_fq_mq_positions) - -""" -Pending inclusion - -class FuncThread(threading.Thread): - def __init__(self, target, *args): - self._target = target - self._args = args - threading.Thread.__init__(self) - def run(self): - self._target(*self._args) - -def someOtherFunc(data, key): - print "someOtherFunc was called : data=%s; key=%s" % (str(data), str(key)) - -Pending inclusion -""" - - - -if __name__ == '__main__': - - """ - Main Function for Variant Calling Core Pipeline - :param: - :return: - - This function runs "core_prep" step to generate intermediate files required for extracting core variants at "core" step. - Using these core variants, a "report" step will generate the final reports and output results of the pipeline as well as runs "tree" step to generate fasttree and raxml results - using the core variants consensus in Date_Time_core_results folder. - Steps: - 1. core_prep - 2. core - 3. report - 4. tree - """ - - # Start Timer to use it for generating folder names and Log prefixes. - start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - start_time_2 = datetime.now() - log_unique_time = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') - global logger - analysis_name_log = "step_" + str(args.steps) - logger = generate_logger(args.filter2_only_snp_vcf_dir, analysis_name_log, log_unique_time) - keep_logging('\nThe Script started at: %s' % start_time, '\nThe Script started at: %s' % start_time, logger, 'info') - print_details = "This step will parse final vcf files(*_no_proximate_snp.vcf) generated at the end of Variant Calling Pipeline. At the end of this step, the following results will be generated and placed in output directory:\n\n" \ - "1. Final Core SNP Positions list(Variant positions that were not filtered out in any of the samples and passed all the filters)\n" \ - "2. SNP Positions that were filtered out with labels indicating the reason (Depth, FQ, MQ, Unmapped in one or other samples, Proximate SNPS, Quality of Variant) why they were filtered out.\n" \ - "3. Barplot Statistics about the filtered variants and their reason for getting filtered.\n" \ - "4. Final Consensus fasta file using only Core SNP Positions\n" - keep_logging('%s' % print_details, '%s' % print_details, logger, 'info') - - # Create temporary Directory core_temp_dir/temp for storing temporary intermediate files. Check if core_temp_dir contains all the required files to run these pipeline. - global temp_dir - temp_dir = args.filter2_only_snp_vcf_dir + "/temp" - - # Read Config file into Config object that will be used to extract configuration settings set up in config file. - global config_file - if args.config: - config_file = args.config - else: - config_file = os.path.dirname(os.path.abspath(__file__)) + "/config" - global Config - Config = ConfigParser.ConfigParser() - Config.read(config_file) - keep_logging('Path to config file: %s' % config_file, 'Path to config file: %s' % config_file, logger, 'info') - - make_sure_path_exists(temp_dir) - - # Get outgroup_Sample name - outgroup = get_outgroup() - outgroup_vcf_filename = str(outgroup) + "_filter2_final.vcf_no_proximate_snp.vcf" - outgroup_indel_vcf_filename = str(outgroup) + "_filter2_indel_final.vcf" - - # Read filenames. Core variants and final results will be extracted considering only these files. - filter2_only_snp_vcf_filenames = args.filter2_only_snp_vcf_filenames - vcf_filenames_temp = [] - vcf_filenames_temp_outgroup = [] - - with open(filter2_only_snp_vcf_filenames) as fp: - for line in fp: - line = line.strip() - line = args.filter2_only_snp_vcf_dir + line - vcf_filenames_temp.append(line) - if args.outgroup: - if "%s_filter2_final.vcf_no_proximate_snp.vcf" % outgroup not in line: - vcf_filenames_temp_outgroup.append(line) - fp.close() - vcf_filenames = sorted(vcf_filenames_temp) - vcf_filenames_outgroup = sorted(vcf_filenames_temp_outgroup) - - make_sure_files_exists(vcf_filenames, Config, logger) - - log_file_handle = "%s/%s_%s.log.txt" % (args.filter2_only_snp_vcf_dir, log_unique_time, analysis_name_log) - - # Start Variant Calling Core Pipeline steps based on steps argument supplied. - if "1" in args.steps: - """ - core_prep step - """ - - # Gather SNP positions from each final *_no_proximate_snp.vcf file (that passed the variant filter parameters from variant calling pipeline) and write to *_no_proximate_snp.vcf_position files for use in downstream methods - keep_logging('Gathering SNP position information from each final *_no_proximate_snp.vcf file...', 'Gathering SNP position information from each final *_no_proximate_snp.vcf file...', logger, 'info') - - core_prep_label(vcf_filenames, args.filter2_only_snp_vcf_dir, args.outgroup, args.reference, log_unique_time, log_file_handle, logger, args.jobrun, Config) - - if "2" in args.steps: - """ - core step - """ - - # Set variables; check if the output from core_prep steps (*label files) exists and was completed without any errors. - snp_unique_positions_file = args.filter2_only_snp_vcf_dir + "/unique_positions_file" - indel_unique_positions_file = args.filter2_only_snp_vcf_dir + "/unique_indel_positions_file" - uniq_snp_positions = sum(1 for line in open('%s' % snp_unique_positions_file)) - uniq_indel_positions = sum(1 for line in open('%s' % indel_unique_positions_file)) - if not os.path.isfile(snp_unique_positions_file) and not os.path.isfile(indel_unique_positions_file): - keep_logging('Error finding unique_positions_file/unique_indel_positions_file. Please rerun core_prep step.','Error finding unique_positions_file/unique_indel_positions_file. Please rerun core_prep step.', logger,'exception') - exit() - - make_sure_label_files_exists(vcf_filenames, uniq_snp_positions, uniq_indel_positions, Config, logger) - - # Set up Report and results directories to transfer the final results. - data_matrix_dir = args.results_dir + '/data_matrix' - core_vcf_fasta_dir = args.results_dir + '/core_snp_consensus' - make_sure_path_exists(data_matrix_dir) - make_sure_path_exists(core_vcf_fasta_dir) - - functional_class_filter_positions = "%s/Functional_class_filter_positions.txt" % args.filter2_only_snp_vcf_dir - - global outgroup_specific_positions - global outgroup_indel_specific_positions - - # Get outgroup specific variant positions - if args.outgroup: - f_outgroup = open("%s/outgroup_indel_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') - - outgroup_indel_specific_positions = [] - for i in f_outgroup: - i = i.strip() - outgroup_indel_specific_positions.append(int(i)) - f_outgroup.close() - - f_outgroup = open("%s/outgroup_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') - - outgroup_specific_positions = [] - for i in f_outgroup: - i = i.strip() - outgroup_specific_positions.append(int(i)) - f_outgroup.close() - - print "No. of outgroup specific variant positions: %s" % len(outgroup_specific_positions) - print "No. of outgroup specific Indel variant positions: %s" % len(outgroup_indel_specific_positions) - else: - - outgroup_indel_specific_positions = [] - outgroup_specific_positions = [] - print "No. of outgroup specific variant positions: %s" % len(outgroup_specific_positions) - print "No. of outgroup specific Indel variant positions: %s" % len(outgroup_indel_specific_positions) - - # Run core steps. Generate SNP and data Matrix results. Extract core SNPS and consensus files. - core_prep_indel(core_vcf_fasta_dir) - - core_prep_snp(core_vcf_fasta_dir) - - # Moving this up before core_prep_snp; for some weird reason, it is failing to generate Only_ref_indel - #core_prep_indel(core_vcf_fasta_dir) - - # Annotate core variants. Generate SNP and Indel matrix. - annotated_snp_matrix() - - # Read new allele matrix and generate fasta; generate a seperate function - keep_logging('Generating Fasta from Variant Alleles...\n', 'Generating Fasta from Variant Alleles...\n', logger, 'info') - - create_job_allele_variant_fasta(args.jobrun, vcf_filenames, args.filter2_only_snp_vcf_dir, config_file) - - #extract_only_ref_variant_fasta_from_reference_allele_variant() - - mask_fq_mq_positions_specific_to_outgroup() - - call("cp %s %s/Logs/core/" % ( - log_file_handle, os.path.dirname(os.path.dirname(args.filter2_only_snp_vcf_dir))), logger) - - if "3" in args.steps: - """ - report step - """ - - # Get outgroup_Sample name - outgroup = get_outgroup() - - keep_logging('Step 3: Generate Reports and Results folder.', 'Step 3: Generate Reports and Results folder.', logger, 'info') - - ## Temporary fix. A bug was introduced that is causing the pipeline to generate *vcf_no_proximate_snp.vcf_filter2_consensus.fa - call("rm %s/*vcf_no_proximate_snp.vcf_filter2_consensus.fa" % args.filter2_only_snp_vcf_dir, logger) - - # Generate DP barplots data and Analyze the FQ values of all the unique variant - # DP_analysis_barplot() - # FQ_analysis() - - # Set up Report and results directories to transfer the final results. - # Set up Report and results directories to transfer the final results. - data_matrix_dir = args.results_dir + '/data_matrix' - core_vcf_fasta_dir = args.results_dir + '/core_snp_consensus' - make_sure_path_exists(args.results_dir) - make_sure_path_exists(data_matrix_dir) - make_sure_path_exists(core_vcf_fasta_dir) - data_matrix_dir = args.results_dir + '/data_matrix' - data_matrix_snpeff_dir = data_matrix_dir + '/snpEff_results' - core_vcf_fasta_dir = args.results_dir + '/core_snp_consensus' - consensus_var_dir = core_vcf_fasta_dir + '/consensus_variant_positions' - core_vcf_dir = core_vcf_fasta_dir + '/core_vcf' - consensus_allele_var_dir = core_vcf_fasta_dir + '/consensus_allele_variant_positions' - consensus_ref_allele_var_dir = core_vcf_fasta_dir + '/consensus_ref_allele_variant_positions' - consensus_ref_var_dir = core_vcf_fasta_dir + '/consensus_ref_variant_positions' - consensus_ref_allele_unmapped_variant_dir = core_vcf_fasta_dir + '/consensus_ref_allele_unmapped_variant' - make_sure_path_exists(data_matrix_dir) - make_sure_path_exists(data_matrix_snpeff_dir) - make_sure_path_exists(core_vcf_fasta_dir) - make_sure_path_exists(consensus_var_dir) - make_sure_path_exists(core_vcf_dir) - make_sure_path_exists(consensus_allele_var_dir) - #make_sure_path_exists(consensus_ref_allele_var_dir) - make_sure_path_exists(consensus_ref_var_dir) - make_sure_path_exists(consensus_ref_allele_unmapped_variant_dir) - reference_base = os.path.basename(args.reference).split('.')[0] - # Move results to the results directory - move_data_matrix_results = "cp -r %s/unique_positions_file %s/unique_indel_positions_file %s/*.csv %s/*.txt %s/temp_* %s/All* %s/Only* %s/*.R %s/R_scripts/generate_diagnostics_plots.R %s/" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, os.path.dirname(os.path.abspath(__file__)), data_matrix_dir) - #move_core_vcf_fasta_results = "cp %s/*_core.vcf.gz %s/*.fa %s/*_variants.fa %s/" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, core_vcf_fasta_dir) - move_core_vcf_fasta_results = "mv %s/*_core.vcf.gz* %s/*_ANN* %s/*.fa %s/" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, core_vcf_fasta_dir) - - - move_consensus_var_fasta_results = "mv %s/*_variants.fa %s/" % (core_vcf_fasta_dir, consensus_var_dir) - move_consensus_ref_var_fasta_results = "mv %s/*.fa %s/" % (core_vcf_fasta_dir, consensus_ref_var_dir) - move_core_vcf = "mv %s/*_core.vcf.gz %s/*vcf_core.vcf.gz.tbi %s/" % (core_vcf_fasta_dir, core_vcf_fasta_dir, core_vcf_dir) - move_consensus_allele_var_fasta_results = "mv %s/*allele_variants.fa %s/" % (consensus_var_dir, consensus_allele_var_dir) - remove_ref_allele = "rm %s/*_ref_allele_variants.fa" % consensus_allele_var_dir - #move_consensus_ref_allele_var_fasta_results = "mv %s/*_ref_allele_variants.fa %s/" % (consensus_allele_var_dir, consensus_ref_allele_var_dir) - move_consensus_ref_allele_unmapped_var_fasta_results = "mv %s/*_ref_allele_unmapped_variants.fa %s/" % (consensus_var_dir, consensus_ref_allele_unmapped_variant_dir) - move_snpeff_results = "mv %s/*ANN* %s/" % (data_matrix_dir, data_matrix_snpeff_dir) - move_snpeff_vcf_results = "mv %s/*ANN* %s/" % (core_vcf_fasta_dir, data_matrix_snpeff_dir) - copy_reference = "cp %s %s/%s.fa" % (args.reference, consensus_ref_var_dir, reference_base) - #copy_reference_2 = "cp %s %s/%s.fa" % (args.reference, consensus_ref_allele_var_dir, reference_base) - - call("%s" % move_data_matrix_results, logger) - call("%s" % move_core_vcf_fasta_results, logger) - call("%s" % move_consensus_var_fasta_results, logger) - call("%s" % move_consensus_ref_var_fasta_results, logger) - call("%s" % move_core_vcf, logger) - call("%s" % move_consensus_allele_var_fasta_results, logger) - call("%s" % remove_ref_allele, logger) - #call("%s" % move_consensus_ref_allele_var_fasta_results, logger) - call("%s" % move_consensus_ref_allele_unmapped_var_fasta_results, logger) - call("%s" % copy_reference, logger) - #call("%s" % copy_reference_2, logger) - call("%s" % move_snpeff_results, logger) - call("%s" % move_snpeff_vcf_results, logger) - subprocess.call(["sed -i 's/title_here/%s/g' %s/generate_diagnostics_plots.R" % (os.path.basename(args.results_dir), data_matrix_dir)], shell=True) - - # Sanity Check if the variant consensus files generated are of same length - count = 0 - for line in open("%s/Only_ref_variant_positions_for_closely_matrix.txt" % data_matrix_dir).xreadlines(): - count += 1 - ref_variants = count - 1 - variant_consensus_files = glob.glob("%s/*_variants.fa" % core_vcf_fasta_dir) - for f in variant_consensus_files: - cmd2 = "%s/%s/bioawk -c fastx '{ print length($seq) }' < %s" % ( - ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin'], f) - proc = subprocess.Popen([cmd2], stdout=subprocess.PIPE, shell=True) - (out2, err2) = proc.communicate() - - try: - int(out2) != int(ref_variants) - except OSError as exception: - if exception.errno != errno.EEXIST: - keep_logging('Error generating variant consensus position file: %s' % f, - 'Error generating variant consensus position file: %s' % f, logger, 'info') - keep_logging('Error generating variant consensus position file: %s' % f, - 'Error generating variant consensus position file: %s' % f, logger, 'exception') - exit() - - # Move and organize data_matrix_dir directory - os.chdir(data_matrix_dir) - plots_dir = "%s/plots" % data_matrix_dir - matrices_dir = "%s/matrices" % data_matrix_dir - functional_ann_dir = "%s/Functional_annotation_results" % data_matrix_dir - logs_dir = "%s/logs" % data_matrix_dir - make_sure_path_exists(plots_dir) - make_sure_path_exists(matrices_dir) - make_sure_path_exists(functional_ann_dir) - make_sure_path_exists(logs_dir) - call("mv *.log.txt %s" % logs_dir, logger) - call("mv summary.txt detail.txt Functional_class_filter_positions.txt inexact_repeat_region_positions.txt phage_region_positions.txt repeat_region_positions.txt %s" % functional_ann_dir, logger) - call("mv temp_* All* Only* SNP_matrix_* Indel* extract_DP_positions.txt header.txt unique_indel_positions_file unique_positions_file %s" % matrices_dir, logger) - call("mv annotated_no_proximate_snp_* %s/snpEff_results/" % data_matrix_dir, logger) - call("mv bargraph* generate_diagnostics_plots.R %s" % plots_dir, logger) - call("cp %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt %s/" % (matrices_dir, plots_dir), logger) - - # """ Generate alignment report """ - # alignment_report(data_matrix_dir) - # - # """ Generate core snps report """ - # variant_report(data_matrix_dir) - - """ Generating Gubbins MFA files""" - reference_base = os.path.basename(args.reference).split('.')[0] - gubbins_dir = args.results_dir + '/gubbins' - tree_dir = args.results_dir + '/trees' - - make_sure_path_exists(gubbins_dir) - #make_sure_path_exists(tree_dir) - - - prepare_ref_var_consensus_input = "%s/gubbins/%s_%s_genome_aln_w_ref_allele.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) - prepare_var_consensus_input = "%s/gubbins/%s_%s_core_var_aln.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) - prepare_allele_var_consensus_input = "%s/gubbins/%s_%s_noncore_plus_core_variants_aln.fa" % ( - args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), - reference_base) - #prepare_ref_allele_var_consensus_input = "%s/gubbins/%s_%s_ref_allele_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''),reference_base) - prepare_ref_allele_unmapped_consensus_input = "%s/gubbins/%s_%s_genome_aln_w_alt_allele_unmapped.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) - - prepare_ref_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_ref_variant_positions/*.fa > %s" % (args.results_dir, prepare_ref_var_consensus_input) - prepare_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_variant_positions/*_variants.fa > %s" % (args.results_dir, prepare_var_consensus_input) - prepare_allele_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_allele_variant_positions/*_allele_variants.fa > %s" % ( - args.results_dir, prepare_allele_var_consensus_input) - #prepare_ref_allele_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_ref_allele_variant_positions/*.fa > %s" % (args.results_dir, prepare_ref_allele_var_consensus_input) - prepare_ref_allele_unmapped_consensus_input_cmd = "cat %s %s/core_snp_consensus/consensus_ref_allele_unmapped_variant/*.fa > %s" % (args.reference, args.results_dir, prepare_ref_allele_unmapped_consensus_input) - call("%s" % prepare_ref_var_consensus_input_cmd, logger) - call("%s" % prepare_var_consensus_input_cmd, logger) - call("%s" % prepare_allele_var_consensus_input_cmd, logger) - #call("%s" % prepare_ref_allele_var_consensus_input_cmd, logger) - call("%s" % prepare_ref_allele_unmapped_consensus_input_cmd, logger) - # os.system(prepare_ref_var_consensus_input_cmd) - # os.system(prepare_var_consensus_input_cmd) - - print_details = "Results for core pipeline can be found in: %s\n" \ - "Description of Results:\n" \ - "1. data_matrix folder contains all the data matrices and other temporary files generated during the core pipeline. bargraph_counts.txt and bargraph_percentage.txt: contains counts/percentage of unique positions filtered out due to different filter parameters for each sample. Run bargraph.R to plot bargraph statistics." \ - "2. core_snp_consensus contains all the core vcf and fasta files. *_core.vcf.gz: core vcf files, *.fa and *_variants.fa: core consensus fasta file and core consensus fasta with only variant positions." % (args.results_dir) - keep_logging(print_details, print_details, logger, 'info') - - call("cp %s %s/Logs/report/" % ( - log_file_handle, os.path.dirname(os.path.dirname(args.filter2_only_snp_vcf_dir))), logger) - - if "4" in args.steps: - """ - Gubbins/Raxml step - """ - - - keep_logging('Step 4: Run Gubbins on core alignments and generate iqtree/RaxML trees.', 'Step 4: Run Gubbins on core alignments and generate iqtree/RaxML trees.', logger, 'info') - - #parse_phaster(args.reference) - reference_base = os.path.basename(args.reference).split('.')[0] - gubbins_dir = args.results_dir + '/gubbins' - tree_dir = args.results_dir + '/trees' - - make_sure_path_exists(gubbins_dir) - #make_sure_path_exists(tree_dir) - - - prepare_ref_var_consensus_input = "%s/gubbins/%s_%s_genome_aln_w_ref_allele.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) - prepare_var_consensus_input = "%s/gubbins/%s_%s_core_var_aln.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) - prepare_allele_var_consensus_input = "%s/gubbins/%s_%s_noncore_plus_core_variants_aln.fa" % ( - args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), - reference_base) - #prepare_ref_allele_var_consensus_input = "%s/gubbins/%s_%s_ref_allele_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''),reference_base) - prepare_ref_allele_unmapped_consensus_input = "%s/gubbins/%s_%s_genome_aln_w_alt_allele_unmapped.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) - - prepare_ref_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_ref_variant_positions/*.fa > %s" % (args.results_dir, prepare_ref_var_consensus_input) - prepare_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_variant_positions/*_variants.fa > %s" % (args.results_dir, prepare_var_consensus_input) - prepare_allele_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_allele_variant_positions/*_allele_variants.fa > %s" % ( - args.results_dir, prepare_allele_var_consensus_input) - #prepare_ref_allele_var_consensus_input_cmd = "cat %s/core_snp_consensus/consensus_ref_allele_variant_positions/*.fa > %s" % (args.results_dir, prepare_ref_allele_var_consensus_input) - prepare_ref_allele_unmapped_consensus_input_cmd = "cat %s %s/core_snp_consensus/consensus_ref_allele_unmapped_variant/*.fa > %s" % (args.reference, args.results_dir, prepare_ref_allele_unmapped_consensus_input) - call("%s" % prepare_ref_var_consensus_input_cmd, logger) - call("%s" % prepare_var_consensus_input_cmd, logger) - call("%s" % prepare_allele_var_consensus_input_cmd, logger) - call("%s" % prepare_ref_allele_unmapped_consensus_input_cmd, logger) - - - if args.gubbins and args.gubbins == "yes": - os.chdir(gubbins_dir) - if args.outgroup: - # Get outgroup_Sample name - outgroup = get_outgroup() - keep_logging('%s/scripts/gubbins_iqtree_raxml.sh %s 1 esnitkin_flux \'%s\'' % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input, outgroup), - '%s/scripts/gubbins_iqtree_raxml.sh %s 1 esnitkin_flux \'%s\'' % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input, outgroup), logger, 'info') - call("%s/scripts/gubbins_iqtree_raxml.sh %s 1 esnitkin_flux \'%s\'" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input, outgroup), logger) - keep_logging('%s/scripts/gubbins_iqtree_raxml.sh %s 1 esnitkin_flux \'%s\'' % ( - os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input, outgroup), - '%s/scripts/gubbins_iqtree_raxml.sh %s 1 esnitkin_flux \'%s\'' % ( - os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input, outgroup), - logger, 'info') - call("%s/scripts/gubbins_iqtree_raxml.sh %s 1 esnitkin_flux \'%s\'" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input, outgroup), logger) - # call("%s/scripts/gubbins_iqtree_raxml.sh %s 1" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_var_consensus_input), logger) - else: - keep_logging('%s/scripts/gubbins_iqtree_raxml.sh %s 1' % ( - os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input), - '%s/scripts/gubbins_iqtree_raxml.sh %s 1' % ( - os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input), - logger, 'info') - call("%s/scripts/gubbins_iqtree_raxml.sh %s 1" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input), logger) - keep_logging('%s/scripts/gubbins_iqtree_raxml.sh %s 1' % ( - os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input), - '%s/scripts/gubbins_iqtree_raxml.sh %s 1' % ( - os.path.dirname(os.path.abspath(__file__)), - prepare_ref_allele_unmapped_consensus_input), - logger, 'info') - call("%s/scripts/gubbins_iqtree_raxml.sh %s 1" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input), logger) - #call("%s/scripts/gubbins_iqtree_raxml.sh %s 1" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_var_consensus_input), logger) - else: - if args.outgroup: - # Get outgroup_Sample name - outgroup = get_outgroup() - keep_logging('The gubbins argument is set to No.', 'The gubbins argument is set to No.', logger, 'info') - keep_logging('%s/scripts/gubbins_iqtree_raxml.sh %s 0 esnitkin_flux \'%s\'' % ( - os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input, outgroup), - '%s/scripts/gubbins_iqtree_raxml.sh %s 0 esnitkin_flux \'%s\'' % ( - os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input, outgroup), - logger, 'info') - print "%s/scripts/gubbins_iqtree_raxml.sh %s 0 esnitkin_flux \'%s\'" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input, outgroup) - keep_logging('%s/scripts/gubbins_iqtree_raxml.sh %s 0 esnitkin_flux \'%s\'' % ( - os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input, outgroup), - '%s/scripts/gubbins_iqtree_raxml.sh %s 0 esnitkin_flux \'%s\'' % ( - os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input, outgroup), - logger, 'info') - print "%s/scripts/gubbins_iqtree_raxml.sh %s 0 esnitkin_flux \'%s\'" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input, outgroup) - else: - keep_logging('The gubbins argument is set to No.', 'The gubbins argument is set to No.', logger, 'info') - print "%s/scripts/gubbins_iqtree_raxml.sh %s 0" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_var_consensus_input) - print "%s/scripts/gubbins_iqtree_raxml.sh %s 0" % (os.path.dirname(os.path.abspath(__file__)), prepare_ref_allele_unmapped_consensus_input) - - call("cp %s %s/Logs/tree/" % ( - log_file_handle, os.path.dirname(os.path.dirname(args.filter2_only_snp_vcf_dir))), logger) - - """ The below steps are for debugging purpose only.""" - if "5" in args.steps: - """ - Debugging Purposes only: Run only SNP matrix annotation step - """ - - keep_logging('Step 5: Running SNP matrix annotation step.', 'Step 5: Running SNP matrix annotation step.', logger, 'info') - - functional_class_filter_positions = "%s/Functional_class_filter_positions.txt" % args.filter2_only_snp_vcf_dir - - global outgroup_specific_positions - global outgroup_indel_specific_positions - - # Get outgroup specific variant positions - if args.outgroup: - f_outgroup = open("%s/outgroup_indel_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') - - outgroup_indel_specific_positions = [] - for i in f_outgroup: - i = i.strip() - outgroup_indel_specific_positions.append(int(i)) - f_outgroup.close() - - f_outgroup = open("%s/outgroup_specific_positions.txt" % args.filter2_only_snp_vcf_dir, 'r+') - - outgroup_specific_positions = [] - for i in f_outgroup: - i = i.strip() - outgroup_specific_positions.append(int(i)) - f_outgroup.close() - - print "No. of outgroup specific variant positions: %s" % len(outgroup_specific_positions) - print "No. of outgroup specific Indel variant positions: %s" % len(outgroup_indel_specific_positions) - else: - - outgroup_indel_specific_positions = [] - outgroup_specific_positions = [] - print "No. of outgroup specific variant positions: %s" % len(outgroup_specific_positions) - print "No. of outgroup specific Indel variant positions: %s" % len(outgroup_indel_specific_positions) - - # Annotate core variants. Generate SNP and Indel matrix. - annotated_snp_matrix() - - # # Read new allele matrix and generate fasta; generate a seperate function - keep_logging('Generating Fasta from Variant Alleles...\n', 'Generating Fasta from Variant Alleles...\n', logger, 'info') - - create_job_allele_variant_fasta(args.jobrun, vcf_filenames, args.filter2_only_snp_vcf_dir, config_file) - - extract_only_ref_variant_fasta_from_reference_allele_variant() - - mask_fq_mq_positions_specific_to_outgroup() - - call("cp %s %s/Logs/core/" % ( - log_file_handle, os.path.dirname(os.path.dirname(args.filter2_only_snp_vcf_dir))), logger) - - if "6" in args.steps: - """ - Debugging Purposes only: Run only Gubbins - """ - reference_base = os.path.basename(args.reference).split('.')[0] - gubbins_dir = args.results_dir + '/gubbins' - tree_dir = args.results_dir + '/trees' - - make_sure_path_exists(gubbins_dir) - #make_sure_path_exists(tree_dir) - - - prepare_ref_var_consensus_input = "%s/gubbins/%s_%s_ref_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) - prepare_var_consensus_input = "%s/gubbins/%s_%s_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) - prepare_allele_var_consensus_input = "%s/gubbins/%s_%s_allele_var_consensus.fa" % ( - args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), - reference_base) - prepare_ref_allele_var_consensus_input = "%s/gubbins/%s_%s_ref_allele_var_consensus.fa" % ( - args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), - reference_base) - prepare_ref_allele_unmapped_consensus_input = "%s/gubbins/%s_%s_ref_allele_unmapped_consensus.fa" % ( - args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), - reference_base) - - if args.gubbins and args.gubbins == "yes": - gubbins(gubbins_dir, prepare_ref_var_consensus_input, args.jobrun, logger, Config) - #gubbins(gubbins_dir, prepare_ref_allele_var_consensus_input, logger, Config) - gubbins(gubbins_dir, prepare_ref_allele_unmapped_consensus_input,args.jobrun, logger, Config) - call("cp %s %s/Logs/tree/" % ( - log_file_handle, os.path.dirname(os.path.dirname(args.filter2_only_snp_vcf_dir))), logger) - - if "7" in args.steps: - """ - Debugging Purposes only: Run iqtree - """ - reference_base = os.path.basename(args.reference).split('.')[0] - gubbins_dir = args.results_dir + '/gubbins' - tree_dir = args.results_dir + '/trees' - - make_sure_path_exists(gubbins_dir) - #make_sure_path_exists(tree_dir) - - - prepare_ref_var_consensus_input = "%s/gubbins/%s_%s_ref_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) - prepare_var_consensus_input = "%s/gubbins/%s_%s_var_consensus.fa" % (args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), reference_base) - prepare_allele_var_consensus_input = "%s/gubbins/%s_%s_allele_var_consensus.fa" % ( - args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), - reference_base) - prepare_ref_allele_var_consensus_input = "%s/gubbins/%s_%s_ref_allele_var_consensus.fa" % ( - args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), - reference_base) - prepare_ref_allele_unmapped_consensus_input = "%s/gubbins/%s_%s_ref_allele_unmapped_consensus.fa" % ( - args.results_dir, (os.path.basename(os.path.normpath(args.results_dir))).replace('_core_results', ''), - reference_base) - iqtree(tree_dir, prepare_ref_allele_var_consensus_input, args.jobrun, logger, Config) - iqtree(tree_dir, prepare_ref_var_consensus_input, args.jobrun, logger, Config) - iqtree(tree_dir, prepare_var_consensus_input, args.jobrun, logger, Config) - iqtree(tree_dir, prepare_ref_allele_unmapped_consensus_input, args.jobrun, logger, Config) - - time_taken = datetime.now() - start_time_2 - if args.remove_temp: - del_command = "rm -r %s" % temp_dir - os.system(del_command) - - - - diff --git a/modules/variant_diagnostics/explore_allele_frequency.py b/modules/variant_diagnostics/explore_allele_frequency.py new file mode 100644 index 0000000..1c12cbb --- /dev/null +++ b/modules/variant_diagnostics/explore_allele_frequency.py @@ -0,0 +1,51 @@ +from __future__ import division +import sys +import argparse +import re +import os +import csv +import subprocess +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +""" Hacky way to append. Instead Add this path to PYTHONPATH Variable """ +from collections import OrderedDict +from collections import defaultdict +from joblib import Parallel, delayed +import multiprocessing +import thread +import glob +import readline +#import pandas as pd +import errno +from datetime import datetime +import threading +import json +from cyvcf2 import VCF +import ConfigParser +from config_settings import ConfigSectionMap +from logging_subprocess import * +from log_modules import * +from tabix import * +from memory_profiler import profile + + +parser = argparse.ArgumentParser(description='Creating Label files individual jobs') +parser.add_argument('-vcf', action='store', dest="vcf", + help='VCF file to extract allele frequency from.') +args = parser.parse_args() + +def extract_allele_frequency(): + filter_passed_final_vcf = VCF((args.vcf).replace('_aln_mpileup_raw.vcf', '_filter2_final.vcf_no_proximate_snp.vcf')) + print filter_passed_final_vcf.POSITION + for variants in VCF("%s" % args.vcf): + #grep -w "2872079" MRSA_CO_HA_426__aln_mpileup_raw.vcf | cut -f8 | cut -d';' -f11 | sed 's/DP4=//g' | awk -F',' '{print ($3+$4)/(($1+$2) + ($3+$4))}' + DP4_value_list = str(variants.INFO.get('DP4')).replace('(', '').replace(')', '').split(',') + #print DP4_value_list + DP4_value_list = map(int, DP4_value_list) + numerator = DP4_value_list[2] + DP4_value_list[3] + deno = DP4_value_list[0] + DP4_value_list[1] + DP4_value_list[2] + DP4_value_list[3] + allele_frequency = float(numerator / deno) + print "%s, %s, %s" % (variants.POS, variants.INFO.get('DP'), allele_frequency) + filter_passed_final_vcf = (args.vcf).replace('_aln_mpileup_raw.vcf', '_filter2_final.vcf_no_proximate_snp.vcf') + + +extract_allele_frequency() \ No newline at end of file diff --git a/modules/variant_diagnostics/extract_only_ref_variant_fasta_alternate.py b/modules/variant_diagnostics/extract_only_ref_variant_fasta_alternate.py deleted file mode 100755 index 1750a09..0000000 --- a/modules/variant_diagnostics/extract_only_ref_variant_fasta_alternate.py +++ /dev/null @@ -1,135 +0,0 @@ -from __future__ import division -import argparse -import re -import os -import csv -import subprocess -from collections import OrderedDict -from collections import defaultdict -from joblib import Parallel, delayed -import multiprocessing -import thread -import glob -import readline -import pandas as pd -import errno -from pyfasta import Fasta -from datetime import datetime -import threading -from cyvcf2 import VCF -import ConfigParser -from config_settings import ConfigSectionMap -# from logging_subprocess import * -# from log_modules import * - - -parser = argparse.ArgumentParser(description='Extract Only reference and variant positions and generate a fasta file out of it.') -required = parser.add_argument_group('Required arguments') -optional = parser.add_argument_group('Optional arguments') -required.add_argument('-filter2_only_snp_vcf_dir', action='store', dest="filter2_only_snp_vcf_dir", - help='Directory where all the filter2 only SNP vcf files are saved.') -required.add_argument('-filter2_only_snp_vcf_filename', action='store', dest="filter2_only_snp_vcf_filename", - help='Name of filter2 only SNP vcf file') -required.add_argument('-reference', action='store', dest="reference", - help='Path to Reference Fasta File') -required.add_argument('-out_core', action='store', dest="out_core", - help='Path to core results directory') -required.add_argument('-config', action='store', dest="config", - help='Path to core results directory') - - -args = parser.parse_args() - -if args.config: - config_file = args.config -else: - config_file = os.path.dirname(os.path.abspath(__file__)) + "/config" -global Config -Config = ConfigParser.ConfigParser() -Config.read(config_file) - - -def extract_only_ref_variant_fasta_alternate_2(): - #print "here" - - # Get reference genome ID - get_reference = Fasta(args.reference) - if len(get_reference.keys()) == 1: - ref_id = get_reference.keys() - - - c_reader = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') - c_reader_2 = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') - columns = list(zip(*c_reader)) - ncol = len(next(c_reader_2)) - - - unique_position_array = [] - for i in columns[0][1:]: - replace_string = i.split(' ') - unique_position_array.append(int(replace_string[3])) - #print unique_position_array - - counts = 1 - end = ncol - for i in xrange(1, end, 1): - print_string = "" - ref_print_string = "" - sample_name = str(columns[i][0]) - sample_name_re = re.sub('_R1.fastq.gz', '', sample_name) - sample_name_re = re.sub('_*1*.fastq.gz', '', sample_name_re) - - if sample_name_re == os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''): - vcf_header = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % sample_name_re - print_string = print_string + ">%s\n" % sample_name_re - ref_print_string = ref_print_string + ">%s\n" % sample_name_re - variant_allele = ''.join(columns[i][1:]) - print_string = print_string + str(variant_allele) + "\n" - allele_variant_fasta = open("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') - allele_ref_variant_fasta = open("%s/%s_ref_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') - allele_ref_variant_vcf = open("%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') - allele_ref_variant_vcf.write(vcf_header) - allele_variant_fasta.write(print_string) - allele_variant_fasta.close() - variant_allele_array = [] - variant_allele_array.append(columns[i][1:]) - get_sample_reference = Fasta("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re)) - if len(get_sample_reference.keys()) == 1: - sample_ref_id = get_sample_reference.keys() - for positions in unique_position_array: - pos_index = unique_position_array.index(positions) - allele_var = str(variant_allele_array[0][pos_index]) - ref_allele = str(get_reference.sequence({'chr': str(get_reference.keys()[0]), 'start': int(positions), 'stop': int(positions)})) - generate_vcf_string = "%s\t%s\t.\t%s\t%s\t221.999\t.\t.\t.\n" % (ref_id[0].split(' ')[0], positions, ref_allele, allele_var) - allele_ref_variant_vcf.write(generate_vcf_string) - allele_ref_variant_vcf.close() - filename = "%s/consensus_ref_allele_variant.sh" % args.filter2_only_snp_vcf_dir - - vcf_filename = "%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re) - f1 = open(filename, 'a+') - bgzip_cmd = "%s/%s/bgzip -f %s\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename) - f1.write(bgzip_cmd) - subprocess.call([bgzip_cmd], shell=True) - tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename) - f1.write(tabix_cmd) - subprocess.call([tabix_cmd], shell=True) - base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("vcftools", Config)['vcftools_bin'] - fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_variants.fa\n" % (args.reference, base_vcftools_bin, vcf_filename, sample_name_re) - f1.write(fasta_cmd) - subprocess.call([fasta_cmd], shell=True) - - sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_variants.fa\n" % (sample_name_re, sample_name_re) - subprocess.call([sed_command], shell=True) - f1.write(sed_command) - - sequence_lgth_cmd = "for i in %s/*.fa; do %s/%s/bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % (args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin']) - #os.system(sequence_lgth_cmd) - #call("%s" % sequence_lgth_cmd, logger) - - else: - print "Sample name %s does not match with column name %s" % (os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''), sample_name_re) - - - - -extract_only_ref_variant_fasta_alternate_2() \ No newline at end of file diff --git a/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py b/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py index eea4e7e..20bbf4a 100755 --- a/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py +++ b/modules/variant_diagnostics/extract_only_ref_variant_fasta_unique_positions.py @@ -200,7 +200,7 @@ def extract_only_ref_variant_fasta_unique_positions_with_unmapped(): #print grab_vcf_filename sample_name_re = columns[i][0][:grab_vcf_filename] - print sample_name_re + #print sample_name_re # Replaced this with a more stable check #sample_name = str(columns[i][0]) diff --git a/modules/variant_diagnostics/extract_ref_variant_position.py b/modules/variant_diagnostics/extract_ref_variant_position.py deleted file mode 100755 index c4b8158..0000000 --- a/modules/variant_diagnostics/extract_ref_variant_position.py +++ /dev/null @@ -1,79 +0,0 @@ -__author__ = 'alipirani' - -import argparse -import re -import os -import csv -import subprocess -from collections import OrderedDict -from collections import defaultdict - - -parser = argparse.ArgumentParser(description='Parsing All position with label file and investigating positions to determine the reason why it was filtered out from the final list') -#All raw only snp pileup files should be store in the same directory where filter2 only snp vcf files are. -parser.add_argument('-positions_file_dir', action='store', dest="positions_file_dir", help='Directory where all the filter2 only SNP vcf files are saved.') -parser.add_argument('-label_filename', action='store', dest="label_filename", help='Names of All_label_final_raw file created after running paste.sh script.') -parser.add_argument('-unique_positions', action='store', dest="unique_positions", help='Names of unique_positions_file') -args = parser.parse_args() - -# def generate_label_report(): -# MyValues = [] -# cmd = "ls %s" % args.positions_file_dir -# proc = subprocess.Popen([cmd], stdout=subprocess.PIPE, shell=True) -# (out, err) = proc.communicate() -# with open(All_position_file, 'rU') as csv_file: -# csv_reader = csv.reader(csv_file, delimiter='\t') -# for row in csv_reader: -# for i in range(1, len(row[1:])): -# total_variant_positions = row[1].count("1") -# print total_variant_positions -# #MyValues.append(row[1]) -# #print MyValues -# generate_label_report() - -def generate_sed_command(): - sed_file = args.positions_file_dir + "/sed_reason.sh" - f4=open(sed_file, 'w') - sed_command = "sed -i 's/reference_unmapped_position/0/g' All_label_final_raw\nsed -i 's/reference_allele/1/g' All_label_final_raw\nsed -i 's/VARIANT/1/g' All_label_final_raw\nsed -i 's/LowFQ_QUAL_DP_proximate_SNP/2/g' All_label_final_raw\nsed -i 's/LowFQ_DP_QUAL_proximate_SNP/2/g' All_label_final_raw\nsed -i 's/LowFQ_QUAL_proximate_SNP/2/g' All_label_final_raw\nsed -i 's/LowFQ_DP_proximate_SNP/2/g' All_label_final_raw\nsed -i 's/LowFQ_proximate_SNP/2/g' All_label_final_raw\nsed -i 's/LowFQ_QUAL_DP/2/g' All_label_final_raw\nsed -i 's/LowFQ_DP_QUAL/2/g' All_label_final_raw\nsed -i 's/LowFQ_QUAL/2/g' All_label_final_raw\nsed -i 's/LowFQ_DP/2/g' All_label_final_raw\nsed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' All_label_final_raw\nsed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' All_label_final_raw\nsed -i 's/HighFQ_QUAL_proximate_SNP/4/g' All_label_final_raw\nsed -i 's/HighFQ_DP_proximate_SNP/4/g' All_label_final_raw\nsed -i 's/HighFQ_proximate_SNP/7/g' All_label_final_raw\nsed -i 's/HighFQ_QUAL_DP/3/g' All_label_final_raw\nsed -i 's/HighFQ_DP_QUAL/3/g' All_label_final_raw\nsed -i 's/HighFQ_QUAL/3/g' All_label_final_raw\nsed -i 's/HighFQ_DP/3/g' All_label_final_raw\nsed -i 's/LowFQ/5/g' All_label_final_raw\nsed -i 's/HighFQ/6/g' All_label_final_raw" - print sed_command - f4.write(sed_command) - os.system(sed_command) -generate_sed_command() - - -All_position_file = args.label_filename -position_label = OrderedDict() -with open(All_position_file, 'rU') as csv_file: - print "reading position file" - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position_label[row[0]] = row[1:] - -##### Filter out those position array that only contain Reference allele and True Variant -##### This is for the sake of generating heatmap so that we can reduce nonrelevant data from heatmap -def generate_heatmap_position(): - print "generate heatmap matrix" - f1=open("Only_ref_variant_positions_for_closely", 'w+') - f2=open("Only_ref_variant_positions_for_closely_matrix", 'w+') - f3=open("Only_filtered_positions_for_closely_matrix", 'w+') - for value in position_label: - lll = ['0', '2', '3', '4', '5', '6', '7'] - ref_var = ['1', '1'] - if set(ref_var) & set(position_label[value]): - if set(lll) & set(position_label[value]): - #print "bakwaas" - STRR3 = value + "\t" + str(position_label[value]) + "\n" - f3.write(STRR3) - else: - strr = value + "\n" - f1.write(strr) - STRR2 = value + "\t" + str(position_label[value]) + "\n" - f2.write(STRR2) -generate_heatmap_position() - - - - - - - diff --git a/modules/variant_diagnostics/find_repeats.py b/modules/variant_diagnostics/find_repeats.py index 6774119..5bd8578 100755 --- a/modules/variant_diagnostics/find_repeats.py +++ b/modules/variant_diagnostics/find_repeats.py @@ -52,16 +52,18 @@ def nucmer_repeat(reference, outdir, logger, Config): #Find Tandem repeats using Nucmer tandem_repeats = [] - with open("%s_tandem_repeats_file" % prefix) as fp: - for i in xrange(5): - fp.next() - for line in fp: - line = line.strip() - line_split = line.split() - end_coords = int(line_split[0]) + int(line_split[1]) - tandem_repeats.extend(list(range(int(line_split[0]), end_coords))) - keep_logging('No. of Tandem repeat matches positions: %s' % len(set(sorted(tandem_repeats))), - 'No. of Tandem repeat matches positions: %s' % len(set(sorted(tandem_repeats))), logger, 'info') + num_lines = sum(1 for line in open("%s_tandem_repeats_file" % prefix)) + if int(num_lines) > 5: + with open("%s_tandem_repeats_file" % prefix) as fp: + for i in xrange(5): + fp.next() + for line in fp: + line = line.strip() + line_split = line.split() + end_coords = int(line_split[0]) + int(line_split[1]) + tandem_repeats.extend(list(range(int(line_split[0]), end_coords))) + keep_logging('No. of Tandem repeat matches positions: %s' % len(set(sorted(tandem_repeats))), + 'No. of Tandem repeat matches positions: %s' % len(set(sorted(tandem_repeats))), logger, 'info') # Not including inexact repeats filter #All_repeats = sorted(set(inexact_repeat_positions + tandem_repeats)) diff --git a/modules/variant_diagnostics/find_repeats.pyc b/modules/variant_diagnostics/find_repeats.pyc index 7c3da0a..530fd66 100755 Binary files a/modules/variant_diagnostics/find_repeats.pyc and b/modules/variant_diagnostics/find_repeats.pyc differ diff --git a/modules/variant_diagnostics/indel_matrix b/modules/variant_diagnostics/indel_matrix deleted file mode 100644 index 6e7fad7..0000000 --- a/modules/variant_diagnostics/indel_matrix +++ /dev/null @@ -1,505 +0,0 @@ - """ Indel matrix """ - """ Prepare SNP/Indel Matrix print strings and add matrix row information subsequently """ - header_print_string = "Type of SNP at POS > ALT functional=PHAGE_REPEAT_MASK locus_tag=locus_id strand=strand; ALT|Effect|Impact|GeneID|Nrchange|Aachange|Nrgenepos|AAgenepos|gene_symbol|product" - final_merge_anno_file = VCF("%s/Final_vcf_gatk_indel.vcf.gz" % args.filter2_only_snp_vcf_dir) - for sample in final_merge_anno_file.samples: - # header_print_string = header_print_string + "," + sample - header_print_string = header_print_string + "\t" + sample - header_print_string = header_print_string + "\n" - #header_print_string = header_print_string.replace(':::,', ':::') - #header_print_string = header_print_string.replace(':::,', '\t') - fp_code = open("%s/Indel_matrix_code.csv" % args.filter2_only_snp_vcf_dir, 'w+') - fp_allele = open("%s/Indel_matrix_allele.csv" % args.filter2_only_snp_vcf_dir, 'w+') - fp_code.write(header_print_string) - fp_allele.write(header_print_string) - - # """ Generate mask_fq_mq_positions array with positions where a variant was filtered because of LowFQ or LowMQ""" - # mask_fq_mq_positions = [] - # for key in position_indel_label.keys(): - # label_sep_array = position_indel_label[key].split(',') - # for i in label_sep_array: - # if "LowAF" in i: - # if key not in mask_fq_mq_positions: - # mask_fq_mq_positions.append(key) - # if i == "HighAF": - # if key not in mask_fq_mq_positions: - # mask_fq_mq_positions.append(key) - # - # print "Length of indel mask_fq_mq_positions array:%s" % len(mask_fq_mq_positions) - - """ Generate mask_fq_mq_positions array with positions where a variant was filtered because of LowFQ or LowMQ""" - mask_fq_mq_positions = [] - mask_fq_mq_positions_outgroup_specific = [] - - if args.outgroup: - position_label_exclude_outgroup = OrderedDict() - with open("%s/All_label_final_ordered_exclude_outgroup_sorted.txt" % args.filter2_only_snp_vcf_dir, - 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position_label_exclude_outgroup[row[0]] = ','.join(row[1:]) - csv_file.close() - - position_indel_label_exclude_outgroup = OrderedDict() - with open("%s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt" % args.filter2_only_snp_vcf_dir, - 'rU') as csv_file: - keep_logging( - 'Reading All label positions file: %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - 'Reading All label positions file: %s/All_indel_label_final_ordered_exclude_outgroup_sorted.txt' % args.filter2_only_snp_vcf_dir, - logger, 'info') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - if row[0] not in position_label_exclude_outgroup.keys(): - position_indel_label_exclude_outgroup[row[0]] = ','.join(row[1:]) - else: - position_indel_label_exclude_outgroup[row[0]] = ','.join(row[1:]) - keep_logging('Warning: position %s already present as a SNP' % row[0], - 'Warning: position %s already present as a SNP' % row[0], logger, 'info') - csv_file.close() - for key in position_label_exclude_outgroup.keys(): - label_sep_array = position_label_exclude_outgroup[key].split(',') - for i in label_sep_array: - if "LowFQ" in str(i): - if key not in mask_fq_mq_positions: - if int(key) not in outgroup_specific_positions: - mask_fq_mq_positions.append(key) - elif int(key) in outgroup_specific_positions: - mask_fq_mq_positions_outgroup_specific.append(key) - if i == "HighFQ": - if key not in mask_fq_mq_positions: - if int(key) not in outgroup_specific_positions: - mask_fq_mq_positions.append(key) - elif int(key) in outgroup_specific_positions: - mask_fq_mq_positions_outgroup_specific.append(key) - else: - for key in position_label.keys(): - label_sep_array = position_label[key].split(',') - for i in label_sep_array: - if "LowFQ" in str(i): - if key not in mask_fq_mq_positions: - mask_fq_mq_positions.append(key) - if i == "HighFQ": - if key not in mask_fq_mq_positions: - mask_fq_mq_positions.append(key) - - - - print "Length of Indel mask_fq_mq_positions:%s" % len(mask_fq_mq_positions) - print "Length of Indel mask_fq_mq_positions specific to outgroup:%s" % len(mask_fq_mq_positions_outgroup_specific) - - - - - - - - for variants in VCF("%s/Final_vcf_gatk_indel.vcf.gz" % args.filter2_only_snp_vcf_dir): - print_string = "" - - functional_field = "" - if str(variants.POS) in phage_positions: - functional_field = functional_field + "PHAGE_" - else: - functional_field = functional_field + "NULL_" - if str(variants.POS) in repetitive_positions: - functional_field = functional_field + "REPEATS_" - else: - functional_field = functional_field + "NULL_" - if str(variants.POS) in mask_positions: - functional_field = functional_field + "MASK" - else: - functional_field = functional_field + "NULL" - - code_string = position_indel_label[str(variants.POS)] - code_string = code_string.replace('reference_allele', '0') - code_string = code_string.replace('reference_unmapped_position', '-1') - code_string = code_string.replace('LowAF_QUAL_DP_proximate_SNP', '2') - code_string = code_string.replace('LowAF_DP_QUAL_proximate_SNP', '2') - code_string = code_string.replace('LowAF_QUAL_proximate_SNP', '2') - code_string = code_string.replace('LowAF_DP_proximate_SNP', '2') - code_string = code_string.replace('LowAF_proximate_SNP', '2') - code_string = code_string.replace('LowAF_QUAL_DP', '2') - code_string = code_string.replace('LowAF_DP_QUAL', '2') - code_string = code_string.replace('LowAF_QUAL', '2') - code_string = code_string.replace('LowAF_DP', '2') - code_string = code_string.replace('HighAF_QUAL_DP_proximate_SNP', '2') - code_string = code_string.replace('HighAF_DP_QUAL_proximate_SNP', '2') - code_string = code_string.replace('HighAF_QUAL_proximate_SNP', '2') - code_string = code_string.replace('HighAF_DP_proximate_SNP', '2') - code_string = code_string.replace('HighAF_proximate_SNP', '2') - code_string = code_string.replace('HighAF_QUAL_DP', '2') - code_string = code_string.replace('HighAF_DP_QUAL', '2') - code_string = code_string.replace('HighAF_QUAL', '2') - code_string = code_string.replace('HighAF_DP', '2') - code_string = code_string.replace('LowAF', '-3') - code_string = code_string.replace('HighAF', '-4') - - if str(variants.POS) in indel_core_positions: - code_string = code_string.replace('VARIANT', '1') - # Adding functional class status code to SNP matrix: 2018-07-24 - elif str(variants.POS) in functional_filter_pos_array: - # Changing Functional class filter code to -2 from 2: 2018-12-04 - code_string = code_string.replace('VARIANT', '-2') - else: - code_string = code_string.replace('VARIANT', '3') - - - - - Changing SNP type: Date 28/05/2019 - Assign type of snp: coding / non-coding - if variants.INFO.get('ANN'): - if "protein_coding" in variants.INFO.get('ANN'): - snp_type = "Coding INDEL" - else: - snp_type = "Non-coding INDEL" - else: - if len(variants.ALT) > 1: - #print variants.ALT - #print ';'.join(set(snp_var_ann_dict[variants.POS].split(','))) - if "protein_coding" in set(indel_var_ann_dict[variants.POS].split(',')): - snp_type = "Coding INDEL" - else: - snp_type = "Non-coding INDEL" - else: - snp_type = "Non-coding INDEL" - - if variants.POS in indel_var_ann_dict.keys(): - if "protein_coding" in indel_var_ann_dict[variants.POS]: - snp_type = "Coding Indel" - else: - snp_type = "Non-coding Indel" - - else: - keep_logging( - 'Warning: position %s not found in indel_var_ann_dict dictionary. Assigning Not found as SNP type.' % variants.POS, - 'Warning: position %s not found in indel_var_ann_dict dictionary. Assigning Not found as SNP type.' % variants.POS, - logger, 'info') - snp_type = "Not Found in Annotated VCF file" - - print_string = print_string + snp_type + " at %s > " % str(variants.POS) + str(",".join(variants.ALT)) + " functional=%s" % functional_field - - if variants.INFO.get('ANN'): - ann_array = (variants.INFO.get('ANN')).split(',') - ann_string = ";" - for i in list(set(ann_array)): - i_split = i.split('|') - #ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13]]) + ";" - tag = str(i_split[4]).replace('CHR_START-', '') - tag = str(tag).replace('-CHR_END', '') - tag = str(tag).replace('&', '-') - #print tag - if "-" in tag: - #print tag - extra_tags = "" - tag_split = tag.split('-') - for i in tag_split: - if i in locus_tag_to_gene_name.keys(): - extra_tags = extra_tags + locus_tag_to_gene_name[i] + "," - else: - extra_tags = extra_tags + "None" + "," - extra_tags_prot = "" - for i in tag_split: - if i in locus_tag_to_product.keys(): - extra_tags_prot = extra_tags_prot + locus_tag_to_product[i] + "," - else: - extra_tags_prot = extra_tags_prot + "None" + "," - ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags, extra_tags_prot]) + ";" - else: - if tag in locus_tag_to_gene_name.keys() and tag in locus_tag_to_product.keys(): - extra_tags = str(locus_tag_to_gene_name[tag]) + "|" + str(locus_tag_to_product[tag]) - else: - print "tag key not found: %s" % tag - extra_tags = "NULL" + "|" + "NULL" - #extra_tags = str(locus_tag_to_gene_name[tag]) + "|" + str(locus_tag_to_product[tag]) - # ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" - ann_string = ann_string + '|'.join([i_split[0],i_split[1],i_split[2],i_split[3],i_split[9], i_split[10], i_split[11], i_split[13], extra_tags]) + ";" - else: - - if len(variants.ALT) > 1: - #print variants.ALT - #print ';'.join(set(indel_var_ann_dict[variants.POS].split(','))) - ann_string = ";%s" % ';'.join(set(indel_var_ann_dict[variants.POS].split(','))) - else: - ann_string = ";None" - - - ann_string = ann_string.replace('ERROR_OUT_OF_CHROMOSOME_RANGE', '%s-%s' % (locus_tag_to_gene_name[last_locus_tag], locus_tag_to_gene_name[first_locus_tag])) - ann_string = ann_string.replace('CHR_END', '%s' % locus_tag_to_gene_name[first_locus_tag]) - - - # SNP Matrix Bug - ann_string_split = ann_string.split(';') - if len(ann_string_split) == 3: - first_allele_ann_string_split = ann_string_split[1].split('|') - second_allele_ann_string_split = ann_string_split[2].split('|') - if len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10: - ann_string = ann_string - elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) == 10: - if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": - prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] - else: - prod = first_allele_ann_string_split[14] + first_allele_ann_string_split[15] - new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + \ - first_allele_ann_string_split[1] + "|" + \ - first_allele_ann_string_split[2] + "|" + \ - first_allele_ann_string_split[4] + "|" + \ - first_allele_ann_string_split[9] + "|" + \ - first_allele_ann_string_split[10] + "|" + \ - first_allele_ann_string_split[11] + "|" + \ - first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = new_first_allele_ann_string + str(ann_string_split[2]) - - elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) > 10: - - if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": - prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] - else: - prod = second_allele_ann_string_split[14] + second_allele_ann_string_split[15] - new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + \ - second_allele_ann_string_split[1] + "|" + \ - second_allele_ann_string_split[2] + "|" + \ - second_allele_ann_string_split[4] + "|" + \ - second_allele_ann_string_split[9] + "|" + \ - second_allele_ann_string_split[10] + "|" + \ - second_allele_ann_string_split[11] + "|" + \ - second_allele_ann_string_split[ - 13] + "|" + prod + "|" + prod + ";" - - ann_string = str(ann_string_split[1]) + new_second_allele_ann_string - elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) > 10: - - if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": - prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] - else: - prod = first_allele_ann_string_split[14] + first_allele_ann_string_split[15] - new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + \ - first_allele_ann_string_split[1] + "|" + \ - first_allele_ann_string_split[2] + "|" + \ - first_allele_ann_string_split[4] + "|" + \ - first_allele_ann_string_split[9] + "|" + \ - first_allele_ann_string_split[10] + "|" + \ - first_allele_ann_string_split[11] + "|" + \ - first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": - prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] - else: - prod = second_allele_ann_string_split[14] + second_allele_ann_string_split[15] - new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + \ - second_allele_ann_string_split[1] + "|" + \ - second_allele_ann_string_split[2] + "|" + \ - second_allele_ann_string_split[4] + "|" + \ - second_allele_ann_string_split[9] + "|" + \ - second_allele_ann_string_split[10] + "|" + \ - second_allele_ann_string_split[11] + "|" + \ - second_allele_ann_string_split[ - 13] + "|" + prod + "|" + prod + ";" - - ann_string = new_first_allele_ann_string + new_second_allele_ann_string - - - if len(ann_string_split) > 3: - - first_allele_ann_string_split = ann_string_split[1].split('|') - second_allele_ann_string_split = ann_string_split[2].split('|') - third_allele_ann_string_split = ann_string_split[3].split('|') - - if len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10 and len( - third_allele_ann_string_split) == 10: - ann_string = ann_string - - elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) == 10 and len( - third_allele_ann_string_split) == 10: - if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": - prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] - else: - prod = first_allele_ann_string_split[14] + first_allele_ann_string_split[15] - new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + \ - first_allele_ann_string_split[1] + "|" + \ - first_allele_ann_string_split[2] + "|" + \ - first_allele_ann_string_split[4] + "|" + \ - first_allele_ann_string_split[9] + "|" + \ - first_allele_ann_string_split[10] + "|" + \ - first_allele_ann_string_split[11] + "|" + \ - first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = new_first_allele_ann_string + str(ann_string_split[2]) + str(ann_string_split[3]) - - elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) > 10 and len( - third_allele_ann_string_split) == 10: - - if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": - prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] - else: - prod = second_allele_ann_string_split[14] + second_allele_ann_string_split[15] - new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + \ - second_allele_ann_string_split[1] + "|" + \ - second_allele_ann_string_split[2] + "|" + \ - second_allele_ann_string_split[4] + "|" + \ - second_allele_ann_string_split[9] + "|" + \ - second_allele_ann_string_split[10] + "|" + \ - second_allele_ann_string_split[11] + "|" + \ - second_allele_ann_string_split[ - 13] + "|" + prod + "|" + prod + ";" - - ann_string = str(ann_string_split[1]) + new_second_allele_ann_string + str(ann_string_split[3]) - - elif len(first_allele_ann_string_split) == 10 and len(second_allele_ann_string_split) == 10 and len( - third_allele_ann_string_split) > 10: - - if third_allele_ann_string_split[14] == "" and third_allele_ann_string_split[15] == "": - prod = third_allele_ann_string_split[3] + third_allele_ann_string_split[15] - else: - prod = third_allele_ann_string_split[14] + third_allele_ann_string_split[15] - new_third_allele_ann_string = third_allele_ann_string_split[0] + "|" + \ - third_allele_ann_string_split[1] + "|" + \ - third_allele_ann_string_split[2] + "|" + \ - third_allele_ann_string_split[4] + "|" + \ - third_allele_ann_string_split[9] + "|" + \ - third_allele_ann_string_split[10] + "|" + \ - third_allele_ann_string_split[11] + "|" + \ - third_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = str(ann_string_split[1]) + str(ann_string_split[2]) + new_third_allele_ann_string - - elif len(first_allele_ann_string_split) > 10 and len(second_allele_ann_string_split) > 10 and len( - third_allele_ann_string_split) > 10: - # print ann_string - if first_allele_ann_string_split[14] == "" and first_allele_ann_string_split[15] == "": - prod = first_allele_ann_string_split[3] + first_allele_ann_string_split[15] - else: - prod = first_allele_ann_string_split[14] + first_allele_ann_string_split[15] - new_first_allele_ann_string = ";" + first_allele_ann_string_split[0] + "|" + \ - first_allele_ann_string_split[1] + "|" + \ - first_allele_ann_string_split[2] + "|" + \ - first_allele_ann_string_split[4] + "|" + \ - first_allele_ann_string_split[9] + "|" + \ - first_allele_ann_string_split[10] + "|" + \ - first_allele_ann_string_split[11] + "|" + \ - first_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - if second_allele_ann_string_split[14] == "" and second_allele_ann_string_split[15] == "": - prod = second_allele_ann_string_split[3] + second_allele_ann_string_split[15] - else: - prod = second_allele_ann_string_split[14] + second_allele_ann_string_split[15] - new_second_allele_ann_string = second_allele_ann_string_split[0] + "|" + \ - second_allele_ann_string_split[1] + "|" + \ - second_allele_ann_string_split[2] + "|" + \ - second_allele_ann_string_split[4] + "|" + \ - second_allele_ann_string_split[9] + "|" + \ - second_allele_ann_string_split[10] + "|" + \ - second_allele_ann_string_split[11] + "|" + \ - second_allele_ann_string_split[ - 13] + "|" + prod + "|" + prod + ";" - - if third_allele_ann_string_split[14] == "" and third_allele_ann_string_split[15] == "": - prod = third_allele_ann_string_split[3] + third_allele_ann_string_split[15] - else: - prod = third_allele_ann_string_split[14] + third_allele_ann_string_split[15] - new_third_allele_ann_string = third_allele_ann_string_split[0] + "|" + \ - third_allele_ann_string_split[1] + "|" + \ - third_allele_ann_string_split[2] + "|" + \ - third_allele_ann_string_split[4] + "|" + \ - third_allele_ann_string_split[9] + "|" + \ - third_allele_ann_string_split[10] + "|" + \ - third_allele_ann_string_split[11] + "|" + \ - third_allele_ann_string_split[13] + "|" + prod + "|" + prod + ";" - - ann_string = new_first_allele_ann_string + new_second_allele_ann_string + new_third_allele_ann_string - - # print ann_string - - # # JUST FOR THE SAKE OF DEBUGGING - # ann_string_split = ann_string.split(';') - # for i in ann_string_split: - # if len(i.split('|')) != 10 and len(i.split('|')) != 1: - # print ann_string - - # Changing Strandness string: Date 28/05/2019 - # Each Locus ID with a strand information - strandness = " Strand Information: " - if "-" in tag: - tagsplit = tag.split('-') - for i in tagsplit: - if i in locus_tag_to_strand.keys(): - if "," in locus_tag_to_strand[i]: - locus_tag_to_strand_split = locus_tag_to_strand[i].split(',') - strand = locus_tag_to_strand_split[0] - else: - strand = locus_tag_to_strand[i] - strandness = strandness + i + "=" + strand + "/" - else: - if i == "" or i == "None": - strandness = strandness + "NULL=" + "No Strand Information found" + "/" - else: - strandness = strandness + i + "=" + "No Strand Information found" + "/" - else: - if tag in locus_tag_to_strand.keys(): - #strandness = strandness + locus_tag_to_strand[tag] - if "," in locus_tag_to_strand[tag]: - locus_tag_to_strand_split = locus_tag_to_strand[tag].split(',') - strand = locus_tag_to_strand_split[0] - else: - strand = locus_tag_to_strand[tag] - strandness = strandness + tag + "=" + strand - else: - if tag == "" or tag == "None": - strandness = strandness + "NULL=" + "No Strand Information found" - else: - strandness = strandness + tag + "=" + "No Strand Information found" - - # Debugging - if "CD630_00290" in ann_string: - print strandness - - # Adding tag equals NULL: 30/05/2019 - if tag == "" or tag == "None": - tag = "NULL" - - print_string = print_string + " locus_tag=" + tag + strandness + ann_string - - - - - - - gt_string = "" - for gt in variants.gt_bases: - gt = gt.replace('./.', '.') - if "/" in gt: - gt_split = gt.split('/') - gt = gt_split[1] - gt_string = gt_string + "," + gt - gt_string = gt_string.replace('.', variants.REF) - - """Replacing Phage/Functional filter position code""" - if str(variants.POS) in functional_filter_pos_array: - code_string_array = code_string.split(',') - code_string = "" - for i in code_string_array: - code_string = code_string + "," + "-2" - - final_allele_string = print_string + gt_string.replace(',', '\t') + '\n' - final_code_string = print_string + "\t" + code_string.replace(',', '\t') + '\n' - final_allele_string = final_allele_string.replace(',|', '|') - # final_allele_string = final_allele_string.replace(',;,', ':::') - # final_allele_string = final_allele_string.replace(';,', ':::') - final_allele_string = final_allele_string.replace(',;,', ':::') - final_allele_string = final_allele_string.replace(';,', ':::') - final_code_string = final_code_string.replace(',|', '|') - # final_code_string = final_code_string.replace(',;,', ':::') - # final_code_string = final_code_string.replace(';,', ':::') - final_code_string = final_code_string.replace(',;,', ':::') - final_code_string = final_code_string.replace(';,', ':::') - final_code_string = final_code_string.replace('\t\t', '\t') - final_allele_string = final_allele_string.replace('\t\t', '\t') - fp_allele.write(final_allele_string) - fp_code.write(final_code_string) - fp_code.close() - fp_allele.close() \ No newline at end of file diff --git a/modules/variant_diagnostics/log_modules.pyc b/modules/variant_diagnostics/log_modules.pyc index 8119e93..4c4f73f 100755 Binary files a/modules/variant_diagnostics/log_modules.pyc and b/modules/variant_diagnostics/log_modules.pyc differ diff --git a/modules/variant_diagnostics/logging_subprocess.pyc b/modules/variant_diagnostics/logging_subprocess.pyc index 73e5e20..3250997 100755 Binary files a/modules/variant_diagnostics/logging_subprocess.pyc and b/modules/variant_diagnostics/logging_subprocess.pyc differ diff --git a/modules/variant_diagnostics/mask_regions.pyc b/modules/variant_diagnostics/mask_regions.pyc index e6144e2..c257ed1 100755 Binary files a/modules/variant_diagnostics/mask_regions.pyc and b/modules/variant_diagnostics/mask_regions.pyc differ diff --git a/modules/variant_diagnostics/matrix_sanity_checks.py b/modules/variant_diagnostics/matrix_sanity_checks.py new file mode 100644 index 0000000..99117ee --- /dev/null +++ b/modules/variant_diagnostics/matrix_sanity_checks.py @@ -0,0 +1,66 @@ +from __future__ import division +import sys +import argparse +import re +import os +import csv +import subprocess +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +""" Hacky way to append. Instead Add this path to PYTHONPATH Variable """ +from collections import OrderedDict +from collections import defaultdict +from joblib import Parallel, delayed +import multiprocessing +import thread +import glob +import readline +#import pandas as pd +import errno +from datetime import datetime +import threading +import json +from cyvcf2 import VCF +import ConfigParser +from config_settings import ConfigSectionMap +from logging_subprocess import * +from log_modules import * +from tabix import * +from memory_profiler import profile + + +parser = argparse.ArgumentParser(description='Sanity Check SNP matrix file') +parser.add_argument('-matrix', action='store', dest="matrix", + help='SNP allele Matrix to perform sanity checks.') +parser.add_argument('-functional_annotation', action='store', dest="functional_annotation", + help='Functional Annotation Positions file.') +args = parser.parse_args() + +functional_annotation_positions = [] +with open(args.functional_annotation) as fp: + for line in fp: + line = line.strip() + functional_annotation_positions.append(int(line)) + fp.close() + + +f_handle=open("matrix_sanity_check.log.txt", 'w+') + +print "Parsing Matrix..." +N_string = ["N"] +count = 0 +with open("%s" % args.matrix, 'rU') as csv_file: + csv_reader = csv.reader(csv_file, delimiter='\t') + next(csv_reader, None) + for row in csv_reader: + position = (row[0]).split(' ') + if set(row[1:]) == set(N_string): + #print "%s\t%s" % (set(row[1:]), set(N_string)) + if int(position[3]) in functional_annotation_positions: + f_handle.write("Functional Position %s Masked in all samples\n" % int(position[3])) + else: + count = count + 1 + print "Error - Wrong position masked - %s\n" % int(position[3]) + f_handle.write("Error - Wrong position masked - %s\n" % int(position[3])) + +print "\nNo. of wrongly masked variants %s" % count +exit() \ No newline at end of file diff --git a/modules/variant_diagnostics/parse_vcf_for_reason_individual_jobs.py b/modules/variant_diagnostics/parse_vcf_for_reason_individual_jobs.py deleted file mode 100755 index f0b978c..0000000 --- a/modules/variant_diagnostics/parse_vcf_for_reason_individual_jobs.py +++ /dev/null @@ -1,1565 +0,0 @@ -from __future__ import division -import argparse -import re -import os -import csv -import subprocess -from collections import OrderedDict -from collections import defaultdict -from joblib import Parallel, delayed -import multiprocessing -import thread -import glob -import readline -import pandas as pd -import errno -from pyfasta import Fasta -from datetime import datetime -import threading -import json - -parser = argparse.ArgumentParser(description='Parsing filtered VCF files and investigating Variants to determine the reason why it was filtered out from the final list') -required = parser.add_argument_group('Required arguments') -optional = parser.add_argument_group('Optional arguments') -required.add_argument('-filter2_only_snp_vcf_dir', action='store', dest="filter2_only_snp_vcf_dir", - help='Directory where all the filter2 only SNP vcf files are saved.') -required.add_argument('-filter2_only_snp_vcf_filenames', action='store', dest="filter2_only_snp_vcf_filenames", - help='Names of filter2 only SNP vcf files with name per line.') -optional.add_argument('-jobrun', action='store', dest="jobrun", - help='Running a job on Cluster, Running Parallel jobs, Run jobs/commands locally (default): cluster, local, parallel-local, parallel-single-cluster') -optional.add_argument('-cluster_type', action='store', dest="cluster_type", - help='Type of Cluster: torque, pbs, sgd') -optional.add_argument('-cluster_resources', action='store', dest="cluster_resources", - help='Cluster Resources to use. for example nodes,core. Ex: 1,4') -optional.add_argument('-numcores', action='store', dest="numcores", - help='Number of cores to use on local system for parallel-local parameter') -optional.add_argument('-remove_temp', action='store', dest="remove_temp", - help='Remove Temporary files generated during the run') -required.add_argument('-reference', action='store', dest="reference", - help='Path to Reference Fasta file for consensus generation') -required.add_argument('-steps', action='store', dest="steps", - help='Analysis Steps to be performed. This should be in sequential order.' - 'Step 1: Run pbs jobs and process all pipeline generated vcf files to generate label files' - 'Step 2: Analyze label files and generate matrix' - 'Step 3: DP/FQ Analysis') -args = parser.parse_args() - - -def create_positions_filestep(vcf_filenames): - - """ - Gather SNP positions from each final *_no_proximate_snp.vcf file (that passed the variant filter parameters - from variant calling pipeline) and write to *_no_proximate_snp.vcf_position files for use in downstream methods - - """ - - filter2_only_snp_position_files_array = [] - for file in vcf_filenames: - with open(file, 'rU') as csv_file: - file_name = temp_dir + "/" + os.path.basename(file) + "_positions" - addpositionfilenametoarray = file_name - filter2_only_snp_position_files_array.append(addpositionfilenametoarray) - f1 = open(file_name, 'w+') - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position = row[0] - if not position.startswith('#'): - p_string = row[1] + "\n" - f1.write(p_string) - f1.close() - csv_file.close() - print "End of creating '_positions' file step\n" - - """ Create position array containing unique positiones from positions file """ - position_array = [] - for filess in filter2_only_snp_position_files_array: - f = open(filess, 'r+') - for line in f: - line = line.strip() - position_array.append(line) - f.close() - position_array_unique = set(position_array) - position_array_sort = sorted(position_array_unique) - print "\nThe number of unique variant positions:\n" + str(len(position_array_sort)) + "\n" - unique_position_file = "%s/unique_positions_file" % args.filter2_only_snp_vcf_dir - f=open(unique_position_file, 'w+') - for i in position_array_sort: - f.write(i + "\n") - f.close() - if len(position_array_sort) == 0: - print "ERROR: No unique positions found. Check if vcf files are empty?" - exit() - - - - # """ Create position array containing all the final SNP positions from all the final vcf files""" - # position_array = [] - # for file in vcf_filenames: - # with open(file, 'rU') as csv_file: - # csv_reader = csv.reader(csv_file, delimiter='\t') - # for row in csv_reader: - # position = row[0] - # if not position.startswith('#'): - # if row[1] not in position_array: - # position_array(row[1]) - # csv_file.close() - # - # - # position_array_unique = set(position_array) - # position_array_sort = sorted(position_array_unique) - # print "\nThe number of unique variant positions:\n" + str(len(position_array_sort)) + "\n" - # unique_position_file = "%s/temp/unique_positions_file" % args.filter2_only_snp_vcf_dir - # f=open(unique_position_file, 'w+') - # for i in position_array_sort: - # f.write(i + "\n") - # f.close() - -####################END: Create position array containing unique positiones from positions file####################### - - -def make_sure_path_exists(out_path): - """ - Make sure the output folder exists or create at given path - :param out_path: - :return: - """ - try: - os.makedirs(out_path) - except OSError as exception: - if exception.errno != errno.EEXIST: - print "Errors in output folder path! please change the output path or analysis name\n" - exit() - -def run_command(i): - print "Running: %s" % i - os.system(i) - done = "done: %s" % i - return done - - -def create_job(jobrun, vcf_filenames): - - """ - Based on type of jobrun; generate jobs and run accordingly. - :param jobrun: - :param vcf_filenames: - :return: - """ - if jobrun == "cluster": - """ - Supports only PBS clusters for now. - """ - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=4,pmem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\n/home/apirani/anaconda/bin/python /nfs/esnitkin/bin_group/scripts/Scripts_v2.0/variants_position_analysis/reason_job.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, i) - job_file_name = "%s.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - print "Running: qsub %s" % i - #os.system("qsub %s" % i) - - elif jobrun == "parallel-local": - """ - Generate a Command list of each job and run it in parallel on different cores available on local system - """ - command_array = [] - command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - - - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=4,pmem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\n/home/apirani/anaconda/bin/python /nfs/esnitkin/bin_group/scripts/Scripts_v2.0/variants_position_analysis/reason_job.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, i) - job_file_name = "%s.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*vcf.pbs" - pbs_scripts = glob.glob(pbs_dir) - - - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - print len(command_array) - if args.numcores: - num_cores = int(num_cores) - else: - num_cores = multiprocessing.cpu_count() - results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) - - elif jobrun == "parallel-single-cluster": - print " " - else: - """ - Generate a Command list of each job and run it on local system one at a time - """ - command_array = [] - command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir - os.system("bash %s" % command_file) - -def create_job_fasta(jobrun, vcf_filenames): - - """ - Based on type of jobrun; generate jobs and run accordingly. - :param jobrun: - :param vcf_filenames: - :return: - """ - if jobrun == "cluster": - """ - Supports only PBS clusters for now. - """ - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\n/home/apirani/anaconda/bin/python /nfs/esnitkin/bin_group/scripts/Scripts_v2.0/variants_position_analysis/extract_only_ref_variant_fasta.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s\n" % (job_name, args.filter2_only_snp_vcf_dir, i, args.reference) - job_file_name = "%s_fasta.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - print "Running: qsub %s" % i - #os.system("qsub %s" % i) - - elif jobrun == "parallel-local": - """ - Generate a Command list of each job and run it in parallel on different cores available on local system - """ - command_array = [] - command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - - - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s_fasta\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\n/home/apirani/anaconda/bin/python /nfs/esnitkin/bin_group/scripts/Scripts_v2.0/variants_position_analysis/reason_job.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s -reference %s\n" % (job_name, args.filter2_only_snp_vcf_dir, i, args.reference) - job_file_name = "%s_fasta.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_fasta.pbs" - pbs_scripts = glob.glob(pbs_dir) - - - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - print len(command_array) - if args.numcores: - num_cores = int(num_cores) - else: - num_cores = multiprocessing.cpu_count() - results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) - - elif jobrun == "parallel-single-cluster": - print " " - else: - """ - Generate a Command list of each job and run it on local system one at a time - """ - command_array = [] - command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir - os.system("bash %s" % command_file) - -def create_job_DP(jobrun, vcf_filenames): - - """ - Based on type of jobrun; generate jobs and run accordingly. - :param jobrun: - :param vcf_filenames: - :return: - """ - if jobrun == "cluster": - """ - Supports only PBS clusters for now. - """ - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/home/apirani/anaconda/bin/python /nfs/esnitkin/bin_group/scripts/Scripts_v2.0/variants_position_analysis/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) - job_file_name = "%s_DP.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" - pbs_scripts = glob.glob(pbs_dir) - for i in pbs_scripts: - print "Running: qsub %s" % i - #os.system("qsub %s" % i) - - elif jobrun == "parallel-local": - """ - Generate a Command list of each job and run it in parallel on different cores available on local system - """ - command_array = [] - command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir - f3 = open(command_file, 'w+') - - - for i in vcf_filenames: - job_name = os.path.basename(i) - job_print_string = "#PBS -N %s\n#PBS -M apirani@med.umich.edu\n#PBS -m a\n#PBS -V\n#PBS -l nodes=1:ppn=1,mem=4000mb,walltime=76:00:00\n#PBS -q fluxod\n#PBS -A esnitkin_fluxod\n#PBS -l qos=flux\n\ncd %s\n/home/apirani/anaconda/bin/python /nfs/esnitkin/bin_group/scripts/Scripts_v2.0/variants_position_analysis/DP_analysis.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_file %s\n" % (job_name, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, i) - job_file_name = "%s_DP.pbs" % (i) - f1=open(job_file_name, 'w+') - f1.write(job_print_string) - f1.close() - #os.system("mv %s/*.pbs %s/temp" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir)) - pbs_dir = args.filter2_only_snp_vcf_dir + "/*_DP.pbs" - pbs_scripts = glob.glob(pbs_dir) - - - for i in pbs_scripts: - f3.write("bash %s\n" % i) - f3.close() - with open(command_file, 'r') as fpp: - for lines in fpp: - lines = lines.strip() - command_array.append(lines) - fpp.close() - print len(command_array) - if args.numcores: - num_cores = int(num_cores) - else: - num_cores = multiprocessing.cpu_count() - results = Parallel(n_jobs=num_cores)(delayed(run_command)(command) for command in command_array) - - elif jobrun == "parallel-single-cluster": - print " " - else: - """ - Generate a Command list of each job and run it on local system one at a time - """ - command_array = [] - command_file = "%s/commands_list.sh" % args.filter2_only_snp_vcf_dir - os.system("bash %s" % command_file) - -def generate_paste_command(): - - """ Generate SNP Filter Label Matrix """ - paste_file = args.filter2_only_snp_vcf_dir + "/paste_label_files.sh" - f4=open(paste_file, 'w+') - paste_command = "paste %s/unique_positions_file" % args.filter2_only_snp_vcf_dir - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') - paste_command = paste_command + " " + label_file - header_awk_cmd = "awk \'{ORS=\"\t\";}{print $1}\' %s > %s/header.txt" % (args.filter2_only_snp_vcf_filenames, args.filter2_only_snp_vcf_dir) - sed_header = "sed -i \'s/^/\t/\' %s/header.txt" % args.filter2_only_snp_vcf_dir - sed_header_2 = "sed -i -e \'$a\\' %s/header.txt" % args.filter2_only_snp_vcf_dir - - os.system(header_awk_cmd) - os.system(sed_header) - os.system(sed_header_2) - - temp_paste_command = paste_command + " > %s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir - paste_command = paste_command + " > %s/All_label_final_raw" % args.filter2_only_snp_vcf_dir - f4.write(paste_command) - f4.close() - sort_All_label_cmd = "sort -n -k1,1 %s/All_label_final_raw > %s/All_label_final_sorted.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - paste_command_header = "cat %s/header.txt %s/All_label_final_sorted.txt > %s/All_label_final_sorted_header.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - #print temp_paste_command - - ls = [] - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_no_proximate_snp.vcf_positions_label') - ls.append(label_file) - ls.insert(0, "%s/unique_positions_file" % args.filter2_only_snp_vcf_dir) - - with open('%s/All_label_final_raw.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: - outfile.write(paste_command) - outfile.close() - - with open('%s/temp_label_final_raw.txt.sh' % args.filter2_only_snp_vcf_dir, 'w') as outfile: - outfile.write(temp_paste_command) - outfile.close() - os.system("bash %s/All_label_final_raw.sh" % args.filter2_only_snp_vcf_dir) - os.system("bash %s/temp_label_final_raw.txt.sh" % args.filter2_only_snp_vcf_dir) - #subprocess.call(["%s" % paste_command], shell=True) - #subprocess.call(["%s" % temp_paste_command], shell=True) - #subprocess.check_call('%s' % paste_command) - #subprocess.check_call('%s' % temp_paste_command) - - print "Finished pasting...DONE" - #os.system(paste_command) change - #os.system(temp_paste_command) change - os.system(sort_All_label_cmd) - os.system(paste_command_header) - - subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_allele/1/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/VARIANT/1TRUE/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_proximate_SNP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP/2/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_proximate_SNP/7/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ/5/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ/6/g' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir], shell=True) - - remove_unwanted_text = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir - os.system(remove_unwanted_text) - - -def generate_position_label_data_matrix(): - - """ - Generate different list of Positions from the **All_label_final_sorted_header.txt** SNP position label data matrix. - - Filtered Position label matrix: - Too bad! This positions where atleast one variant was observed in atleast one sample - This position didn't made it to the final Only_ref_variant_positions_for_closely_matrix list, - because it was either unmapped(non-core) in one or more of the samples or was filtered out one or more of the sample due to Variant Filtered Parameter - - Only_ref_variant_positions_for_closely_matrix.txt : - Those Positions where the variant was either reference allele or a variant that passed all the variant filter parameters. - Yeah! This ones made it to final vcf file and are core variants - (Core variant Position: Variant Position which was not filtered out in any of the other samples due to variant filter parameter and also this position was present in all the samples(not unmapped)). - - """ - def generate_position_label_data_matrix_All_label(): - position_label = OrderedDict() - #changed 20 september - f1=open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'w+') - f2=open("%s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f3=open("%s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f4=open("%s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir, 'w+') - #changed 20 september - with open("%s/All_label_final_sorted_header.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - print "Reading All label positions file: %s/All_label_final_sorted_header.txt \n" % args.filter2_only_snp_vcf_dir - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - position_label[row[0]] = row[1:] - print "Generating different list of Positions and heatmap data matrix... \n" - #changed 20 september - # f1=open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'w+') - # f2=open("%s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - # f3=open("%s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - # f4=open("%s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir, 'w+') - #changed 20 september - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f1.write('\t' + print_string_header.strip() + '\n') - f2.write('\t' + print_string_header.strip() + '\n') - f3.write('\t' + print_string_header.strip() + '\n') - f4.write('\t' + print_string_header.strip() + '\n') - for value in position_label: - lll = ['0', '2', '3', '4', '5', '6', '7'] - ref_var = ['1', '1TRUE'] - if set(ref_var) & set(position_label[value]): - if set(lll) & set(position_label[value]): - print_string = "" - for i in position_label[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f3.write(STRR2) - if position_label[value].count('1TRUE') >= 2: - f4.write('1\n') - else: - f4.write('0\n') - else: - strr = value + "\n" - f1.write(strr) - STRR3 = value + "\t" + str(position_label[value]) + "\n" - f2.write(STRR3) - #changed 20 september - # f1.close() - # f2.close() - # f3.close() - # f4.close() - #changed 20 september - csv_file.close() - #changed 20 september - f1.close() - f2.close() - f3.close() - f4.close() - #changed 20 september - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_ref_variant_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/Only_filtered_positions_for_closely_matrix_TRUE_variants_filtered_out.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/1TRUE/-1/g' %s/Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir], shell=True) - - def temp_generate_position_label_data_matrix_All_label(): - - """ - Read **temp_label_final_raw.txt** SNP position label data matrix for generating barplot statistics. - """ - temp_position_label = OrderedDict() - #changed 20 september - f33=open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f33.write('\t' + print_string_header.strip() + '\n') - print "Reading temporary label positions file: %s/temp_label_final_raw.txt \n" % args.filter2_only_snp_vcf_dir - lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] - ref_var = ['reference_allele', 'VARIANT'] - #changed 20 september - # changed 20 september newline changed from 'rU' to newline='' - with open("%s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file: - # print "Reading temporary label positions file: %s/temp_label_final_raw.txt \n" % args.filter2_only_snp_vcf_dir - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - #temp_position_label[row[0]] = row[1:] - if set(ref_var) & set(row[1:]): - if set(lll) & set(row[1:]): - print_string = "" - for i in row[1:]: - print_string = print_string + "\t" + i - STRR2 = row[0] + print_string + "\n" - f33.write(STRR2) - #changed 20 september - # f33=open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - # print_string_header = "\t" - # for i in vcf_filenames: - # print_string_header = print_string_header + os.path.basename(i) + "\t" - # f33.write('\t' + print_string_header.strip() + '\n') - #changed 20 september - - # for value in temp_position_label: - # #changed 20 september - # # lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] - # # ref_var = ['reference_allele', 'VARIANT'] - # #changed 20 september - # if set(ref_var) & set(temp_position_label[value]): - # if set(lll) & set(temp_position_label[value]): - # print_string = "" - # for i in temp_position_label[value]: - # print_string = print_string + "\t" + i - # STRR2 = value + print_string + "\n" - # f33.write(STRR2) - #changed 20 september - #f33.close() - #changed 20 september - csv_file.close() - #changed 20 september - f33.close() - #changed 20 september - """ - # Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of FQ - # """ - temp_position_label_FQ = OrderedDict() - #changed 20 september - f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir, 'w+') - #changed 20 september - with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - print "Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n" % args.filter2_only_snp_vcf_dir - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - - for row in csv_reader: - temp_position_label_FQ[row[0]] = row[1:] - #changed 20 september - #f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir, 'w+') - #changed 20 september - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f44.write('\t' + print_string_header.strip() + '\n') - for value in temp_position_label_FQ: - #lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] - lll = ['LowFQ'] - #ref_var = ['reference_allele', 'VARIANT'] - if set(lll) & set(temp_position_label_FQ[value]): - print_string = "" - for i in temp_position_label_FQ[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f44.write(STRR2) - #changed 20 september - f44.close() - #changed 20 september - csv_file.close() - #changed 20 september - f44.close() - #changed 20 september - # Perform Sed - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # """ - # Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp - # """ - # temp_position_label_DP = OrderedDict() - # #changed 20 september - # f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') - # #changed 20 september - # with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - # print "Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n" % args.filter2_only_snp_vcf_dir - # csv_reader = csv.reader(csv_file, delimiter='\t') - # next(csv_reader, None) - # for row in csv_reader: - # temp_position_label_DP[row[0]] = row[1:] - # #changed 20 september - # # f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') - # #changed 20 september - # print_string_header = "\t" - # for i in vcf_filenames: - # print_string_header = print_string_header + os.path.basename(i) + "\t" - # f44.write('\t' + print_string_header.strip() + '\n') - # for value in temp_position_label_DP: - # #lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] - # lll = ['HighFQ_DP'] - # ref_var = ['reference_allele', 'VARIANT'] - # if set(lll) & set(temp_position_label_FQ[value]): - # print_string = "" - # for i in temp_position_label_FQ[value]: - # print_string = print_string + "\t" + i - # STRR2 = value + print_string + "\n" - # f44.write(STRR2) - # - # #changed 20 september - # f44.close() - # #changed 20 septemberfile.close() - # #changed 20 september - # #changed 20 september - # - # #Perform Sed - # subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/LowFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - - """ - Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp - """ - temp_position_label_DP = OrderedDict() - #changed 20 september - f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') - #changed 20 september - with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - print "Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n" % args.filter2_only_snp_vcf_dir - csv_reader = csv.reader(csv_file, delimiter='\t') - next(csv_reader, None) - for row in csv_reader: - temp_position_label_DP[row[0]] = row[1:] - #changed 20 september - # f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') - #changed 20 september - print_string_header = "\t" - for i in vcf_filenames: - print_string_header = print_string_header + os.path.basename(i) + "\t" - f44.write('\t' + print_string_header.strip() + '\n') - for value in temp_position_label_DP: - #lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] - lll = ['HighFQ_DP'] - ref_var = ['reference_allele', 'VARIANT'] - if set(lll) & set(temp_position_label_FQ[value]): - print_string = "" - for i in temp_position_label_FQ[value]: - print_string = print_string + "\t" + i - STRR2 = value + print_string + "\n" - f44.write(STRR2) - #changed 20 september - f44.close() - #changed 20 september - csv_file.close() - #changed 20 september - #f44.close() - #changed 20 september - - #Perform Sed - subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/LowFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - - - - - - - - - - - def barplot_stats(): - print "Read each Sample columns and calculate the percentage of each label to generate barplot statistics." - """ - Read each Sample columns and calculate the percentage of each label to generate barplot statistics. - This will give a visual explanation of how many positions in each samples were filtered out because of different reason - """ - - c_reader = csv.reader(open('%s/temp_Only_filtered_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') - columns = list(zip(*c_reader)) - print "Finished reading columns..." - counts = 1 - end = len(vcf_filenames) + 1 - f_bar_count = open("%s/bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_perc = open("%s/bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_count.write("Sample\tunmapped_positions\treference_allele\ttrue_variant\tOnly_low_FQ\tOnly_DP\tOnly_low_MQ\tother\n") - f_bar_perc.write("Sample\tunmapped_positions_perc\ttrue_variant_perc\tOnly_low_FQ_perc\tOnly_DP_perc\tOnly_low_MQ_perc\tother_perc\n") - for i in xrange(1, end, 1): - """ Bar Count Statistics: Variant Position Count Statistics """ - true_variant = columns[i].count('VARIANT') - unmapped_positions = columns[i].count('reference_unmapped_position') - reference_allele = columns[i].count('reference_allele') - Only_low_FQ = columns[i].count('LowFQ') - Only_DP = columns[i].count('HighFQ_DP') - Only_low_MQ = columns[i].count('HighFQ') - low_FQ_other_parameters = columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count('LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[i].count('LowFQ_DP') - high_FQ_other_parameters = columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count('HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL') - other = low_FQ_other_parameters + high_FQ_other_parameters - total = true_variant + unmapped_positions + reference_allele + Only_low_FQ + Only_DP + low_FQ_other_parameters + high_FQ_other_parameters + Only_low_MQ - filename_count = i - 1 - bar_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions, reference_allele, true_variant, Only_low_FQ, Only_DP, Only_low_MQ, other) - f_bar_count.write(bar_string) - - """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ - try: - true_variant_perc = float((columns[i].count('VARIANT') * 100) / total) - except ZeroDivisionError: - true_variant_perc = 0 - try: - unmapped_positions_perc = float((columns[i].count('reference_unmapped_position') * 100) / total) - except ZeroDivisionError: - unmapped_positions_perc = 0 - try: - reference_allele_perc = float((columns[i].count('reference_allele') * 100) / total) - except ZeroDivisionError: - reference_allele_perc = 0 - try: - Only_low_FQ_perc = float((columns[i].count('LowFQ') * 100) / total) - except ZeroDivisionError: - Only_low_FQ_perc = 0 - try: - Only_DP_perc = float((columns[i].count('HighFQ_DP') * 100) / total) - except ZeroDivisionError: - Only_DP_perc = 0 - try: - Only_low_MQ_perc = float((columns[i].count('HighFQ') * 100) / total) - except ZeroDivisionError: - Only_low_MQ_perc = 0 - try: - low_FQ_other_parameters_perc = float(((columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count('LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[i].count('LowFQ_DP')) * 100) / total) - except ZeroDivisionError: - low_FQ_other_parameters_perc = 0 - try: - high_FQ_other_parameters_perc = float(((columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count('HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL')) * 100) / total) - except ZeroDivisionError: - high_FQ_other_parameters_perc = 0 - - other_perc = float(low_FQ_other_parameters_perc + high_FQ_other_parameters_perc) - bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), unmapped_positions_perc, true_variant_perc, Only_low_FQ_perc, Only_DP_perc, Only_low_MQ_perc, other_perc) - f_bar_perc.write(bar_perc_string) - f_bar_count.close() - f_bar_perc.close() - """ Methods Steps""" - print "Running: generate_position_label_data_matrix_All_label" - generate_position_label_data_matrix_All_label() - print "temp_generate_position_label_data_matrix_All_label" - temp_generate_position_label_data_matrix_All_label() - #print "barplot_stats" - barplot_stats() - -def generate_vcf_files(): - #print ref_variant_position_array - filter2_files_array = [] - for i in vcf_filenames: - filter2_file = i.replace('_no_proximate_snp.vcf', '') - filter2_files_array.append(filter2_file) - - ref_variant_position_array = [] - ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir, 'r+') - for line in ffp: - line = line.strip() - ref_variant_position_array.append(line) - ffp.close() - - filtered_out_vcf_files = [] - #print filter2_files_array - for i in filter2_files_array: - #print_array = i - print_array =[] - fasta_string = "" - with open(i) as file_open: - for line in file_open: - line = line.strip() - if line.startswith("#"): - print_array.append(line) - else: - #line.split(' ') - split_array = re.split(r'\t+', line) - if split_array[1] in ref_variant_position_array and 'INDEL' not in split_array[7]: - print_array.append(line) - #extract variant - # else: - # extract_base = "tr -d '\n' < %s | cut -b%s" % (args.reference, split_array[1]) - # proc = subprocess.Popen([grep_reference_file], stdout=subprocess.PIPE, shell=True) - file_open.close() - file_name = i + "_core.vcf" - print "Generating %s" % file_name - filtered_out_vcf_files.append(file_name) - f1 = open(file_name, 'w+') - for ios in print_array: - print_string = str(ios) + "\n" - f1.write(print_string) - f1.close() - - filename = "%s/consensus.sh" % args.filter2_only_snp_vcf_dir - print "\nGenerating Consensus...\n" - for file in filtered_out_vcf_files: - f1 = open(filename, 'a+') - bgzip_cmd = "bgzip -f %s\n" % file - f1.write(bgzip_cmd) - subprocess.call([bgzip_cmd], shell=True) - tabix_cmd = "tabix -f -p vcf %s.gz\n" % file - f1.write(tabix_cmd) - subprocess.call([tabix_cmd], shell=True) - fasta_cmd = "cat %s | /home/apirani/bin/vcftools_0.1.12b/bin/vcf-consensus %s.gz > %s.fa\n" % (args.reference, file, file.replace('_filter2_final.vcf_core.vcf', '')) - f1.write(fasta_cmd) - subprocess.call([fasta_cmd], shell=True) - base = os.path.basename(file) - header = base.replace('_filter2_final.vcf_core.vcf', '') - sed_command = "sed -i 's/>.*/>%s/g' %s.fa\n" % (header, file.replace('_filter2_final.vcf_core.vcf', '')) - subprocess.call([sed_command], shell=True) - f1.write(sed_command) - print "The consensus commands are in : %s" % filename - sequence_lgth_cmd = "for i in %s/*.fa; do bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % args.filter2_only_snp_vcf_dir - os.system(sequence_lgth_cmd) - - - #os.system("bash %s" % filename) - #subprocess.call(["bash %s" % filename], shell=True) - -def gatk_filter2(final_raw_vcf, out_path, analysis, reference): - gatk_filter2_parameter_expression = "MQ > 50 && QUAL > 100 && DP > 9" - gatk_filter2_command = "java -jar ~/bin/GenomeAnalysisTK-3.3-0/GenomeAnalysisTK.jar -T VariantFiltration -R %s -o %s/%s_filter2_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % (reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) - print "\n\nRunning Command: [%s]\n\n" % gatk_filter2_command - os.system(gatk_filter2_command) - filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_gatk.vcf > %s/%s_filter2_final.vcf" % (out_path, analysis, out_path, analysis) - os.system(filter_flag_command) - gatk_filter2_final_vcf = "%s/%s_filter2_final.vcf" % (out_path, analysis) - return gatk_filter2_final_vcf - - - -def remove_proximate_snps(gatk_filter2_final_vcf_file, out_path, analysis, reference): - all_position = [] - remove_proximate_position_array = [] - gatk_filter2_final_vcf_file_no_proximate_snp = gatk_filter2_final_vcf_file + "_no_proximate_snp.vcf" - with open(gatk_filter2_final_vcf_file, 'rU') as csv_file: - for line in csv_file: - if not line.startswith('#'): - line_array = line.split('\t') - all_position.append(line_array[1]) - for position in all_position: - position_index = all_position.index(position) - next_position_index = position_index + 1 - - if next_position_index < len(all_position): - diff = int(all_position[next_position_index]) - int(position) - if diff < 10: - #print position + " " + all_position[next_position_index] - if position not in remove_proximate_position_array and all_position[next_position_index] not in remove_proximate_position_array: - remove_proximate_position_array.append(int(position)) - remove_proximate_position_array.append(int(all_position[next_position_index])) - #print remove_proximate_position_array - f1=open(gatk_filter2_final_vcf_file_no_proximate_snp, 'w+') - with open(gatk_filter2_final_vcf_file, 'rU') as csv_file2: - for line in csv_file2: - if line.startswith('gi') or line.startswith('MRSA_8058'): ##change this! - line_array = line.split('\t') - if int(line_array[1]) not in remove_proximate_position_array: - #print line_array[1] - #print line_array[1] - print_string = line - f1.write(print_string) - else: - print_string = line - f1.write(print_string) - gatk_filter2_final_vcf_file_no_proximate_snp_positions = gatk_filter2_final_vcf_file + "_no_proximate_snp.vcf_positions_array" - f2=open(gatk_filter2_final_vcf_file_no_proximate_snp_positions, 'w+') - for i in remove_proximate_position_array: - position_print_string = str(i) + "\n" - f2.write(position_print_string) - return gatk_filter2_final_vcf_file_no_proximate_snp - - -def FQ_analysis(): - for i in vcf_filenames: - filename_base = os.path.basename(i) - aln_mpileup_vcf_file = filename_base.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_aln_mpileup_raw.vcf_5bp_indel_removed.vcf') - analysis = filename_base.replace('_filter2_final.vcf_no_proximate_snp.vcf', '') - grep_reference_file = "grep \'^##reference\' %s" % aln_mpileup_vcf_file - proc = subprocess.Popen([grep_reference_file], stdout=subprocess.PIPE, shell=True) - (out, err) = proc.communicate() - out = out.strip() - reference_file = out.split(':') - gatk_filter2_final_vcf_file = gatk_filter2(aln_mpileup_vcf_file, temp_dir, analysis, reference_file[1]) - gatk_filter2_final_vcf_file_no_proximate_snp = remove_proximate_snps(gatk_filter2_final_vcf_file, temp_dir, analysis, reference_file[1]) - grep_fq_field = "awk -F\'\t\' \'{print $8}\' %s | grep -o \'FQ=.*\' | sed \'s/FQ=//g\' | awk -F\';\' \'{print $1}\' > %s_FQ_values" % (gatk_filter2_final_vcf_file_no_proximate_snp, analysis) - os.system(grep_fq_field) - -def DP_analysis(): - create_job_DP(args.jobrun, vcf_filenames) - paste_command = "paste %s/extract_DP_positions.txt" % args.filter2_only_snp_vcf_dir - for i in vcf_filenames: - label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_DP_values') - paste_command = paste_command + " " + label_file - - paste_file = args.filter2_only_snp_vcf_dir + "/paste_DP_files.sh" - f2=open(paste_file, 'w+') - paste_command = paste_command + " > %s/filtered_DP_values_temp.txt" % args.filter2_only_snp_vcf_dir - #os.system(paste_command) - f2.write(paste_command + '\n') - cat_header = "cat %s/header.txt %s/filtered_DP_values_temp.txt > %s/filtered_DP_values.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - #os.system(cat_header) - f2.write(cat_header + '\n') - sed_command = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/filtered_DP_values.txt" % (args.filter2_only_snp_vcf_dir) - #os.system(sed_command) - f2.write(sed_command + '\n') - cmd = "bash %s" % paste_file - os.system("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir) - # print "Analyzing positions that were filtered out due to Depth..." - # extract_DP_positions = "awk -F\'\\t\' \'{print $1}\' temp_Only_filtered_positions_for_closely_matrix_DP.txt | sed \'/^$/d\' > extract_DP_positions.txt" - # os.system(extract_DP_positions) - # - # for i in vcf_filenames: - # filename_base = os.path.basename(i) - # aln_mpileup_vcf_file = filename_base.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_aln_mpileup_raw.vcf_5bp_indel_removed.vcf') - # analysis = filename_base.replace('_filter2_final.vcf_no_proximate_snp.vcf', '') - # grep_reference_file = "grep \'^##reference\' %s" % aln_mpileup_vcf_file - # proc = subprocess.Popen([grep_reference_file], stdout=subprocess.PIPE, shell=True) - # (out, err) = proc.communicate() - # out = out.strip() - # reference_file = out.split(':') - # #gatk_filter2_final_vcf_file = gatk_filter2(aln_mpileup_vcf_file, temp_dir, analysis, reference_file[1]) - # #gatk_filter2_final_vcf_file_no_proximate_snp = remove_proximate_snps(gatk_filter2_final_vcf_file, temp_dir, analysis, reference_file[1]) - # DP_values_file = "%s/%s_DP_values" % (args.filter2_only_snp_vcf_dir, analysis) - # f2=open(DP_values_file, 'w+') - # - # - # with open("%s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_filess: - # csv_readerr = csv.reader(csv_filess, delimiter='\t') - # next(csv_readerr, None) - # for rows in csv_readerr: - # #print rows - # #grep_dp_field = "grep -wP \'^\S+\s+%s\s+\b\' %s | awk -F\'\\t\' \'{print $8}\' | grep -o \'DP=.*\' | sed \'s/DP=//g\' | awk -F\';\' \'{print $1}\'" % (rows[0], aln_mpileup_vcf_file) - # grep_dp_field = "grep -w \'%s\' %s" % (rows[0], aln_mpileup_vcf_file) - # awk_dp_field = "awk -F\'\t\' \'$2 == %s\' %s | awk -F\'\t\' \'{print $8}\' | awk -F\';\' \'{print $1}\' | sed \'s/DP=//g\'" % (rows[0], aln_mpileup_vcf_file) - # #print grep_dp_field - # #proc = subprocess.Popen([grep_dp_field], stdout=subprocess.PIPE, shell=True) - # #(out2, err2) = proc.communicate() - # #out_split = out.split('\n') - # #out = out.strip() - # proc = subprocess.Popen([awk_dp_field], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - # (out2, err2) = proc.communicate() - # #print out2.strip() - # if out2: - # #print out2.strip() - # if "INDEL" in out2: - # #print awk_dp_field - # out2 == "NA" - # f2.write(out2.strip() + '\n') - # # if len(out_split) > 1: - # # print out_split[0] - # # # for i in out: - # # # print i - # # line_split = out.split('\t') - # # #print line_split - # # if line_split[1] == rows[0]: - # # DP_field = line_split[7].split(';') - # # DP_value = DP_field[0].replace('DP=', '') - # #print out - # else: - # f2.write("NA\n") - # #print "NA" - # - # paste_command = "paste %s/extract_DP_positions.txt" % args.filter2_only_snp_vcf_dir - # for i in vcf_filenames: - # label_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_DP_values') - # paste_command = paste_command + " " + label_file - # - # paste_file = args.filter2_only_snp_vcf_dir + "/paste_DP_files.sh" - # f2=open(paste_file, 'w+') - # paste_command = paste_command + " > %s/filtered_DP_values_temp.txt" % args.filter2_only_snp_vcf_dir - # #os.system(paste_command) - # f2.write(paste_command + '\n') - # cat_header = "cat %s/header.txt %s/filtered_DP_values_temp.txt > %s/filtered_DP_values.txt" % (args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir, args.filter2_only_snp_vcf_dir) - # #os.system(cat_header) - # f2.write(cat_header + '\n') - # sed_command = "sed -i \'s/_filter2_final.vcf_no_proximate_snp.vcf//g\' %s/filtered_DP_values.txt" % (args.filter2_only_snp_vcf_dir) - # #os.system(sed_command) - # f2.write(sed_command + '\n') - # cmd = "bash %s" % paste_file - # os.system("bash %s/paste_DP_files.sh" % args.filter2_only_snp_vcf_dir) - # - # #os.system(cmd) change - # #subprocess.call(["%s" % cmd], shell=True) - # #subprocess.check_call('%s' % cmd) -def DP_analysis_barplot(): - print "Generating DP barplots data..." - c_reader = csv.reader(open('%s/filtered_DP_values.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') - columns = list(zip(*c_reader)) - counts = 1 - end = len(vcf_filenames) + 1 - f_bar_count = open("%s/DP_bargraph_counts.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_perc = open("%s/DP_bargraph_percentage.txt" % args.filter2_only_snp_vcf_dir, 'w+') - f_bar_count.write("Sample\treference_position\toneto5\tsixto10\televento14\tfifteenorabove\n") - f_bar_perc.write("Sample\treference_position\toneto5\tsixto10\televento14\tfifteenorabove\n") - for i in xrange(1, end, 1): - # if i == 73: - # print list(columns[i][1:]) - """ Bar Count Statistics: Variant Position Count Statistics """ - reference_position = columns[i].count('NA') - oneto5 = 0 - for k in list(columns[i][1:]): - if k != "": - if k != "NA": - if int(k) < 5: - oneto5 += 1 - sixto10 = 0 - for k in list(columns[i][1:]): - if k != "": - if k != "NA": - if int(k) >= 5 and int(k) <= 10: - sixto10 += 1 - elevento14 = 0 - for k in list(columns[i][1:]): - if k != "": - if k != "NA": - if int(k) >= 11 and int(k) <= 14: - elevento14 += 1 - fifteenorabove = 0 - for k in list(columns[i][1:]): - if k != "": - if k != "NA": - if int(k) >= 15: - fifteenorabove += 1 - total = reference_position + oneto5 + sixto10 + elevento14 + fifteenorabove - filename_count = i - 1 - bar_string = "%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), reference_position, oneto5, sixto10, elevento14, fifteenorabove) - f_bar_count.write(bar_string) - - """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """ - try: - reference_position_perc = float(reference_position * 100 / total) - except ZeroDivisionError: - reference_position_perc = 0 - try: - oneto5_perc = float(oneto5 * 100 / total) - except ZeroDivisionError: - oneto5_perc = 0 - try: - sixto10_perc = float(sixto10 * 100 / total) - except ZeroDivisionError: - sixto10_perc = 0 - try: - elevento14_perc = float(elevento14 * 100 / total) - except ZeroDivisionError: - elevento14_perc = 0 - try: - fifteenorabove_perc = float(fifteenorabove * 100 / total) - except ZeroDivisionError: - fifteenorabove_perc = 0 - bar_perc_string = "%s\t%s\t%s\t%s\t%s\t%s\n" % (os.path.basename(vcf_filenames[filename_count].replace('_filter2_final.vcf_no_proximate_snp.vcf', '')), reference_position_perc, oneto5_perc, sixto10_perc, elevento14_perc, fifteenorabove_perc) - f_bar_perc.write(bar_perc_string) - - -def extract_only_ref_variant_fasta(): - create_job_fasta(args.jobrun, vcf_filenames) - # f = Fasta(args.reference) - # if len(f.keys()) == 1: - # ref_id = str(f.keys()) - # #print ref_id - # for i in vcf_filenames: - # ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir).readlines() - # core_vcf_file = i.replace('_filter2_final.vcf_no_proximate_snp.vcf', '_filter2_final.vcf_core.vcf.gz') - # print core_vcf_file - # #fasta_string = ">%s\n" % core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz', '') - # fasta_string = "" - # firstLine = ffp.pop(0) - # #print len(ffp) - # for lines in ffp: - # #next(ffp) - # lines = lines.strip() - # #grep_position = "zcat %s | grep -w \'%s\' | awk -F\'\\t\' \'{print $5}\' | wc -l" % (core_vcf_file, lines) - # grep_position = "zcat %s | grep -v \'#\' | awk -F\'\\t\' \'{ if ($2 == %s) print $0 }\' | awk -F\'\\t\' \'{print $5}\'" % (core_vcf_file, lines) - # #print grep_position - # proc = subprocess.Popen([grep_position], stdout=subprocess.PIPE, shell=True) - # (out, err) = proc.communicate() - # out = out.strip() - # if out and "," not in out: - # print out - # fasta_string = fasta_string + out - # else: - # extract_base = "tr -d \'\\n\' < %s | cut -b%s" % (args.reference, lines) - # #print extract_base - # # proc = subprocess.Popen([extract_base], stdout=subprocess.PIPE, shell=True) - # # (out, err) = proc.communicate() - # # out = out.strip() - # # fasta_string = fasta_string + out - # # if not out: - # # print "Error extracting reference allele" - # #out = str(f.sequence({'chr': str(f.keys()[0]), 'start': int(lines), 'stop': int(lines)})) - # #print str(f.sequence({'chr': str(f.keys()[0]), 'start': int(lines), 'stop': int(lines)})) - # fasta_string = fasta_string + str(f.sequence({'chr': str(f.keys()[0]), 'start': int(lines), 'stop': int(lines)})) - # - # pattern = re.compile(r'\s+') - # fasta_string = re.sub(pattern, '', fasta_string) - # final_fasta_string = ">%s\n" % os.path.basename(core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz', '')) + fasta_string - # fp = open("%s/%s_variants.fa" % (args.filter2_only_snp_vcf_dir, os.path.basename(core_vcf_file.replace('_filter2_final.vcf_core.vcf.gz', ''))), 'w+') - # print final_fasta_string - # fp.write(final_fasta_string) - # fp.close() - # #ffp.close() - # sequence_lgth_cmd = "for i in %s/*_variants.fa; do bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % args.filter2_only_snp_vcf_dir - # os.system(sequence_lgth_cmd) - - - - - - - - - - -# def extract_only_ref_variant_fasta(): -# # f = Fasta(args.reference) -# # if len(f.keys()) == 1: -# # ref_id = str(f.keys()) -# if args.numcores: -# num_cores = int(num_cores) -# else: -# num_cores = multiprocessing.cpu_count() -# #results = Parallel(n_jobs=num_cores)(delayed(extract_only_ref_variant_fasta_jobs)(file) for file in vcf_filenames) -# for file in vcf_filenames: -# t1 = FuncThread(extract_only_ref_variant_fasta_jobs, file) -# t1.start() -# t1.join() - -def extract_only_ref_variant_fasta_from_reference(): - ffp = open("%s/Only_ref_variant_positions_for_closely" % args.filter2_only_snp_vcf_dir).readlines() - fasta_string = "" - firstLine = ffp.pop(0) - for lines in ffp: - #next(ffp) - lines = lines.strip() - extract_base = "grep -v \'>\' %s | tr -d \'\\n\'| cut -b%s" % (args.reference, lines) - proc = subprocess.Popen([extract_base], stdout=subprocess.PIPE, shell=True) - (out, err) = proc.communicate() - out = out.strip() - fasta_string = fasta_string + out - if not out: - print "Error extracting reference allele" - - pattern = re.compile(r'\s+') - fasta_string = re.sub(pattern, '', fasta_string) - final_fasta_string = ">%s\n" % os.path.basename(args.reference.replace('.fasta', '')) + fasta_string - fp = open("%s/%s_variants.fa" % (args.filter2_only_snp_vcf_dir, os.path.basename(args.reference.replace('.fasta', ''))), 'w+') - print final_fasta_string - fp.write(final_fasta_string) - fp.close() - #ffp.close() - #sequence_lgth_cmd = "for i in %s/*_variants.fa; do bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % args.filter2_only_snp_vcf_dir - #os.system(sequence_lgth_cmd) - -class FuncThread(threading.Thread): - def __init__(self, target, *args): - self._target = target - self._args = args - threading.Thread.__init__(self) - - def run(self): - self._target(*self._args) - -def someOtherFunc(data, key): - print "someOtherFunc was called : data=%s; key=%s" % (str(data), str(key)) - -def run_phaster(reference_genome): - print "\nRunning Phaster on input reference genome: %s\n" % reference_genome - out_name = (os.path.basename(reference_genome)).split('.') - phaster_post_cmd = "wget --post-file=\"%s\" \"http://phaster.ca/phaster_api\" -O %s/%s" % (reference_genome, args.filter2_only_snp_vcf_dir, str(out_name[0]) + "_phaster_post.json") - - print "Running: %s\n" % phaster_post_cmd - #os.system(phaster_post_cmd) - with open('%s/%s' % (args.filter2_only_snp_vcf_dir, str(out_name[0]) + "_phaster_post.json")) as json_data: - data = json.load(json_data) - print "Status: %s\njob_id: %s\n" % (data["status"], data["job_id"]) - -def parse_phaster(reference_genome): - out_name = (os.path.basename(reference_genome)).split('.') - with open('%s/%s' % (args.filter2_only_snp_vcf_dir, str(out_name[0]) + "_phaster_post.json")) as json_data: - data = json.load(json_data) - phaster_get_cmd = "wget \"http://phaster.ca/phaster_api?acc=%s\" -O %s/%s" % (data["job_id"], args.filter2_only_snp_vcf_dir, str(out_name[0]) + "_phaster_get.json") - print phaster_get_cmd - - with open('%s/%s' % (args.filter2_only_snp_vcf_dir, str(out_name[0]) + "_phaster_get.json")) as json_get_data: - get_data = json.load(json_get_data) - print get_data["zip"] - phaster_zip_cmd = "wget \"http://%s\" -O %s/%s_phaster_get.zip" % (str(get_data["zip"]), args.filter2_only_snp_vcf_dir, str(out_name[0])) - phaster_unzip_cmd = "unzip %s/%s_phaster_get.zip" % (args.filter2_only_snp_vcf_dir, str(out_name[0])) - print phaster_zip_cmd - print phaster_unzip_cmd - # for key, value in get_data.items(): - # print get_data["zip"][0] - -#####Back up - # def temp_generate_position_label_data_matrix_All_label(): - # - # """ - # Read **temp_label_final_raw.txt** SNP position label data matrix for generating barplot statistics. - # """ - # temp_position_label = OrderedDict() - # #changed 20 september - # f33=open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - # print_string_header = "\t" - # for i in vcf_filenames: - # print_string_header = print_string_header + os.path.basename(i) + "\t" - # f33.write('\t' + print_string_header.strip() + '\n') - # print "Reading temporary label positions file: %s/temp_label_final_raw.txt \n" % args.filter2_only_snp_vcf_dir - # lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] - # ref_var = ['reference_allele', 'VARIANT'] - # #changed 20 september - # # changed 20 september newline changed from 'rU' to newline='' - # with open("%s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, newline='') as csv_file: - # # print "Reading temporary label positions file: %s/temp_label_final_raw.txt \n" % args.filter2_only_snp_vcf_dir - # csv_reader = csv.reader(csv_file, delimiter='\t') - # next(csv_reader, None) - # for row in csv_reader: - # temp_position_label[row[0]] = row[1:] - # #changed 20 september - # # f33=open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'w+') - # # print_string_header = "\t" - # # for i in vcf_filenames: - # # print_string_header = print_string_header + os.path.basename(i) + "\t" - # # f33.write('\t' + print_string_header.strip() + '\n') - # #changed 20 september - # - # for value in temp_position_label: - # #changed 20 september - # # lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] - # # ref_var = ['reference_allele', 'VARIANT'] - # #changed 20 september - # if set(ref_var) & set(temp_position_label[value]): - # if set(lll) & set(temp_position_label[value]): - # print_string = "" - # for i in temp_position_label[value]: - # print_string = print_string + "\t" + i - # STRR2 = value + print_string + "\n" - # f33.write(STRR2) - # #changed 20 september - # #f33.close() - # #changed 20 september - # csv_file.close() - # #changed 20 september - # f33.close() - # #changed 20 september - # """ - # Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of FQ - # """ - # #temp_position_label_FQ = OrderedDict() - # #changed 20 september - # #f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir, 'w+') - # #changed 20 september - # #with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - # # print "Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n" % args.filter2_only_snp_vcf_dir - # # csv_reader = csv.reader(csv_file, delimiter='\t') - # - # # next(csv_reader, None) - # # for row in csv_reader: - # # temp_position_label_FQ[row[0]] = row[1:] - # # #changed 20 september - # # #f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir, 'w+') - # # #changed 20 september - # # print_string_header = "\t" - # # for i in vcf_filenames: - # # print_string_header = print_string_header + os.path.basename(i) + "\t" - # # f44.write('\t' + print_string_header.strip() + '\n') - # # for value in temp_position_label_FQ: - # # #lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] - # # lll = ['LowFQ'] - # # #ref_var = ['reference_allele', 'VARIANT'] - # # if set(lll) & set(temp_position_label_FQ[value]): - # # print_string = "" - # # for i in temp_position_label_FQ[value]: - # # print_string = print_string + "\t" + i - # # STRR2 = value + print_string + "\n" - # # f44.write(STRR2) - # #changed 20 september - # #f44.close() - # #changed 20 september - # #csv_file.close() - # #changed 20 september - # #f44.close() - # #changed 20 september - # ## Perform Sed - # # subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/HighFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/LowFQ/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # # subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_FQ.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # """ - # Read temp_Only_filtered_positions_for_closely_matrix file and generate a matrix of positions that are being filtered just because of Dp - # """ - # #temp_position_label_DP = OrderedDict() - # #changed 20 september - # #f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') - # #changed 20 september - # #with open("%s/temp_Only_filtered_positions_for_closely_matrix.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file: - # # print "Reading temporary Only_filtered_positions label file: %s/temp_Only_filtered_positions_for_closely_matrix.txt \n" % args.filter2_only_snp_vcf_dir - # # csv_reader = csv.reader(csv_file, delimiter='\t') - # # next(csv_reader, None) - # # for row in csv_reader: - # # temp_position_label_DP[row[0]] = row[1:] - # # #changed 20 september - # # # f44=open("%s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir, 'w+') - # # #changed 20 september - # # print_string_header = "\t" - # # for i in vcf_filenames: - # # print_string_header = print_string_header + os.path.basename(i) + "\t" - # # f44.write('\t' + print_string_header.strip() + '\n') - # # for value in temp_position_label_DP: - # # #lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP'] - # # lll = ['HighFQ_DP'] - # #ref_var = ['reference_allele', 'VARIANT'] - # # if set(lll) & set(temp_position_label_FQ[value]): - # # print_string = "" - # # for i in temp_position_label_FQ[value]: - # # print_string = print_string + "\t" + i - # # STRR2 = value + print_string + "\n" - # # f44.write(STRR2) - # #changed 20 september - # #f44.close() - # #changed 20 september - # #csv_file.close() - # #changed 20 september - # #f44.close() - # #changed 20 september - # - # #Perform Sed - # #subprocess.call(["sed -i 's/_filter2_final.vcf_no_proximate_snp.vcf//g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/reference_unmapped_position/0/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/reference_allele/1/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/VARIANT/2/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/LowFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/LowFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/LowFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/LowFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/LowFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/LowFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/LowFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/LowFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/LowFQ_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/HighFQ_QUAL_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/HighFQ_DP_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/HighFQ_QUAL_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/HighFQ_DP_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/HighFQ_proximate_SNP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/HighFQ_QUAL_DP/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/HighFQ_DP_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/HighFQ_QUAL/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/HighFQ_DP/3/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/LowFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # #subprocess.call(["sed -i 's/HighFQ/4/g' %s/temp_Only_filtered_positions_for_closely_matrix_DP.txt" % args.filter2_only_snp_vcf_dir], shell=True) - # - # - # - - - -#Main Steps -if __name__ == '__main__': - - """Start Timer""" - start_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - start_time_2 = datetime.now() - - print "\nThe Script started at: %s\n" % start_time - - print "\nThe Script: parse_vcf_for_reason.py will parse the final vcf files generated from Variant Calling Pipeline to generate:\n\n" \ - "1. Final Core SNP Positions list(Variant positions that were not filtered out in any of the samples and passed all the filters)\n" \ - "2. SNP Positions that were filtered out with labels indicating the reason (Depth, FQ, MQ, Unmapped in one or other samples, Proximate SNPS, Quality of Variant) why they were filtered out.\n" \ - "3. Barplot Statistics about the filtered variants and their reason for getting filtered.\n" \ - "4. Final Consensus fasta file generating using Final Core SNP Positions list\n" - - """ Create Temp Directory for storing unwanted temp files generated while running script """ - temp_dir = args.filter2_only_snp_vcf_dir + "/temp" - make_sure_path_exists(temp_dir) - - filter2_only_snp_vcf_filenames = args.filter2_only_snp_vcf_filenames - vcf_filenames = [] - with open(filter2_only_snp_vcf_filenames) as fp: - for line in fp: - line = line.strip() - line = args.filter2_only_snp_vcf_dir + line - vcf_filenames.append(line) - fp.close() - - if "1" in args.steps: - """ - Gather SNP positions from each final *_no_proximate_snp.vcf file (that passed the variant filter parameters - from variant calling pipeline) and write to *_no_proximate_snp.vcf_position files for use in downstream methods - """ - create_positions_filestep(vcf_filenames) - - """ Get the cluster option; create and run jobs based on given parameter """ - create_job(args.jobrun, vcf_filenames) - - """ Find ProPhage region in reference genome """ - #run_phaster(args.reference) - - if "2" in args.steps: - """ Generate SNP Filter Label Matrix """ - #generate_paste_command() - - """ Generate different list of Positions from the **All_label_final_sorted_header.txt** SNP position label data matrix. """ - #generate_position_label_data_matrix() - - """ Generate VCF files from final list of variants in Only_ref_variant_positions_for_closely; generate commands for consensus generation """ - generate_vcf_files() - - #extract_only_ref_variant_fasta() - - #extract_only_ref_variant_fasta_from_reference() - - if "3" in args.steps: - """ Analyze the FQ values of all the unique variant """ - #FQ_analysis() - - # """ Analyze the positions that were filtered out only due to insufficient depth""" - #DP_analysis() - # - # """ Generate DP barplots data """ - DP_analysis_barplot() - - if "4" in args.steps: - parse_phaster(args.reference) - - time_taken = datetime.now() - start_time_2 - if args.remove_temp: - del_command = "rm -r %s" % temp_dir - os.system(del_command) - - - - - - - - - - - - - - - diff --git a/modules/variant_diagnostics/reason_job.py b/modules/variant_diagnostics/reason_job.py deleted file mode 100755 index 6d0c4d0..0000000 --- a/modules/variant_diagnostics/reason_job.py +++ /dev/null @@ -1,165 +0,0 @@ -__author__ = 'alipirani' - -import argparse -import re -import os -import csv -import subprocess -from collections import OrderedDict -from collections import defaultdict -from joblib import Parallel, delayed -import multiprocessing -from cyvcf2 import VCF -import timeit -import ConfigParser -from config_settings import ConfigSectionMap -from logging_subprocess import * -from log_modules import * - -parser = argparse.ArgumentParser(description='Creating Label files individual jobs') -parser.add_argument('-filter2_only_snp_vcf_dir', action='store', dest="filter2_only_snp_vcf_dir", - help='Directory where all the filter2 only SNP vcf files are saved.') -parser.add_argument('-filter2_only_snp_vcf_file', action='store', dest="filter2_only_snp_vcf_file", - help='Names of filter2 only SNP vcf file') -parser.add_argument('-unique_position_file', action='store', dest="unique_position_file", - help='Names of unique positions file') -parser.add_argument('-tmp_dir', action='store', dest="tmp_dir", - help='Names of temporary directory') -args = parser.parse_args() - -"""Set variables and set up the tmp directories""" -dir = args.filter2_only_snp_vcf_dir -unique_positions_file = args.unique_position_file -os.system("mkdir %s" % args.tmp_dir) -os.system("cp %s %s/%s" % (args.filter2_only_snp_vcf_file, args.tmp_dir, os.path.basename(args.filter2_only_snp_vcf_file))) - -""" Generate unique positions array""" -position_array_sort = [] -f = open(unique_positions_file, 'r+') -for line in f: - line = line.strip() - position_array_sort.append(line) -f.close() - -""" Prepare output label file """ -file = args.tmp_dir + "/" + os.path.basename(args.filter2_only_snp_vcf_file) -print "Processing %s" % file -out_file_name = args.filter2_only_snp_vcf_file + "_positions_label" - -""" Get the prefix for all the arrays """ -array_name = os.path.basename(out_file_name) - -#Changed 8 March -""" Generate proximate, unmapped, variant positions array""" -ori_unmapped_file = out_file_name.replace("filter2_final.vcf_no_proximate_snp.vcf_positions_label", "unmapped.bed_positions") -ori_proximate_file = out_file_name.replace("filter2_final.vcf_no_proximate_snp.vcf_positions_label", "filter2_final.vcf_no_proximate_snp.vcf_positions_array") -ori_variant_position_file = out_file_name.replace("filter2_final.vcf_no_proximate_snp.vcf_positions_label", "filter2_final.vcf_no_proximate_snp.vcf") -ori_mpileup_file = out_file_name.replace("filter2_final.vcf_no_proximate_snp.vcf_positions_label", "aln_mpileup_raw.vcf_5bp_indel_removed.vcf") - -current_unmapped_file = args.tmp_dir + "/%s" % (os.path.basename(ori_unmapped_file)) -current_proximate_file = args.tmp_dir + "/%s" % (os.path.basename(ori_proximate_file)) -current_variant_position_file = args.tmp_dir + "/%s" % (os.path.basename(ori_variant_position_file)) -current_mpileup_file = args.tmp_dir + "/%s" % (os.path.basename(ori_mpileup_file)) - -os.system("cp %s %s/" % (ori_unmapped_file, args.tmp_dir)) -os.system("cp %s %s/" % (ori_proximate_file, args.tmp_dir)) -os.system("cp %s %s/" % (ori_variant_position_file, args.tmp_dir)) -os.system("cp %s %s/" % (ori_mpileup_file, args.tmp_dir)) - - - -#variant position array -variant_position_array = "variant_" + str(array_name) -variant_position_array = [] -with open(current_variant_position_file, 'rU') as csv_file: - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position = row[0] - if not position.startswith('#'): - variant_position_array.append(row[1]) -csv_file.close() - -#unmapped position array -unmapped_array = "unmapped_" + str(array_name) -unmapped_array = {} -with open(current_unmapped_file, 'rU') as fp1: - for line in fp1: - line = line.strip() - unmapped_array[line] = "" -fp1.close() - -#proximate position array -proximate_array = "proximate_" + str(array_name) -proximate_array = {} -with open(current_proximate_file, 'rU') as fp2: - for liness in fp2: - liness = liness.strip() - proximate_array[liness] = "" -fp2.close() - -""" Prepare cyvcf vcf files """ -#bgzip_cmd = "for i in %s/*.vcf; do bgzip -c $i > $i%s; done" % (args.filter2_only_snp_vcf_dir, ".gz") -#tabix_cmd = "for i in %s/*.vcf.gz; do tabix $i; done" % (args.filter2_only_snp_vcf_dir) -#os.system(bgzip_cmd) -#os.system(tabix_cmd) - -""" Load Cyvcf objects """ -vcf_final_file = VCF(args.filter2_only_snp_vcf_file + ".gz") -mpileup_file = VCF(ori_mpileup_file + ".gz") - -reference_genome = vcf_final_file.seqnames[0] - -positions_final_vcf = defaultdict(list) -positions_mpileup_vcf = defaultdict(list) - -for variants in VCF(args.filter2_only_snp_vcf_file + ".gz"): - positions_final_vcf[int(variants.POS)].append(variants.INFO.get('DP')) - -for variants in VCF(ori_mpileup_file + ".gz"): - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('DP')) - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('FQ')) - positions_mpileup_vcf[int(variants.POS)].append(variants.QUAL) - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('MQ')) - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('AF1')) - - - -""" Generate label files and check why the positions were filtered out from the final vcf file """ -def get_reason(): - f1=open(out_file_name, 'w+') - for j in position_array_sort: - """ Check if the unique position is present in the final no_proximate_snp.vcf file """ - if int(j) not in positions_final_vcf.keys(): - if int(j) not in positions_mpileup_vcf.keys(): - if j in unmapped_array.keys(): - st = "reference_unmapped_position\n" - f1.write(st) - else: - st = "reference_allele\n" - f1.write(st) - else: - if j in proximate_array.keys(): - pst = "_proximate_SNP" - else: - pst = "" - if positions_mpileup_vcf[int(j)][1] < -40: - st = "HighFQ" - if positions_mpileup_vcf[int(j)][2] < 100.00: - st = st + "_QUAL" - if positions_mpileup_vcf[int(j)][0] < 15: - st = st + "_DP" - else: - st = "LowFQ" - if positions_mpileup_vcf[int(j)][2] < 100.00: - st = st + "_QUAL" - if positions_mpileup_vcf[int(j)][0] < 15: - st = st + "_DP" - st = st + pst + "\n" - f1.write(st) - else: - st = "VARIANT" + "\n" - f1.write(st) - f1.close() - - -print "Time taken to execute this code block: %s" % (timeit.timeit(get_reason, number=1)) diff --git a/modules/variant_diagnostics/reason_job_debug.py b/modules/variant_diagnostics/reason_job_debug.py index a75afad..b357698 100755 --- a/modules/variant_diagnostics/reason_job_debug.py +++ b/modules/variant_diagnostics/reason_job_debug.py @@ -11,10 +11,12 @@ import multiprocessing from cyvcf2 import VCF import timeit +import time import ConfigParser from config_settings import ConfigSectionMap from logging_subprocess import * from log_modules import * +from memory_profiler import profile parser = argparse.ArgumentParser(description='Creating Label files individual jobs') parser.add_argument('-filter2_only_snp_vcf_dir', action='store', dest="filter2_only_snp_vcf_dir", @@ -27,6 +29,7 @@ help='Names of temporary directory') args = parser.parse_args() + """Set variables and set up the tmp directories""" dir = args.filter2_only_snp_vcf_dir unique_positions_file = args.unique_position_file @@ -67,64 +70,152 @@ os.system("cp -f %s %s/" % (ori_mpileup_file, args.tmp_dir)) +# Optimization changes +# #variant position array +# variant_position_array = "variant_" + str(array_name) +# variant_position_array = [] +# with open(current_variant_position_file, 'rU') as csv_file: +# csv_reader = csv.reader(csv_file, delimiter='\t') +# for row in csv_reader: +# position = row[0] +# if not position.startswith('#'): +# variant_position_array.append(row[1]) +# csv_file.close() + +# Optimization changes +def generate_dicts(): + #unmapped position dict + program_starts = time.time() + global unmapped_array + unmapped_array = "unmapped_" + str(array_name) + unmapped_array = {} + with open(current_unmapped_file, 'rU') as fp1: + for line in fp1: + line = line.strip() + unmapped_array[line] = "" + fp1.close() + now = time.time() + print "Time taken to load unmapped positions array - {0} seconds".format(now - program_starts) + + #proximate position dict + program_starts = time.time() + global proximate_array + proximate_array = "proximate_" + str(array_name) + proximate_array = {} + with open(current_proximate_file, 'rU') as fp2: + for liness in fp2: + liness = liness.strip() + proximate_array[liness] = "" + fp2.close() + now = time.time() + print "Time taken to load proximate positions array - {0} seconds".format(now - program_starts) + + """ Prepare cyvcf vcf files - Load Cyvcf objects """ + # Optimization changes + program_starts = time.time() + global positions_final_vcf + global positions_mpileup_vcf + positions_final_vcf = defaultdict(list) + positions_mpileup_vcf = defaultdict(list) + + for variants in VCF(args.filter2_only_snp_vcf_file + ".gz"): + positions_final_vcf[int(variants.POS)].append(variants.INFO.get('DP')) + now = time.time() + print "Time taken to load filtered positions array - {0} seconds".format(now - program_starts) + + program_starts = time.time() + for variants in VCF(ori_mpileup_file + ".gz"): + positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('DP')) + positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('FQ')) + positions_mpileup_vcf[int(variants.POS)].append(variants.QUAL) + positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('MQ')) + positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('AF1')) + now = time.time() + print "Time taken to load raw vcf data array - {0} seconds".format(now - program_starts) + +# @profile +def get_reason(): + generate_dicts() + #print "Time taken to generate dictionaries: %s" % (timeit.timeit(generate_dicts, number=1)) + #program_starts = time.time() + f1=open(out_file_name, 'w+') + + # Older chunk of code -slower + # for j in position_array_sort: + # """ Check if the unique position is present in the final no_proximate_snp.vcf file """ + # if int(j) not in positions_final_vcf.keys(): + # if int(j) not in positions_mpileup_vcf.keys(): + # if j in unmapped_array.keys(): + # st = "reference_unmapped_position\n" + # f1.write(st) + # else: + # st = "reference_allele\n" + # f1.write(st) + # else: + # if j in proximate_array.keys(): + # pst = "_proximate_SNP" + # else: + # pst = "" + # if positions_mpileup_vcf[int(j)][1] < -40: + # st = "HighFQ" + # if positions_mpileup_vcf[int(j)][2] < 100.00: + # st = st + "_QUAL" + # if positions_mpileup_vcf[int(j)][0] < 15: + # st = st + "_DP" + # else: + # st = "LowFQ" + # if positions_mpileup_vcf[int(j)][2] < 100.00: + # st = st + "_QUAL" + # if positions_mpileup_vcf[int(j)][0] < 15: + # st = st + "_DP" + # st = st + pst + "\n" + # f1.write(st) + # else: + # st = "VARIANT" + "\n" + # f1.write(st) + # now = time.time() + # print "Time taken to iterate the loop once - {0} seconds".format(now - program_starts) + + # Newer chunk of code -faster + for j in position_array_sort: + """ Check if the unique position is present in the final no_proximate_snp.vcf file """ + if not positions_final_vcf.has_key(int(j)): + if not positions_mpileup_vcf.has_key(int(j)): + if unmapped_array.has_key(j): + st = "reference_unmapped_position\n" + f1.write(st) + else: + st = "reference_allele\n" + f1.write(st) + else: + if proximate_array.has_key(j): + pst = "_proximate_SNP" + else: + pst = "" + if positions_mpileup_vcf[int(j)][1] < -40: + st = "HighFQ" + if positions_mpileup_vcf[int(j)][2] < 100.00: + st = st + "_QUAL" + if positions_mpileup_vcf[int(j)][0] < 15: + st = st + "_DP" + else: + st = "LowFQ" + if positions_mpileup_vcf[int(j)][2] < 100.00: + st = st + "_QUAL" + if positions_mpileup_vcf[int(j)][0] < 15: + st = st + "_DP" + st = st + pst + "\n" + f1.write(st) + else: + st = "VARIANT" + "\n" + f1.write(st) + now = time.time() + #print "Time taken to iterate the loop once - {0} seconds".format(now - program_starts) + f1.close() + +print "Time taken to execute this code block: %s" % (timeit.timeit(get_reason, number=1)) -#variant position array -variant_position_array = "variant_" + str(array_name) -variant_position_array = [] -with open(current_variant_position_file, 'rU') as csv_file: - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position = row[0] - if not position.startswith('#'): - variant_position_array.append(row[1]) -csv_file.close() - -#unmapped position array -unmapped_array = "unmapped_" + str(array_name) -unmapped_array = {} -with open(current_unmapped_file, 'rU') as fp1: - for line in fp1: - line = line.strip() - unmapped_array[line] = "" -fp1.close() - -#proximate position array -proximate_array = "proximate_" + str(array_name) -proximate_array = {} -with open(current_proximate_file, 'rU') as fp2: - for liness in fp2: - liness = liness.strip() - proximate_array[liness] = "" -fp2.close() - -""" Prepare cyvcf vcf files """ -#bgzip_cmd = "for i in %s/*.vcf; do bgzip -c $i > $i%s; done" % (args.filter2_only_snp_vcf_dir, ".gz") -#tabix_cmd = "for i in %s/*.vcf.gz; do tabix $i; done" % (args.filter2_only_snp_vcf_dir) -#os.system(bgzip_cmd) -#os.system(tabix_cmd) - -""" Load Cyvcf objects """ -vcf_final_file = VCF(args.filter2_only_snp_vcf_file + ".gz") -mpileup_file = VCF(ori_mpileup_file + ".gz") - -reference_genome = vcf_final_file.seqnames[0] - -positions_final_vcf = defaultdict(list) -positions_mpileup_vcf = defaultdict(list) - -for variants in VCF(args.filter2_only_snp_vcf_file + ".gz"): - positions_final_vcf[int(variants.POS)].append(variants.INFO.get('DP')) - -for variants in VCF(ori_mpileup_file + ".gz"): - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('DP')) - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('FQ')) - positions_mpileup_vcf[int(variants.POS)].append(variants.QUAL) - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('MQ')) - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('AF1')) - - - -""" Generate label files and check why the positions were filtered out from the final vcf file """ +# """ Generate label files and check why the positions were filtered out from the final vcf file """ # def get_reason(): # f1=open(out_file_name, 'w+') # for j in position_array_sort: @@ -159,43 +250,4 @@ # else: # st = "VARIANT" + "\n" # f1.write(st) -# f1.close() - -def get_reason(): - f1=open(out_file_name, 'w+') - for j in position_array_sort: - """ Check if the unique position is present in the final no_proximate_snp.vcf file """ - if int(j) not in positions_final_vcf.keys(): - if int(j) not in positions_mpileup_vcf.keys(): - if j in unmapped_array.keys(): - st = "reference_unmapped_position\n" - f1.write(st) - else: - st = "reference_allele\n" - f1.write(st) - else: - if j in proximate_array.keys(): - pst = "_proximate_SNP" - else: - pst = "" - if positions_mpileup_vcf[int(j)][1] < -40: - st = "HighFQ" - if positions_mpileup_vcf[int(j)][2] < 100.00: - st = st + "_QUAL" - if positions_mpileup_vcf[int(j)][0] < 15: - st = st + "_DP" - else: - st = "LowFQ" - if positions_mpileup_vcf[int(j)][2] < 100.00: - st = st + "_QUAL" - if positions_mpileup_vcf[int(j)][0] < 15: - st = st + "_DP" - st = st + pst + "\n" - f1.write(st) - else: - st = "VARIANT" + "\n" - f1.write(st) - f1.close() - - -print "Time taken to execute this code block: %s" % (timeit.timeit(get_reason, number=1)) +# f1.close() \ No newline at end of file diff --git a/modules/variant_diagnostics/reason_job_indel_debug.py b/modules/variant_diagnostics/reason_job_indel_debug.py deleted file mode 100755 index 0772a49..0000000 --- a/modules/variant_diagnostics/reason_job_indel_debug.py +++ /dev/null @@ -1,169 +0,0 @@ -__author__ = 'alipirani' - -import argparse -import re -import os -import csv -import subprocess -from collections import OrderedDict -from collections import defaultdict -from joblib import Parallel, delayed -import multiprocessing -from cyvcf2 import VCF -import timeit -import ConfigParser -from config_settings import ConfigSectionMap -from logging_subprocess import * -from log_modules import * - -parser = argparse.ArgumentParser(description='Creating Label files individual jobs') -parser.add_argument('-filter2_only_snp_vcf_dir', action='store', dest="filter2_only_snp_vcf_dir", - help='Directory where all the filter2 only SNP vcf files are saved.') -parser.add_argument('-filter2_only_snp_vcf_file', action='store', dest="filter2_only_snp_vcf_file", - help='Names of filter2 only SNP vcf file') -parser.add_argument('-unique_position_file', action='store', dest="unique_position_file", - help='Names of unique positions file') -parser.add_argument('-tmp_dir', action='store', dest="tmp_dir", - help='Names of temporary directory') -args = parser.parse_args() - -indel_file = (args.filter2_only_snp_vcf_file).replace('_final.vcf_no_proximate_snp.vcf', '_indel_final.vcf') - -"""Set variables and set up the tmp directories""" -dir = args.filter2_only_snp_vcf_dir -unique_positions_file = args.unique_position_file -os.system("mkdir %s" % args.tmp_dir) -os.system("cp %s %s/%s" % (indel_file, args.tmp_dir, os.path.basename(indel_file))) - -""" Generate unique positions array""" -position_array_sort = [] -f = open(unique_positions_file, 'r+') -for line in f: - line = line.strip() - position_array_sort.append(line) -f.close() - -""" Prepare output label file """ -file = args.tmp_dir + "/" + os.path.basename(indel_file) -print "Processing %s" % file -out_file_name = indel_file + "_indel_positions_label" - -""" Get the prefix for all the arrays """ -array_name = os.path.basename(out_file_name) - -#Changed 8 March -""" Generate proximate, unmapped, variant positions array""" -ori_unmapped_file = out_file_name.replace("filter2_indel_final.vcf_indel_positions_label", "unmapped.bed_positions") -ori_proximate_file = out_file_name.replace("filter2_indel_final.vcf_indel_positions_label", "filter2_final.vcf_no_proximate_snp.vcf_positions_array") -ori_variant_position_file = out_file_name.replace("filter2_indel_final.vcf_indel_positions_label", "filter2_indel_final.vcf") -ori_mpileup_file = out_file_name.replace("filter2_indel_final.vcf_indel_positions_label", "aln_mpileup_raw.vcf") - -current_unmapped_file = args.tmp_dir + "/%s" % (os.path.basename(ori_unmapped_file)) -current_proximate_file = args.tmp_dir + "/%s" % (os.path.basename(ori_proximate_file)) -current_variant_position_file = args.tmp_dir + "/%s" % (os.path.basename(ori_variant_position_file)) -current_mpileup_file = args.tmp_dir + "/%s" % (os.path.basename(ori_mpileup_file)) - -os.system("cp %s %s/" % (ori_unmapped_file, args.tmp_dir)) -os.system("cp %s %s/" % (ori_proximate_file, args.tmp_dir)) -os.system("cp %s %s/" % (ori_variant_position_file, args.tmp_dir)) -os.system("cp %s %s/" % (ori_mpileup_file, args.tmp_dir)) - - - -#variant position array -variant_position_array = "variant_" + str(array_name) -variant_position_array = [] -with open(current_variant_position_file, 'rU') as csv_file: - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position = row[0] - if not position.startswith('#'): - variant_position_array.append(row[1]) -csv_file.close() - -#unmapped position array -unmapped_array = "unmapped_" + str(array_name) -unmapped_array = {} -with open(current_unmapped_file, 'rU') as fp1: - for line in fp1: - line = line.strip() - unmapped_array[line] = "" -fp1.close() - -#proximate position array -proximate_array = "proximate_" + str(array_name) -proximate_array = {} -with open(current_proximate_file, 'rU') as fp2: - for liness in fp2: - liness = liness.strip() - proximate_array[liness] = "" -fp2.close() - -""" Prepare cyvcf vcf files """ -#bgzip_cmd = "for i in %s/*.vcf; do bgzip -c $i > $i%s; done" % (args.filter2_only_snp_vcf_dir, ".gz") -#tabix_cmd = "for i in %s/*.vcf.gz; do tabix $i; done" % (args.filter2_only_snp_vcf_dir) -#os.system(bgzip_cmd) -#os.system(tabix_cmd) - - - -""" Load Cyvcf objects """ -vcf_final_file = VCF(indel_file + ".gz") -mpileup_file = VCF(ori_mpileup_file + ".gz") - -reference_genome = vcf_final_file.seqnames[0] - -positions_final_vcf = defaultdict(list) -positions_mpileup_vcf = defaultdict(list) - -for variants in VCF(indel_file + ".gz"): - positions_final_vcf[int(variants.POS)].append(variants.INFO.get('DP')) - -for variants in VCF(ori_mpileup_file + ".gz"): - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('DP')) - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('FQ')) - positions_mpileup_vcf[int(variants.POS)].append(variants.QUAL) - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('MQ')) - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('AF1')) - - - -""" Generate label files and check why the positions were filtered out from the final vcf file """ -def get_reason(): - f1=open(out_file_name, 'w+') - for j in position_array_sort: - """ Check if the unique position is present in the final no_proximate_snp.vcf file """ - if int(j) not in positions_final_vcf.keys(): - if int(j) not in positions_mpileup_vcf.keys(): - if j in unmapped_array.keys(): - st = "reference_unmapped_position\n" - f1.write(st) - else: - st = "reference_allele\n" - f1.write(st) - else: - if j in proximate_array.keys(): - pst = "_proximate_SNP" - else: - pst = "" - if positions_mpileup_vcf[int(j)][1] < -40: - st = "HighFQ" - if positions_mpileup_vcf[int(j)][2] < 100.00: - st = st + "_QUAL" - if positions_mpileup_vcf[int(j)][0] < 10: - st = st + "_DP" - else: - st = "LowFQ" - if positions_mpileup_vcf[int(j)][2] < 100.00: - st = st + "_QUAL" - if positions_mpileup_vcf[int(j)][0] < 10: - st = st + "_DP" - st = st + pst + "\n" - f1.write(st) - else: - st = "VARIANT" + "\n" - f1.write(st) - f1.close() - - -print "Time taken to execute this code block: %s" % (timeit.timeit(get_reason, number=1)) diff --git a/modules/variant_diagnostics/reason_job_indel_debug_gatk.py b/modules/variant_diagnostics/reason_job_indel_debug_gatk.py index 01bf438..d46578d 100755 --- a/modules/variant_diagnostics/reason_job_indel_debug_gatk.py +++ b/modules/variant_diagnostics/reason_job_indel_debug_gatk.py @@ -11,10 +11,12 @@ import multiprocessing from cyvcf2 import VCF import timeit +import time import ConfigParser from config_settings import ConfigSectionMap from logging_subprocess import * from log_modules import * +from memory_profiler import profile parser = argparse.ArgumentParser(description='Creating Label files individual jobs') parser.add_argument('-filter2_only_snp_vcf_dir', action='store', dest="filter2_only_snp_vcf_dir", @@ -72,81 +74,93 @@ -#variant position array -variant_position_array = "variant_" + str(array_name) -variant_position_array = [] -with open(current_variant_position_file, 'rU') as csv_file: - csv_reader = csv.reader(csv_file, delimiter='\t') - for row in csv_reader: - position = row[0] - if not position.startswith('#'): - variant_position_array.append(row[1]) -csv_file.close() - -#unmapped position array -unmapped_array = "unmapped_" + str(array_name) -unmapped_array = {} -with open(current_unmapped_file, 'rU') as fp1: - for line in fp1: - line = line.strip() - unmapped_array[line] = "" -fp1.close() - -#proximate position array -proximate_array = "proximate_" + str(array_name) -proximate_array = {} -with open(current_proximate_file, 'rU') as fp2: - for liness in fp2: - liness = liness.strip() - proximate_array[liness] = "" -fp2.close() - -""" Prepare cyvcf vcf files """ -#bgzip_cmd = "for i in %s/*.vcf; do bgzip -c $i > $i%s; done" % (args.filter2_only_snp_vcf_dir, ".gz") -#tabix_cmd = "for i in %s/*.vcf.gz; do tabix $i; done" % (args.filter2_only_snp_vcf_dir) -#os.system(bgzip_cmd) -#os.system(tabix_cmd) - - - -""" Load Cyvcf objects """ -vcf_final_file = VCF(indel_file + ".gz") -mpileup_file = VCF(ori_mpileup_file + ".gz") - -reference_genome = vcf_final_file.seqnames[0] - -positions_final_vcf = defaultdict(list) -positions_mpileup_vcf = defaultdict(list) - -for variants in VCF(indel_file + ".gz"): - positions_final_vcf[int(variants.POS)].append(variants.INFO.get('DP')) - -for variants in VCF(ori_mpileup_file + ".gz"): - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('DP')) - #positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('FQ')) - #positions_mpileup_vcf[int(variants.POS)].append(variants.QUAL) - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('QD')) - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('MQ')) - positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('AF')) - #positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('AF1')) - - +# #variant position array +# variant_position_array = "variant_" + str(array_name) +# variant_position_array = [] +# with open(current_variant_position_file, 'rU') as csv_file: +# csv_reader = csv.reader(csv_file, delimiter='\t') +# for row in csv_reader: +# position = row[0] +# if not position.startswith('#'): +# variant_position_array.append(row[1]) +# csv_file.close() + +# Optimization changes +# @profile +def generate_dicts(): + #unmapped position dict + program_starts = time.time() + global unmapped_array + unmapped_array = "unmapped_" + str(array_name) + unmapped_array = {} + with open(current_unmapped_file, 'rU') as fp1: + for line in fp1: + line = line.strip() + unmapped_array[line] = "" + fp1.close() + now = time.time() + print "Time taken to load unmapped positions array - {0} seconds".format(now - program_starts) + + #proximate position array + program_starts = time.time() + global proximate_array + proximate_array = "proximate_" + str(array_name) + proximate_array = {} + with open(current_proximate_file, 'rU') as fp2: + for liness in fp2: + liness = liness.strip() + proximate_array[liness] = "" + fp2.close() + now = time.time() + print "Time taken to load proximate positions array - {0} seconds".format(now - program_starts) + + """ Prepare cyvcf vcf files; Load Cyvcf objects """ + # Optimization changes + program_starts = time.time() + global positions_final_vcf + global positions_mpileup_vcf + positions_final_vcf = defaultdict(list) + positions_mpileup_vcf = defaultdict(list) + + for variants in VCF(indel_file + ".gz"): + positions_final_vcf[int(variants.POS)].append(variants.INFO.get('DP')) + now = time.time() + print "Time taken to load filtered positions array - {0} seconds".format(now - program_starts) + + program_starts = time.time() + for variants in VCF(ori_mpileup_file + ".gz"): + positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('DP')) + #positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('FQ')) + #positions_mpileup_vcf[int(variants.POS)].append(variants.QUAL) + positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('QD')) + positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('MQ')) + positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('AF')) + #positions_mpileup_vcf[int(variants.POS)].append(variants.INFO.get('AF1')) + + now = time.time() + print "Time taken to load raw vcf data array - {0} seconds".format(now - program_starts) """ Generate label files and check why the positions were filtered out from the final vcf file """ def get_reason(): + generate_dicts() + #print "Time taken to generate dictionaries: %s" % (timeit.timeit(generate_dicts, number=1)) f1=open(out_file_name, 'w+') for j in position_array_sort: """ Check if the unique position is present in the final no_proximate_snp.vcf file """ - if int(j) not in positions_final_vcf.keys(): - if int(j) not in positions_mpileup_vcf.keys(): - if j in unmapped_array.keys(): + #if int(j) not in positions_final_vcf.keys(): + if not positions_final_vcf.has_key(int(j)): + #if int(j) not in positions_mpileup_vcf.keys(): + if not positions_mpileup_vcf.has_key(int(j)): + #if j in unmapped_array.keys(): + if unmapped_array.has_key(j): st = "reference_unmapped_position\n" f1.write(st) else: st = "reference_allele\n" f1.write(st) else: - if j in proximate_array.keys(): + #if j in proximate_array.keys(): + if proximate_array.has_key(j): pst = "_proximate_SNP" else: pst = "" @@ -169,5 +183,4 @@ def get_reason(): f1.write(st) f1.close() - print "Time taken to execute this code block: %s" % (timeit.timeit(get_reason, number=1)) diff --git a/modules/variant_diagnostics/scripts/__pycache__/fasta_functions.cpython-37.pyc b/modules/variant_diagnostics/scripts/__pycache__/fasta_functions.cpython-37.pyc new file mode 100644 index 0000000..e470971 Binary files /dev/null and b/modules/variant_diagnostics/scripts/__pycache__/fasta_functions.cpython-37.pyc differ diff --git a/modules/variant_diagnostics/scripts/fasta_functions.py b/modules/variant_diagnostics/scripts/fasta_functions.py new file mode 100755 index 0000000..1e5826e --- /dev/null +++ b/modules/variant_diagnostics/scripts/fasta_functions.py @@ -0,0 +1,284 @@ +# Functions to manipulate fasta files + +# Functions: +# subset_fasta +# get_fasta_subsets +# rm_invar_sites +# mask_positions +# count_invar_sites +# + +# Import modules +from subprocess import call +from Bio import AlignIO +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +import pandas as pd +from collections import defaultdict +from collections import Counter +import re +import os + +def subset_fasta(ids, fastain, fastaout, keep=True): + """Creates a new fasta file with a subset of original sequences. + + Args: + ids: List of sequence names to be extracted (default) or removed (change keep to False). + fastain: Path to fasta file containing all sequences of interest. + fastaout: Name of output fasta file containing only the sequences of interest. + keep: If ids is a list of sequences to keep (True) or remove (False). + + Output: + Fasta file containing only the sequences of interest. + """ + + with open(fastaout, 'w') as f: + for rec in SeqIO.parse(fastain, 'fasta'): + if keep: + if str(rec.id) in ids: + f.write('>' + ids[ids.index(rec.id)] + '\n') + f.write(str(rec.seq) + '\n') + else: + if str(rec.id) not in ids: + f.write('>' + ids[ids.index(rec.id)] + '\n') + f.write(str(rec.seq) + '\n') + + +def get_fasta_subsets(csv, fasta): + """Creates new fasta files with subsets of the original sequences. + + Args: + csv: csv file where each line contains a different subset of sequence names to be extracted. + fasta: Path to fasta file containing all sequences of interest. + + Output: + Fasta files containing only the sequences of interest. + + Returns: + List of paths to subsetted fasta files. + """ + + fas = [] + + with open(csv) as f: + csv_reader = csv.reader(f, delimiter=',') + count = 0 + for subset in csv_reader: + count += 1 + fastaout = fasta.split('.')[0] + '_subset' + str(count) + '.fasta' + fa = subset_fasta(subset, fasta, fastaout) + fas.append(fa) + + return(fas) + + +def rm_invar_sites(fasta, outfile=None, outfmt=['fasta','vcf'], outdir='.', + path={'snpsites':'/nfs/esnitkin/bin_group/anaconda3/bin/snp-sites'}): + """Removes invariant sites from a multifasta file using snp-sites. + + Args: + fasta: Alignment fasta file from which to get variant sites. + outfile: (Optional) Output file name. + outfmt: Format of output file (fasta or vcf). + outdir: (Optional) Output directory (default: current working directory). + path: Dictionary of paths including the path to snp-sites (key: snpsites, value: path) + + Output: + Fasta or VCF file of variant sites. + + Returns: + Path to output file. + """ + + if len(outfmt) > 1: + outfmt = outfmt[0] + + # Get output fasta file name + if outfile == None: + outfile = outdir + '/' + re.split('/|\.', fasta)[-2] + 'var_sites.' + outfmt + + # Get output format + if outfmt == 'fasta': + flag = ' -m ' + elif outfmt == 'vcf': + flag = ' -v ' + + # Get variant sites and write to fasta file using snp-sites + print('Getting variant sites using snp-sites.') + cmd = path['snpsites'] + ' ' + fasta + \ + flag + ' -o ' + outfile + + os.system(cmd) + print(outfile, 'written.') + + return(outfile) + + +def mask_positions(fasta, gff, outdir='.', masked_sites_file=False): + """Masks positions in GFF file in alignment file in fasta format. + + If fasta is the whole genome alignment fasta used in gubbins + and gff is the gubbins output GFF file, then recombinant + regions identified by gubbins are masked (on a per-genome basis). + Optionally returns a text file with list of masked positions in each genome. + + Args: + fasta: Alignment fasta file in which sites will be masked. + gff: GFF file containing sites that will be masked in certain genomes + (ex. gubbins output GFF). + outdir: Output file directory (default: current working directory). + masked_sites_file: If true, generates a text file with a list of masked positions in each genome. + + Output: + Masked whole genome alignment file in FASTA format. + (Optional) Text file with list of masked positions in each genome. + + Returns: + Path to masked fasta file. + """ + + # Read in alignment and gff file + print('Reading ', gff, '.', sep='') + gff = pd.read_csv(gff, sep='\t', skiprows=2, header=None) + print('Reading ', fasta, '.', sep='') + aln = AlignIO.read(fasta, 'fasta') + + # Get indices/positions of recombinant regions identified by gubbins + print('Getting recombinant positions.') + recomb_regions = defaultdict(list) + + for row in gff.iterrows(): + start = row[1][3] + end = row[1][4] + region = list(range(start, end)) + taxa = row[1][8].split(';')[2] + taxa = taxa.replace('taxa=\"', '') + taxa = taxa.replace('\"', '') + taxa = list(taxa.split()) + for isolate in taxa: + for position in region: + recomb_regions[isolate].append(position) + + # Mask indices/positions of recombinant regions identified by gubbins + print('Masking recombinant positions in whole genome alignment.') + sample_masked_indices = defaultdict(list) + new_aln = list() + + for record in aln: + seq_str = list(str(record.seq)) + masked_indices = recomb_regions.get(record.id, []) + for index in masked_indices: + seq_str[index] = 'N' + seq_str = ''.join(seq_str) + new_record = SeqRecord(Seq(seq_str), id=record.id, description='') + sample_masked_indices[record.id] = masked_indices + new_aln.append(new_record) + + # Write new FASTA file with recombinant regions masked + fasta_outfile = outdir + '/' + re.split('/|\.', fasta)[-2] + \ + '_gubbins_masked.fa' + + print('Writing', fasta_outfile) + with open(fasta_outfile, 'w') as handle: + SeqIO.write(new_aln, handle, 'fasta') + + if masked_sites_file: + # Write text file with list of recombinant sites for each genome + text_outfile = outdir + '/' + re.split('/|\.', fasta)[-2] + \ + '_masked_recomb_positions.txt' + print('Writing', text_outfile) + with open(text_outfile, 'w') as handle: + for sample, positions in sample_masked_indices.items(): + line = str(sample) + '\t' + ','.join(map(str, positions)) + '\n' + handle.write(line) + + return(fasta_outfile) + +def count_invar_sites(fasta,gff=None,outdir='.',path={'snpsites':'/nfs/esnitkin/bin_group/anaconda3/bin/snp-sites'}): + """Counts invariant sites in an alignment file (fasta format). + + Gets invariant site count for As, Cs, Gs, and Ts from an alignment file. + If gff is not None, positions in the GFF file will be masked before + invariant sites are counted. + + Args: + fasta: Path to alignment file in fasta format. + gff: [Optional] GFF file of sections of genomes to mask (ex. gubbins output GFF). + outdir: Output file directory (default: current working directory). + path: Dictionary of paths including the path to snp-sites (key: snpsites, value: path) + + Output: + Text file (*_invar_site_counts.txt) with invariant site counts in the following order: A,C,G,T. + VCF file of variants (created by snp-sites) + If GFF path given, masked fasta file (*_gubbins_masked.fa). + + Returns: + Name of text file with invariant site counts (*_invar_site_counts.txt). + + """ + + # Mask recombinant regions before counting invariant sites + if gff is not None: + aln_file = mask_positions(fasta, gff) + else: + aln_file = fasta + + # Count invariant sites in whole genome alignment + + # Read in alignment + print('Reading ', aln_file, '.', sep='') + aln = AlignIO.read(aln_file, 'fasta') + + # Get variant positions + var_site_outfile = outdir + '/' + re.split('/|\.', aln_file)[-2] + \ + '_snp-sites.vcf' + print('Getting variant positions using snp-sites.') + cmd = path['snpsites'] + ' ' + aln_file + \ + ' -v ' + ' -o ' + var_site_outfile + + os.system(cmd) + + positions = [] + with open(var_site_outfile) as f: + for line in f: + li=line.strip() + if not li.startswith("#"): + positions.append(line.split('\t')[1]) + + # Get allele for invariant sites + invar_sites = [] + for record in aln: + seq_str = list(str(record.seq)) + for index in positions: + index = int(index) + seq_str[index] = '' + #tmp = ''.join(seq_str) + if len(invar_sites) == 0: + invar_sites = seq_str + else: + for i,b in enumerate(invar_sites): + if b is 'N' or b is 'n': + invar_sites[i] = seq_str[i] + invar_counts = Counter(invar_sites) + if invar_counts['N'] == 0 and invar_counts['n'] == 0: + break + + del invar_counts[''] + + # Get invariant site count for each base + #print('Counting bases.') + #invar_counts = Counter(invar_sites) + + # Write base counts to files + invar_counts_file = outdir + '/' + re.split('/|\.', aln_file)[-2] + \ + '_invar_site_counts.txt' + print('Writing ', invar_counts_file, ' (order: A C G T).', sep='') + with open(invar_counts_file, 'w') as f: + for base, count in sorted(invar_counts.items()): + if base in ['A','a','C','c','G','g','T','t']: + print(base + ' ' + str(count)) + f.write('%s ' % (count)) + #else: + # print(base + ' ' + str(count)) + return(invar_counts_file) diff --git a/modules/variant_diagnostics/scripts/gubbins_iqtree.py b/modules/variant_diagnostics/scripts/gubbins_iqtree.py new file mode 100755 index 0000000..52ef749 --- /dev/null +++ b/modules/variant_diagnostics/scripts/gubbins_iqtree.py @@ -0,0 +1,103 @@ +# script to run gubbins and make tree +# written in python3 + +# modules needed: gubbins + +# import modules +import argparse +import sys +import re +import os + +# set path +sys.path.insert(1, '/nfs/esnitkin/bin_group/pipeline/Github/beast/') + +# import functions +import fasta_functions + +# parse command line arguments +parser = argparse.ArgumentParser(description='''Make tree using iqtree; + optionally run gubbins first.''') +parser.add_argument('alignment', metavar='ALN', + help='Alignment file in FASTA format.') +parser.add_argument('-ng', '--nogubbins', + help='Don\'t do recombination filtering using gubbins.', dest="ng") +parser.add_argument('-w', '--whole_genome', + action='store_true', + help='Make tree using whole genome alignment.', dest="w") +parser.add_argument('-v', '--variants_only', + action='store_true', + help='Make tree using just variant sites.', dest="v") +parser.add_argument('-m', '--model', metavar='MODEL', default='MFP', + help='Nucleotide substitution model to use in iqtree.', dest="m") +parser.add_argument('-o', '--outgroup', metavar='OG', default=None, nargs='*', + help='Outgroup sample(s) in alignment file.', dest="o") + +args = parser.parse_args() + + + +if not args.w and not args.v: + parser.error('At least one of -w and -v required.') + +# get prefix for output files +pref = args.alignment.split('/')[-1] +pref = pref.rsplit('.', 1)[-1] +print('prefix: ' + pref) + +# change working directory to where alignment file is +wd = args.alignment.split('/')[0] +os.chdir(wd) + +# modules to load +# modules = 'ml gubbins' +# os.system(modules) + + + +# if perform recombination filtering with gubbins +if args.ng: + print('Not running gubbins.') + # use unmasked alignment + fasta_wga = args.alignment +else: + # create new alignment if outgroups present + if args.o is not None: + no_og_fasta = re.sub('.fa*', '_no-outgroup.fa', args.alignment) + subset_fasta(args.o, args.alignment, no_og_fasta) + fasta = no_og_fasta + else: + fasta = args.alignment + # run gubbins + gub = 'run_gubbins.py --prefix ' + pref + ' --threads 12 ' + fasta + print('Gubbins command: ' + gub) + os.system(gub) + # mask recombinant variants in whole genome alignment + fasta_wga = mask_positions(args.alignment, + pref + '.recombination_predictions.gff', + mask_all=args.o) + +# if build tree with only variants +if args.v: + fasta_vars = rm_invar_sites(fasta_wga) + # make iqtree_var directory + os.mkdir('iqtree_var') + os.chdir('iqtree_var') + # run iqtree + iqtree_var = '/nfs/esnitkin/bin_group/anaconda3/bin/iqtree -s' + \ + fasta_vars + '-nt AUTO -bb 1000 -m ' + args.m + '-pre ' + pref + print('iqtree variant sites command: ' + iqtree_var) + os.system(iqtree_var) + +# if build tree with whole genome alignment +if args.w: + # make iqtree_wga directory + #os.mkdir('iqtree_wga') + os.system('mkdir iqtree_wga') + os.chdir('iqtree_wga') + # run iqtree + iqtree_wga = '/nfs/esnitkin/bin_group/anaconda3/bin/iqtree -s' + \ + fasta_wga + '-nt AUTO -bb 1000 -m ' + args.m + '-pre ' + pref + print('iqtree WGA command: ' + iqtree_wga) + os.system(iqtree_wga) + diff --git a/modules/variant_diagnostics/scripts/gubbins_iqtree_raxml.py b/modules/variant_diagnostics/scripts/gubbins_iqtree_raxml.py new file mode 100644 index 0000000..17b5f79 --- /dev/null +++ b/modules/variant_diagnostics/scripts/gubbins_iqtree_raxml.py @@ -0,0 +1,99 @@ +# script to run gubbins and make tree +# written in python3 + +# modules needed: gubbins + +# import modules +import argparse +import sys +import re +import os + +# set path +sys.path.insert(1, '/nfs/esnitkin/bin_group/pipeline/Github/beast/') + +# import functions +import fasta_functions + +# parse command line arguments +parser = argparse.ArgumentParser(description='''Make tree using iqtree; + optionally run gubbins first.''') +parser.add_argument('alignment', metavar='ALN', + help='Alignment file in FASTA format.') +parser.add_argument('-ng', '--nogubbins', + action='store_false', + help='Don\'t do recombination filtering using gubbins.') +parser.add_argument('-w', '--whole_genome', + action='store_true', + help='Make tree using whole genome alignment.') +parser.add_argument('-v', '--variants_only', + action='store_true', + help='Make tree using just variant sites.') +parser.add_argument('-m', '--model', metavar='MODEL', default='MFP', + help='Nucleotide substitution model to use in iqtree.') +parser.add_argument('-o', '--outgroup', metavar='OG', default=None, nargs='*', + help='Outgroup sample(s) in alignment file.') + +args = parser.parse_args() + +if not args.w and not args.v: + parser.error('At least one of -w and -v required.') + +# get prefix for output files +pref = args.alignment.split('/')[-1] +pref = pref.rsplit('.', 1)[-1] +print('prefix: ' + pref) + +# change working directory to where alignment file is +wd = args.alignment.split('/')[0] +os.chdir(wd) + +# modules to load +modules = 'ml gubbins' +os.system(modules) + +# if perform recombination filtering with gubbins +if not args.ng: + # create new alignment if outgroups present + if args.o is not None: + no_og_fasta = re.sub('.fa*', '_no-outgroup.fa', args.alignment) + subset_fasta(args.o, args.alignment, no_og_fasta) + fasta = no_og_fasta + else: + fasta = args.alignment + # run gubbins + gub = 'run_gubbins.py --prefix ' + pref + ' --threads 12 ' + fasta + print('Gubbins command: ' + gub) + os.system(gub) + # mask recombinant variants in whole genome alignment + fasta_wga = mask_positions(args.alignment, + pref + '.recombination_predictions.gff', + mask_all=args.o) +else: + print('Not running gubbins.') + # use unmasked alignment + fasta_wga = args.alignment + +# if build tree with only variants +if args.v: + fasta_vars = rm_invar_sites(fasta_wga) + # make iqtree_var directory + os.mkdir('iqtree_var') + os.chdir('iqtree_var') + # run iqtree + iqtree_var = '/nfs/esnitkin/bin_group/anaconda3/bin/iqtree -s' + \ + fasta_vars + '-nt AUTO -bb 1000 -m ' + args.m + '-pre ' + pref + print('iqtree variant sites command: ' + iqtree_var) + os.system(iqtree_var) + +# if build tree with whole genome alignment +if args.w: + # make iqtree_wga directory + os.mkdir('iqtree_wga') + os.chdir('iqtree_wga') + # run iqtree + iqtree_wga = '/nfs/esnitkin/bin_group/anaconda3/bin/iqtree -s' + \ + fasta_wga + '-nt AUTO -bb 1000 -m ' + args.m + '-pre ' + pref + print('iqtree WGA command: ' + iqtree_wga) + os.system(iqtree_wga) + diff --git a/modules/variant_diagnostics/scripts/gubbins_iqtree_raxml.sh b/modules/variant_diagnostics/scripts/gubbins_iqtree_raxml.sh index b6ef080..f521827 100755 --- a/modules/variant_diagnostics/scripts/gubbins_iqtree_raxml.sh +++ b/modules/variant_diagnostics/scripts/gubbins_iqtree_raxml.sh @@ -23,6 +23,9 @@ cd $wd # modules to load (some of these might not be necessary) modules=$(echo python-anaconda2/201607 biopython fasttree dendropy reportlab RAxML raxml bioperl fastml/gub gubbins openmpi/1.10.2/gcc/4.8.5 gcc/4.8.5) +# Get Directory name where the bash script is located: Changes by Ali 31 Oct 2019 +DIRECTORY=`dirname $0` + # get account if [ -z "$3" ]; then echo Will submit jobs to esnitkin_flux. @@ -47,13 +50,13 @@ if [ $2 = 1 ]; then echo $raxml > ${pref}_raxml_command.sh # iqtree command - iqtree=$(echo /nfs/esnitkin/bin_group/anaconda3/bin/iqtree -s ../$pref.filtered_polymorphic_sites.fasta -nt AUTO -bb 1000 -m MFP -pre $pref) + iqtree=$(echo iqtree -s ../$pref.filtered_polymorphic_sites.fasta -nt AUTO -bb 1000 -m MFP -pre $pref) echo $iqtree > ${pref}_iqtree_command.sh # generate pbs scripts for gubbins, raxml, iqtree - /nfs/esnitkin/bin_group/anaconda3/bin/python /nfs/esnitkin/bin_group/pipeline/Github/scripts/pbs_script_maker.py -c ${pref}_gubbins_command.sh -o ${pref}_gubbins.pbs -M "$modules" -a $acct -wd $wd - /nfs/esnitkin/bin_group/anaconda3/bin/python /nfs/esnitkin/bin_group/pipeline/Github/scripts/pbs_script_maker.py -c ${pref}_iqtree_command.sh -o ${pref}_iqtree.pbs -M "$modules" -a $acct -wd $wd/iqtree_results - /nfs/esnitkin/bin_group/anaconda3/bin/python /nfs/esnitkin/bin_group/pipeline/Github/scripts/pbs_script_maker.py -c ${pref}_raxml_command.sh -o ${pref}_raxml.pbs -M "$modules" -a $acct -wd $wd/raxml_results + python $DIRECTORY/pbs_script_maker.py -c ${pref}_gubbins_command.sh -o ${pref}_gubbins.pbs -M "$modules" -a $acct -wd $wd + python $DIRECTORY/pbs_script_maker.py -c ${pref}_iqtree_command.sh -o ${pref}_iqtree.pbs -M "$modules" -a $acct -wd $wd/iqtree_results + python $DIRECTORY/pbs_script_maker.py -c ${pref}_raxml_command.sh -o ${pref}_raxml.pbs -M "$modules" -a $acct -wd $wd/raxml_results # start gubbins, iqtree, raxml jobs echo qsub ${pref}_gubbins.pbs @@ -80,19 +83,19 @@ else echo Will not run gubbins. echo Finding and removing invariant sites. - /nfs/esnitkin/bin_group/anaconda3/bin/snp-sites -o ${pref}_varSites.fa $1 + snp-sites -o ${pref}_varSites.fa $1 # raxml command raxml=$(echo mpirun -np 2 raxmlHPC-HYBRID-SSE3 -f a -x 12345 -p 12345 -N autoMRE -m ASC_GTRGAMMA --asc-corr=lewis -s ../${pref}_varSites.fa -n ${pref}_raxML -T 6) echo $raxml > ${pref}_raxml_command.sh # iqtree command - iqtree=$(echo /nfs/esnitkin/bin_group/anaconda3/bin/iqtree -s ../${pref}_varSites.fa -nt AUTO -bb 1000 -m MFP+ASC -pre ${pref}_varSites) + iqtree=$(echo iqtree -s ../${pref}_varSites.fa -nt AUTO -bb 1000 -m MFP+ASC -pre ${pref}_varSites) echo $iqtree > ${pref}_iqtree_command.sh # generate pbs scripts for iqtree, raxml - /nfs/esnitkin/bin_group/anaconda3/bin/python /nfs/esnitkin/bin_group/pipeline/Github/scripts/pbs_script_maker.py -c ${pref}_iqtree_command.sh -o ${pref}_iqtree.pbs -M "$modules" -a $acct -wd $wd/iqtree_results - /nfs/esnitkin/bin_group/anaconda3/bin/python /nfs/esnitkin/bin_group/pipeline/Github/scripts/pbs_script_maker.py -c ${pref}_raxml_command.sh -o ${pref}_raxml.pbs -M "$modules" -a $acct -wd $wd/raxml_results + python $DIRECTORY/pbs_script_maker.py -c ${pref}_iqtree_command.sh -o ${pref}_iqtree.pbs -M "$modules" -a $acct -wd $wd/iqtree_results + python $DIRECTORY/pbs_script_maker.py -c ${pref}_raxml_command.sh -o ${pref}_raxml.pbs -M "$modules" -a $acct -wd $wd/raxml_results # start iqtree, raxml jobs diff --git a/modules/variant_diagnostics/scripts/pbs_script_maker.py b/modules/variant_diagnostics/scripts/pbs_script_maker.py index 63566dd..6f328d7 100755 --- a/modules/variant_diagnostics/scripts/pbs_script_maker.py +++ b/modules/variant_diagnostics/scripts/pbs_script_maker.py @@ -48,6 +48,10 @@ default='esnitkin_flux', help='''flux account to submit pbs script to (default: esnitkin_flux)''') +parser.add_argument('-wd', '--wd', metavar='WD', type=str, + default='$PBS_O_WORKDIR', + help='''directory to submit pbs script from + (default: $PBS_O_WORKDIR)''') args = parser.parse_args() @@ -62,6 +66,7 @@ modules = args.modules commands = args.commands outfile = args.outfile +wd = args.wd if args.jobname is None: @@ -83,36 +88,39 @@ num_nodes, num_cores, mem, walltime) # print pbs script to output file -print('#!/bin/sh', - '#### PBS preamble', - '', - '#PBS -N {}'.format(job_name), - '', - '# User info', - '#PBS -M {}'.format(email), - '#PBS -m abe', - '', - '# Number of cores, amount of memory, and walltime', - info, - '#PBS -j oe', - '#PBS -V', - '', - '#PBS -A {}'.format(acct), - '#PBS -q {}'.format(qos), - '#PBS -l qos=flux', - '', - '#### End PBS preamble', - '', - '# Show list of CPUs you ran on, if you\'re running under PBS', - 'if [ -n "$PBS_NODEFILE" ]; then cat $PBS_NODEFILE; fi', - '', - '# Change to the directory you submitted from', - 'cd $PBS_O_WORKDIR', - '', - '# Load modules', - 'module load {}'.format(modules), - '', - '# Job commands', - 'bash {}'.format(commands), - sep='\n', - file=open(outfile, 'w')) +file = open(outfile, 'w') +file.write('#!/bin/sh\n') +file.write('\n') +file.write('#!/bin/sh\n') +file.write('#### PBS preamble\n') +file.write('\n') +file.write('#PBS -N {}\n'.format(job_name)) +file.write('\n') +file.write('# User info\n') +file.write('#PBS -M {}\n'.format(email)) +file.write('#PBS -m abe\n') +file.write('\n') +file.write('# Number of cores, amount of memory, and walltime\n') +file.write('%s\n' % info) +file.write('#PBS -j oe\n') +file.write('#PBS -V\n') +file.write('\n') +file.write('#PBS -A {}\n'.format(acct)) +file.write('#PBS -q {}\n'.format(qos)) +file.write('#PBS -l qos=flux\n') +file.write('\n') +file.write('#### End PBS preamble\n') +file.write('\n') +file.write('# Show list of CPUs you ran on, if you\'re running under PBS\n') +file.write('if [ -n "$PBS_NODEFILE" ]; then cat $PBS_NODEFILE; fi\n') +file.write('\n') +file.write('# Change to the directory you submitted from\n') +file.write('#cd $PBS_O_WORKDIR\n') +file.write('cd {}\n'.format(wd)) +file.write('echo {}\n'.format(wd)) +file.write('\n') +file.write('# Load modules\n') +file.write('module load {}\n'.format(modules)) +file.write('\n') +file.write('# Job commands\n') +file.write('bash {}\n'.format(commands)) diff --git a/modules/variant_diagnostics/scripts/script_maker.py b/modules/variant_diagnostics/scripts/script_maker.py new file mode 100755 index 0000000..8ab5bb1 --- /dev/null +++ b/modules/variant_diagnostics/scripts/script_maker.py @@ -0,0 +1,168 @@ +# Create pbs script based on input + +# import libraries +import argparse +import subprocess + +# parse command line arguments +parser = argparse.ArgumentParser( + description='''Create pbs script based on input arguments.''') + +parser.add_argument('-c', '--commands', metavar='COMMANDS_FILE', type=str, + required=True, + help='bash file with commands to run') +parser.add_argument('-o', '--outfile', metavar='OUT_FILE', type=str, + required=True, + help='output pbs file name') +parser.add_argument('-M', '--modules', metavar='MODULES', type=str, + default='', + help='modules to load (space-delimited; default: None)') + +parser.add_argument('-n', '--nodes', metavar='NUM_NODES', type=str, + default='1', + help='number of nodes (default: 1)') +parser.add_argument('-p', '--ppn', metavar='NUM_CORES', type=str, + default='12', + help='number of processers (cores) per node (default: 12)') +parser.add_argument('-m', '--mem', metavar='MEMORY', type=str, + default='47', + help='amount of memory (gb; default: 47)') +parser.add_argument('-w', '--walltime', metavar='WALL_TIME', type=str, + default='10:00:00:00', + help='''amount of time needed to run program + (default: 10 days -- 10:00:00:00)''') + +parser.add_argument('-N', '--jobname', metavar='JOB_NAME', type=str, + default=None, + help='name of pbs job (default: output file prefix)') +parser.add_argument('-e', '--email', metavar='EMAIL', type=str, + default=None, + help='''umich email, + default: flux login uniqname umich email''') + +parser.add_argument('-P', '--pmem', action='store_true', + help='''flag - use pmem (memory per processor) + instead of mem (total memory)''') + +parser.add_argument('-a', '--acct', metavar='ACCOUNT', type=str, + default='esnitkin_flux', + help='''account to submit script to + (default: esnitkin_flux)''') +parser.add_argument('-wd', '--wd', metavar='WD', type=str, + default=None, + help='''directory to submit script from + (default: location of script)''') +parser.add_argument('-s', '--slurm', action='store_true', + help='''flag - create slurm script (rather than pbs script)''') + +args = parser.parse_args() + +num_nodes = args.nodes +num_cores = args.ppn +ppn = args.ppn +mem = args.mem +pmem = args.pmem +walltime = args.walltime +acct = args.acct +qos = acct.split('_')[-1] +modules = args.modules +commands = args.commands +outfile = args.outfile +wd = args.wd +slurm = args.slurm + + +if args.wd is None: + if slurm: + wd = '$SLURM_SUBMIT_DIR' + else: + wd = '$PBS_O_WORKDIR' + +if args.jobname is None: + job_name = outfile.split('.')[0] +else: + job_name = args.jobname + +if args.email is None: + uniqname = str(subprocess.check_output('whoami').rsplit()).split('\'')[1] + email = '{}@umich.edu'.format(uniqname) +else: + email = args.email + +if pmem: + info = '#PBS -l nodes={}:ppn={},pmem={}gb,walltime={}'.format( + num_nodes, num_cores, mem, walltime) +else: + info = '#PBS -l nodes={}:ppn={},mem={}gb,walltime={}'.format( + num_nodes, num_cores, mem, walltime) + +if slurm: + + if pmem: + info = '#SBATCH --nodes={} --ntasks=1 --cpus-per-task={} --mem-per-cpu={}g --time={}'.format( + num_nodes, num_cores, mem, walltime) + else: + info = '#SBATCH --nodes={} --ntasks=1 --cpus-per-task={} --mem={}g --time={}'.format( + num_nodes, num_cores, mem, walltime) + + # print slurm script to output file + print('#!/bin/sh', + '# Job name', + '#SBATCH --job-name={}'.format(job_name), + '# User info', + '#SBATCH --mail-user={}'.format(email), + '#SBATCH --mail-type=BEGIN,END,NONE,FAIL,REQUEUE', + '#SBATCH --export=ALL', + '#SBATCH --partition=standard', + '#SBATCH --account={}'.format(acct), + '# Number of cores, amount of memory, and walltime', + info, + '# Change to the directory you submitted from', + 'cd {}'.format(wd), + 'echo {}'.format(wd), + '', + '# Load modules', + 'module load {}'.format(modules), + '', + '# Job commands', + 'bash {}'.format(commands), + sep='\n', + file=open(outfile, 'w')) + +else: + # print pbs script to output file + print('#!/bin/sh', + '#### PBS preamble', + '', + '#PBS -N {}'.format(job_name), + '', + '# User info', + '#PBS -M {}'.format(email), + '#PBS -m abe', + '', + '# Number of cores, amount of memory, and walltime', + info, + '#PBS -j oe', + '#PBS -V', + '', + '#PBS -A {}'.format(acct), + '#PBS -q {}'.format(qos), + '#PBS -l qos=flux', + '', + '#### End PBS preamble', + '', + '# Show list of CPUs you ran on, if you\'re running under PBS', + 'if [ -n "$PBS_NODEFILE" ]; then cat $PBS_NODEFILE; fi', + '', + '# Change to the directory you submitted from', + '#cd $PBS_O_WORKDIR', + 'cd {}'.format(wd), + 'echo {}'.format(wd), + '', + '# Load modules', + 'module load {}'.format(modules), + '', + '# Job commands', + 'bash {}'.format(commands), + sep='\n', + file=open(outfile, 'w')) diff --git a/modules/variant_diagnostics/test b/modules/variant_diagnostics/test deleted file mode 100755 index 21f3210..0000000 --- a/modules/variant_diagnostics/test +++ /dev/null @@ -1,22 +0,0 @@ -import csv -import re - -c_reader = csv.reader(open('SNP_matrix_allele_new.csv', 'r'), delimiter='\t') -columns = list(zip(*c_reader)) -counts = 1 -end = 415 + 1 - - -for i in xrange(1, end, 1): - print_string = "" - sample_name = str(columns[i][0]) - sample_name_re = re.sub('_*1*.fastq.gz', '', sample_name) - print_string = print_string + ">%s\n" % sample_name_re - variant_allele = ''.join(columns[i][1:]) - print_string = print_string + str(variant_allele) - allele_variant_fasta = open("%s_allele_variants.fa" % (sample_name_re), 'w+') - #print print_string - #print print_string.replace('_.*1.fastq.gz', '') - #print print_string_re - allele_variant_fasta.write(print_string) - allele_variant_fasta.close() \ No newline at end of file diff --git a/modules/variant_diagnostics/test.py b/modules/variant_diagnostics/test.py deleted file mode 100755 index b278851..0000000 --- a/modules/variant_diagnostics/test.py +++ /dev/null @@ -1,17 +0,0 @@ -for i in xrange(1, end, 1): - -print_string = "" -sample_name = str(columns[i][0]) -sample_name_re = re.sub('_*1*.fastq.gz', '', sample_name) -print sample_name -print_string = print_string + ">%s\n\n" % sample_name_re -variant_allele = ''.join(columns[i][1:]) -print_string = print_string + str(variant_allele) -allele_variant_fasta = open("%s/%s.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') -#print print_string -#print print_string.replace('_.*1.fastq.gz', '') -#print print_string_re -allele_variant_fasta.write(print_string) -allele_variant_fasta.close() -variant_allele_array = [] -variant_allele_array.append(columns[i][1:]) \ No newline at end of file diff --git a/modules/vcftools.pyc b/modules/vcftools.pyc index 1fe4369..5f415a7 100755 Binary files a/modules/vcftools.pyc and b/modules/vcftools.pyc differ diff --git a/pipeline.py b/pipeline.py index f09f7e7..a55d97e 100755 --- a/pipeline.py +++ b/pipeline.py @@ -669,7 +669,7 @@ def downsample(args, logger): files_to_delete = [] Config = ConfigParser.ConfigParser() Config.read(config_file) - pipeline(args, logger) + #pipeline(args, logger) cleanup(args, logger) keep_logging('End: Pipeline', 'End: Pipeline', logger, 'info') time_taken = datetime.now() - start_time_2 diff --git a/spec-file.txt b/spec-file.txt new file mode 100644 index 0000000..3f86ca3 --- /dev/null +++ b/spec-file.txt @@ -0,0 +1,53 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +@EXPLICIT +https://conda.anaconda.org/bioconda/linux-64/bwa-0.7.12-1.tar.bz2 +https://conda.anaconda.org/bioconda/linux-64/samtools-1.2-2.tar.bz2 +https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2019.9.11-hecc5488_0.tar.bz2 +https://conda.anaconda.org/bioconda/linux-64/java-jdk-8.0.92-1.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/krb5-1.14.6-0.tar.bz2 +https://repo.anaconda.com/pkgs/main/linux-64/libgfortran-ng-7.3.0-hdf63c60_0.tar.bz2 +https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-9.1.0-hdf63c60_0.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/ncurses-5.9-10.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/openjdk-8.0.192-h14c3975_1003.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/sqlite-3.13.0-1.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.5.19-2.tar.bz2 +https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-9.1.0-hdf63c60_0.tar.bz2 +https://conda.anaconda.org/bioconda/linux-64/picard-2.5.0-2.tar.bz2 +https://conda.anaconda.org/bioconda/linux-64/qualimap-2.2.2a-1.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/readline-6.2-0.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h516909a_1.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/capnproto-0.6.1-hfc679d8_1.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.7-h6e990d7_1.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/openssl-1.0.2r-h14c3975_0.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/perl-5.26.2-h516909a_1006.tar.bz2 +https://conda.anaconda.org/bioconda/linux-64/raxml-8.2.12-h14c3975_1.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.4-h14c3975_1001.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.11-h516909a_1006.tar.bz2 +https://conda.anaconda.org/bioconda/linux-64/bedtools-2.23.0-hdbcaa40_3.tar.bz2 +https://conda.anaconda.org/bioconda/linux-64/bioawk-1.0-h84994c4_4.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.8.0-13_openblas.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.8.0-h1ad7b7a_1003.tar.bz2 +https://conda.anaconda.org/bioconda/linux-64/mummer-3.23-pl526_8.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/python-2.7.12-2.tar.bz2 +https://conda.anaconda.org/bioconda/noarch/snpeff-4.3.1t-2.tar.bz2 +https://conda.anaconda.org/bioconda/linux-64/tabix-0.2.6-ha92aebf_0.tar.bz2 +https://conda.anaconda.org/bioconda/linux-64/vcftools-0.1.16-he860b03_3.tar.bz2 +https://conda.anaconda.org/bioconda/linux-64/bowtie2-2.2.6-py27_0.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/certifi-2019.9.11-py27_0.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/curl-7.61.0-h93b3f91_2.tar.bz2 +https://conda.anaconda.org/bioconda/linux-64/gatk-3.8-py27_0.tar.bz2 +https://conda.anaconda.org/bioconda/noarch/gatk4-4.1.3.0-0.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.8.0-13_openblas.tar.bz2 +https://conda.anaconda.org/bioconda/noarch/pilon-1.22-1.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/subprocess32-3.5.4-py27h516909a_0.tar.bz2 +https://conda.anaconda.org/bioconda/noarch/trimmomatic-0.39-1.tar.bz2 +https://conda.anaconda.org/bioconda/linux-64/bcftools-1.2-h02bfda8_4.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/gsl-2.5-h294904e_1.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/setuptools-41.2.0-py27_0.tar.bz2 +https://conda.anaconda.org/conda-forge/noarch/joblib-0.14.0-py_0.tar.bz2 +https://conda.anaconda.org/bioconda/linux-64/mash-2.2.1-h3d38be6_0.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/wheel-0.33.6-py27_0.tar.bz2 +https://conda.anaconda.org/conda-forge/linux-64/pip-19.2.3-py27_0.tar.bz2 diff --git a/test b/test deleted file mode 100644 index e69de29..0000000 diff --git a/variant_call.py b/variant_call.py index 0d6888c..f7221ca 100755 --- a/variant_call.py +++ b/variant_call.py @@ -32,10 +32,12 @@ def parser(): '1. All: Run all variant calling steps starting from trimming the reads, mapping, post-processing the alignments and calling variants;\n' '2. core_All: Extract core snps and generate different types of alignments, SNP/Indel Matrices and diagnostics plots.') required.add_argument('-analysis', action='store', dest="analysis_name", help='Unique analysis name that will be used as prefix to saving results and log files.', required=True) + optional.add_argument('-gubbins_env', action='store', dest="gubbins_env", + help='Name of the Gubbins Raxml Iqtree environment to load for Phylogenetic analysis') optional.add_argument('-config', action='store', dest="config", help='Path to Config file, Make sure to check config settings before running pipeline', required=False) optional.add_argument('-suffix', action='store', dest="suffix", help='Fastq reads suffix such as fastq, fastq.gz, fq.gz, fq; Default: fastq.gz', required=False) optional.add_argument('-filenames', action='store', dest="filenames", help='fastq filenames with one single-end filename per line. \nIf the type is set to PE, it will detect the second paired-end filename with the suffix from first filename. \nUseful for running variant calling pipeline on selected files in a reads directory or extracting core snps for selected samples in input reads directory. \nOtherwise the pipeline will consider all the samples available in reads directory.', required=False) - optional.add_argument('-cluster', action='store', dest='cluster', help='Run variant calling pipeline in one of the four modes. Default: local. Suggested mode for core snp is \"cluster\" that will run all the steps in parallel with the available cores. Make sure to provide a large memory node for this option\nThe possible modes are: cluster/parallel-local/local\nSet your specific hpc cluster parameters in config file under the [scheduler] section. Supports only PBS scheduling system. ') + optional.add_argument('-cluster', action='store', dest='cluster', help='Run variant calling pipeline in local or cluster mode.\nDefault: local.\nSet your specific hpc cluster parameters in config file under the [scheduler] section. Supports PBS/SLURM scheduling system.') optional.add_argument('-clean', action="store_true", help='clean up intermediate files. Default: OFF') optional.add_argument('-extract_unmapped', action='store', dest="extract_unmapped", help='Extract unmapped reads, assemble it and detect AMR genes using ariba') optional.add_argument('-datadir', action='store', dest="datadir", help='Path to snpEff data directory') @@ -48,7 +50,7 @@ def parser(): optional.add_argument('-coverage_depth', action='store', dest="coverage_depth", help='Downsample Reads to this user specified depth') optional.add_argument('-scheduler', action='store', dest="scheduler", - help='Type of Scheduler for generating cluster jobs') + help='Type of Scheduler for generating cluster jobs: PBS, SLURM, LOCAL') return parser @@ -113,6 +115,38 @@ def get_filenames(dir, type, filenames, analysis, suffix): list_of_files.append(line) return list_of_files +def get_scheduler_directive(scheduler, Config): + """ Generate Cluster Directive lines for a scheduler provided with args.scheduler""" + # Scheduler Changes here; current changes + if scheduler and scheduler == "SLURM": + script_Directive = "#SBATCH" + job_name_flag = "--job-name=" + scheduler_directives = "#SBATCH --mail-user=%s\n#SBATCH --mail-type=%s\n#SBATCH --export=ALL\n#SBATCH --partition=%s\n#SBATCH --account=%s\n#SBATCH %s\n" \ + % (ConfigSectionMap("slurm", Config)['email'], + ConfigSectionMap("slurm", Config)['notification'], + ConfigSectionMap("slurm", Config)['partition'], + ConfigSectionMap("slurm", Config)['flux_account'], + ConfigSectionMap("slurm", Config)['resources']) + elif scheduler and scheduler == "PBS": + script_Directive = "#PBS" + job_name_flag = "-N" + scheduler_directives = "#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n" \ + % (ConfigSectionMap("scheduler", Config)['email'], + ConfigSectionMap("scheduler", Config)['notification'], + ConfigSectionMap("scheduler", Config)['resources'], + ConfigSectionMap("scheduler", Config)['queue'], + ConfigSectionMap("scheduler", Config)['flux_account']) + else: + script_Directive = "#SBATCH" + job_name_flag = "--job-name=" + scheduler_directives = "#SBATCH --mail-user=%s\n#SBATCH --mail-type=%s\n#SBATCH --export=ALL\n#SBATCH --partition=%s\n#SBATCH --account=%s\n#SBATCH %s\n" \ + % (ConfigSectionMap("slurm", Config)['email'], + ConfigSectionMap("slurm", Config)['notification'], + ConfigSectionMap("slurm", Config)['partition'], + ConfigSectionMap("slurm", Config)['flux_account'], + ConfigSectionMap("slurm", Config)['resources']) + return scheduler_directives, script_Directive, job_name_flag + """ Methods to generate jobs for various pipeline tasks """ def create_varcall_jobs(filenames_array, type, output_folder, reference, steps, config_file, logger): """Takes a list of files and other arguments, generate variant calling jobs. @@ -129,28 +163,12 @@ def create_varcall_jobs(filenames_array, type, output_folder, reference, steps, make_sure_path_exists(jobs_temp_dir) keep_logging('Generating cluster jobs in temporary directory %s' % jobs_temp_dir, 'Generating cluster jobs in temporary directory %s' % jobs_temp_dir, logger, 'exception') - # # Scheduler Changes here; current changes - # if args.scheduler and args.scheduler == "slurm": - # script_Directive = "#SBATCH" - # Pbs_model_lines = "#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n" \ - # % (ConfigSectionMap("scheduler", Config)['email'], - # ConfigSectionMap("scheduler", Config)['notification'], - # ConfigSectionMap("scheduler", Config)['resources'], - # ConfigSectionMap("scheduler", Config)['queue'], - # ConfigSectionMap("scheduler", Config)['flux_account']) - # elif args.scheduler and args.scheduler == "flux": - # script_Directive = "#PBS" - # Pbs_model_lines = "#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n" \ - # % (ConfigSectionMap("scheduler", Config)['email'], - # ConfigSectionMap("scheduler", Config)['notification'], - # ConfigSectionMap("scheduler", Config)['resources'], - # ConfigSectionMap("scheduler", Config)['queue'], - # ConfigSectionMap("scheduler", Config)['flux_account']) + scheduler_directives, script_Directive, job_name_flag = get_scheduler_directive(args.scheduler, Config) - - Pbs_model_lines = "#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n"\ - % (ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account']) + # Deprecated + # Pbs_model_lines = "#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n"\ + # % (ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account']) @@ -165,52 +183,52 @@ def create_varcall_jobs(filenames_array, type, output_folder, reference, steps, second_part = filename_base.replace("R1_001_final.fastq.gz", "R2_001_final.fastq.gz") first_part_split = filename_base.split('R1_001_final.fastq.gz') first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) + first_part = re.sub("_S[0-9].*_", "", first_part) elif "R1_001.fastq.gz" in filename_base: second_part = filename_base.replace("R1_001.fastq.gz", "R2_001.fastq.gz") first_part_split = filename_base.split('R1_001.fastq.gz') first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) + first_part = re.sub("_S[0-9].*_", "", first_part) elif "_R1.fastq.gz" in filename_base: second_part = filename_base.replace("_R1.fastq.gz", "_R2.fastq.gz") first_part_split = filename_base.split('_R1.fastq.gz') first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) + first_part = re.sub("_S[0-9].*_", "", first_part) elif "R1.fastq.gz" in filename_base: second_part = filename_base.replace("R1.fastq.gz", "R2.fastq.gz") first_part_split = filename_base.split('R1.fastq.gz') first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) + first_part = re.sub("_S[0-9].*_", "", first_part) elif "1_combine.fastq.gz" in filename_base: second_part = filename_base.replace("1_combine.fastq.gz", "2_combine.fastq.gz") first_part_split = filename_base.split('1_combine.fastq.gz') first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) + first_part = re.sub("_S[0-9].*_", "", first_part) elif "1_sequence.fastq.gz" in filename_base: second_part = filename_base.replace("1_sequence.fastq.gz", "2_sequence.fastq.gz") first_part_split = filename_base.split('1_sequence.fastq.gz') first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) + first_part = re.sub("_S[0-9].*_", "", first_part) elif "_forward.fastq.gz" in filename_base: second_part = filename_base.replace("_forward.fastq.gz", "_reverse.fastq.gz") first_part_split = filename_base.split('_forward.fastq.gz') first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) + first_part = re.sub("_S[0-9].*_", "", first_part) elif "R1_001.fastq.gz" in filename_base: second_part = filename_base.replace("R1_001.fastq.gz", "R2_001.fastq.gz") first_part_split = filename_base.split('R1_001.fastq.gz') first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) + first_part = re.sub("_S[0-9].*_", "", first_part) elif "_1.fastq.gz" in filename_base: second_part = filename_base.replace("_1.fastq.gz", "_2.fastq.gz") first_part_split = filename_base.split('_1.fastq.gz') first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) + first_part = re.sub("_S[0-9].*_", "", first_part) elif ".1.fastq.gz" in filename_base: second_part = filename_base.replace(".1.fastq.gz", ".2.fastq.gz") first_part_split = filename_base.split('.1.fastq.gz') first_part = first_part_split[0].replace('_L001', '') - first_part = re.sub("_S.*_", "", first_part) + first_part = re.sub("_S[0-9].*_", "", first_part) """ Have a standard filename preparation step""" # else: @@ -220,24 +238,27 @@ def create_varcall_jobs(filenames_array, type, output_folder, reference, steps, # first_part = first_part_split[0].replace('_L001', '') # first_part = re.sub("_S.*_", "", first_part) second_file = args.dir + "/" + second_part - job_name = jobs_temp_dir + "/" + first_part + ".pbs" + if args.scheduler == "SLURM": + job_name = jobs_temp_dir + "/" + first_part + ".sbat" + else: + job_name = jobs_temp_dir + "/" + first_part + ".pbs" if not steps: steps == "All" if type == "SE": if args.clean: - command = "/nfs/esnitkin/bin_group/anaconda2/bin/python %s/pipeline.py -PE1 %s -o %s/%s -analysis %s -index %s -type SE -config %s -steps %s -clean" % ( + command = "python %s/pipeline.py -PE1 %s -o %s/%s -analysis %s -index %s -type SE -config %s -steps %s -clean" % ( os.path.dirname(os.path.abspath(__file__)), first_file, output_folder, first_part, first_part, reference, config_file, steps) else: - command = "/nfs/esnitkin/bin_group/anaconda2/bin/python %s/pipeline.py -PE1 %s -o %s/%s -analysis %s -index %s -type SE -config %s -steps %s" % (os.path.dirname(os.path.abspath(__file__)), first_file, output_folder, first_part, first_part, reference, config_file, steps) + command = "python %s/pipeline.py -PE1 %s -o %s/%s -analysis %s -index %s -type SE -config %s -steps %s" % (os.path.dirname(os.path.abspath(__file__)), first_file, output_folder, first_part, first_part, reference, config_file, steps) else: if args.clean: - command = "/nfs/esnitkin/bin_group/anaconda2/bin/python %s/pipeline.py -PE1 %s -PE2 %s -o %s/%s -analysis %s -index %s -type PE -config %s -steps %s -clean" % ( + command = "python %s/pipeline.py -PE1 %s -PE2 %s -o %s/%s -analysis %s -index %s -type PE -config %s -steps %s -clean" % ( os.path.dirname(os.path.abspath(__file__)), first_file, second_file, output_folder, first_part, first_part, reference, config_file, steps) else: - command = "/nfs/esnitkin/bin_group/anaconda2/bin/python %s/pipeline.py -PE1 %s -PE2 %s -o %s/%s -analysis %s -index %s -type PE -config %s -steps %s" % (os.path.dirname(os.path.abspath(__file__)), first_file, second_file, output_folder, first_part, first_part, reference, config_file, steps) + command = "python %s/pipeline.py -PE1 %s -PE2 %s -o %s/%s -analysis %s -index %s -type PE -config %s -steps %s" % (os.path.dirname(os.path.abspath(__file__)), first_file, second_file, output_folder, first_part, first_part, reference, config_file, steps) # # Adding Downsampling support 2019-06-20 @@ -250,12 +271,10 @@ def create_varcall_jobs(filenames_array, type, output_folder, reference, steps, command = command + " -downsample yes -coverage_depth %s" % depth with open(job_name, 'w') as out: - job_title = "#PBS -N %s" % first_part + job_title = "%s %s%s" % (script_Directive, job_name_flag, first_part) + out.write("#!/bin/sh" + '\n') out.write(job_title+'\n') - out.write(Pbs_model_lines+'\n') - #out.write(cd_command+'\n') ## changed it to automatically change to PBS working directory - out.write("# Change to the directory you submitted from\nif [ -n \"$PBS_O_WORKDIR\" ]; then cd $PBS_O_WORKDIR; fi" + '\n') - out.write("echo $PBS_O_WORKDIR" + '\n') + out.write(scheduler_directives+'\n') out.write("cd %s/temp_jobs" % output_folder + '\n') out.write(command+'\n') elif "R2_001_final.fastq.gz" in filename_base or "R2.fastq.gz" in filename_base or "2_combine.fastq.gz" in filename_base or "2_sequence.fastq.gz" in filename_base or "_reverse.fastq.gz" in filename_base or "R2_001.fastq.gz" in filename_base or "_2.fastq.gz" in filename_base or ".2.fastq.gz" in filename_base or "_R2.fastq.gz" in filename_base: @@ -264,7 +283,10 @@ def create_varcall_jobs(filenames_array, type, output_folder, reference, steps, keep_logging('Error while generating cluster jobs. Make sure the fastq filenames ends with one of these suffix: R1_001_final.fastq.gz, R1.fastq.gz, 1_combine.fastq.gz, 1_sequence.fastq.gz, _forward.fastq.gz, R1_001.fastq.gz, _1.fastq.gz, .1.fastq.gz, _R1.fastq.gz', 'Error while generating cluster jobs. Make sure the fastq filenames ends with one of these suffix: R1_001_final.fastq.gz, R1.fastq.gz, 1_combine.fastq.gz, 1_sequence.fastq.gz, _forward.fastq.gz, R1_001.fastq.gz, _1.fastq.gz, .1.fastq.gz, _R1.fastq.gz', logger, 'exception') print filename_base exit() - list_of_jobs = glob.glob("%s/*.pbs" % jobs_temp_dir) + if args.scheduler == "SLURM": + list_of_jobs = glob.glob("%s/*.sbat" % jobs_temp_dir) + else: + list_of_jobs = glob.glob("%s/*.pbs" % jobs_temp_dir) return list_of_jobs def generate_custom_vcf_file_list(filenames_array, logger): @@ -336,25 +358,33 @@ def run_command_list(command): return done def run_varcall_jobs(list_of_jobs, cluster, log_unique_time, analysis_name, output_folder, logger): - #Generate command list to run on single cluster or in parallel command_list = "" command_list_qsub = [] for job in list_of_jobs: command_list = command_list + "bash %s\n" % job command_list_qsub.append(job) - #cluster mode + job_id_array = [] if cluster == "cluster": keep_logging('Running Jobs in cluster mode', 'Running Jobs in cluster mode', logger, 'info') for job in command_list_qsub: - keep_logging('Submitting Job: qsub %s' % job, 'Submitting Job: qsub %s' % job, logger, 'info') - call("qsub %s" % job, logger) - - elif cluster == "parallel-cluster": - keep_logging('Running Jobs in parallel-cluster mode', 'Running Jobs in parallel-cluster mode', logger, 'info') - for job in command_list_qsub: - keep_logging('Submitting Job: qsub %s' % job, 'Submitting Job: qsub %s' % job, logger, 'info') - call("qsub %s" % job, logger) + keep_logging('Submitting Job: %s' % job, 'Submitting Job: %s' % job, logger, 'info') + if args.scheduler == "SLURM": + #call("sbatch %s" % job, logger) + keep_logging("sbatch %s" % job, "sbatch %s" % job, logger, 'info') + #proc = subprocess.Popen(["sbatch %s" % job], stdout=subprocess.PIPE, shell=True) + #(out, err) = proc.communicate() + #job_id_array.append(out.split(' ')[3].strip()) + # job_id_array.append('123xx') + # break + elif args.scheduler == "PBS": + #call("qsub %s" % job, logger) + keep_logging("qsub %s" % job, "qsub %s" % job, logger, 'info') + #proc = subprocess.Popen(["qsub %s" % job], stdout=subprocess.PIPE, shell=True) + #(out, err) = proc.communicate() + #job_id_array.append(out.split('.')[0].strip()) + # job_id_array.append('123xx') + # break elif cluster == "parallel-local": keep_logging('Running Jobs in parallel-local mode', 'Running Jobs in parallel-local mode', logger, 'info') @@ -366,21 +396,27 @@ def run_varcall_jobs(list_of_jobs, cluster, log_unique_time, analysis_name, outp num_cores = multiprocessing.cpu_count() keep_logging('Number of cores available: %s' % num_cores, 'Number of cores available: %s' % num_cores, logger, 'info') results = Parallel(n_jobs=num_cores)(delayed(run_command)(i) for i in command_list_qsub) + elif cluster == "local": keep_logging('Running Jobs in local mode', 'Running Jobs in local mode', logger, 'info') for job in command_list_qsub: keep_logging('Running Job: bash %s' % job, 'Running Job: bash %s' % job, logger, 'info') call("bash %s" % job, logger) + return job_id_array """ Pipeline individual task methods """ def run_core_prep_analysis(core_temp_dir, reference, analysis_name, log_unique_time, cluster, logger, config_file): file_exists(reference) if args.debug_mode == "yes": - core_prep_pipeline = "/nfs/esnitkin/bin_group/anaconda2/bin/python %s/modules/variant_diagnostics/core_pipeline_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 1 -jobrun %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, config_file) + core_prep_pipeline = "python %s/modules/variant_diagnostics/core_pipeline_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 1 -jobrun %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, config_file) else: - core_prep_pipeline = "/nfs/esnitkin/bin_group/anaconda2/bin/python %s/modules/variant_diagnostics/core_pipeline.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 1 -jobrun %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, config_file) + core_prep_pipeline = "python %s/modules/variant_diagnostics/core_pipeline.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 1 -jobrun %s -config %s -scheduler %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, config_file, args.scheduler) - job_name = core_temp_dir + "/" + log_unique_time + "_" + analysis_name + ".pbs" + + if args.scheduler == "SLURM": + job_name = core_temp_dir + "/" + log_unique_time + "_" + analysis_name + ".sbat" + else: + job_name = core_temp_dir + "/" + log_unique_time + "_" + analysis_name + ".pbs" Pbs_model_lines = "#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n" \ % (ConfigSectionMap("scheduler", Config)['email'], @@ -425,9 +461,9 @@ def run_core_prep_analysis(core_temp_dir, reference, analysis_name, log_unique_t def run_core_analysis(core_temp_dir, reference, analysis_name, log_unique_time, cluster, logger, core_results_dir, config_file): file_exists(reference) if args.debug_mode == "yes": - core_pipeline = "/nfs/esnitkin/bin_group/anaconda2/bin/python %s/modules/variant_diagnostics/core_pipeline_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 2 -jobrun %s -results_dir %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, core_results_dir, config_file) + core_pipeline = "python %s/modules/variant_diagnostics/core_pipeline_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 2 -jobrun %s -results_dir %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, core_results_dir, config_file) else: - core_pipeline = "/nfs/esnitkin/bin_group/anaconda2/bin/python %s/modules/variant_diagnostics/core_pipeline.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 2 -jobrun %s -results_dir %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, core_results_dir, config_file) + core_pipeline = "python %s/modules/variant_diagnostics/core_pipeline.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 2 -jobrun %s -results_dir %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, core_results_dir, config_file) job_name = core_temp_dir + "/" + log_unique_time + "_" + analysis_name + ".pbs" Pbs_model_lines = "#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n" \ @@ -468,9 +504,9 @@ def run_core_analysis(core_temp_dir, reference, analysis_name, log_unique_time, def run_report_analysis(core_temp_dir, reference, analysis_name, log_unique_time, cluster, logger, core_results_dir, config_file): file_exists(reference) if args.debug_mode == "yes": - core_pipeline = "/nfs/esnitkin/bin_group/anaconda2/bin/python %s/modules/variant_diagnostics/core_pipeline_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 3 -jobrun %s -results_dir %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, core_results_dir, config_file) + core_pipeline = "python %s/modules/variant_diagnostics/core_pipeline_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 3 -jobrun %s -results_dir %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, core_results_dir, config_file) else: - core_pipeline = "/nfs/esnitkin/bin_group/anaconda2/bin/python %s/modules/variant_diagnostics/core_pipeline.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 3 -jobrun %s -results_dir %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, core_results_dir, config_file) + core_pipeline = "python %s/modules/variant_diagnostics/core_pipeline.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 3 -jobrun %s -results_dir %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, core_results_dir, config_file) job_name = core_temp_dir + "/" + log_unique_time + "_" + analysis_name + ".pbs" Pbs_model_lines = "#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l nodes=1:ppn=4,pmem=4000mb,walltime=92:00:00\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n"\ % (ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account']) @@ -504,13 +540,15 @@ def run_report_analysis(core_temp_dir, reference, analysis_name, log_unique_time def run_tree_analysis(core_temp_dir, reference, analysis_name, log_unique_time, cluster, logger, core_results_dir, config_file): if args.debug_mode == "yes": - core_pipeline = "/nfs/esnitkin/bin_group/anaconda2/bin/python %s/modules/variant_diagnostics/core_pipeline_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 4 -jobrun %s -results_dir %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, core_results_dir, config_file) + core_pipeline = "python %s/modules/variant_diagnostics/core_pipeline_debug.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 4 -jobrun %s -results_dir %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, core_results_dir, config_file) if args.gubbins == "yes": core_pipeline = core_pipeline + " -gubbins %s" % args.gubbins + if args.gubbins_env: + core_pipeline = core_pipeline + " -gubbins_env %s" % args.gubbins_env if args.outgroup: core_pipeline = core_pipeline + " -outgroup %s" % args.outgroup else: - core_pipeline = "/nfs/esnitkin/bin_group/anaconda2/bin/python %s/modules/variant_diagnostics/core_pipeline.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 4 -jobrun %s -results_dir %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, core_results_dir, config_file) + core_pipeline = "python %s/modules/variant_diagnostics/core_pipeline.py -filter2_only_snp_vcf_dir %s -filter2_only_snp_vcf_filenames %s/vcf_filenames -reference %s -steps 4 -jobrun %s -results_dir %s -config %s" % (os.path.dirname(os.path.abspath(__file__)), core_temp_dir, core_temp_dir, reference, cluster, core_results_dir, config_file) if args.gubbins == "yes": core_pipeline = core_pipeline + " -gubbins %s" % args.gubbins if args.outgroup: @@ -603,13 +641,18 @@ def run_tree_analysis(core_temp_dir, reference, analysis_name, log_unique_time, """ Main Variant calling Methods: Generate and Run the jobs""" list_of_files = get_filenames(args.dir, args.type, args.filenames, args.analysis_name, args.suffix) list_of_jobs = create_varcall_jobs(list_of_files, args.type, args.output_folder, args.index, args.steps, config_file, logger) - run_varcall_jobs(list_of_jobs, cluster_mode, log_unique_time, args.analysis_name, args.output_folder, logger) + job_submitted = run_varcall_jobs(list_of_jobs, cluster_mode, log_unique_time, args.analysis_name, args.output_folder, logger) + print job_submitted time_taken = datetime.now() - start_time_2 keep_logging('Logs were recorded in file with extension log.txt in %s' % vc_logs_folder, 'Logs were recorded in file with extension log.txt in %s' % vc_logs_folder, logger, 'info') keep_logging('Total Time taken: {}'.format(time_taken), 'Total Time taken: {}'.format(time_taken), logger, 'info') keep_logging('End: Variant calling Pipeline', 'End: Variant calling Pipeline', logger, 'info') elif "core_All" in args.steps or "2" in args.steps: + # job_submitted = [] + # job_submitted.append('123xx') + # job_submitted.append('123xxx') + core_All_cmds = [] """ Set Up Core Prep logs folder/logger object, cluster mode and copy config files to it""" core_prep_logs_folder = logs_folder + "/core_prep" @@ -622,8 +665,6 @@ def run_tree_analysis(core_temp_dir, reference, analysis_name, log_unique_time, keep_logging('\nCopying vcf files to %s\n' % core_temp_dir, '\nCopying vcf files to %s\n' % core_temp_dir, logger, 'info') cp_command = "cp %s/*/*_vcf_results/*_filter2_indel_final.vcf %s/*/*_vcf_results/*_aln_mpileup_raw.vcf %s/*/*_vcf_results/*_raw.vcf_5bp_indel_removed.vcf* %s/*/*_vcf_results/*filter2_final.vcf* %s/*/*_vcf_results/*vcf_no_proximate_snp.vcf* %s/*/*_vcf_results/*array %s/*/*unmapped.bed_positions %s/*/*_vcf_results/*_indel_gatk.vcf %s/*/*_stats_results/*_depth_* %s/*/*_stats_results/*_markduplicates_metrics %s/*/*_stats_results/*_markduplicates_metrics %s" % (args.output_folder, args.output_folder, args.output_folder, args.output_folder, args.output_folder, args.output_folder, args.output_folder, args.output_folder, args.output_folder, args.output_folder, args.output_folder, core_temp_dir) - - call(cp_command, logger) """ Decompress zipped files in core temp folder""" @@ -683,9 +724,6 @@ def run_tree_analysis(core_temp_dir, reference, analysis_name, log_unique_time, exit() - - - reference = ConfigSectionMap(args.index, Config)['ref_path'] + "/" + ConfigSectionMap(args.index, Config)['ref_name'] core_prep_pipeline_cmd = run_core_prep_analysis(core_temp_dir, reference, args.analysis_name, log_unique_time, args.cluster, logger, config_file) core_All_cmds.append(core_prep_pipeline_cmd) @@ -846,23 +884,19 @@ def run_tree_analysis(core_temp_dir, reference, analysis_name, log_unique_time, core_All_cmds.append(run_tree_analysis_cmd) - combine_job_name = core_temp_dir + "/" + log_unique_time + "_" + args.analysis_name + "_core_All.pbs" + if args.scheduler == "SLURM": + combine_job_name = core_temp_dir + "/" + log_unique_time + "_" + args.analysis_name + "_core_All.sbat" + else: + combine_job_name = core_temp_dir + "/" + log_unique_time + "_" + args.analysis_name + "_core_All.pbs" - Pbs_model_lines = "#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n" \ - % (ConfigSectionMap("scheduler", Config)['email'], - ConfigSectionMap("scheduler", Config)['notification'], - ConfigSectionMap("scheduler", Config)['large_resources'], - ConfigSectionMap("scheduler", Config)['queue'], - ConfigSectionMap("scheduler", Config)['flux_account']) + scheduler_directives, script_Directive, job_name_flag = get_scheduler_directive(args.scheduler, Config) with open(combine_job_name, 'w') as out: - job_title = "#PBS -N %s_%s_core_All" % (log_unique_time, args.analysis_name) + job_title = "%s %s%s" % (script_Directive, job_name_flag, os.path.basename(combine_job_name)) + out.write("#!/bin/sh" + '\n') out.write(job_title + '\n') - out.write(Pbs_model_lines + '\n') - out.write( - "# Change to the directory you submitted from\nif [ -n \"$PBS_O_WORKDIR\" ]; then cd $PBS_O_WORKDIR; fi" + '\n') - out.write("echo \"PBS working directory: $PBS_O_WORKDIR\"" + '\n') - out.write("cd %s" % core_temp_dir + '\n') + out.write(scheduler_directives + '\n') + out.write("cd %s/" % core_temp_dir + '\n') for cmds in core_All_cmds: out.write(cmds + '\n') out.close() @@ -871,8 +905,11 @@ def run_tree_analysis(core_temp_dir, reference, analysis_name, log_unique_time, 'Running: %s\n' % combine_job_name, logger, 'info') - call("qsub %s" % combine_job_name, logger) + #call("qsub %s" % combine_job_name, logger) + # keep_logging('Running: sbatch --dependency=afterany:%s %s\n' % (",".join(job_submitted), combine_job_name), + # 'Running: sbatch --dependency=afterany:%s %s\n' % (",".join(job_submitted), combine_job_name), + # logger, 'info') elif "core_prep" in args.steps: """ Set Up Core Prep logs folder/logger object, cluster mode and copy config files to it"""