From a96fbceab663188da07cde6f6a5be32648716d42 Mon Sep 17 00:00:00 2001
From: Ali Pirani <alipirani88@gmail.com>
Date: Mon, 21 Oct 2019 09:39:01 -0400
Subject: [PATCH] Making changes to Barplot code - excludeing Phage positions

---
 modules/variant_diagnostics/core_pipeline.py | 118 ++++++++++++++-----
 1 file changed, 86 insertions(+), 32 deletions(-)

diff --git a/modules/variant_diagnostics/core_pipeline.py b/modules/variant_diagnostics/core_pipeline.py
index 51f156d..99f21ec 100755
--- a/modules/variant_diagnostics/core_pipeline.py
+++ b/modules/variant_diagnostics/core_pipeline.py
@@ -1101,6 +1101,31 @@ def temp_generate_position_label_data_matrix_All_label():
                 print_string_header = print_string_header + os.path.basename(i) + "\t"
 
         f33.write('\t' + print_string_header.strip() + '\n')
+
+        """ GET individual PHAGE/Repetitive/masked region positions to assign functional class group string """
+
+        phage_positions = []
+        repetitive_positions = []
+        mask_positions = []
+
+        phage_region_positions = "%s/phage_region_positions.txt" % args.filter2_only_snp_vcf_dir
+        if os.path.isfile(phage_region_positions):
+            with open(phage_region_positions, 'rU') as fphage:
+                for line in fphage:
+                    phage_positions.append(line.strip())
+            fphage.close()
+        else:
+            raise IOError('%s/phage_region_positions.txt does not exist.' % args.filter2_only_snp_vcf_dir)
+            exit()
+
+        """ End: Generate a list of functional class positions from Phaster, Mummer and Custom Masking results/files"""
+
+        f_open_temp_Only_filtered_positions_for_closely_matrix = open("%s/temp_Only_filtered_positions_for_closely_matrix_exclude_phage.txt" % args.filter2_only_snp_vcf_dir, 'w+')
+
+        f_open_temp_Only_filtered_positions_for_closely_matrix.write('\t' + print_string_header.strip() + '\n')
+
+
+
         keep_logging('Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, 'Reading temporary label positions file: %s/temp_label_final_raw.txt \n' % args.filter2_only_snp_vcf_dir, logger, 'info')
         lll = ['reference_unmapped_position', 'LowFQ', 'LowFQ_DP', 'LowFQ_QUAL', 'LowFQ_DP_QUAL', 'LowFQ_QUAL_DP', 'HighFQ_DP', 'HighFQ_QUAL', 'HighFQ_DP_QUAL', 'HighFQ_QUAL_DP', 'HighFQ', 'LowFQ_proximate_SNP', 'LowFQ_DP_proximate_SNP', 'LowFQ_QUAL_proximate_SNP', 'LowFQ_DP_QUAL_proximate_SNP', 'LowFQ_QUAL_DP_proximate_SNP', 'HighFQ_DP_proximate_SNP', 'HighFQ_QUAL_proximate_SNP', 'HighFQ_DP_QUAL_proximate_SNP', 'HighFQ_QUAL_DP_proximate_SNP', 'HighFQ_proximate_SNP', '_proximate_SNP']
         ref_var = ['reference_allele', 'VARIANT']
@@ -1120,8 +1145,16 @@ def temp_generate_position_label_data_matrix_All_label():
                                     print_string = print_string + "\t" + i
                                 STRR2 = row[0] + print_string + "\n"
                                 f33.write(STRR2)
+
+                                if str(row[0]) not in phage_positions:
+                                    print_string_2 = ""
+                                    for i in row[1:]:
+                                        print_string_2 = print_string_2 + "\t" + i
+                                    STRR3 = row[0] + print_string_2 + "\n"
+                                    f_open_temp_Only_filtered_positions_for_closely_matrix.write(STRR3)
             csv_file.close()
             f33.close()
+            f_open_temp_Only_filtered_positions_for_closely_matrix.close()
 
         else:
             with open("%s/temp_label_final_raw.txt" % args.filter2_only_snp_vcf_dir, 'r') as csv_file:
@@ -1130,12 +1163,19 @@ def temp_generate_position_label_data_matrix_All_label():
                 for row in csv_reader:
                     if set(ref_var) & set(row[1:]):
                         if set(lll) & set(row[1:]):
-
                             print_string = ""
                             for i in row[1:]:
                                 print_string = print_string + "\t" + i
                             STRR2 = row[0] + print_string + "\n"
                             f33.write(STRR2)
+                            if str(row[0]) not in phage_positions:
+                                print_string_2 = ""
+                                for i in row[1:]:
+                                    print_string_2 = print_string_2 + "\t" + i
+                                STRR3 = row[0] + print_string_2 + "\n"
+                                f_open_temp_Only_filtered_positions_for_closely_matrix.write(STRR3)
+
+
             csv_file.close()
             f33.close()
         """
@@ -1259,8 +1299,19 @@ def barplot_stats():
         This will give a visual explanation of how many positions in each samples were filtered out because of different reason
         """
 
-        c_reader = csv.reader(open('%s/temp_Only_filtered_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t')
-        columns = list(zip(*c_reader))
+        print "Exluding Phage regions from temp_Only_filtered_positions_for_closely_matrix.txt file. The results will be outputed to temp_Only_filtered_positions_for_closely_matrix_exclude_phage.txt"
+
+
+
+
+
+        temp_Only_filtered_positions_for_closely_matrix_exclude_phage = "%s/temp_Only_filtered_positions_for_closely_matrix_exclude_phage.txt" % args.filter2_only_snp_vcf_dir
+        print temp_Only_filtered_positions_for_closely_matrix_exclude_phage
+        #c_reader = csv.reader(open('%s/temp_Only_filtered_positions_for_closely_matrix.txt' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t')
+        c_reader_2 = csv.reader(
+            open(temp_Only_filtered_positions_for_closely_matrix_exclude_phage, 'r'), delimiter='\t')
+        columns_2 = list(zip(*c_reader_2))
+        print len(columns_2)
         keep_logging('Finished reading columns...', 'Finished reading columns...', logger, 'info')
         counts = 1
 
@@ -1277,14 +1328,14 @@ def barplot_stats():
 
         for i in xrange(1, end, 1):
             """ Bar Count Statistics: Variant Position Count Statistics """
-            true_variant = columns[i].count('VARIANT')
-            unmapped_positions = columns[i].count('reference_unmapped_position')
-            reference_allele = columns[i].count('reference_allele')
-            Only_low_FQ = columns[i].count('LowFQ')
-            Only_DP = columns[i].count('HighFQ_DP')
-            Only_low_MQ = columns[i].count('HighFQ')
-            low_FQ_other_parameters = columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count('LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[i].count('LowFQ_DP')
-            high_FQ_other_parameters = columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count('HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL')
+            true_variant = columns_2[i].count('VARIANT')
+            unmapped_positions = columns_2[i].count('reference_unmapped_position')
+            reference_allele = columns_2[i].count('reference_allele')
+            Only_low_FQ = columns_2[i].count('LowFQ')
+            Only_DP = columns_2[i].count('HighFQ_DP')
+            Only_low_MQ = columns_2[i].count('HighFQ')
+            low_FQ_other_parameters = columns_2[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_DP_proximate_SNP') + columns_2[i].count('LowFQ_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_DP') + columns_2[i].count('LowFQ_DP_QUAL') + columns_2[i].count('LowFQ_QUAL') + columns_2[i].count('LowFQ_DP')
+            high_FQ_other_parameters = columns_2[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_DP_proximate_SNP') + columns_2[i].count('HighFQ_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_DP') + columns_2[i].count('HighFQ_DP_QUAL') + columns_2[i].count('HighFQ_QUAL')
             other = low_FQ_other_parameters + high_FQ_other_parameters
 
             total = true_variant + unmapped_positions + reference_allele + Only_low_FQ + Only_DP + low_FQ_other_parameters + high_FQ_other_parameters + Only_low_MQ
@@ -1302,35 +1353,35 @@ def barplot_stats():
             #f_bar_count.write(bar_string)
             """ Bar Count Percentage Statistics: Variant Position Percentage Statistics """
             try:
-                true_variant_perc = float((columns[i].count('VARIANT') * 100) / total)
+                true_variant_perc = float((columns_2[i].count('VARIANT') * 100) / total)
             except ZeroDivisionError:
                 true_variant_perc = 0
             try:
-                unmapped_positions_perc = float((columns[i].count('reference_unmapped_position') * 100) / total)
+                unmapped_positions_perc = float((columns_2[i].count('reference_unmapped_position') * 100) / total)
             except ZeroDivisionError:
                 unmapped_positions_perc = 0
             try:
-                reference_allele_perc = float((columns[i].count('reference_allele') * 100) / total)
+                reference_allele_perc = float((columns_2[i].count('reference_allele') * 100) / total)
             except ZeroDivisionError:
                 reference_allele_perc = 0
             try:
-                Only_low_FQ_perc = float((columns[i].count('LowFQ') * 100) / total)
+                Only_low_FQ_perc = float((columns_2[i].count('LowFQ') * 100) / total)
             except ZeroDivisionError:
                 Only_low_FQ_perc = 0
             try:
-                Only_DP_perc = float((columns[i].count('HighFQ_DP') * 100) / total)
+                Only_DP_perc = float((columns_2[i].count('HighFQ_DP') * 100) / total)
             except ZeroDivisionError:
                 Only_DP_perc = 0
             try:
-                Only_low_MQ_perc = float((columns[i].count('HighFQ') * 100) / total)
+                Only_low_MQ_perc = float((columns_2[i].count('HighFQ') * 100) / total)
             except ZeroDivisionError:
                 Only_low_MQ_perc = 0
             try:
-                low_FQ_other_parameters_perc = float(((columns[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns[i].count('LowFQ_QUAL_proximate_SNP') + columns[i].count('LowFQ_DP_proximate_SNP') + columns[i].count('LowFQ_proximate_SNP') + columns[i].count('LowFQ_QUAL_DP') + columns[i].count('LowFQ_DP_QUAL') + columns[i].count('LowFQ_QUAL') + columns[i].count('LowFQ_DP'))  * 100) / total)
+                low_FQ_other_parameters_perc = float(((columns_2[i].count('LowFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('LowFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_proximate_SNP') + columns_2[i].count('LowFQ_DP_proximate_SNP') + columns_2[i].count('LowFQ_proximate_SNP') + columns_2[i].count('LowFQ_QUAL_DP') + columns_2[i].count('LowFQ_DP_QUAL') + columns_2[i].count('LowFQ_QUAL') + columns_2[i].count('LowFQ_DP'))  * 100) / total)
             except ZeroDivisionError:
                 low_FQ_other_parameters_perc = 0
             try:
-                high_FQ_other_parameters_perc = float(((columns[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns[i].count('HighFQ_QUAL_proximate_SNP') + columns[i].count('HighFQ_DP_proximate_SNP') + columns[i].count('HighFQ_proximate_SNP') + columns[i].count('HighFQ_QUAL_DP') + columns[i].count('HighFQ_DP_QUAL') + columns[i].count('HighFQ_QUAL')) * 100) / total)
+                high_FQ_other_parameters_perc = float(((columns_2[i].count('HighFQ_QUAL_DP_proximate_SNP') + columns_2[i].count('HighFQ_DP_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_proximate_SNP') + columns_2[i].count('HighFQ_DP_proximate_SNP') + columns_2[i].count('HighFQ_proximate_SNP') + columns_2[i].count('HighFQ_QUAL_DP') + columns_2[i].count('HighFQ_DP_QUAL') + columns_2[i].count('HighFQ_QUAL')) * 100) / total)
             except ZeroDivisionError:
                 high_FQ_other_parameters_perc = 0
 
@@ -2988,11 +3039,6 @@ def annotated_snp_matrix():
     """ End: Prepare a All_indel_label_final_ordered_sorted.txt file with sorted unique variant positions. """
 
 
-
-
-
-
-
     """ Generate a position_label and position_indel_label dictionary that will contain information about each unique variant position that passed variant filters in any sample and reasons for being filtered out in any sample """
     position_label = OrderedDict()
     with open("%s/All_label_final_ordered_sorted.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file:
@@ -3024,9 +3070,6 @@ def annotated_snp_matrix():
     """ End: Generate a position_label and position_indel_label dictionary """
 
 
-
-
-
     """ Generate mask_fq_mq_positions array with positions where a variant was filtered because of LowFQ or LowMQ """
     mask_fq_mq_positions = []
     mask_fq_mq_positions_outgroup_specific = []
@@ -4532,8 +4575,19 @@ def mask_fq_mq_positions_specific_to_outgroup():
 
 
     else:
+        position_label = OrderedDict()
+
+        with open("%s/All_label_final_ordered_sorted.txt" % args.filter2_only_snp_vcf_dir, 'rU') as csv_file:
+            keep_logging(
+                'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir,
+                'Reading All label positions file: %s/All_label_final_sorted_header.txt \n' % args.filter2_only_snp_vcf_dir,
+                logger, 'info')
+            csv_reader = csv.reader(csv_file, delimiter='\t')
+            next(csv_reader, None)
+            for row in csv_reader:
+                position_label[row[0]] = row[1:]
         for key in position_label.keys():
-            label_sep_array = position_label[key].split(',')
+            label_sep_array = str(position_label[key]).split(',')
             for i in label_sep_array:
                 if "LowFQ" in str(i):
                     if key not in mask_fq_mq_positions:
@@ -4656,14 +4710,14 @@ def someOtherFunc(data, key):
         unique_position_file = create_positions_filestep(vcf_filenames)
         unique_indel_position_file = create_indel_positions_filestep(vcf_filenames)
 
-        # bgzip and tabix all the vcf files in core_temp_dir.
-        files_for_tabix = glob.glob("%s/*.vcf" % args.filter2_only_snp_vcf_dir)
-        tabix(files_for_tabix, "vcf", logger, Config)
+        # # bgzip and tabix all the vcf files in core_temp_dir.
+        # files_for_tabix = glob.glob("%s/*.vcf" % args.filter2_only_snp_vcf_dir)
+        # tabix(files_for_tabix, "vcf", logger, Config)
 
         # Get the cluster option; create and run jobs based on given parameter. The jobs will parse all the intermediate vcf file to extract information such as if any unique variant position was unmapped in a sample, if it was filtered out dur to DP,MQ, FQ, proximity to indel, proximity to other SNPs and other variant filter parameters set in config file.
         tmp_dir = "/tmp/temp_%s/" % log_unique_time
 
-        create_job(args.jobrun, vcf_filenames, unique_position_file, tmp_dir)
+        #create_job(args.jobrun, vcf_filenames, unique_position_file, tmp_dir)
 
         create_indel_job(args.jobrun, vcf_filenames, unique_indel_position_file, tmp_dir)