-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparse_blast_xml.py
161 lines (135 loc) · 5.32 KB
/
parse_blast_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from Bio.Blast import NCBIXML
import sys
import argparse
from glob import glob
#from joblib import Parallel, delayed
import os.path
#hsps with larger e-value are discarded
significant_e_value = 0.001
#if query coverage fraction is smaller then alignment is discarded
significant_query_fraction = 0.1
#if best_hit_query_coverage * significant_ratio < other_hit_query_coverage
significant_ratio = 0.9
def parse_args(args):
###### Command Line Argument Parser
parser = argparse.ArgumentParser(description="Check the help flag")
parser.add_argument('-i', help='Path to file with XML output')
parser.add_argument('-o', help='Path to output folder')
return parser.parse_args()
###### Return total length of alignment with respect of significant_e_value (0.001) threshold
#
def get_total_len(alignment):
global significant_e_value
total_len = 0
for hsp in alignment.hsps:
if hsp.expect < significant_e_value:
total_len += hsp.align_length
return total_len
def get_query_coverage (alignment):
global significant_e_value
total_len = 0
coords = []
for hsp in alignment.hsps:
if hsp.expect < significant_e_value:
coords.append([hsp.query_start, 1])
coords.append([hsp.query_end, -1])
coords.sort()
if len(coords) > 0:
layers = coords[0][1]
for j in range (1, len(coords)):
if layers > 0:
total_len += coords[j][0] - coords[j - 1][0]
layers += coords[j][1]
return total_len
def get_identity(alignment):
global significant_e_value
identity = 0
for hsp in alignment.hsps:
if hsp.expect < significant_e_value:
identity += hsp.identities
return identity
def get_hsp_count(alignment):
global significant_e_value
hsp_count = 0
for hsp in alignment.hsps:
if hsp.expect < significant_e_value:
hsp_count += 1
return hsp_count
def parse_name(seq):
if (seq.find('hage') != -1 or seq.find('irus') != -1 ):
return "Virus"
elif seq.find('lasmid') != -1:
return "Plasmid"
elif (seq.find('hromosome') != -1 or seq.find("omplete genome") != -1):
return "Chromosome"
else:
return "Other"
def report_file(file, query, alignment):
file.write(query + '\n')
file.write(alignment[1] + "\t query coverage: " + str(1.0 * alignment[0]/alignment[6]) + "\t identity: " + str(1.0 * alignment[4]/alignment[3]) + "\t hsp_num: " + str(alignment[5]))
file.write('\n')
return
def parser(f, out_dir):
global significant_query_fraction
global significant_ratio
scat_out={}
xml_file = open(f,"r")
name = os.path.basename(f)
name = os.path.join(out_dir, name)
# print(name)
nosig = open(name[:-4]+"_no_significant.names", "w")
chrom = open(name[:-4]+"_chromosome.names", "w")
plasmids = open(name[:-4]+"_plasmid.names", "w")
other = open(name[:-4]+"_other.names", "w")
records= NCBIXML.parse(xml_file)
viral = open(name[:-4]+"_viruses.names", "w")
files = {"Virus": viral, "Plasmid": plasmids, "Chromosome": chrom, "Other": other}
for item in records:
pl_len = item.query_length
###### No alignment - put in non-significant
if len(item.alignments) == 0:
# print ("No Significant")
nosig.write(item.query + '\n\n')
continue
good_aln = []
scat_good_aln = []
alignments = []
for alignment in item.alignments:
seq_type = parse_name(alignment.title) # Virus, Plasmid, Chromosome or Other
query_coverage = get_query_coverage(alignment)
if get_query_coverage(alignment) < significant_query_fraction * pl_len:
continue
alignments.append([get_query_coverage(alignment), alignment.title, parse_name(alignment.title), get_total_len(alignment), get_identity(alignment), get_hsp_count(alignment), pl_len])
if len(alignments)== 0:
nosig.write(item.query + '\n')
al = item.alignments[0]
nosig.write(al.title + "\t" + "total query cov " + str(get_query_coverage(al)) + " of " + str(pl_len))
nosig.write('\n')
else:
alignments.sort()
alignments.reverse()
type = alignments[0][2]
best_query_cov = alignments[0][0]
for i in range (1, len(alignments)):
if alignments[i][0] < best_query_cov * significant_ratio:
break
if type != alignments[i][2]:
if alignments[i][2] == "Virus" or alignments[i][2] == "Plasmid":
report_file(files[alignments[i][2]], item.query,alignments[0])
break
else:
# print (type)
report_file(files[type], item.query,alignments[0])
return
def main ():
parsed_args = parse_args(sys.argv[1:])
files = glob(str(parsed_args.i)+"/*.xml")
# print (files)
parser(parsed_args.i, parsed_args.o)
# Parallel(n_jobs=30)(delayed(parser) (infile, parsed_args.o) for infile in files)
#Parallel(n_jobs=30)(delayed(parser) (args) for args in files)
# print('score:', hsp.score)
# print('gaps:', hsp.gaps)
# print('e value:', hsp.expect)
if __name__ == '__main__':
main()