-
Notifications
You must be signed in to change notification settings - Fork 1
/
cds_annotation.py
77 lines (52 loc) · 2.12 KB
/
cds_annotation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import sys
import re
from Bio import SeqIO
import subprocess
from random import randrange
r2=str(randrange(10000))
#T. Allnutt, CSIRO, 2015
#read unannotated protein fasta and add annotations from a blast db
#usage: python ~/s/cds_annotation.py centroids.fasta nr cds.blast p 1e-5
#'p' can be 'p' for protein or 'n' for nucleotide
#'1e-5' is evalue threshold for blast hits
#blast > v2.2.30+ required
f = sys.argv[1] #centroids.fasta
db = sys.argv[2] # database
h = open(sys.argv[3],'w') #annotated list output
#h=sys.argv[4] #blast output file
type = sys.argv[4] #type
e = sys.argv[5]
#m = open(sys.argv[2]+".un.fa",'w') #unknowns file
print
print f
print 'blasting'
n=0
cent=open(f,'r')
#get list of cds from centroids
cds=[]
for i in SeqIO.parse(cent,'fasta'):
cds.append(i.id)
if type == "p":
p1= subprocess.Popen("~/bin/ncbi-blast-2.2.30+/bin/blastp -query %s -db %s -out temp%s.blast -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle' -max_target_seqs 1 -evalue %s -num_threads 24" %(f,db,r2,e), shell=True).wait()
if type == "n":
p1= subprocess.Popen("~/bin/ncbi-blast-2.2.30+/bin/blastn -task blastn -query %s -db %s -out temp%s.blast -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle' -max_target_seqs 1 -evalue %s -num_threads 24" %(f,db,r2,e), shell=True).wait()
if type == "x":
p1= subprocess.Popen("~/bin/ncbi-blast-2.2.30+/bin/blastx -query %s -db %s -out temp%s.blast -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle' -max_target_seqs 1 -evalue %s -num_threads 24" %(f,db,r2,e), shell=True).wait()
j = open("temp%s.blast" %r2,'r')
data={}
#gets blast hits in same order as cds if cds not in hits then also report
cnt=0
for i in j:
cnt=cnt+1
k= i.split("\t")
line = "\t".join(str(p) for p in k)
if k[0] not in data.keys():
data[k[0]]=line #has return on end
print "%s hits" %str(cnt)
for i in cds:
if i in data.keys():
h.write(data[i])
else:
h.write(i+"\tno hits\n")
p2= subprocess.Popen("rm temp%s.blast" %r2, shell=True).wait()
print "done"