Skip to content

Commit 8854ff7

Browse files
author
Jody Phelan
committed
init
0 parents  commit 8854ff7

File tree

8 files changed

+149780
-0
lines changed

8 files changed

+149780
-0
lines changed

ann.txt

Lines changed: 47167 additions & 0 deletions
Large diffs are not rendered by default.

barcode.bed

Lines changed: 414 additions & 0 deletions
Large diffs are not rendered by default.

genes.fasta

Lines changed: 7960 additions & 0 deletions
Large diffs are not rendered by default.

genes.txt

Lines changed: 7193 additions & 0 deletions
Large diffs are not rendered by default.

genome.fasta

Lines changed: 73527 additions & 0 deletions
Large diffs are not rendered by default.

genome.gff

Lines changed: 12032 additions & 0 deletions
Large diffs are not rendered by default.

parse_db.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import csv
2+
import json
3+
import re
4+
from collections import defaultdict
5+
import argparse
6+
from shutil import copyfile
7+
8+
def fa2dict(filename):
9+
fa_dict = {}
10+
seq_name = ""
11+
for l in open(filename):
12+
line = l.rstrip()
13+
if line[0] == ">":
14+
seq_name = line[1:].split()[0]
15+
fa_dict[seq_name] = []
16+
else:
17+
fa_dict[seq_name].append(line)
18+
result = {}
19+
for seq in fa_dict:
20+
result[seq] = "".join(fa_dict[seq])
21+
return result
22+
23+
def parse_mutation(mut,gene,fasta_dict):
24+
aa_long2short = {
25+
"Ala":"A","Arg":"R","Asn":"N","Asp":"D","Cys":"C",
26+
"Gln":"Q","Glu":"E","Gly":"G","His":"H","Ile":"I",
27+
"Leu":"L","Lys":"K","Met":"M","Phe":"F","Pro":"P",
28+
"Ser":"S","Thr":"T","Trp":"W","Tyr":"Y","Val":"V",
29+
"Stop":"*", "-":"-"
30+
}
31+
# AA change
32+
re_obj = re.search("p.([A-Z][a-z][a-z])([0-9]+)([A-Z][a-z][a-z])",mut)
33+
if re_obj:
34+
ref_aa = aa_long2short[re_obj.group(1)]
35+
alt_aa = aa_long2short[re_obj.group(3)]
36+
codon_num = re_obj.group(2)
37+
return "%s%s>%s%s" % (codon_num,ref_aa,codon_num,alt_aa)
38+
# Stop codon
39+
re_obj = re.search("p.([A-Z][a-z][a-z])([0-9]+)(\*)",mut)
40+
if re_obj:
41+
ref_aa = aa_long2short[re_obj.group(1)]
42+
alt_aa = re_obj.group(3)
43+
codon_num = re_obj.group(2)
44+
return "%s%s>%s%s" % (codon_num,ref_aa,codon_num,alt_aa)
45+
# Deletion
46+
re_obj = re.search("c.([0-9]+)_([0-9]+)del",mut)
47+
if re_obj:
48+
start_nt = int(re_obj.group(1))
49+
end_nt = int(re_obj.group(2))
50+
seq = fasta_dict[gene][start_nt-2:end_nt]
51+
return "%s%s>%s" % (start_nt-1,seq,seq[0])
52+
# Insertion
53+
re_obj = re.search("c.([0-9]+)_([0-9]+)ins([A-Z]+)",mut)
54+
if re_obj:
55+
start_nt = int(re_obj.group(1))
56+
seq_ins = re_obj.group(3)
57+
seq_start = fasta_dict[gene][start_nt-1]
58+
return "%s%s>%s" % (start_nt,seq_start,seq_start+seq_ins)
59+
## Promoter Mutation
60+
## c.-16G>C
61+
re_obj = re.search("c.(\-[0-9]+)([A-Z])>([A-Z])",mut)
62+
if re_obj:
63+
nt_pos = re_obj.group(1)
64+
ref_nt = re_obj.group(2)
65+
alt_nt = re_obj.group(2)
66+
return "%s%s>%s" % (nt_pos,ref_nt,alt_nt)
67+
## ncRNA Mutation
68+
## r.514a>c
69+
re_obj = re.search("r.([0-9]+)([a-z])>([a-z])",mut)
70+
if re_obj:
71+
nt_pos = re_obj.group(1)
72+
ref_nt = re_obj.group(2)
73+
alt_nt = re_obj.group(3)
74+
return "%s%s>%s" % (nt_pos,ref_nt.upper(),alt_nt.upper())
75+
def write_bed(gene_dict,gene_info,outfile):
76+
O = open(outfile,"w")
77+
for gene in gene_dict:
78+
if gene not in gene_info:
79+
sys.stderr.write("%s not found in the 'gene_info' dictionary... Exiting!" % gene)
80+
quit()
81+
O.write("Chromosome\t%s\t%s\t%s\t%s\t%s\n" % (gene_info[gene]["start"],gene_info[gene]["end"],gene_info[gene]["locus_tag"],gene_info[gene]["gene"],",".join(gene_dict[gene])))
82+
O.close()
83+
84+
def load_gene_info(filename):
85+
gene_info = {}
86+
for l in open(filename):
87+
row = l.rstrip().split()
88+
gene_info[row[0]] = {"locus_tag":row[0],"gene":row[1],"start":row[2],"end":row[3],"strand":row[4]}
89+
gene_info[row[1]] = {"locus_tag":row[0],"gene":row[1],"start":row[2],"end":row[3],"strand":row[4]}
90+
return gene_info
91+
92+
def main(args):
93+
fasta_dict = fa2dict("genes.fasta")
94+
gene_info = load_gene_info("genes.txt")
95+
db = {}
96+
locus_tag_to_drug_dict = defaultdict(set)
97+
for row in csv.DictReader(open(args.csv)):
98+
locus_tag = gene_info[row["Gene"]]["locus_tag"]
99+
mut = parse_mutation(row["Mutation"],locus_tag,fasta_dict)
100+
locus_tag_to_drug_dict[locus_tag].add(row["Drug"].lower())
101+
if locus_tag not in db:
102+
db[locus_tag] = {}
103+
if mut not in db[locus_tag]:
104+
db[locus_tag][mut] = {"drugs":[]}
105+
db[locus_tag][mut]["drugs"].append(row["Drug"].lower())
106+
annotation_columns = set(row.keys()) - set(["Gene","Mutation","Drug"])
107+
for col in annotation_columns:
108+
if col not in db[locus_tag][mut]:
109+
db[locus_tag][mut][col] = [row[col]]
110+
else:
111+
db[locus_tag][mut][col].append(row[col])
112+
113+
copyfile("genome.fasta", "%s.fasta" % args.prefix)
114+
copyfile("genome.gff", "%s.gff" % args.prefix)
115+
copyfile("ann.txt", "%s.ann.txt" % args.prefix)
116+
copyfile("barcode.bed", "%s.barcode.bed" % args.prefix)
117+
write_bed(locus_tag_to_drug_dict,gene_info,"%s.bed"%args.prefix)
118+
json.dump(db,open("%s.dr.json"%args.prefix,"w"))
119+
120+
parser = argparse.ArgumentParser(description='TBProfiler pipeline',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
121+
parser.add_argument('--prefix','-p',default="tbdb",type=str,help='NGS Platform')
122+
parser.add_argument('--csv','-c',default="tbdb.csv",type=str,help='NGS Platform')
123+
parser.set_defaults(func=main)
124+
args = parser.parse_args()
125+
args.func(args)

0 commit comments

Comments
 (0)