-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdrugbank.py
executable file
·239 lines (213 loc) · 9.4 KB
/
drugbank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/usr/bin/env python
""" Index DrugBank xml dataset with MongoDB or Elasticsearch,
can also save drug interactions as graph files (experimental) """
from __future__ import print_function
import argparse
import os
from pprint import pprint
from zipfile import ZipFile
import networkx as nx
import pymongo
import xmltodict
from pymongo import IndexModel
from nosqlbiosets.dbutils import DBconnection
from nosqlbiosets.objutils import *
SOURCE_URL = "https://www.drugbank.ca/releases/latest"
DOCTYPE = 'drugbank' # MongoDB default collection name
# List attributes, processed by function unifylistattributes()
LIST_ATTRS = ["transporters", "drug-interactions", "food-interactions",
"atc-codes", "affected-organisms", "targets", "enzymes",
"carriers", "groups", "salts", "products",
'pathways', 'go-classifiers', 'external-links',
'external-identifiers']
# Update DrugBank entry for better database representation
def update_entry_forindexing(e, slim=True):
unifylistattributes(e, LIST_ATTRS)
unifylistattribute(e, 'categories', 'category')
if 'pathways' in e:
unifylistattributes(e['pathways'], ['drugs'])
if "products" in e:
for product in e["products"]:
checkbooleanattributes(product,
["generic", "approved", "over-the-counter"])
# Make sure type of numeric attributes are numeric
atts = ["carriers", "enzymes", "targets", "transporters"]
for att in atts:
if att in e:
for i in e[att]:
if 'position' in i:
i['position'] = int(i['position'])
if 'polypeptide' in i:
if isinstance(i['polypeptide'], list):
for pp in i['polypeptide']:
if slim is True:
del pp['amino-acid-sequence']
del pp['gene-sequence']
unifylistattributes(pp, LIST_ATTRS)
else:
if slim is True:
del i['polypeptide']['amino-acid-sequence']
del i['polypeptide']['gene-sequence']
unifylistattributes(i['polypeptide'], LIST_ATTRS)
if slim is True:
if 'sequences' in e:
del e['sequences']
if 'patents' in e:
del e['patents']
atts = ["average-mass", "monoisotopic-mass"]
for att in atts:
if att in e:
e[att] = float(e[att])
if 'salts' in e and att in e['salts']:
e['salts'][att] = float(e['salts'][att])
# Read DrugBank xml files, index using the function indexf
def parse_drugbank_xmlfile(infile, indexf):
infile = str(infile)
print("Reading/indexing %s " % infile)
if infile.endswith(".zip"):
with ZipFile(infile) as zipf:
for fname in zipf.namelist():
with zipf.open(fname) as inf:
xmltodict.parse(inf, item_depth=2, attr_prefix='',
item_callback=indexf)
else:
with open(infile, 'rb', buffering=1000) as inf:
xmltodict.parse(inf, item_depth=2, attr_prefix='',
item_callback=indexf)
print("\nCompleted")
class Indexer(DBconnection):
def __init__(self, db, index, host, port, doctype, slim=True):
self.doctype = doctype
self.index = index
self.slim = slim
super(Indexer, self).__init__(db, index, host, port,
mdbcollection=doctype, recreateindex=True)
if db == "MongoDB":
self.mcl = self.mdbi[doctype]
# Index DrugBank entry with MongoDB
def mongodb_index_entry(self, _, entry):
try:
update_entry_forindexing(entry, slim=self.slim)
docid = self.getdrugid(entry)
entry["_id"] = docid
self.mcl.insert_one(entry)
self.reportprogress()
r = True
except Exception as e:
pprint(e)
r = False
return r
# Index DrugBank entry with Elasticsearch
def es_index_entry(self, _, entry):
try:
update_entry_forindexing(entry)
docid = self.getdrugid(entry)
entry['drugbank-id'] = docid
self.es.index(index=self.index, doc_type='_doc',
id=docid, body=entry)
self.reportprogress()
r = True
except Exception as e:
print(e)
r = False
return r
def getdrugid(self, e):
if isinstance(e['drugbank-id'], list):
eid = e['drugbank-id'][0]['#text']
else:
eid = e['drugbank-id']['#text']
return eid
interactions = set()
def saveinteractions(self, _, e):
eid = self.getdrugid(e)
if e['drug-interactions'] is not None:
if isinstance(e['drug-interactions']['drug-interaction'], list):
for i in e['drug-interactions']['drug-interaction']:
did = i['drugbank-id']
self.interactions.add((eid, did))
else:
did = e['drug-interactions']['drug-interaction']['drugbank-id']
self.interactions.add((eid, did))
return True
# Save drug-drug interactions as graph files in GML format
# Both Cytoscape and Gephi are able to read GML files
# For saving networks other than drug-drug interactions
# and for saving subsets of the data see queries.py in this folder
def saveasgraph(self):
print("#edges = %d" % len(self.interactions))
graph = nx.MultiDiGraph(list(self.interactions))
nx.write_gml(graph, self.index + ".gml")
return graph
# Fields for text indexing
TEXT_FIELDS = ["description", "atc-codes.level.#text",
"go-classifiers.description",
"mechanism-of-action",
"general-references.references.articles.article.citation",
"targets.references.articles.article.citation",
"targets.polypeptide.gene-name",
"drug interactions.description",
"targets.polypeptide.general-function",
"targets.polypeptide.specific-function"]
def mongodb_indices(mdb):
indx = [(field, pymongo.TEXT) for field in TEXT_FIELDS]
mdb.create_indexes([IndexModel(indx,
name="text-index-for-selected-fields")])
indx_fields = [
"name", "products.name",
"classification.class", "drug-interactions.name",
"targets.polypeptide.gene-name",
"protein_associations.protein.protein_accession",
"metabolite_associations.metabolite.accession",
"metabolite_associations.metabolite.name",
[('name', pymongo.ASCENDING),
('drug-interactions.name', pymongo.ASCENDING)]
]
for field in indx_fields:
mdb.create_index(field)
def main(infile, db, index, doctype=DOCTYPE, host=None, port=None, slim=True):
indxr = Indexer(db, index, host, port, doctype, slim)
if db == 'MongoDB':
parse_drugbank_xmlfile(infile, indxr.mongodb_index_entry)
mongodb_indices(indxr.mdbi[doctype])
elif db == 'Elasticsearch':
parse_drugbank_xmlfile(infile, indxr.es_index_entry)
indxr.es.indices.refresh(index=index)
else:
parse_drugbank_xmlfile(infile, indxr.saveinteractions)
indxr.saveasgraph()
if __name__ == '__main__':
d = os.path.dirname(os.path.abspath(__file__))
parser = argparse.ArgumentParser(
description='Index DrugBank xml dataset with MongoDB or Elasticsearch, '
'downloaded from ' + SOURCE_URL +
', can also save drug interactions as NetworkX graph file')
parser.add_argument('-infile', '--infile',
required=True,
help='Input file name')
parser.add_argument('--index',
default="biosets",
help='Name of the MongoDB database or Elasticsearch'
' index, or filename for NetworkX graph')
parser.add_argument('--mdbcollection',
default=DOCTYPE,
help='MongoDB collection name')
parser.add_argument('--host',
help='MongoDB or Elasticsearch server hostname')
parser.add_argument('--port',
help="MongoDB or Elasticsearch server port number")
parser.add_argument('--db', default='Elasticsearch',
help="Database: 'MongoDB' or 'Elasticsearch',"
" if not set drug-drug interaction"
" network is saved to a graph file specified with"
" the '--graphfile' option")
parser.add_argument('--graphfile', default='Elasticsearch',
help="Database: 'MongoDB' or 'Elasticsearch',"
"or if 'graphfile' drug-drug interaction"
"network saved as graph file")
parser.add_argument('--allfields', default=False, action='store_true',
help="By default sequence fields"
" and the patents field is not indexed."
" Select this option to index all fields")
args = parser.parse_args()
main(args.infile, args.db, args.index, args.mdbcollection,
args.host, args.port, not args.allfields)