-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2873826_DatabasePyScript.py
429 lines (348 loc) · 17.2 KB
/
2873826_DatabasePyScript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
#importing logging
try:
import logging
except ModuleNotFoundError:
print("Hello, the logging module is not installed. Please install it and run the script again. Raising SystemExit.")
raise SystemExit(1)
#setting up the logger
#getting the logger
logger = logging.getLogger('logger')
#setting the level of the logger
logger.setLevel(logging.WARNING)
#creating a stream handler.
shandler = logging.StreamHandler()
#formatting the stream handler
shandler.setFormatter(logging.Formatter('%(levelname)s - %(asctime)s - {%(pathname)s:%(lineno)d} - %(message)s'))
#adding the stream handler to the logger.
logger.addHandler(shandler)
#importing necessary modules
try:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
import sqlite3
from STOUT import translate_forward
import os
except ModuleNotFoundError as e:
logger.error(f"Hello, the following library was not found. Please install it and run the script again, Raising SystemExit.\n{e}")
raise SystemExit(1)
#setting file paths
database_path = '2873826.db'
sdf_filepath = 'Molecules8.sdf'
#making sure the sdf database filepath exists
try:
assert os.path.isfile(sdf_filepath) == True
except AssertionError:
logger.error(f"Hello the given sdf file {sdf_filepath} does not exist. Please fix the error or make sure the file exists. Raising SystemExit.")
raise SystemExit(1)
#creating an rdkit database of the sdf file
db = Chem.SDMolSupplier(sdf_filepath)
#defining functions
#defining a function to calculate the number of the fused aromatic rings in a compounds
def fused_aro_rings_calculator(list_supplied): #as an argument, it takes a list of lists where each list is a tuple of the atom numbers in an aromatic ring
dictionary_results = {} #starting with an empty dictionary to store the results in
fuses = 0 #starting with a zero number of fuses counted, each fuse is defined as a common bonds between any two aromatic rings in the input list supplied.
for item in list_supplied:
dictionary_results[str(item)] = 0 #using the ring as a key
for item_compared in list_supplied:
common_atoms = set(item) & set(item_compared)
if len(common_atoms) == 2:
dictionary_results[str(item)] = dictionary_results[str(item)] + 1 #counts how many rings the given ring is fused to
#print(dictionary_results)
for value in dictionary_results.values(): #for the dictionary in which the data has been collected, for each ring, we count it to be fused once if at least one common bond has been counted.
if int(value) >= 1:
fuses = fuses + 1
return fuses
#defining functions to test for the following filters -
#lipinski's, lead-likeness, bioavailablity, ghose, veber, rule of 3, REOS, drug-like QED, and one for testing if all filters were passed or not. for the given file of compounds, none of them pass all the filters.
def lipinski_scorer(mass: float, logP: float, donorCount: int, acceptorCount: int):
score = 0
if mass <= 500:
score = score + 1
if logP <= 5:
score = score + 1
if donorCount <= 5:
score = score + 1
if acceptorCount <= 10:
score = score + 1
return score
def lead_likeness_scorer(mass: float,logD: float, ringCount: int,rotatableBondCount: int,donorCount: int, acceptorCount: int):
score = 0
if mass <= 450:
score = score + 1
if logD >= (-4) and logD <= 4:
score = score + 1
if ringCount <= 4:
score = score + 1
if rotatableBondCount <= 10:
score = score + 1
if donorCount <= 5:
score = score + 1
if acceptorCount <= 8:
score = score + 1
return score
def bioavailability_scorer(mass: float, logP: float, donorCount: int, acceptorCount: int, rotatableBondCount: int, PSA: float, fusedAromaticRingCount: int):
score = 0
if mass <= 500:
score = score + 1
if logP <= 5:
score = score + 1
if donorCount <= 5:
score = score + 1
if acceptorCount <= 10:
score = score + 1
if rotatableBondCount <= 10:
score = score + 1
if PSA <= 200:
score = score + 1
if fusedAromaticRingCount <= 5:
score = score + 1
return score
def ghose_scorer(mass: float, logP: float, atom_count: int, molar_refractivity: float):
score = 0
if mass > 160 and mass < 480:
score = score + 1
if logP > (-0.4) and logP < 5.6:
score = score + 1
if atom_count > 20 and atom_count < 70:
score = score + 1
if molar_refractivity > 40 and molar_refractivity < 130:
score = score + 1
return score
def veber_scorer(rotatableBondCount: int,TPSA: float):
score = 0
if rotatableBondCount <= 10:
score = score + 1
if TPSA <= 140:
score = score + 1
return score
def REOS_scorer(mass: float, logP: float, donorCount: int, acceptorCount: int, formal_charge: float, rotatableBondCount: int, heavy_atom_count: int):
score = 0
if mass > 200 and mass < 500:
score = score + 1
if logP > (-5) and logP < 5:
score = score + 1
if donorCount > 0 and donorCount < 5:
score = score + 1
if acceptorCount > 0 and acceptorCount < 10:
score = score + 1
if formal_charge > (-2) and formal_charge < 2:
score = score + 1
if rotatableBondCount > 0 and rotatableBondCount < 8:
score = score + 1
if heavy_atom_count > 15 and heavy_atom_count < 50:
score = score + 1
return score
def rule_of_3_scorer(mass: float, logP: float, donorCount: int, acceptorCount: int, rotatableBondCount: int):
score = 0
if mass <= 300:
score = score + 1
if logP <= 3:
score = score + 1
if donorCount <= 3:
score = score + 1
if acceptorCount <= 3:
score = score + 1
if rotatableBondCount <= 3:
score = score + 1
return score
def drug_like_QED_scorer(mass: float, ringCount: int, rotatableBondCount: int, donorCount: int, acceptorCount: int, logP: float):
score = 0
if mass < 400:
score = score + 1
if ringCount > 0:
score = score + 1
if rotatableBondCount < 5:
score = score + 1
if donorCount <= 5:
score = score + 1
if acceptorCount <= 10:
score = score + 1
if logP < 5:
score = score + 1
return score
def all_filters_test(lipinski_score,lead_likeness_score,bioavailability_score,ghose_score,veber_score,REOS_score,rule_of_3_score,drug_like_QED_score):
if lipinski_score == 4 and lead_likeness_score == 6 and bioavailability_score >= 6 and ghose_score == 4 and veber_score == 2 and REOS_score == 7 and rule_of_3_score == 5 and drug_like_QED_score == 6:
return True
else:
return False
'''
test = 0
for compound in db:
print(compound.GetNumAtoms()) #Return the number of atoms
print(Chem.MolToSmiles(compound)) #get the SMILES code
print(AllChem.Compute2DCoords(compound)) #2D coordinates #all are zero for me
test = test + 1
print(f"Number of compounds is {test}.") #ok this works
for compound in db:
print(compound.GetBonds()[0].GetBondType()) #gets bond type of the first bond of the first compound
for atom in compound.GetAtoms():
print(atom.GetAtomicNum()) #gets the atomic number
break
'''
'''
for compound in db:
tmp=AllChem.Compute2DCoords(compound)
'''
#this loop is for my personal use. it extract the dictionary of the descriptors of the first compound.
for compound in db:
descriptors_list = []
for key in (Descriptors.CalcMolDescriptors(compound)).keys():
descriptors_list.append(key)
break
#creating the database schema. first creating a connection and cursor.
connection = sqlite3.connect(database_path)
cur = connection.cursor()
#making sure the ddl file exists
try:
assert os.path.isfile('sqlddl.sql') == True
except AssertionError:
logger.error(f"Hello, the sql ddl script was not found in the current working directory. Please fix it and re-run the script. Raising SystemExit.")
raise SystemExit(1)
#reading the sql script.
with open('sqlddl.sql') as sqlfile:
sql_script = sqlfile.read() #reading the script and saving it
#print(sql_script)
cur.executescript(sql_script) #executing the script in the given schema.
connection.commit() #committing the changes to the database
cur.close()
connection.close() #closing the cursor and the connection.
#opening another connection to load the database
connection = sqlite3.connect(database_path)
cur = connection.cursor()
#this extracts the parameters for the database.
for compound in db:
descriptors = Descriptors.CalcMolDescriptors(compound) #this returns a dictionary, where the descriptor is the key and the value is the respective value.
#QED stands for quantitative estimation of drug-likeness
qed = descriptors['qed']
#print(qed)
qed = round(qed,3)
#print(qed)
#We propose the spacial score (SPS) as an empirical scoring system that builds upon the principle underlying Fsp3 and FCstereo and expresses the spacial complexity of a compound in a uniform manner on a highly granular scale.
SPS = descriptors['SPS']
#print(SPS)
SPS = round(SPS,3)
#print(SPS)
'''
Exact Molecular Mass versus Molecular Weight
"In the discussion in the last section, the term exact mass was introduced. Again this is the mass of a molecule calculated with only the most abundant isotopes present; these are usually the lightest isotopes. The molecular weights given in the Handbook of Chemistry and Physics are different; they are the average molecular weights for all the molecules with different isotopic compositions present in a compound, each weighted for the natural abundance of the respective isotope or isotopes. For example the molecular weight of propane is 44.0956 g/mol and for acetaldehyde, 44.0526 g/mol. The exact molecular masses for the molecules with the lightest isotopes are 44.0626 amu and 44.0262 amu, respectively. Note, we use different units for molecular weight than we use for molecular mass."
Copyright information: Original content © University of Colorado, Boulder, Chemistry and Biochemistry Department, 2011. The information on these pages is available for academic use without restriction.
Accessed: https://www.orgchemboulder.com/Spectroscopy/MS/molmassmw.shtml [15 February 2024]
'''
MolWt = descriptors['MolWt']
MolWt = round(MolWt,3)
#print(MolWt)
ExactMolWt = descriptors['ExactMolWt']
ExactMolWt = round(ExactMolWt,3)
#print(ExactMolWt)
NumValenceElectrons = descriptors['NumValenceElectrons']
#print(NumValenceElectrons)
NumRadicalElectrons = descriptors['NumRadicalElectrons']
#print(NumRadicalElectrons)
HBA = descriptors['NOCount']
#print(HBA)
#print(f"flag for me {descriptors['NumHAcceptors']}")
HBD = descriptors['NHOHCount']
#print(HBD)
#print(f"flag for me {descriptors['NumHDonors']}")
#those two flags returned the same values
LogP = descriptors['MolLogP']
LogP = round(LogP,3)
#print(LogP)
NumAromaticRings = descriptors['NumAromaticRings']
#print(NumAromaticRings)
#print(descriptors)
#print('flag 1')
#getting the formula
formula = Chem.rdMolDescriptors.CalcMolFormula(compound)
#print(formula)
name = compound.GetProp('Name')
#print(name)
LogD = compound.GetProp('LogD')
LogD = round(float(LogD),3)
#print(LogD)
RingCount = descriptors['RingCount']
#print(RingCount)
#print('flag 2')
NumRotatableBonds = descriptors['NumRotatableBonds']
#print(NumRotatableBonds)
#something abour surface area
TPSA = descriptors['TPSA']
TPSA = round(TPSA,3)
#print(TPSA)
molar_refractivity = Chem.Crippen.MolMR(compound)
molar_refractivity = round(molar_refractivity,3)
#print(molar_refractivity)
Mol_ID = compound.GetProp('Mol_ID')
#print(Mol_ID)
CdId = compound.GetProp('CdId')
#print(CdId)
number_of_atoms = Chem.rdchem.Mol.GetNumAtoms(compound)
#print(number_of_atoms)
formal_charge = Chem.rdmolops.GetFormalCharge(compound)
#print(formal_charge)
heavy_atoms = Chem.rdchem.Mol.GetNumHeavyAtoms(compound)
#print(heavy_atoms) #an isotopic form of an atom that contains more than the common number of neutrons and is thus of greater relative atomic mass than the most abundant or most commonly observed isotope.
#this extracts a list of lists where each list contains information about a ring that is aromatic. the said information is a tuple of atom numbers contained in that ring (with respect to the whole compound, so numbering/labelling is consistent.)
#for example, this is the list for the first compound (CdId 1):
#[(4, 5, 6, 15, 16, 3), (7, 8, 9, 14, 5, 6), (10, 11, 12, 13, 14, 9)]
#this compound has three aromatic rings, so the list contains three tuples. each tuple says which atoms are in a ring. therefore, the first compound's atom numbers (4, 5, 6, 15, 16, 3) are in an aromatic ring.
aromatic_rings = [ring for ring in compound.GetRingInfo().AtomRings() if all(compound.GetAtomWithIdx(atom_idx).GetIsAromatic() for atom_idx in ring)]
#print(aromatic_rings)
fused_aro_rings = fused_aro_rings_calculator(aromatic_rings)
#print(fused_aro_rings)
lipinski_score = lipinski_scorer(mass=float(MolWt),logP=float(LogP),donorCount=int(HBD),acceptorCount=int(HBA))
#print(lipinski_score)
#print('flag 3')
lead_likeness_score = lead_likeness_scorer(mass=float(MolWt),logD=float(LogD),ringCount=int(RingCount),rotatableBondCount=int(NumRotatableBonds),donorCount=int(HBD),acceptorCount=int(HBA))
#print(lead_likeness_score)
bioavailability_score = bioavailability_scorer(mass=float(MolWt),logP=float(LogP),donorCount=int(HBD),acceptorCount=int(HBA),rotatableBondCount=int(NumRotatableBonds),PSA=float(TPSA),fusedAromaticRingCount=int(fused_aro_rings))
#print(bioavailability_score)
ghose_score = ghose_scorer(float(MolWt),float(LogP),int(number_of_atoms),float(molar_refractivity))
#print(ghose_score)
veber_score = veber_scorer(int(NumRotatableBonds),float(TPSA))
#print(veber_score)
REOS_score = REOS_scorer(float(MolWt),float(LogP),int(HBD),int(HBA),float(formal_charge),int(NumRotatableBonds),int(heavy_atoms))
#print(REOS_score)
rule_of_3_score = rule_of_3_scorer(float(MolWt),float(LogP),int(HBD),int(HBA),int(NumRotatableBonds))
#print(rule_of_3_score)
drug_like_QED_score = drug_like_QED_scorer(float(MolWt),int(RingCount),int(NumRotatableBonds),int(HBD),int(HBA),float(LogP))
#print(drug_like_QED_score)
smiles = Chem.MolToSmiles(compound)
#print(smiles)
try:
IUPAC = translate_forward(smiles)
except: #translate_forward is a function from the library STOUT which is a neural net trained on many known SMILES and their respective IUPAC codes.
IUPAC = None
#print(IUPAC)
#print(name)
#print('flag 4')
Draw.MolToFile(db[(int(CdId)-1)],'image.png') #drawing the image using the CdId as an index to access the original database created from the sdf file.
with open('image.png', 'rb') as image_file:
binaryData = image_file.read() #converting the image into binary data (BLOB) so it can be inserted into the data using SQL.
#print(binaryData)
all_filters_boolean = all_filters_test(lipinski_score,lead_likeness_score,bioavailability_score,ghose_score,veber_score,REOS_score,rule_of_3_score,drug_like_QED_score)
#now the instance of data to be input into the database can be created in the form of a list of parameters. #this follows the chronological order in which the columns exist in the sql ddl file, so it respects the database schema.
list_to_input = [CdId,name,formula,binaryData,IUPAC,Mol_ID,MolWt,ExactMolWt,LogP,LogD,HBD,HBA,NumRotatableBonds,RingCount,TPSA,NumAromaticRings,fused_aro_rings,smiles,qed,SPS,NumValenceElectrons,NumRadicalElectrons,molar_refractivity,number_of_atoms,formal_charge,heavy_atoms,lipinski_score,lead_likeness_score,bioavailability_score,ghose_score,veber_score,REOS_score,rule_of_3_score,drug_like_QED_score,all_filters_boolean]
#print(list_to_input)
#print(len(list_to_input)) #we have 35 attributes of the entity Compounds.
#creating the insert statement
question_marks = '?,' * 35
question_marks = question_marks.rstrip(',')
insert_statement = "INSERT INTO Compounds VALUES (%s);" % question_marks
#print(question_marks)
#print(insert_statement)
#the data is now ready to be inserted.
try:
cur.execute(insert_statement,list_to_input)
connection.commit() #committing the changes to the database
except:
logger.warning(f"The data for compound {CdId} could not be inserted into the database, skipping this.")
continue
#print(descriptors_list)
#closing the filehandle to the database
cur.close()
connection.close()
#removing the image file
os.remove("image.png")
#for 100 compounds, this takes about 1.5 hours to run on a regular virtual machine.