Skip to content

Commit 869db9c

Browse files
authored
bingo: elastic: new exact search, major refactorings
1 parent 688ca8c commit 869db9c

File tree

12 files changed

+206
-159
lines changed

12 files changed

+206
-159
lines changed

bingo/bingo-elastic/python/bingo_elastic/elastic.py

Lines changed: 31 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
except ImportError:
2323
pass
2424

25-
from indigo import Indigo # type: ignore
25+
from indigo import Indigo, IndigoObject # type: ignore
2626

2727
from bingo_elastic.model.record import (
2828
IndigoRecord,
@@ -105,6 +105,8 @@ def get_client(
105105
"sub_fingerprint": {"type": "keyword", "similarity": "boolean"},
106106
"sub_fingerprint_len": {"type": "integer"},
107107
"cmf": {"type": "binary"},
108+
"hash": {"type": "unsigned_long"},
109+
"has_error": {"type": "integer"},
108110
}
109111
}
110112
}
@@ -151,13 +153,16 @@ def prepare(
151153

152154

153155
def response_to_records(
154-
res, index_name, postprocess_actions
156+
res: dict,
157+
index_name: str,
158+
postprocess_actions: PostprocessType = None,
159+
indigo_session: Indigo = None,
160+
options: str = "",
155161
) -> Generator[IndigoRecord, None, None]:
156-
indigo_session = Indigo()
157162
for el_response in res.get("hits", {}).get("hits", []):
158163
record = get_record_by_index(el_response, index_name)
159-
for action_fn in postprocess_actions:
160-
record = action_fn(record, indigo_session) # type: ignore
164+
for action_fn in postprocess_actions: # type: ignore
165+
record = action_fn(record, indigo_session, options) # type: ignore
161166
if not record:
162167
continue
163168
yield record
@@ -218,27 +223,25 @@ async def index_records(self, records: Generator, chunk_size: int = 500):
218223

219224
async def filter(
220225
self,
221-
similarity: Union[BaseMatch] = None,
222-
exact: IndigoRecord = None,
223-
substructure: IndigoRecord = None,
224-
limit=10,
226+
query_subject: Union[BaseMatch, IndigoObject, IndigoRecord] = None,
227+
indigo_session: Indigo = None,
228+
limit: int = 10,
229+
options: str = "",
225230
**kwargs,
226231
) -> AsyncGenerator[IndigoRecord, None]:
227232

228233
# actions needed to be called on elastic_search result
229234
postprocess_actions: PostprocessType = []
230235

231236
query = compile_query(
232-
similarity=similarity,
233-
exact=exact,
234-
substructure=substructure,
237+
query_subject=query_subject,
235238
limit=limit,
236239
postprocess_actions=postprocess_actions,
237240
**kwargs,
238241
)
239242
res = await self.el_client.search(index=self.index_name, body=query)
240243
for record in response_to_records(
241-
res, self.index_name, postprocess_actions
244+
res, self.index_name, postprocess_actions, indigo_session, options
242245
):
243246
yield record
244247

@@ -313,40 +316,33 @@ def delete_all_records(self):
313316

314317
def filter(
315318
self,
316-
similarity: Union[BaseMatch] = None,
317-
exact: IndigoRecord = None,
318-
substructure: IndigoRecord = None,
319-
limit=10,
319+
query_subject: Union[BaseMatch, IndigoObject, IndigoRecord] = None,
320+
indigo_session: Indigo = None,
321+
limit: int = 10,
322+
options: str = "",
320323
**kwargs,
321324
) -> Generator[IndigoRecord, None, None]:
322325

323326
# actions needed to be called on elastic_search result
324327
postprocess_actions: PostprocessType = []
325-
326328
query = compile_query(
327-
similarity=similarity,
328-
exact=exact,
329-
substructure=substructure,
329+
query_subject=query_subject,
330330
limit=limit,
331331
postprocess_actions=postprocess_actions,
332332
**kwargs,
333333
)
334334
res = self.el_client.search(index=self.index_name, body=query)
335335
yield from response_to_records(
336-
res, self.index_name, postprocess_actions
336+
res, self.index_name, postprocess_actions, indigo_session, options
337337
)
338338

339339

340-
# pylint: disable=too-many-arguments
341340
def compile_query(
342-
similarity: BaseMatch = None,
343-
exact: IndigoRecord = None,
344-
substructure: IndigoRecord = None,
341+
query_subject: Union[BaseMatch, IndigoObject, IndigoRecord] = None,
345342
limit: int = 10,
346343
postprocess_actions: PostprocessType = None,
347344
**kwargs,
348345
) -> Dict:
349-
350346
query = {
351347
"size": limit,
352348
"_source": {
@@ -359,17 +355,15 @@ def compile_query(
359355
],
360356
},
361357
}
362-
if similarity and substructure:
363-
raise AttributeError(
364-
"similarity and substructure search is not supported"
365-
)
366358

367-
if similarity:
368-
similarity.compile(query, postprocess_actions)
369-
elif exact:
370-
query_factory("exact", exact).compile(query, postprocess_actions)
371-
elif substructure:
372-
query_factory("substructure", substructure).compile(
359+
if isinstance(query_subject, BaseMatch):
360+
query_subject.compile(query, postprocess_actions)
361+
elif isinstance(query_subject, IndigoRecord):
362+
query_factory("exact", query_subject).compile(
363+
query, postprocess_actions
364+
)
365+
elif isinstance(query_subject, IndigoObject):
366+
query_factory("substructure", query_subject).compile(
373367
query, postprocess_actions
374368
)
375369

bingo/bingo-elastic/python/bingo_elastic/model/record.py

Lines changed: 45 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55

66
from indigo import Indigo, IndigoException, IndigoObject # type: ignore
77

8+
MOL_TYPES = ["#02: <molecule>", "#03: <query reaction>", "#12: <RDFMolecule>"]
9+
REAC_TYPES = ["#04: <reaction>", "#05: <query reaction>"]
10+
811

912
# pylint: disable=unused-argument
1013
def skip_errors(instance: IndigoRecord, err: BaseException) -> None:
@@ -29,36 +32,60 @@ def __set__(self, instance: IndigoRecord, value: Dict):
2932

3033
class WithIndigoObject:
3134
def __set__(self, instance: IndigoRecord, value: IndigoObject) -> None:
35+
value_dup = value.clone()
36+
value_dup.aromatize()
37+
3238
fingerprints = (
3339
"sim",
3440
"sub",
3541
)
42+
3643
for f_print in fingerprints:
3744
try:
3845
setattr(instance, f"{f_print}_fingerprint", [])
3946
setattr(instance, f"{f_print}_fingerprint_len", 0)
40-
fp_ = [
41-
int(feature)
42-
for feature in value.fingerprint(f_print)
43-
.oneBitsList()
44-
.split(" ")
45-
]
46-
setattr(instance, f"{f_print}_fingerprint", fp_)
47-
setattr(instance, f"{f_print}_fingerprint_len", len(fp_))
47+
48+
fp_list = value_dup.fingerprint(f_print).oneBitsList()
49+
if fp_list:
50+
fp_ = [int(feature) for feature in fp_list.split(" ")]
51+
setattr(instance, f"{f_print}_fingerprint", fp_)
52+
setattr(instance, f"{f_print}_fingerprint_len", len(fp_))
4853
except ValueError as err_:
4954
check_error(instance, err_)
5055
except IndigoException as err_:
5156
check_error(instance, err_)
5257

5358
try:
54-
setattr(instance, "name", value.name())
55-
except IndigoException:
56-
pass
59+
cmf = " ".join(map(str, list(value_dup.serialize())))
60+
setattr(instance, "cmf", cmf)
61+
except IndigoException as err_:
62+
setattr(instance, "cmf", "")
63+
check_error(instance, err_)
64+
65+
try:
66+
setattr(instance, "name", value_dup.name())
67+
except IndigoException as err_:
68+
setattr(instance, "name", "")
69+
check_error(instance, err_)
70+
71+
try:
72+
internal_type = value_dup.dbgInternalType()
73+
if internal_type in MOL_TYPES:
74+
hash_ = [
75+
component.clone().hash()
76+
for component in value_dup.iterateComponents()
77+
]
78+
setattr(instance, "hash", sorted(set(hash_)))
79+
elif internal_type in REAC_TYPES:
80+
setattr(instance, "hash", [value_dup.hash()])
81+
except IndigoException as err_:
82+
check_error(instance, err_)
5783

5884
try:
59-
setattr(
60-
instance, "cmf", " ".join(map(str, list(value.serialize())))
61-
)
85+
if value_dup.checkBadValence():
86+
setattr(instance, "has_error", 1)
87+
else:
88+
setattr(instance, "has_error", 0)
6289
except IndigoException as err_:
6390
check_error(instance, err_)
6491

@@ -74,6 +101,7 @@ class IndigoRecord:
74101

75102
cmf: Optional[str] = None
76103
name: Optional[str] = None
104+
rawData: Optional[str] = None
77105
sim_fingerprint: Optional[List[str]] = None
78106
sub_fingerprint: Optional[List[str]] = None
79107
indigo_object = WithIndigoObject()
@@ -120,8 +148,9 @@ def as_dict(self) -> Dict:
120148
}
121149

122150
def as_indigo_object(self, session: Indigo):
123-
assert self.cmf
124-
return session.deserialize(list(map(int, self.cmf.split(" "))))
151+
if self.cmf:
152+
return session.deserialize(list(map(int, self.cmf.split(" ")))) # type: ignore
153+
raise ValueError("Unexpected cmf value")
125154

126155

127156
class IndigoRecordMolecule(IndigoRecord):

0 commit comments

Comments
 (0)