Skip to content

Commit

Permalink
Cleaner mongo_vs_sql_query.py and faster mongo query
Browse files Browse the repository at this point in the history
  • Loading branch information
AbdelrhmanBassiouny committed Jul 4, 2024
1 parent 7a2e500 commit e9f0278
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 74 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "neem_to_sql"
version = "1.0.8"
version = "1.0.9"
description = "Convert NEEMs database from Mongo to SQL"
readme = "README.md"
authors = [{ name = "Abdelrhman Bassiouny", email = "[email protected]" }]
Expand Down
153 changes: 80 additions & 73 deletions src/mongo_vs_sql_query.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,44 @@
from time import time

from neems_to_sql.neems_to_sql import mongo_collection_to_list_of_dicts, parse_arguments, \
get_mongo_uri, connect_to_mongo_and_get_client, filter_and_select_neems_in_batches, filter_neems
from neems_to_sql.logger import CustomLogger, logging
from bson import ObjectId
from typing_extensions import List, Optional, Dict

if __name__ == "__main__":
from neems_to_sql.logger import CustomLogger, logging
from neems_to_sql.neems_to_sql import mongo_collection_to_list_of_dicts, parse_arguments, \
get_mongo_uri, connect_to_mongo_and_get_client, filter_neems

LOGGER = CustomLogger("MONGO_VS_SQL_QUERY",
"mongo_vs_sql_query.txt",
logging.DEBUG, reset_handlers=True).get_logger()

# Replace the uri string with your MongoDB deployment's connection string.
args = parse_arguments()
if args.mongo_uri is not None:
MONGODB_URI = args.mongo_uri
else:
MONGODB_URI = get_mongo_uri(args.mongo_username, args.mongo_password, args.mongo_host,
args.mongo_port, args.mongo_database)
# set a 5-second connection timeout
mongo_client = connect_to_mongo_and_get_client(MONGODB_URI)
db = mongo_client.neems

# Get neem ids
meta = db.meta
meta_lod = mongo_collection_to_list_of_dicts(meta)
meta_lod = filter_neems(meta_lod, {'visibility': True})
if len(meta_lod) == 0:
LOGGER.error("NO NEEMS FOUND (Probably no meta data collection OR no neems with the given filters)")
raise ValueError("NO NEEMS FOUND (Probably no meta data collection OR no neems with the given filters)")
neem_ids = [doc['_id'] for doc in meta_lod]
LOGGER.debug(f"NEEM IDS: {neem_ids}")

total_time = 0
all_docs = []
get_collection_time = []
def apply_query_on_all_neems(db, neem_ids: List, query_name: str, number_of_repeats: int = 10):
single_query_time = []
append_time = []
total_per_neem_time = []
for neem_id in neem_ids:
start = time()
triples = db.get_collection(f"{neem_id}_triples")
get_collection_time.append(time() - start)
first_neem_id = neem_ids[0]
triples = db.get_collection(f"{first_neem_id}_triples")
query = get_task_query_for_neem(first_neem_id)
number_of_query_lines_per_neem = len(query)
query.extend([
{
"$unionWith": {
"coll": f"{neem_id}_triples",
"pipeline": get_task_query_for_neem(neem_id)
}
} for neem_id in neem_ids
])
all_docs = []
for i in range(number_of_repeats):
start = time()
cursor = triples.aggregate([
{"$match": {"$or": [{"p": "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#executesTask"},
{'p': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
'o': 'http://www.ease-crc.org/ont/SOMA.owl#Gripping'}]}},
cursor = triples.aggregate(query)
all_docs = [doc for doc in cursor]
single_query_time.append(time() - start)
LOGGER.info(f"Query: {query_name}")
LOGGER.info(f"ALL DOCS: {all_docs}")
LOGGER.info(f"Avg time for {number_of_repeats} repeats: {sum(single_query_time)/number_of_repeats}")
LOGGER.info(f"Total number of documents: {len(all_docs)}")
LOGGER.info(f"Number of query lines per neem: {number_of_query_lines_per_neem}")
LOGGER.info(f"Number of neems: {len(neem_ids)}")
LOGGER.info(f"Number of query lines: {number_of_query_lines_per_neem * len(neem_ids)}")


def get_task_query_for_neem(neem_id):
return [{"$match": {"p": "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#executesTask"}},
{
"$lookup":
{
Expand All @@ -55,39 +48,53 @@
"as": f"{neem_id}"
}
},
{
"$unwind": f"${neem_id}"
},
{"$match": {f'{neem_id}.p': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
f'{neem_id}.o': 'http://www.ease-crc.org/ont/SOMA.owl#Gripping'}},
{
"$project": {
f"{neem_id}.p*": 0,
f"{neem_id}._id": 0,
f"{neem_id}.graph": 0,
f"{neem_id}.scope": 0,
f"{neem_id}.o*": 0,
"p*": 0,
"o*": 0,
"_id": 0,
"graph": 0,
"scope": 0
f"{neem_id}.s": 1,
f"{neem_id}.p": 1,
f"{neem_id}.o": 1
}
},
{"$match": {f'{neem_id}.p': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
f'{neem_id}.o': 'http://www.ease-crc.org/ont/SOMA.owl#Gripping'}}
])
single_query_time.append(time() - start)
start = time()
all_docs.extend([doc for doc in cursor])
append_time.append(time() - start)
total_per_neem_time.append(get_collection_time[-1] + single_query_time[-1] + append_time[-1])
}]


def filter_neems_and_get_neem_ids(db, filters: Optional[Dict] = None) -> List[ObjectId]:
meta = db.meta
meta_lod = mongo_collection_to_list_of_dicts(meta)
if filters is not None:
meta_lod = filter_neems(meta_lod, filters)
if len(meta_lod) == 0:
LOGGER.error("NO NEEMS FOUND (Probably no meta data collection OR no neems with the given filters)")
raise ValueError("NO NEEMS FOUND (Probably no meta data collection OR no neems with the given filters)")
neem_ids = [doc['_id'] for doc in meta_lod]
LOGGER.debug(f"NEEM IDS: {neem_ids}")
return neem_ids


def connect_to_mongo_and_get_neems_database(args):
if args.mongo_uri is not None:
MONGODB_URI = args.mongo_uri
else:
MONGODB_URI = get_mongo_uri(args.mongo_username, args.mongo_password, args.mongo_host,
args.mongo_port, args.mongo_database)
mongo_client = connect_to_mongo_and_get_client(MONGODB_URI)
db = mongo_client.neems
return db


if __name__ == "__main__":

LOGGER = CustomLogger("MONGO_VS_SQL_QUERY",
"mongo_vs_sql_query.txt",
logging.DEBUG, reset_handlers=True).get_logger()

# Replace the uri string with your MongoDB deployment's connection string.
args = parse_arguments()
db = connect_to_mongo_and_get_neems_database(args)
neem_ids = filter_neems_and_get_neem_ids(db, {'visibility': True})
query_name = "Find all tasks that are of type Gripping."
apply_query_on_all_neems(db, neem_ids, query_name)



LOGGER.info(f"ALL DOCS: {all_docs}")
LOGGER.info(f"Total time: {sum(total_per_neem_time)}")
LOGGER.info(f"Total get collection time: {sum(get_collection_time)}")
LOGGER.info(f"Total single query time: {sum(single_query_time)}")
LOGGER.info(f"Total append time: {sum(append_time)}")
LOGGER.info(f"Avg per neem time: {sum(total_per_neem_time) / len(neem_ids)}")
LOGGER.info(f"Avg get collection time: {sum(get_collection_time) / len(neem_ids)}")
LOGGER.info(f"Avg single query time: {sum(single_query_time) / len(neem_ids)}")
LOGGER.info(f"Avg append time: {sum(append_time) / len(neem_ids)}")
LOGGER.info(f"Total number of documents: {len(all_docs)}")

0 comments on commit e9f0278

Please sign in to comment.