Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

euro-bioimaging extension and assistant. #129

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion bioimageio_chatbot/chatbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,16 @@ class ThoughtsSchema(BaseModel):
model="gpt-4-0125-preview",
)


elara_instructions = (
"As Elara, your focus is on serving as an assistant for helping users accessing the services and technologies within the EuroBioImaging network."
"Address only inquiries related to euro-bioimaging, ensuring your responses are not only accurate, concise, and logical, but also educational and engaging. "
)

elara = Role(
instructions=elara_instructions,
actions=[respond_to_user],
model="gpt-4o",
)
# convert to a list
all_extensions = [
{"id": ext.id, "name": ext.name, "description": ext.description} for ext in builtin_extensions
Expand All @@ -218,11 +227,16 @@ class ThoughtsSchema(BaseModel):
]

skyler_extensions = []

elara_extensions = [
ext for ext in all_extensions if ext["id"] == "eurobioimaging"
]
return [
{"name": "Melman", "agent": melman, "extensions": melman_extensions, "code_interpreter": False, "alias": "BioImage Seeker", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Melman. I am help you navigate the bioimaging tools and provide information about bioimaging. How can I help you today?"},
{"name": "Nina", "agent": nina, "extensions": nina_extensions, "code_interpreter": False, "alias": "BioImage Tutor", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Nina, I can help with your learning journey in bioimaging. How can I help you today?"},
{"name": "Bridget", "agent": bridget, "extensions": bridget_extensions, "code_interpreter": True, "alias": "BioImage Analyst", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Bridget, I can help you with your bioimaging tasks. Please mount your data folder and let me know how I can assist you today."},
{"name": "Skyler", "agent": skyler, "extensions": skyler_extensions, "code_interpreter": False, "alias": "BioImage GPT", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Skyler. How can I help you today?"},
{"name": "Elara", "agent": elara, "extensions": elara_extensions, "code_interpreter": False, "alias": "EuroBioImaging", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Elara. I can help you with EuroBioImaging services and technologies. How can I help you today?"}
]


Expand Down
7 changes: 4 additions & 3 deletions bioimageio_chatbot/chatbot_extensions/docs_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,6 @@ def get_extension():
)
os.makedirs(knowledge_base_path, exist_ok=True)

knowledge_base_path = os.environ.get(
"BIOIMAGEIO_KNOWLEDGE_BASE_PATH", "./bioimageio-knowledge-base"
)
docs_store_dict = load_knowledge_base(knowledge_base_path)

docs_tools = {}
Expand All @@ -155,12 +152,16 @@ def get_extension():
description="Search information in the documents of the bioimage.io knowledge base. Provide a list of keywords to search information in the documents. Returns a list of relevant documents.",
tools=docs_tools,
)
else:
sinfo1 = None
if books_tools:
sinfo2 = ChatbotExtension(
id="books",
name="Search BioImage Books",
description="Search information in BioImage books. Provide a list of keywords to search information in the books. Returns a list of relevant documents.",
tools=books_tools,
)
else:
sinfo2 = None

return sinfo1, sinfo2
319 changes: 319 additions & 0 deletions bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
from bioimageio_chatbot.utils import ChatbotExtension
from bioimageio_chatbot.knowledge_base import load_docs_store
from schema_agents import schema_tool
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from pydantic import Field, BaseModel
from typing import Any, Dict
import json
import requests
from markdownify import markdownify as md
import re
import os

import pandas as pd
file_path = '/home/alalulu/workspace/chatbot_bmz/chatbot/tests/publications.csv'
publication_df = pd.read_csv(file_path)


class DocWithScore(BaseModel):
"""A document with an associated relevance score."""

doc: str = Field(description="The document retrieved.")
score: float = Field(description="The relevance score of the retrieved document.")
metadata: Dict[str, Any] = Field(description="The metadata of the retrieved document.")

def load_eurobioimaging_base(db_path):
docs_store_dict = {}
print(f"Loading EuroBioImaging docs store from {db_path}")
for collection in ['technologies', 'nodes', 'publication']:
try:
docs_store = load_docs_store(db_path, f"eurobioimaging-{collection}")
length = len(docs_store.docstore._dict.keys())
assert length > 0, f"Please make sure the docs store {collection} is not empty."
print(f"- Loaded {length} documents from {collection}")
docs_store_dict[collection] = docs_store
except Exception as e:
print(f"Failed to load docs store for {collection}. Error: {e}")

if len(docs_store_dict) == 0:
raise Exception("No docs store is loaded, please make sure the docs store is not empty.")
# load the node index
with open(os.path.join(db_path, "eurobioimaging-node-index.json"), "r") as f:
node_index = json.load(f)
return docs_store_dict, node_index

def chunkify(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]

def create_eurobioimaging_vector_database(output_dir=None):
if output_dir is None:
output_dir = os.environ.get("BIOIMAGEIO_KNOWLEDGE_BASE_PATH", "./bioimageio-knowledge-base")
os.makedirs(output_dir, exist_ok=True)

EUROBIOIMAGING_TOKEN = os.getenv("EUROBIOIMAGING_TOKEN")
assert EUROBIOIMAGING_TOKEN is not None, "Please set the EUROBIOIMAGING_TOKEN environment variable to access the EuroBioImaging API."

embeddings = OpenAIEmbeddings()

response = requests.get(f"https://www.eurobioimaging.eu/api-general-data.php?a={EUROBIOIMAGING_TOKEN}")
collections = response.json()

for name in ["technologies", "nodes"]:
all_metadata = []
all_contents = []
for item in collections[name]:
print(f"Fetch description from {item['url']}...")
response = requests.get(f"https://www.eurobioimaging.eu/api-page-content.php?a={EUROBIOIMAGING_TOKEN}&url={item['url']}")
content = md(response.text, heading_style="ATX")
description = re.sub(r"\n{3,}", "\n\n", content).strip()

# Generate embeddings for the item
item_content = f"# {item['name']}\n\n{description}"
all_contents.append(item_content)
if "description" in item:
item.pop("description")
all_metadata.append(item)

print(f"Embedding {len(all_contents)} documents...")
all_embedding_pairs = []
for chunk in chunkify(all_contents, 300):
item_embeddings = embeddings.embed_documents(chunk)
all_embedding_pairs.extend(zip(chunk, item_embeddings))

# Create the FAISS index from all the embeddings
vectordb = FAISS.from_embeddings(all_embedding_pairs, embeddings, metadatas=all_metadata)
print("Saving the vector database...")
vectordb.save_local(output_dir, index_name="eurobioimaging-" + name)
print("Created a vector database from the downloaded documents.")

node_index = {}
for item in collections["nodes"]:
node_index[item['node_id']] = item
with open(os.path.join(output_dir, "eurobioimaging-node-index.json"), "w") as f:
json.dump(node_index, f)


def get_pmid_from_doi(doi):
esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
"db": "pubmed",
"term": doi,
"retmode": "json"
}
response = requests.get(esearch_url, params=params)
data = response.json()
idlist = data["esearchresult"]["idlist"]
if idlist:
return idlist[0]
else:
return None

def get_article_details(pmids):
esummary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
params = {
"db": "pubmed",
"id": ",".join(pmids),
"retmode": "json"
}
response = requests.get(esummary_url, params=params)
data = response.json()
return data["result"]

def get_article_abstract(pmid):
efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {
"db": "pubmed",
"id": pmid,
"retmode": "xml",
"rettype": "abstract"
}
response = requests.get(efetch_url, params=params)
return response.text


def read_publication_abstract(
doi: str = Field(..., description="The DOI of the publication"),
):
"""Read the abstract of the publication with the provided DOI"""
# get the doi and title
pmid = get_pmid_from_doi(doi)
if pmid:
article = get_article_details(pmid)
# article_details = article[pmid]
# title = article_details['title']
abstract_xml = get_article_abstract(pmid)

# Extract the abstract text from the XML
from xml.etree import ElementTree as ET
root = ET.fromstring(abstract_xml)
abstract = ""
for elem in root.findall(".//AbstractText"):
abstract += elem.text + " "
# print(f"Abstract: {abstract.strip()}")
return abstract.strip()
else:
# print("No article found for the provided DOI.")
return "No article found for the provided DOI."

def create_tools(docs_store_dict, node_index):
async def search_publication(
keywords: str = Field(..., description="The keywords used for searching and extracting use cases from the publications."),
):
"""Search the relevant publications in the list"""

results = []
for index, row in publication_df.iterrows():
# search if keywords in the title
if keywords.lower() in row['Title'].lower():
results.append(row)

if results:
# convert the results to dictionary
results_dict = [obj.to_dict() for obj in results]
# get the DOI of the publication
for result in results_dict:
result['DOI'] = result['DOI'].split(' ')[0]
abstract = read_publication_abstract(result['DOI'])
result['Abstract'] = abstract
return results_dict
else:
return f"No publication found with keywords: {keywords}"



async def search_technology(
keywords: str = Field(..., description="The keywords used for searching the technology in EuroBioImaging service index."),
top_k: int = Field(..., description="Return the top_k number of search results")
):
"""Search Technology in EuroBioImaging service index"""
results = []
collection = "technologies"
top_k = max(1, min(top_k, 15))
docs_store = docs_store_dict[collection]

print(f"Retrieving documents from database {collection} with keywords: {keywords}")
results.append(
await docs_store.asimilarity_search_with_relevance_scores(
keywords, k=top_k
)
)
docs_with_score = [
DocWithScore(
doc=doc.page_content,
score=round(score, 2),
metadata=doc.metadata,
)
for results_with_scores in results
for doc, score in results_with_scores
]
# sort by relevance score
docs_with_score = sorted(docs_with_score, key=lambda x: x.score, reverse=True)[
: top_k
]

if len(docs_with_score) > 2:
print(
f"Retrieved documents:\n{docs_with_score[0].doc[:20] + '...'} (score: {docs_with_score[0].score})\n{docs_with_score[1].doc[:20] + '...'} (score: {docs_with_score[1].score})\n{docs_with_score[2].doc[:20] + '...'} (score: {docs_with_score[2].score})"
)
else:
print(f"Retrieved documents:\n{docs_with_score}")
return docs_with_score


async def get_node_details(
node_id: str = Field(..., description="The EuroBioImaging node id"),
):
"""Get details of the EuroBioImaging node who provide services and technoliges to users"""
if node_id in node_index:
return node_index[node_id]
else:
return f"Node not found: {node_id}"


async def search_node(
keywords: str = Field(..., description="The keywords for searching the service nodes"),
top_k: int = Field(..., description="Return the top_k number of search results")
):
"""Search a service node in the EuroBioImaging network"""
results = []
collection = "nodes"
top_k = max(1, min(top_k, 15))
docs_store = docs_store_dict[collection]

print(f"Retrieving documents from database {collection} with query: {keywords}")
results.append(
await docs_store.asimilarity_search_with_relevance_scores(
keywords, k=top_k
)
)

docs_with_score = [
DocWithScore(
doc=doc.page_content,
score=round(score, 2),
metadata=doc.metadata,
)
for results_with_scores in results
for doc, score in results_with_scores
]
# sort by relevance score
docs_with_score = sorted(docs_with_score, key=lambda x: x.score, reverse=True)[
: top_k
]

if len(docs_with_score) > 2:
print(
f"Retrieved documents:\n{docs_with_score[0].doc[:20] + '...'} (score: {docs_with_score[0].score})\n{docs_with_score[1].doc[:20] + '...'} (score: {docs_with_score[1].score})\n{docs_with_score[2].doc[:20] + '...'} (score: {docs_with_score[2].score})"
)
else:
print(f"Retrieved documents:\n{docs_with_score}")
return docs_with_score
return schema_tool(search_technology), schema_tool(search_node), schema_tool(get_node_details), schema_tool(search_publication)


def get_extension():
# collections = get_manifest()["collections"]
knowledge_base_path = os.environ.get(
"BIOIMAGEIO_KNOWLEDGE_BASE_PATH", "./bioimageio-knowledge-base"
)
assert (
knowledge_base_path is not None
), "Please set the BIOIMAGEIO_KNOWLEDGE_BASE_PATH environment variable to the path of the knowledge base."

# check if node_index exists
if not os.path.exists(os.path.join(knowledge_base_path, "eurobioimaging-node-index.json")):
EUROBIOIMAGING_TOKEN = os.getenv("EUROBIOIMAGING_TOKEN")
if EUROBIOIMAGING_TOKEN is None:
print("Warning: Disable EuroBioImaging extension because the EUROBIOIMAGING_TOKEN is not set.")
return None
print("Creating EuroBioImaging vector database...")
create_eurobioimaging_vector_database(knowledge_base_path)

docs_store_dict, node_index = load_eurobioimaging_base(knowledge_base_path)
search_technology, search_node, get_node_details, search_publication = create_tools(docs_store_dict, node_index)

return ChatbotExtension(
id="eurobioimaging",
name="EuroBioImaging Service Index",
description="Help users find bioimaging services and technologies in the EuroBioimaging network. For a given study or research, you should firstly search publications to find relevant use cases and the technologies used in the study. Then you can search by keywords for the imaging technology, and use the returned node_id to find out details about the service providing node in the EuroBioimang network",
tools=dict(
search_technology=search_technology,
search_node=search_node,
get_node_details=get_node_details,
search_publication=search_publication,
# read_publication_abstract=read_publication_abstract,
)
)

if __name__ == "__main__":
# import asyncio
# async def main():
# extension = get_extension()
# query = "mouse embryos"
# top_k = 2
# print(await extension.tools["search_technology"](keywords=query, top_k=top_k))
# asyncio.run(main())
create_eurobioimaging_publication_vector_database()
3 changes: 2 additions & 1 deletion bioimageio_chatbot/knowledge_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,14 @@ def load_knowledge_base(db_path):
collections = get_manifest()['collections']
docs_store_dict = {}

print(f"Loading knowledge base from {db_path}")
for collection in collections:
channel_id = collection['id']
try:
docs_store = load_docs_store(db_path, channel_id)
length = len(docs_store.docstore._dict.keys())
assert length > 0, f"Please make sure the docs store {channel_id} is not empty."
print(f"Loaded {length} documents from {channel_id}")
print(f"- Loaded {length} documents from {channel_id}")
docs_store_dict[channel_id] = docs_store
except Exception as e:
print(f"Failed to load docs store for {channel_id}. Error: {e}")
Expand Down
Loading