From a40481f69f1c825e378ae627786734e1b034923a Mon Sep 17 00:00:00 2001 From: alalulu8668 Date: Mon, 20 May 2024 01:06:12 +0200 Subject: [PATCH 1/9] add euro-bioimaging extension and assistant. --- bioimageio_chatbot/chatbot.py | 16 ++- .../euro_bioimaging_extension.py | 99 +++++++++++++++++++ pyproject.toml | 3 +- requirements.txt | 3 +- 4 files changed, 118 insertions(+), 3 deletions(-) create mode 100644 bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py diff --git a/bioimageio_chatbot/chatbot.py b/bioimageio_chatbot/chatbot.py index 5fd981e..09adab4 100644 --- a/bioimageio_chatbot/chatbot.py +++ b/bioimageio_chatbot/chatbot.py @@ -196,7 +196,16 @@ class ThoughtsSchema(BaseModel): model="gpt-4-0125-preview", ) - + elara_instructions = ( + "As Elara, your focus is on serving as an assistant for helping users accessing the services and technologies within the EuroBioImaging network." + "Address only inquiries related to euro-bioimaging, ensuring your responses are not only accurate, concise, and logical, but also educational and engaging. " + ) + + elara = Role( + instructions=elara_instructions, + actions=[respond_to_user], + model="gpt-4o", + ) # convert to a list all_extensions = [ {"id": ext.id, "name": ext.name, "description": ext.description} for ext in builtin_extensions @@ -218,11 +227,16 @@ class ThoughtsSchema(BaseModel): ] skyler_extensions = [] + + elara_extensions = [ + ext for ext in all_extensions if ext["id"] == "eurobioimaging" + ] return [ {"name": "Melman", "agent": melman, "extensions": melman_extensions, "code_interpreter": False, "alias": "BioImage Seeker", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Melman. I am help you navigate the bioimaging tools and provide information about bioimaging. How can I help you today?"}, {"name": "Nina", "agent": nina, "extensions": nina_extensions, "code_interpreter": False, "alias": "BioImage Tutor", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Nina, I can help with your learning journey in bioimaging. How can I help you today?"}, {"name": "Bridget", "agent": bridget, "extensions": bridget_extensions, "code_interpreter": True, "alias": "BioImage Analyst", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Bridget, I can help you with your bioimaging tasks. Please mount your data folder and let me know how I can assist you today."}, {"name": "Skyler", "agent": skyler, "extensions": skyler_extensions, "code_interpreter": False, "alias": "BioImage GPT", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Skyler. How can I help you today?"}, + {"name": "Elara", "agent": elara, "extensions": elara_extensions, "code_interpreter": False, "alias": "EuroBioImaging Assistant", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Elara. I can help you with EuroBioImaging services and technologies. How can I help you today?"} ] diff --git a/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py b/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py new file mode 100644 index 0000000..2106340 --- /dev/null +++ b/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py @@ -0,0 +1,99 @@ +from bioimageio_chatbot.utils import ChatbotExtension +from schema_agents import schema_tool +from pydantic import Field, BaseModel +from typing import Optional, List, Dict, Any +import pandas as pd +from pathlib import Path +import requests +from markdownify import markdownify as md +import re +import os +from bioimageio_chatbot.utils import download_file +import urllib3 +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + +EUROBIOIMAGING_TOKEN = os.getenv("EUROBIOIMAGING_TOKEN") +response = requests.get(f"https://www.eurobioimaging.eu/api-general-data.php?a={EUROBIOIMAGING_TOKEN}") +collections = response.json() + +for name in ["technologies", "nodes"]: + for item in collections[name]: + print(f"Download description from {item['url']}") + response = requests.get(f"https://www.eurobioimaging.eu/api-page-content.php?a={EUROBIOIMAGING_TOKEN}&url={item['url']}") + content = md(response.text, heading_style="ATX") + item["description"] = re.sub(r"\n{3,}", "\n\n", content).strip() + +node_index = {} +for item in collections["nodes"]: + node_index[item['node_id']] = item + + +@schema_tool +async def search_technology( + keywords: List[str] = Field(..., description="A list of keywords for searching the technology"), + top_k: int = Field(..., description="Return the top_k number of search results") +): + """Search Technology in EuroBioImaging service index""" + results = [] + for item in collections["technologies"]: + for k in keywords: + if k.lower() in item["description"].lower() or k.lower() in item["name"].lower(): + results.append(item) + # TODO sort by relevance + if len(results) >= top_k: + break + if len(results) >= top_k: + break + return results + + +@schema_tool +async def get_node_details( + node_id: str = Field(..., description="The EuroBioImaging node id"), +): + """Get details of the EuroBioImaging node who provide services and technoliges to users""" + if node_id in node_index: + return node_index[node_id] + else: + return f"Node not found: {node_id}" + +@schema_tool +async def search_node( + keywords: List[str] = Field(..., description="A list of keywords for searching the service nodes"), + top_k: int = Field(..., description="Return the top_k number of search results") +): + """Search a service node in the EuroBioImaging network""" + results = [] + for item in collections["nodes"]: + for k in keywords: + if k.lower() in item["description"].lower() or k.lower() in item["name"].lower(): + results.append(item) + # TODO sort by relevance + if len(results) >= top_k: + break + if len(results) >= top_k: + break + return results + + +def get_extension(): + return ChatbotExtension( + id="eurobioimaging", + name="EuroBioImaging Service Index", + description="Help users to find bioimaging services in the EuroBioimaging network; You can search by keywords for the imaging technology, then use the returned node_id to find out details about the service providing node in the EuroBioimang network", + tools=dict( + search_technology=search_technology, + search_node=search_node, + get_node_details=get_node_details + ) + ) + +if __name__ == "__main__": + import asyncio + async def main(): + extension = get_extension() + query = "mouse embryos" + top_k = 2 + print(await extension.tools["search_technology"](keywords=query, top_k=top_k)) + asyncio.run(main()) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 3795a3c..991b174 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ version = "0.1.98" readme = "README.md" description = "Your Personal Assistant in Computational BioImaging." dependencies = [ - "schema-agents>=0.1.49", + "schema-agents>=0.1.50", "imjoy-rpc>=0.5.48.post2", "requests", "pypdf", @@ -25,6 +25,7 @@ dependencies = [ "langchain-core>=0.1.31", "langchain-community>=0.0.27", "html2text", + "markdownify", ] [tool.setuptools] diff --git a/requirements.txt b/requirements.txt index 1192a9b..6577fc6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -schema-agents>=0.1.45 +schema-agents>=0.1.50 imjoy-rpc>=0.5.48.post2 requests pypdf @@ -18,3 +18,4 @@ langchain-openai==0.0.8 rank-bm25==0.2.2 html2text==2020.1.16 setuptools +markdownify From 46b762953d6da5722d15dc694012307f8b43e370 Mon Sep 17 00:00:00 2001 From: alalulu8668 Date: Mon, 20 May 2024 08:30:30 +0200 Subject: [PATCH 2/9] update search tools for euro bioimaging. --- .../chatbot_extensions/docs_extension.py | 3 - .../euro_bioimaging_extension.py | 255 +++++++++++++----- 2 files changed, 186 insertions(+), 72 deletions(-) diff --git a/bioimageio_chatbot/chatbot_extensions/docs_extension.py b/bioimageio_chatbot/chatbot_extensions/docs_extension.py index b9d740b..415adfd 100644 --- a/bioimageio_chatbot/chatbot_extensions/docs_extension.py +++ b/bioimageio_chatbot/chatbot_extensions/docs_extension.py @@ -135,9 +135,6 @@ def get_extension(): ) os.makedirs(knowledge_base_path, exist_ok=True) - knowledge_base_path = os.environ.get( - "BIOIMAGEIO_KNOWLEDGE_BASE_PATH", "./bioimageio-knowledge-base" - ) docs_store_dict = load_knowledge_base(knowledge_base_path) docs_tools = {} diff --git a/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py b/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py index 2106340..92ab70f 100644 --- a/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py +++ b/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py @@ -1,83 +1,199 @@ from bioimageio_chatbot.utils import ChatbotExtension +from bioimageio_chatbot.knowledge_base import load_docs_store from schema_agents import schema_tool +from langchain_community.vectorstores import FAISS +from langchain_openai import OpenAIEmbeddings from pydantic import Field, BaseModel -from typing import Optional, List, Dict, Any -import pandas as pd -from pathlib import Path +import json import requests from markdownify import markdownify as md import re import os -from bioimageio_chatbot.utils import download_file import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) EUROBIOIMAGING_TOKEN = os.getenv("EUROBIOIMAGING_TOKEN") -response = requests.get(f"https://www.eurobioimaging.eu/api-general-data.php?a={EUROBIOIMAGING_TOKEN}") -collections = response.json() - -for name in ["technologies", "nodes"]: - for item in collections[name]: - print(f"Download description from {item['url']}") - response = requests.get(f"https://www.eurobioimaging.eu/api-page-content.php?a={EUROBIOIMAGING_TOKEN}&url={item['url']}") - content = md(response.text, heading_style="ATX") - item["description"] = re.sub(r"\n{3,}", "\n\n", content).strip() - -node_index = {} -for item in collections["nodes"]: - node_index[item['node_id']] = item + +class DocWithScore(BaseModel): + """A document with an associated relevance score.""" + + doc: str = Field(description="The document retrieved.") + score: float = Field(description="The relevance score of the retrieved document.") + +def load_eurobioimaging_base(db_path): + docs_store_dict = {} + for collection in ['technologies', 'nodes']: + channel_id = collection + try: + docs_store = load_docs_store(db_path, channel_id) + length = len(docs_store.docstore._dict.keys()) + assert length > 0, f"Please make sure the docs store {channel_id} is not empty." + print(f"Loaded {length} documents from {channel_id}") + docs_store_dict[channel_id] = docs_store + except Exception as e: + print(f"Failed to load docs store for {channel_id}. Error: {e}") + + if len(docs_store_dict) == 0: + raise Exception("No docs store is loaded, please make sure the docs store is not empty.") + # load the node index + with open(os.path.join(db_path, "eurobioimaging_node_index.json"), "r") as f: + node_index = json.load(f) + return docs_store_dict, node_index -@schema_tool -async def search_technology( - keywords: List[str] = Field(..., description="A list of keywords for searching the technology"), - top_k: int = Field(..., description="Return the top_k number of search results") -): - """Search Technology in EuroBioImaging service index""" - results = [] - for item in collections["technologies"]: - for k in keywords: - if k.lower() in item["description"].lower() or k.lower() in item["name"].lower(): - results.append(item) - # TODO sort by relevance - if len(results) >= top_k: - break - if len(results) >= top_k: - break - return results - - -@schema_tool -async def get_node_details( - node_id: str = Field(..., description="The EuroBioImaging node id"), -): - """Get details of the EuroBioImaging node who provide services and technoliges to users""" - if node_id in node_index: - return node_index[node_id] - else: - return f"Node not found: {node_id}" - -@schema_tool -async def search_node( - keywords: List[str] = Field(..., description="A list of keywords for searching the service nodes"), - top_k: int = Field(..., description="Return the top_k number of search results") -): - """Search a service node in the EuroBioImaging network""" - results = [] + +def create_eurobioimaging_vector_database(output_dir=None): + if output_dir is None: + output_dir = os.environ.get("BIOIMAGEIO_KNOWLEDGE_BASE_PATH", "./bioimageio-knowledge-base") + os.makedirs(output_dir, exist_ok=True) + + embeddings = OpenAIEmbeddings() + + response = requests.get(f"https://www.eurobioimaging.eu/api-general-data.php?a={EUROBIOIMAGING_TOKEN}") + collections = response.json() + + for name in ["technologies", "nodes"]: + all_embedding_pairs = [] + all_metadata = [] + for item in collections[name]: + print(f"Download description from {item['url']} and create embeddings...") + response = requests.get(f"https://www.eurobioimaging.eu/api-page-content.php?a={EUROBIOIMAGING_TOKEN}&url={item['url']}") + content = md(response.text, heading_style="ATX") + description = re.sub(r"\n{3,}", "\n\n", content).strip() + + # Generate embeddings for the item + item_content = f"# {item['name']}\n\n{description}" + item_embedding = embeddings.embed_documents([item_content])[0] + + # Append the item_embedding to the all_embedding_pairs list + all_embedding_pairs.append((item_content, item_embedding)) + if "description" in item: + item.pop("description") + + all_metadata.append(item) + + # Create the FAISS index from all the embeddings + vectordb = FAISS.from_embeddings(all_embedding_pairs, embeddings, metadatas=all_metadata) + print("Saving the vector database...") + vectordb.save_local(output_dir, index_name=name) + print("Created a vector database from the downloaded documents.") + + node_index = {} for item in collections["nodes"]: - for k in keywords: - if k.lower() in item["description"].lower() or k.lower() in item["name"].lower(): - results.append(item) - # TODO sort by relevance - if len(results) >= top_k: - break - if len(results) >= top_k: - break - return results + node_index[item['node_id']] = item + with open(os.path.join(output_dir, "eurobioimaging_node_index.json"), "w") as f: + json.dump(node_index, f) + + +def create_tools(docs_store_dict, node_index): + async def search_technology( + keywords: str = Field(..., description="The keywords used for searching the technology in EuroBioImaging service index."), + top_k: int = Field(..., description="Return the top_k number of search results") + ): + """Search Technology in EuroBioImaging service index""" + results = [] + collection = "technologies" + top_k = max(1, min(top_k, 15)) + docs_store = docs_store_dict[collection] + + print(f"Retrieving documents from database {collection} with keywords: {keywords}") + results.append( + await docs_store.asimilarity_search_with_relevance_scores( + keywords, k=top_k + ) + ) + docs_with_score = [ + DocWithScore( + doc=doc.page_content, + score=round(score, 2), + metadata=doc.metadata, + ) + for results_with_scores in results + for doc, score in results_with_scores + ] + # sort by relevance score + docs_with_score = sorted(docs_with_score, key=lambda x: x.score, reverse=True)[ + : top_k + ] + + if len(docs_with_score) > 2: + print( + f"Retrieved documents:\n{docs_with_score[0].doc[:20] + '...'} (score: {docs_with_score[0].score})\n{docs_with_score[1].doc[:20] + '...'} (score: {docs_with_score[1].score})\n{docs_with_score[2].doc[:20] + '...'} (score: {docs_with_score[2].score})" + ) + else: + print(f"Retrieved documents:\n{docs_with_score}") + return docs_with_score + + + async def get_node_details( + node_id: str = Field(..., description="The EuroBioImaging node id"), + ): + """Get details of the EuroBioImaging node who provide services and technoliges to users""" + if node_id in node_index: + return node_index[node_id] + else: + return f"Node not found: {node_id}" + + + async def search_node( + keywords: str = Field(..., description="The keywords for searching the service nodes"), + top_k: int = Field(..., description="Return the top_k number of search results") + ): + """Search a service node in the EuroBioImaging network""" + results = [] + collection = "nodes" + top_k = max(1, min(top_k, 15)) + docs_store = docs_store_dict[collection] + + print(f"Retrieving documents from database {collection} with query: {keywords}") + results.append( + await docs_store.asimilarity_search_with_relevance_scores( + keywords, k=top_k + ) + ) + + docs_with_score = [ + DocWithScore( + doc=doc.page_content, + score=round(score, 2), + metadata=doc.metadata, + ) + for results_with_scores in results + for doc, score in results_with_scores + ] + # sort by relevance score + docs_with_score = sorted(docs_with_score, key=lambda x: x.score, reverse=True)[ + : top_k + ] + + if len(docs_with_score) > 2: + print( + f"Retrieved documents:\n{docs_with_score[0].doc[:20] + '...'} (score: {docs_with_score[0].score})\n{docs_with_score[1].doc[:20] + '...'} (score: {docs_with_score[1].score})\n{docs_with_score[2].doc[:20] + '...'} (score: {docs_with_score[2].score})" + ) + else: + print(f"Retrieved documents:\n{docs_with_score}") + return docs_with_score + return schema_tool(search_technology), schema_tool(search_node), schema_tool(get_node_details) def get_extension(): + # collections = get_manifest()["collections"] + knowledge_base_path = os.environ.get( + "BIOIMAGEIO_KNOWLEDGE_BASE_PATH", "./bioimageio-knowledge-base" + ) + assert ( + knowledge_base_path is not None + ), "Please set the BIOIMAGEIO_KNOWLEDGE_BASE_PATH environment variable to the path of the knowledge base." + + # check if node_index exists + if not os.path.exists(os.path.join(knowledge_base_path, "eurobioimaging_node_index.json")): + print("Creating EuroBioImaging vector database...") + create_eurobioimaging_vector_database(knowledge_base_path) + + docs_store_dict, node_index = load_eurobioimaging_base(knowledge_base_path) + search_technology, search_node, get_node_details = create_tools(docs_store_dict, node_index) + return ChatbotExtension( id="eurobioimaging", name="EuroBioImaging Service Index", @@ -90,10 +206,11 @@ def get_extension(): ) if __name__ == "__main__": - import asyncio - async def main(): - extension = get_extension() - query = "mouse embryos" - top_k = 2 - print(await extension.tools["search_technology"](keywords=query, top_k=top_k)) - asyncio.run(main()) \ No newline at end of file + # import asyncio + # async def main(): + # extension = get_extension() + # query = "mouse embryos" + # top_k = 2 + # print(await extension.tools["search_technology"](keywords=query, top_k=top_k)) + # asyncio.run(main()) + create_eurobioimaging_vector_database() \ No newline at end of file From 204ad5bc5317239cd046a167b1c7b31eb0f541b2 Mon Sep 17 00:00:00 2001 From: alalulu8668 Date: Mon, 20 May 2024 09:03:44 +0200 Subject: [PATCH 3/9] update metadata --- .../chatbot_extensions/euro_bioimaging_extension.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py b/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py index 92ab70f..dcc1277 100644 --- a/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py +++ b/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py @@ -4,6 +4,7 @@ from langchain_community.vectorstores import FAISS from langchain_openai import OpenAIEmbeddings from pydantic import Field, BaseModel +from typing import Any, Dict import json import requests from markdownify import markdownify as md @@ -20,6 +21,7 @@ class DocWithScore(BaseModel): doc: str = Field(description="The document retrieved.") score: float = Field(description="The relevance score of the retrieved document.") + metadata: Dict[str, Any] = Field(description="The metadata of the retrieved document.") def load_eurobioimaging_base(db_path): docs_store_dict = {} From b44dce8801d4778d548359153669c94b94d7c6a8 Mon Sep 17 00:00:00 2001 From: Wei Ouyang Date: Mon, 20 May 2024 19:31:29 +0200 Subject: [PATCH 4/9] Refactoring extension --- bioimageio_chatbot/chatbot.py | 2 +- .../euro_bioimaging_extension.py | 67 ++++++++++--------- bioimageio_chatbot/knowledge_base.py | 3 +- 3 files changed, 40 insertions(+), 32 deletions(-) diff --git a/bioimageio_chatbot/chatbot.py b/bioimageio_chatbot/chatbot.py index 09adab4..beab376 100644 --- a/bioimageio_chatbot/chatbot.py +++ b/bioimageio_chatbot/chatbot.py @@ -236,7 +236,7 @@ class ThoughtsSchema(BaseModel): {"name": "Nina", "agent": nina, "extensions": nina_extensions, "code_interpreter": False, "alias": "BioImage Tutor", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Nina, I can help with your learning journey in bioimaging. How can I help you today?"}, {"name": "Bridget", "agent": bridget, "extensions": bridget_extensions, "code_interpreter": True, "alias": "BioImage Analyst", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Bridget, I can help you with your bioimaging tasks. Please mount your data folder and let me know how I can assist you today."}, {"name": "Skyler", "agent": skyler, "extensions": skyler_extensions, "code_interpreter": False, "alias": "BioImage GPT", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Skyler. How can I help you today?"}, - {"name": "Elara", "agent": elara, "extensions": elara_extensions, "code_interpreter": False, "alias": "EuroBioImaging Assistant", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Elara. I can help you with EuroBioImaging services and technologies. How can I help you today?"} + {"name": "Elara", "agent": elara, "extensions": elara_extensions, "code_interpreter": False, "alias": "EuroBioImaging", "icon": "https://bioimage.io/static/img/bioimage-io-icon.svg", "welcome_message": "Hi there! I'm Elara. I can help you with EuroBioImaging services and technologies. How can I help you today?"} ] diff --git a/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py b/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py index dcc1277..96136c7 100644 --- a/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py +++ b/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py @@ -10,12 +10,8 @@ from markdownify import markdownify as md import re import os -import urllib3 -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) -EUROBIOIMAGING_TOKEN = os.getenv("EUROBIOIMAGING_TOKEN") - class DocWithScore(BaseModel): """A document with an associated relevance score.""" @@ -25,66 +21,74 @@ class DocWithScore(BaseModel): def load_eurobioimaging_base(db_path): docs_store_dict = {} - + print(f"Loading EuroBioImaging docs store from {db_path}") for collection in ['technologies', 'nodes']: - channel_id = collection try: - docs_store = load_docs_store(db_path, channel_id) + docs_store = load_docs_store(db_path, f"eurobioimaging-{collection}") length = len(docs_store.docstore._dict.keys()) - assert length > 0, f"Please make sure the docs store {channel_id} is not empty." - print(f"Loaded {length} documents from {channel_id}") - docs_store_dict[channel_id] = docs_store + assert length > 0, f"Please make sure the docs store {collection} is not empty." + print(f"- Loaded {length} documents from {collection}") + docs_store_dict[collection] = docs_store except Exception as e: - print(f"Failed to load docs store for {channel_id}. Error: {e}") + print(f"Failed to load docs store for {collection}. Error: {e}") if len(docs_store_dict) == 0: raise Exception("No docs store is loaded, please make sure the docs store is not empty.") # load the node index - with open(os.path.join(db_path, "eurobioimaging_node_index.json"), "r") as f: + with open(os.path.join(db_path, "eurobioimaging-node-index.json"), "r") as f: node_index = json.load(f) return docs_store_dict, node_index +def chunkify(lst, n): + """Yield successive n-sized chunks from lst.""" + for i in range(0, len(lst), n): + yield lst[i:i + n] def create_eurobioimaging_vector_database(output_dir=None): if output_dir is None: output_dir = os.environ.get("BIOIMAGEIO_KNOWLEDGE_BASE_PATH", "./bioimageio-knowledge-base") os.makedirs(output_dir, exist_ok=True) + EUROBIOIMAGING_TOKEN = os.getenv("EUROBIOIMAGING_TOKEN") + assert EUROBIOIMAGING_TOKEN is not None, "Please set the EUROBIOIMAGING_TOKEN environment variable to access the EuroBioImaging API." + embeddings = OpenAIEmbeddings() response = requests.get(f"https://www.eurobioimaging.eu/api-general-data.php?a={EUROBIOIMAGING_TOKEN}") collections = response.json() for name in ["technologies", "nodes"]: - all_embedding_pairs = [] all_metadata = [] + all_contents = [] for item in collections[name]: - print(f"Download description from {item['url']} and create embeddings...") + print(f"Fetch description from {item['url']}...") response = requests.get(f"https://www.eurobioimaging.eu/api-page-content.php?a={EUROBIOIMAGING_TOKEN}&url={item['url']}") content = md(response.text, heading_style="ATX") description = re.sub(r"\n{3,}", "\n\n", content).strip() # Generate embeddings for the item item_content = f"# {item['name']}\n\n{description}" - item_embedding = embeddings.embed_documents([item_content])[0] - - # Append the item_embedding to the all_embedding_pairs list - all_embedding_pairs.append((item_content, item_embedding)) + all_contents.append(item_content) if "description" in item: item.pop("description") - all_metadata.append(item) + + print(f"Embedding {len(all_contents)} documents...") + all_embedding_pairs = [] + for chunk in chunkify(all_contents, 300): + item_embeddings = embeddings.embed_documents(chunk) + all_embedding_pairs.extend(zip(chunk, item_embeddings)) # Create the FAISS index from all the embeddings vectordb = FAISS.from_embeddings(all_embedding_pairs, embeddings, metadatas=all_metadata) print("Saving the vector database...") - vectordb.save_local(output_dir, index_name=name) + vectordb.save_local(output_dir, index_name="eurobioimaging-" + name) print("Created a vector database from the downloaded documents.") node_index = {} for item in collections["nodes"]: node_index[item['node_id']] = item - with open(os.path.join(output_dir, "eurobioimaging_node_index.json"), "w") as f: + with open(os.path.join(output_dir, "eurobioimaging-node-index.json"), "w") as f: json.dump(node_index, f) @@ -189,7 +193,11 @@ def get_extension(): ), "Please set the BIOIMAGEIO_KNOWLEDGE_BASE_PATH environment variable to the path of the knowledge base." # check if node_index exists - if not os.path.exists(os.path.join(knowledge_base_path, "eurobioimaging_node_index.json")): + if not os.path.exists(os.path.join(knowledge_base_path, "eurobioimaging-node-index.json")): + EUROBIOIMAGING_TOKEN = os.getenv("EUROBIOIMAGING_TOKEN") + if EUROBIOIMAGING_TOKEN is None: + print("Warning: Disable EuroBioImaging extension because the EUROBIOIMAGING_TOKEN is not set.") + return None print("Creating EuroBioImaging vector database...") create_eurobioimaging_vector_database(knowledge_base_path) @@ -208,11 +216,10 @@ def get_extension(): ) if __name__ == "__main__": - # import asyncio - # async def main(): - # extension = get_extension() - # query = "mouse embryos" - # top_k = 2 - # print(await extension.tools["search_technology"](keywords=query, top_k=top_k)) - # asyncio.run(main()) - create_eurobioimaging_vector_database() \ No newline at end of file + import asyncio + async def main(): + extension = get_extension() + query = "mouse embryos" + top_k = 2 + print(await extension.tools["search_technology"](keywords=query, top_k=top_k)) + asyncio.run(main()) \ No newline at end of file diff --git a/bioimageio_chatbot/knowledge_base.py b/bioimageio_chatbot/knowledge_base.py index f290de0..d69e6fa 100644 --- a/bioimageio_chatbot/knowledge_base.py +++ b/bioimageio_chatbot/knowledge_base.py @@ -35,13 +35,14 @@ def load_knowledge_base(db_path): collections = get_manifest()['collections'] docs_store_dict = {} + print(f"Loading knowledge base from {db_path}") for collection in collections: channel_id = collection['id'] try: docs_store = load_docs_store(db_path, channel_id) length = len(docs_store.docstore._dict.keys()) assert length > 0, f"Please make sure the docs store {channel_id} is not empty." - print(f"Loaded {length} documents from {channel_id}") + print(f"- Loaded {length} documents from {channel_id}") docs_store_dict[channel_id] = docs_store except Exception as e: print(f"Failed to load docs store for {channel_id}. Error: {e}") From 0e1a86464491969cfee891c1f0c8c6574a4c8d7a Mon Sep 17 00:00:00 2001 From: Wei Ouyang Date: Mon, 20 May 2024 19:33:11 +0200 Subject: [PATCH 5/9] Bump version --- pyproject.toml | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 991b174..829389a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ version = "0.1.98" readme = "README.md" description = "Your Personal Assistant in Computational BioImaging." dependencies = [ - "schema-agents>=0.1.50", + "schema-agents>=0.1.51", "imjoy-rpc>=0.5.48.post2", "requests", "pypdf", diff --git a/requirements.txt b/requirements.txt index 6577fc6..b365289 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -schema-agents>=0.1.50 +schema-agents>=0.1.51 imjoy-rpc>=0.5.48.post2 requests pypdf From 285570751b9a9b88c58c8401cf96801aac386967 Mon Sep 17 00:00:00 2001 From: alalulu8668 Date: Sun, 26 May 2024 22:38:21 +0200 Subject: [PATCH 6/9] update docs_extension, add test_gemini Co-authored-by: Wei Ouyang --- .../chatbot_extensions/docs_extension.py | 4 + tests/test_gemini_chatbot.py | 74 +++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 tests/test_gemini_chatbot.py diff --git a/bioimageio_chatbot/chatbot_extensions/docs_extension.py b/bioimageio_chatbot/chatbot_extensions/docs_extension.py index 415adfd..0d6d791 100644 --- a/bioimageio_chatbot/chatbot_extensions/docs_extension.py +++ b/bioimageio_chatbot/chatbot_extensions/docs_extension.py @@ -152,6 +152,8 @@ def get_extension(): description="Search information in the documents of the bioimage.io knowledge base. Provide a list of keywords to search information in the documents. Returns a list of relevant documents.", tools=docs_tools, ) + else: + sinfo1 = None if books_tools: sinfo2 = ChatbotExtension( id="books", @@ -159,5 +161,7 @@ def get_extension(): description="Search information in BioImage books. Provide a list of keywords to search information in the books. Returns a list of relevant documents.", tools=books_tools, ) + else: + sinfo2 = None return sinfo1, sinfo2 diff --git a/tests/test_gemini_chatbot.py b/tests/test_gemini_chatbot.py new file mode 100644 index 0000000..4cf4dc2 --- /dev/null +++ b/tests/test_gemini_chatbot.py @@ -0,0 +1,74 @@ +import os +from bioimageio_chatbot.chatbot import create_assistants, get_builtin_extensions, QuestionWithHistory, UserProfile +from schema_agents.role import Role +from schema_agents.schema import Message +import json +import pytest + +KNOWLEDGE_BASE_PATH = "./bioimageio-knowledge-base" + +@pytest.fixture +def builtin_extensions(): + return get_builtin_extensions() + +@pytest.fixture +def elara(builtin_extensions): + assistants = create_assistants(builtin_extensions) + # find an assistant name Elara + m = [assistant for assistant in assistants if assistant['name'] == "Elara"][0] + return m['agent'] + + + +# @pytest.mark.asyncio +# async def test_respond_user_str(): +# async def respond_to_user(query: str, role: Role) -> str: +# """Respond to user.""" +# response = await role.aask(query, str) +# return response + + +# role = Role(name="Alice", +# profile="Customer service", +# goal="Efficiently communicate with the user and translate the user's needs to technical requirements", +# constraints=None, +# actions=[respond_to_user], +# backend="gemini",) + +# messages = ["hi"] +# responses = await role.handle(messages) +# assert responses +# @pytest.mark.asyncio +# async def test_tool_call(builtin_extensions, elara): +# # load saved json file +# with open('test_messages.json', 'r') as file: +# test_messages = json.load(file) + +# messages = elara._llm.format_msg(test_messages) +# assert + +@pytest.mark.asyncio +async def test_chatbot(builtin_extensions, elara): + select_extensions = [ + {"id": "eurobioimaging"} + ] + chat_history=[] + question = "Which technique can I use to image neurons?" + profile = UserProfile(name="lulu", occupation="data scientist", background="machine learning and AI") + m = QuestionWithHistory(question=question, chat_history=chat_history, user_profile=UserProfile.model_validate(profile), channel_id=None, chatbot_extensions=select_extensions) + resp = await elara.handle(Message(content="", data=m , role="User")) + assert resp + str_resp = [str(element) for element in resp] + assert any(["EurobioimagingSearchTechnology" in element for element in str_resp]) + + chat_history.append( + {"role": "user", "content": question} + ) + chat_history.append( + {"role": "assistant", "content": resp.text} + ) + # question2 = "tell me where i can find this technique?" + + + + From 011de47fa91c6468c402800ec3bf799b682fbd71 Mon Sep 17 00:00:00 2001 From: alalulu8668 Date: Sun, 2 Jun 2024 16:41:38 +0200 Subject: [PATCH 7/9] add search for publication func (for titles only) --- .../euro_bioimaging_extension.py | 63 ++++++++++++++++--- knowledge-base-manifest.yaml | 7 ++- 2 files changed, 62 insertions(+), 8 deletions(-) diff --git a/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py b/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py index 96136c7..41b3aeb 100644 --- a/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py +++ b/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py @@ -11,7 +11,6 @@ import re import os - class DocWithScore(BaseModel): """A document with an associated relevance score.""" @@ -22,7 +21,7 @@ class DocWithScore(BaseModel): def load_eurobioimaging_base(db_path): docs_store_dict = {} print(f"Loading EuroBioImaging docs store from {db_path}") - for collection in ['technologies', 'nodes']: + for collection in ['technologies', 'nodes', 'publication']: try: docs_store = load_docs_store(db_path, f"eurobioimaging-{collection}") length = len(docs_store.docstore._dict.keys()) @@ -90,9 +89,58 @@ def create_eurobioimaging_vector_database(output_dir=None): node_index[item['node_id']] = item with open(os.path.join(output_dir, "eurobioimaging-node-index.json"), "w") as f: json.dump(node_index, f) - + def create_tools(docs_store_dict, node_index): + # async def search_publication( + # keywords: str = Field(..., description="The keywords used for searching and extracting use cases from the publications."), + # top_k: int = Field(..., description="Return the top_k number of search results") + # ): + # """Search the relevant use cases in the publications""" + # restuls = [] + # restuls.append( + # await docs_store_dict["publication"].asimilarity_search_with_relevance_scores( + # keywords, k=top_k + # ) + # ) + # docs_with_score = [ + # DocWithScore( + # doc=doc.page_content, + # score=round(score, 2), + # metadata=doc.metadata, + # ) + # for results_with_scores in restuls + # for doc, score in results_with_scores + # ] + # # sort by relevance score + # docs_with_score = sorted(docs_with_score, key=lambda x: x.score, reverse=True)[ + # : top_k + # ] + # if len(docs_with_score) > 2: + # print( + # f"Retrieved documents:\n{docs_with_score[0].doc[:20] + '...'} (score: {docs_with_score[0].score})\n{docs_with_score[1].doc[:20] + '...'} (score: {docs_with_score[1].score})\n{docs_with_score[2].doc[:20] + '...'} (score: {docs_with_score[2].score})" + # ) + # else: + # print(f"Retrieved documents:\n{docs_with_score}") + # return docs_with_score + + async def search_publication( + keywords: str = Field(..., description="The keywords used for searching and extracting use cases from the publications."), + ): + """Search the relevant publications in the list""" + import pandas as pd + file_path = '/home/alalulu/workspace/chatbot_bmz/chatbot/tests/publications.csv' + df = pd.read_csv(file_path) + results = [] + for index, row in df.iterrows(): + # search if keywords in the title + if keywords.lower() in row['Title'].lower(): + results.append(row) + # convert the results to a list of dictionaries + results = results.to_dict(orient='records') + return results + + async def search_technology( keywords: str = Field(..., description="The keywords used for searching the technology in EuroBioImaging service index."), top_k: int = Field(..., description="Return the top_k number of search results") @@ -180,7 +228,7 @@ async def search_node( else: print(f"Retrieved documents:\n{docs_with_score}") return docs_with_score - return schema_tool(search_technology), schema_tool(search_node), schema_tool(get_node_details) + return schema_tool(search_technology), schema_tool(search_node), schema_tool(get_node_details), schema_tool(search_publication) def get_extension(): @@ -202,16 +250,17 @@ def get_extension(): create_eurobioimaging_vector_database(knowledge_base_path) docs_store_dict, node_index = load_eurobioimaging_base(knowledge_base_path) - search_technology, search_node, get_node_details = create_tools(docs_store_dict, node_index) + search_technology, search_node, get_node_details, search_publication = create_tools(docs_store_dict, node_index) return ChatbotExtension( id="eurobioimaging", name="EuroBioImaging Service Index", - description="Help users to find bioimaging services in the EuroBioimaging network; You can search by keywords for the imaging technology, then use the returned node_id to find out details about the service providing node in the EuroBioimang network", + description="Help users find bioimaging services and technologies in the EuroBioimaging network. For a given study or research, you should firstly search publications to find relevant use cases and the technologies used in the study. Then you can search by keywords for the imaging technology, and use the returned node_id to find out details about the service providing node in the EuroBioimang network", tools=dict( search_technology=search_technology, search_node=search_node, - get_node_details=get_node_details + get_node_details=get_node_details, + search_publication=search_publication, ) ) diff --git a/knowledge-base-manifest.yaml b/knowledge-base-manifest.yaml index 31012aa..2921bf1 100644 --- a/knowledge-base-manifest.yaml +++ b/knowledge-base-manifest.yaml @@ -132,7 +132,12 @@ collections: description: "High-performance spatial analysis in a web browser, Node.js, and across programming languages and hardware architectures" base_url: https://wasm.itk.org format: markdown - + - name: Euro-BioImaging-Publication + id: eurobioimaging-publication + source: https://raw.githubusercontent.com/oeway/hypha-knowledge-base/gh-pages/eurobioimaging-ragaller-et-al-2024-quantifying-fluorescence-lifetime-responsiveness-of-environment-sensitive-probes-for-membrane.pdf + description: "Quantifying fluorescence lifetime responsiveness of environment-sensitive probes for membrane" + base_url: https://www.eurobioimaging.eu/ + format: pdf additional_channels: - name: "biii.eu" id: biii.eu From 84c7df0b2d776ba3382f35416e7191411c85c74a Mon Sep 17 00:00:00 2001 From: alalulu8668 Date: Mon, 10 Jun 2024 17:05:36 +0200 Subject: [PATCH 8/9] update search publication --- .../euro_bioimaging_extension.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py b/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py index 41b3aeb..548f962 100644 --- a/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py +++ b/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py @@ -11,6 +11,11 @@ import re import os +import pandas as pd +file_path = '/home/alalulu/workspace/chatbot_bmz/chatbot/tests/publications.csv' +publication_df = pd.read_csv(file_path) + + class DocWithScore(BaseModel): """A document with an associated relevance score.""" @@ -128,17 +133,19 @@ async def search_publication( keywords: str = Field(..., description="The keywords used for searching and extracting use cases from the publications."), ): """Search the relevant publications in the list""" - import pandas as pd - file_path = '/home/alalulu/workspace/chatbot_bmz/chatbot/tests/publications.csv' - df = pd.read_csv(file_path) + results = [] - for index, row in df.iterrows(): + for index, row in publication_df.iterrows(): # search if keywords in the title if keywords.lower() in row['Title'].lower(): results.append(row) - # convert the results to a list of dictionaries - results = results.to_dict(orient='records') - return results + + if results: + # convert the results to dictionary + results_dict = [obj.to_dict() for obj in results] + return results_dict + else: + return f"No publication found with keywords: {keywords}" async def search_technology( From 417142d8fc43e1f5595571959585f91ca3362db0 Mon Sep 17 00:00:00 2001 From: alalulu8668 Date: Mon, 10 Jun 2024 18:54:10 +0200 Subject: [PATCH 9/9] get abstract by pubmed --- .../euro_bioimaging_extension.py | 118 ++++++++++++------ 1 file changed, 78 insertions(+), 40 deletions(-) diff --git a/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py b/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py index 548f962..4635cb5 100644 --- a/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py +++ b/bioimageio_chatbot/chatbot_extensions/euro_bioimaging_extension.py @@ -95,40 +95,70 @@ def create_eurobioimaging_vector_database(output_dir=None): with open(os.path.join(output_dir, "eurobioimaging-node-index.json"), "w") as f: json.dump(node_index, f) + +def get_pmid_from_doi(doi): + esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" + params = { + "db": "pubmed", + "term": doi, + "retmode": "json" + } + response = requests.get(esearch_url, params=params) + data = response.json() + idlist = data["esearchresult"]["idlist"] + if idlist: + return idlist[0] + else: + return None + +def get_article_details(pmids): + esummary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" + params = { + "db": "pubmed", + "id": ",".join(pmids), + "retmode": "json" + } + response = requests.get(esummary_url, params=params) + data = response.json() + return data["result"] -def create_tools(docs_store_dict, node_index): - # async def search_publication( - # keywords: str = Field(..., description="The keywords used for searching and extracting use cases from the publications."), - # top_k: int = Field(..., description="Return the top_k number of search results") - # ): - # """Search the relevant use cases in the publications""" - # restuls = [] - # restuls.append( - # await docs_store_dict["publication"].asimilarity_search_with_relevance_scores( - # keywords, k=top_k - # ) - # ) - # docs_with_score = [ - # DocWithScore( - # doc=doc.page_content, - # score=round(score, 2), - # metadata=doc.metadata, - # ) - # for results_with_scores in restuls - # for doc, score in results_with_scores - # ] - # # sort by relevance score - # docs_with_score = sorted(docs_with_score, key=lambda x: x.score, reverse=True)[ - # : top_k - # ] - # if len(docs_with_score) > 2: - # print( - # f"Retrieved documents:\n{docs_with_score[0].doc[:20] + '...'} (score: {docs_with_score[0].score})\n{docs_with_score[1].doc[:20] + '...'} (score: {docs_with_score[1].score})\n{docs_with_score[2].doc[:20] + '...'} (score: {docs_with_score[2].score})" - # ) - # else: - # print(f"Retrieved documents:\n{docs_with_score}") - # return docs_with_score +def get_article_abstract(pmid): + efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" + params = { + "db": "pubmed", + "id": pmid, + "retmode": "xml", + "rettype": "abstract" + } + response = requests.get(efetch_url, params=params) + return response.text + + +def read_publication_abstract( + doi: str = Field(..., description="The DOI of the publication"), + ): + """Read the abstract of the publication with the provided DOI""" + # get the doi and title + pmid = get_pmid_from_doi(doi) + if pmid: + article = get_article_details(pmid) + # article_details = article[pmid] + # title = article_details['title'] + abstract_xml = get_article_abstract(pmid) + # Extract the abstract text from the XML + from xml.etree import ElementTree as ET + root = ET.fromstring(abstract_xml) + abstract = "" + for elem in root.findall(".//AbstractText"): + abstract += elem.text + " " + # print(f"Abstract: {abstract.strip()}") + return abstract.strip() + else: + # print("No article found for the provided DOI.") + return "No article found for the provided DOI." + +def create_tools(docs_store_dict, node_index): async def search_publication( keywords: str = Field(..., description="The keywords used for searching and extracting use cases from the publications."), ): @@ -143,10 +173,16 @@ async def search_publication( if results: # convert the results to dictionary results_dict = [obj.to_dict() for obj in results] + # get the DOI of the publication + for result in results_dict: + result['DOI'] = result['DOI'].split(' ')[0] + abstract = read_publication_abstract(result['DOI']) + result['Abstract'] = abstract return results_dict else: return f"No publication found with keywords: {keywords}" - + + async def search_technology( keywords: str = Field(..., description="The keywords used for searching the technology in EuroBioImaging service index."), @@ -268,14 +304,16 @@ def get_extension(): search_node=search_node, get_node_details=get_node_details, search_publication=search_publication, + # read_publication_abstract=read_publication_abstract, ) ) if __name__ == "__main__": - import asyncio - async def main(): - extension = get_extension() - query = "mouse embryos" - top_k = 2 - print(await extension.tools["search_technology"](keywords=query, top_k=top_k)) - asyncio.run(main()) \ No newline at end of file + # import asyncio + # async def main(): + # extension = get_extension() + # query = "mouse embryos" + # top_k = 2 + # print(await extension.tools["search_technology"](keywords=query, top_k=top_k)) + # asyncio.run(main()) + create_eurobioimaging_publication_vector_database() \ No newline at end of file