diff --git a/README.md b/README.md index 6eb02aaea..67c807d6a 100644 --- a/README.md +++ b/README.md @@ -35,10 +35,6 @@ According to enviornment we are configuring the models which is indicated by VIT EX: ```env VITE_LLM_MODELS_PROD="openai_gpt_4o,openai_gpt_4o_mini,diffbot,gemini_1.5_flash" -``` -OPENAI_API_KEY="your-openai-key" -``` - You can then run Docker Compose to build and start all components: ```bash docker-compose up --build @@ -71,7 +67,6 @@ VITE_CHAT_MODES="" If however you want to specify the only vector mode or only graph mode you can do that by specifying the mode in the env: ```env VITE_CHAT_MODES="vector,graph" -VITE_CHAT_MODES="vector,graph" ``` #### Running Backend and Frontend separately (dev environment) @@ -88,7 +83,7 @@ Alternatively, you can run the backend and frontend separately: ``` - For the backend: -1. Create the backend/.env file by copy/pasting the backend/example.env. To streamline the initial setup and testing of the application, you can preconfigure user credentials directly within the .env file. This bypasses the login dialog and allows you to immediately connect with a predefined user. +1. Create the backend/.env file by copy/pasting the backend/example.env. To streamline the initial setup and testing of the application, you can preconfigure user credentials directly within the backend .env file. This bypasses the login dialog and allows you to immediately connect with a predefined user. - **NEO4J_URI**: - **NEO4J_USERNAME**: - **NEO4J_PASSWORD**: @@ -122,6 +117,8 @@ Allow unauthenticated request : Yes ## ENV | Env Variable Name | Mandatory/Optional | Default Value | Description | |-------------------------|--------------------|---------------|--------------------------------------------------------------------------------------------------| +| | +| **BACKEND ENV** | EMBEDDING_MODEL | Optional | all-MiniLM-L6-v2 | Model for generating the text embedding (all-MiniLM-L6-v2 , openai , vertexai) | | IS_EMBEDDING | Optional | true | Flag to enable text embedding | | KNN_MIN_SCORE | Optional | 0.94 | Minimum score for KNN algorithm | @@ -135,7 +132,13 @@ Allow unauthenticated request : Yes | LANGCHAIN_API_KEY | Optional | | API key for Langchain | | LANGCHAIN_PROJECT | Optional | | Project for Langchain | | LANGCHAIN_TRACING_V2 | Optional | true | Flag to enable Langchain tracing | +| GCS_FILE_CACHE | Optional | False | If set to True, will save the files to process into GCS. If set to False, will save the files locally | | LANGCHAIN_ENDPOINT | Optional | https://api.smith.langchain.com | Endpoint for Langchain API | +| ENTITY_EMBEDDING | Optional | False | If set to True, It will add embeddings for each entity in database | +| LLM_MODEL_CONFIG_ollama_ | Optional | | Set ollama config as - model_name,model_local_url for local deployments | +| RAGAS_EMBEDDING_MODEL | Optional | openai | embedding model used by ragas evaluation framework | +| | +| **FRONTEND ENV** | VITE_BACKEND_API_URL | Optional | http://localhost:8000 | URL for backend API | | VITE_BLOOM_URL | Optional | https://workspace-preview.neo4j.io/workspace/explore?connectURL={CONNECT_URL}&search=Show+me+a+graph&featureGenAISuggestions=true&featureGenAISuggestionsInternal=true | URL for Bloom visualization | | VITE_REACT_APP_SOURCES | Mandatory | local,youtube,wiki,s3 | List of input sources that will be available | @@ -146,10 +149,6 @@ Allow unauthenticated request : Yes | VITE_GOOGLE_CLIENT_ID | Optional | | Client ID for Google authentication | | VITE_LLM_MODELS_PROD | Optional | openai_gpt_4o,openai_gpt_4o_mini,diffbot,gemini_1.5_flash | To Distinguish models based on the Enviornment PROD or DEV | VITE_LLM_MODELS | Optional | 'diffbot,openai_gpt_3.5,openai_gpt_4o,openai_gpt_4o_mini,gemini_1.5_pro,gemini_1.5_flash,azure_ai_gpt_35,azure_ai_gpt_4o,ollama_llama3,groq_llama3_70b,anthropic_claude_3_5_sonnet' | Supported Models For the application -| GCS_FILE_CACHE | Optional | False | If set to True, will save the files to process into GCS. If set to False, will save the files locally | -| ENTITY_EMBEDDING | Optional | False | If set to True, It will add embeddings for each entity in database | -| LLM_MODEL_CONFIG_ollama_ | Optional | | Set ollama config as - model_name,model_local_url for local deployments | -| RAGAS_EMBEDDING_MODEL | Optional | openai | embedding model used by ragas evaluation framework | ## LLMs Supported 1. OpenAI diff --git a/backend/example.env b/backend/example.env index 6bef36f78..f747a94e8 100644 --- a/backend/example.env +++ b/backend/example.env @@ -1,8 +1,7 @@ -OPENAI_API_KEY = "" -#EMBEDDING_MODEL can be openai or vertexai or by default all-MiniLM-L6-v2 -EMBEDDING_MODEL = "all-MiniLM-L6-v2" -RAGAS_EMBEDDING_MODEL = "openai" -IS_EMBEDDING = "true" +OPENAI_API_KEY = "" #This is required if you are using openai embedding model +EMBEDDING_MODEL = "all-MiniLM-L6-v2" #this can be openai or vertexai or by default all-MiniLM-L6-v2 +RAGAS_EMBEDDING_MODEL = "openai" #Keep blank if you want to use all-MiniLM-L6-v2 for ragas embeddings +IS_EMBEDDING = "TRUE" KNN_MIN_SCORE = "0.94" # Enable Gemini (default is False) | Can be False or True GEMINI_ENABLED = False @@ -24,7 +23,7 @@ GCS_FILE_CACHE = "" #save the file into GCS or local, SHould be True or False NEO4J_USER_AGENT="" ENABLE_USER_AGENT = "" LLM_MODEL_CONFIG_model_version="" -ENTITY_EMBEDDING="" True or False +ENTITY_EMBEDDING="TRUE" # TRUE or FALSE based on whether to create embeddings for entities suitable for entity vector mode DUPLICATE_SCORE_VALUE =0.97 DUPLICATE_TEXT_DISTANCE =3 DEFAULT_DIFFBOT_CHAT_MODEL="openai_gpt_4o" #whichever model specified here , need to add config for that model in below format) @@ -43,4 +42,6 @@ LLM_MODEL_CONFIG_fireworks_llama_v3_70b="model_name,fireworks_api_key" LLM_MODEL_CONFIG_bedrock_claude_3_5_sonnet="model_name,aws_access_key_id,aws_secret__access_key,region_name" LLM_MODEL_CONFIG_ollama_llama3="model_name,model_local_url" YOUTUBE_TRANSCRIPT_PROXY="https://user:pass@domain:port" - +EFFECTIVE_SEARCH_RATIO=5 +GRAPH_CLEANUP_MODEL="openai_gpt_4o" +CHUNKS_TO_BE_PROCESSED="50" diff --git a/backend/requirements.txt b/backend/requirements.txt index 7af041a09..ee6a49bff 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,6 +1,6 @@ asyncio==3.4.3 -boto3==1.35.69 -botocore==1.35.69 +boto3==1.35.90 +botocore==1.35.90 certifi==2024.8.30 fastapi==0.115.6 fastapi-health==0.4.0 @@ -10,24 +10,24 @@ google_auth_oauthlib==1.2.1 google-cloud-core==2.4.1 json-repair==0.30.2 pip-install==1.3.5 -langchain==0.3.8 -langchain-aws==0.2.7 +langchain==0.3.13 +langchain-aws==0.2.10 langchain-anthropic==0.3.0 langchain-fireworks==0.2.5 -langchain-community==0.3.8 -langchain-core==0.3.21 -langchain-experimental==0.3.3 +langchain-community==0.3.13 +langchain-core==0.3.28 +langchain-experimental==0.3.4 langchain-google-vertexai==2.0.7 langchain-groq==0.2.1 -langchain-openai==0.2.9 -langchain-text-splitters==0.3.2 +langchain-openai==0.2.14 +langchain-text-splitters==0.3.4 langchain-huggingface==0.1.2 langdetect==1.0.9 -langsmith==0.1.146 +langsmith==0.2.4 langserve==0.3.0 neo4j-rust-ext nltk==3.9.1 -openai==1.55.1 +openai==1.58.1 opencv-python==4.10.0.84 psutil==6.1.0 pydantic==2.9.2 @@ -58,4 +58,5 @@ graphdatascience==1.12 Secweb==1.11.0 ragas==0.2.6 rouge_score==0.1.2 -langchain-neo4j==0.1.1 +langchain-neo4j==0.2.0 + diff --git a/backend/score.py b/backend/score.py index a5da52aab..6869b1b85 100644 --- a/backend/score.py +++ b/backend/score.py @@ -13,7 +13,7 @@ from src.graphDB_dataAccess import graphDBdataAccess from src.graph_query import get_graph_results,get_chunktext_results from src.chunkid_entities import get_entities_from_chunkids -from src.post_processing import create_vector_fulltext_indexes, create_entity_embedding +from src.post_processing import create_vector_fulltext_indexes, create_entity_embedding, graph_schema_consolidation from sse_starlette.sse import EventSourceResponse from src.communities import create_communities from src.neighbours import get_neighbour_nodes @@ -30,8 +30,9 @@ from Secweb.XFrameOptions import XFrame from fastapi.middleware.gzip import GZipMiddleware from src.ragas_eval import * -from starlette.types import ASGIApp, Message, Receive, Scope, Send +from starlette.types import ASGIApp, Receive, Scope, Send from langchain_neo4j import Neo4jGraph +from src.entities.source_node import sourceNode logger = CustomLogger() CHUNK_DIR = os.path.join(os.path.dirname(__file__), "chunks") @@ -76,8 +77,6 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send): ) await gzip_middleware(scope, receive, send) app = FastAPI() -# SecWeb(app=app, Option={'referrer': False, 'xframe': False}) -# app.add_middleware(ContentSecurityPolicy, Option={'default-src': ["'self'"], 'base-uri': ["'self'"], 'block-all-mixed-content': []}, script_nonce=False, style_nonce=False, report_only=False) app.add_middleware(XContentTypeOptions) app.add_middleware(XFrame, Option={'X-Frame-Options': 'DENY'}) app.add_middleware(CustomGZipMiddleware, minimum_size=1000, compresslevel=5,paths=["/sources_list","/url/scan","/extract","/chat_bot","/chunk_entities","/get_neighbours","/graph_query","/schema","/populate_graph_schema","/get_unconnected_nodes_list","/get_duplicate_nodes","/fetch_chunktext"]) @@ -99,7 +98,6 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send): @app.post("/url/scan") async def create_source_knowledge_graph_url( - request: Request, uri=Form(), userName=Form(), password=Form(), @@ -150,11 +148,18 @@ async def create_source_knowledge_graph_url( 'gcs_project_id':gcs_project_id, 'logging_time': formatted_time(datetime.now(timezone.utc))} logger.log_struct(json_obj, "INFO") result ={'elapsed_api_time' : f'{elapsed_time:.2f}'} - return create_api_response("Success",message=message,success_count=success_count,failed_count=failed_count,file_name=lst_file_name,data=result) + return create_api_response("Success",message=message,success_count=success_count,failed_count=failed_count,file_name=lst_file_name,data=result) + except LLMGraphBuilderException as e: + error_message = str(e) + message = f" Unable to create source node for source type: {source_type} and source: {source}" + # Set the status "Success" becuase we are treating these error already handled by application as like custom errors. + json_obj = {'error_message':error_message, 'status':'Success','db_url':uri, 'userName':userName, 'database':database,'success_count':1, 'source_type': source_type, 'source_url':source_url, 'wiki_query':wiki_query, 'logging_time': formatted_time(datetime.now(timezone.utc))} + logger.log_struct(json_obj, "INFO") + return create_api_response('Failed',message=message + error_message[:80],error=error_message,file_source=source_type) except Exception as e: error_message = str(e) message = f" Unable to create source node for source type: {source_type} and source: {source}" - json_obj = {'error_message':error_message, 'status':'Failed','db_url':uri,'failed_count':1, 'source_type': source_type, 'source_url':source_url, 'wiki_query':wiki_query, 'logging_time': formatted_time(datetime.now(timezone.utc))} + json_obj = {'error_message':error_message, 'status':'Failed','db_url':uri, 'userName':userName, 'database':database,'failed_count':1, 'source_type': source_type, 'source_url':source_url, 'wiki_query':wiki_query, 'logging_time': formatted_time(datetime.now(timezone.utc))} logger.log_struct(json_obj, "ERROR") logging.exception(f'Exception Stack trace:') return create_api_response('Failed',message=message + error_message[:80],error=error_message,file_source=source_type) @@ -172,7 +177,6 @@ async def extract_knowledge_graph_from_file( aws_access_key_id=Form(None), aws_secret_access_key=Form(None), wiki_query=Form(None), - max_sources=Form(None), gcs_project_id=Form(None), gcs_bucket_name=Form(None), gcs_bucket_folder=Form(None), @@ -183,7 +187,8 @@ async def extract_knowledge_graph_from_file( allowedRelationship=Form(None), language=Form(None), access_token=Form(None), - retry_condition=Form(None) + retry_condition=Form(None), + additional_instructions=Form(None) ): """ Calls 'extract_graph_from_file' in a new thread to create Neo4jGraph from a @@ -206,22 +211,22 @@ async def extract_knowledge_graph_from_file( if source_type == 'local file': merged_file_path = os.path.join(MERGED_DIR,file_name) logging.info(f'File path:{merged_file_path}') - uri_latency, result = await extract_graph_from_file_local_file(uri, userName, password, database, model, merged_file_path, file_name, allowedNodes, allowedRelationship, retry_condition) + uri_latency, result = await extract_graph_from_file_local_file(uri, userName, password, database, model, merged_file_path, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions) elif source_type == 's3 bucket' and source_url: - uri_latency, result = await extract_graph_from_file_s3(uri, userName, password, database, model, source_url, aws_access_key_id, aws_secret_access_key, file_name, allowedNodes, allowedRelationship, retry_condition) + uri_latency, result = await extract_graph_from_file_s3(uri, userName, password, database, model, source_url, aws_access_key_id, aws_secret_access_key, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions) elif source_type == 'web-url': - uri_latency, result = await extract_graph_from_web_page(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, retry_condition) + uri_latency, result = await extract_graph_from_web_page(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions) elif source_type == 'youtube' and source_url: - uri_latency, result = await extract_graph_from_file_youtube(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, retry_condition) + uri_latency, result = await extract_graph_from_file_youtube(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions) elif source_type == 'Wikipedia' and wiki_query: - uri_latency, result = await extract_graph_from_file_Wikipedia(uri, userName, password, database, model, wiki_query, language, file_name, allowedNodes, allowedRelationship, retry_condition) + uri_latency, result = await extract_graph_from_file_Wikipedia(uri, userName, password, database, model, wiki_query, language, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions) elif source_type == 'gcs bucket' and gcs_bucket_name: - uri_latency, result = await extract_graph_from_file_gcs(uri, userName, password, database, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token, file_name, allowedNodes, allowedRelationship, retry_condition) + uri_latency, result = await extract_graph_from_file_gcs(uri, userName, password, database, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions) else: return create_api_response('Failed',message='source_type is other than accepted source') extract_api_time = time.time() - start_time @@ -256,31 +261,32 @@ async def extract_knowledge_graph_from_file( result['gcs_bucket_folder'] = gcs_bucket_folder result['gcs_blob_filename'] = gcs_blob_filename result['gcs_project_id'] = gcs_project_id - result['allowedNodes'] = allowedNodes - result['allowedRelationship'] = allowedRelationship result['language'] = language result['retry_condition'] = retry_condition logger.log_struct(result, "INFO") result.update(uri_latency) logging.info(f"extraction completed in {extract_api_time:.2f} seconds for file name {file_name}") return create_api_response('Success', data=result, file_source= source_type) + except LLMGraphBuilderException as e: + error_message = str(e) + graphDb_data_Access.update_exception_db(file_name,error_message, retry_condition) + failed_file_process(uri,file_name, merged_file_path, source_type) + node_detail = graphDb_data_Access.get_current_status_document_node(file_name) + # Set the status "Completed" in logging becuase we are treating these error already handled by application as like custom errors. + json_obj = {'api_name':'extract','message':error_message,'file_created_at':node_detail[0]['created_time'],'error_message':error_message, 'file_name': file_name,'status':'Completed', + 'db_url':uri, 'userName':userName, 'database':database,'success_count':1, 'source_type': source_type, 'source_url':source_url, 'wiki_query':wiki_query, 'logging_time': formatted_time(datetime.now(timezone.utc))} + logger.log_struct(json_obj, "INFO") + return create_api_response("Failed", message = error_message, error=error_message, file_name=file_name) except Exception as e: message=f"Failed To Process File:{file_name} or LLM Unable To Parse Content " error_message = str(e) graphDb_data_Access.update_exception_db(file_name,error_message, retry_condition) - gcs_file_cache = os.environ.get('GCS_FILE_CACHE') - if source_type == 'local file': - if gcs_file_cache == 'True': - folder_name = create_gcs_bucket_folder_name_hashed(uri,file_name) - copy_failed_file(BUCKET_UPLOAD, BUCKET_FAILED_FILE, folder_name, file_name) - time.sleep(5) - delete_file_from_gcs(BUCKET_UPLOAD,folder_name,file_name) - else: - logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}') - delete_uploaded_local_file(merged_file_path,file_name) - json_obj = {'message':message,'error_message':error_message, 'file_name': file_name,'status':'Failed','db_url':uri,'failed_count':1, 'source_type': source_type, 'source_url':source_url, 'wiki_query':wiki_query, 'logging_time': formatted_time(datetime.now(timezone.utc))} + failed_file_process(uri,file_name, merged_file_path, source_type) + node_detail = graphDb_data_Access.get_current_status_document_node(file_name) + + json_obj = {'api_name':'extract','message':message,'file_created_at':node_detail[0]['created_time'],'error_message':error_message, 'file_name': file_name,'status':'Failed', + 'db_url':uri, 'userName':userName, 'database':database,'failed_count':1, 'source_type': source_type, 'source_url':source_url, 'wiki_query':wiki_query, 'logging_time': formatted_time(datetime.now(timezone.utc))} logger.log_struct(json_obj, "ERROR") - logging.exception(f'File Failed in extraction: {json_obj}') return create_api_response('Failed', message=message + error_message[:100], error=error_message, file_name = file_name) finally: gc.collect() @@ -329,10 +335,15 @@ async def post_processing(uri=Form(), userName=Form(), password=Form(), database await asyncio.to_thread(create_entity_embedding, graph) api_name = 'post_processing/create_entity_embedding' logging.info(f'Entity Embeddings created') + + if "graph_schema_consolidation" in tasks : + await asyncio.to_thread(graph_schema_consolidation, graph) + api_name = 'post_processing/graph_schema_consolidation' + logging.info(f'Updated nodes and relationship labels') if "enable_communities" in tasks: api_name = 'create_communities' - await asyncio.to_thread(create_communities, uri, userName, password, database) + await asyncio.to_thread(create_communities, uri, userName, password, database) logging.info(f'created communities') graph = create_graph_database_connection(uri, userName, password, database) @@ -342,10 +353,12 @@ async def post_processing(uri=Form(), userName=Form(), password=Form(), database if count_response: count_response = [{"filename": filename, **counts} for filename, counts in count_response.items()] logging.info(f'Updated source node with community related counts') + + end = time.time() elapsed_time = end - start - json_obj = {'api_name': api_name, 'db_url': uri, 'userName':userName, 'database':database, 'tasks':tasks, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}'} - # logger.log_struct(json_obj) + json_obj = {'api_name': api_name, 'db_url': uri, 'userName':userName, 'database':database, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}'} + logger.log_struct(json_obj) return create_api_response('Success', data=count_response, message='All tasks completed successfully') except Exception as e: @@ -491,7 +504,7 @@ async def connect(uri=Form(), userName=Form(), password=Form(), database=Form()) gcs_file_cache = os.environ.get('GCS_FILE_CACHE') end = time.time() elapsed_time = end - start - json_obj = {'api_name':'connect','db_url':uri, 'userName':userName, 'database':database,'status':result, 'count':1, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}'} + json_obj = {'api_name':'connect','db_url':uri, 'userName':userName, 'database':database, 'count':1, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}'} logger.log_struct(json_obj, "INFO") result['elapsed_api_time'] = f'{elapsed_time:.2f}' result['gcs_file_cache'] = gcs_file_cache @@ -615,8 +628,7 @@ async def delete_document_and_entities(uri=Form(), start = time.time() graph = create_graph_database_connection(uri, userName, password, database) graphDb_data_Access = graphDBdataAccess(graph) - result, files_list_size = await asyncio.to_thread(graphDb_data_Access.delete_file_from_graph, filenames, source_types, deleteEntities, MERGED_DIR, uri) - # entities_count = result[0]['deletedEntities'] if 'deletedEntities' in result[0] else 0 + files_list_size = await asyncio.to_thread(graphDb_data_Access.delete_file_from_graph, filenames, source_types, deleteEntities, MERGED_DIR, uri) message = f"Deleted {files_list_size} documents with entities from database" end = time.time() elapsed_time = end - start @@ -825,13 +837,17 @@ async def retry_processing(uri=Form(), userName=Form(), password=Form(), databas try: start = time.time() graph = create_graph_database_connection(uri, userName, password, database) - await asyncio.to_thread(set_status_retry, graph,file_name,retry_condition) + chunks = graph.query(QUERY_TO_GET_CHUNKS, params={"filename":file_name}) end = time.time() elapsed_time = end - start json_obj = {'api_name':'retry_processing', 'db_url':uri, 'userName':userName, 'database':database, 'file_name':file_name,'retry_condition':retry_condition, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}'} logger.log_struct(json_obj, "INFO") - return create_api_response('Success',message=f"Status set to Ready to Reprocess for filename : {file_name}") + if chunks[0]['text'] is None or chunks[0]['text']=="" or not chunks : + return create_api_response('Success',message=f"Chunks are not created for the file{file_name}. Please upload again the file to re-process.",data=chunks) + else: + await asyncio.to_thread(set_status_retry, graph,file_name,retry_condition) + return create_api_response('Success',message=f"Status set to Ready to Reprocess for filename : {file_name}") except Exception as e: job_status = "Failed" message="Unable to set status to Retry" @@ -921,16 +937,6 @@ async def fetch_chunktext( page_no: int = Form(1) ): try: - payload_json_obj = { - 'api_name': 'fetch_chunktext', - 'db_url': uri, - 'userName': userName, - 'database': database, - 'document_name': document_name, - 'page_no': page_no, - 'logging_time': formatted_time(datetime.now(timezone.utc)) - } - logger.log_struct(payload_json_obj, "INFO") start = time.time() result = await asyncio.to_thread( get_chunktext_results, @@ -968,6 +974,7 @@ async def fetch_chunktext( @app.post("/backend_connection_configuration") async def backend_connection_configuration(): try: + start = time.time() uri = os.getenv('NEO4J_URI') username= os.getenv('NEO4J_USERNAME') database= os.getenv('NEO4J_DATABASE') @@ -988,6 +995,11 @@ async def backend_connection_configuration(): result["database"] = database result["password"] = encoded_password result['gcs_file_cache'] = gcs_file_cache + end = time.time() + elapsed_time = end - start + result['api_name'] = 'backend_connection_configuration' + result['elapsed_api_time'] = f'{elapsed_time:.2f}' + logger.log_struct(result, "INFO") return create_api_response('Success',message=f"Backend connection successful",data=result) else: graph_connection = False @@ -1000,7 +1012,7 @@ async def backend_connection_configuration(): logging.exception(f'{error_message}') return create_api_response(job_status, message=message, error=error_message.rstrip('.') + ', or fill from the login dialog.', data=graph_connection) finally: - gc.collect() - + gc.collect() + if __name__ == "__main__": uvicorn.run(app) \ No newline at end of file diff --git a/backend/src/QA_integration.py b/backend/src/QA_integration.py index f50a36efb..4c68030f8 100644 --- a/backend/src/QA_integration.py +++ b/backend/src/QA_integration.py @@ -364,12 +364,13 @@ def initialize_neo4j_vector(graph, chat_mode_settings): raise return neo_db -def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_threshold): +def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_threshold,ef_ratio): if document_names and chat_mode_settings["document_filter"]: retriever = neo_db.as_retriever( search_type="similarity_score_threshold", search_kwargs={ 'k': search_k, + 'effective_search_ratio': ef_ratio, 'score_threshold': score_threshold, 'filter': {'fileName': {'$in': document_names}} } @@ -378,7 +379,7 @@ def create_retriever(neo_db, document_names, chat_mode_settings,search_k, score_ else: retriever = neo_db.as_retriever( search_type="similarity_score_threshold", - search_kwargs={'k': search_k, 'score_threshold': score_threshold} + search_kwargs={'k': search_k,'effective_search_ratio': ef_ratio, 'score_threshold': score_threshold} ) logging.info(f"Successfully created retriever with search_k={search_k}, score_threshold={score_threshold}") return retriever @@ -389,7 +390,8 @@ def get_neo4j_retriever(graph, document_names,chat_mode_settings, score_threshol neo_db = initialize_neo4j_vector(graph, chat_mode_settings) # document_names= list(map(str.strip, json.loads(document_names))) search_k = chat_mode_settings["top_k"] - retriever = create_retriever(neo_db, document_names,chat_mode_settings, search_k, score_threshold) + ef_ratio = int(os.getenv("EFFECTIVE_SEARCH_RATIO", "2")) if os.getenv("EFFECTIVE_SEARCH_RATIO", "2").isdigit() else 2 + retriever = create_retriever(neo_db, document_names,chat_mode_settings, search_k, score_threshold,ef_ratio) return retriever except Exception as e: index_name = chat_mode_settings.get("index_name") diff --git a/backend/src/diffbot_transformer.py b/backend/src/diffbot_transformer.py index e16e54efb..03e1ba69e 100644 --- a/backend/src/diffbot_transformer.py +++ b/backend/src/diffbot_transformer.py @@ -1,11 +1,5 @@ -from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer -#from langchain_community.graphs import Neo4jGraph -from langchain_neo4j import Neo4jGraph -from langchain.docstore.document import Document from typing import List -import os import logging -import uuid from src.llm import get_combined_chunks, get_llm logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO') @@ -14,6 +8,4 @@ def get_graph_from_diffbot(graph,chunkId_chunkDoc_list:List): combined_chunk_document_list = get_combined_chunks(chunkId_chunkDoc_list) llm,model_name = get_llm('diffbot') graph_documents = llm.convert_to_graph_documents(combined_chunk_document_list) - return graph_documents - - \ No newline at end of file + return graph_documents \ No newline at end of file diff --git a/backend/src/document_sources/gcs_bucket.py b/backend/src/document_sources/gcs_bucket.py index 3aaf42e12..d50635571 100644 --- a/backend/src/document_sources/gcs_bucket.py +++ b/backend/src/document_sources/gcs_bucket.py @@ -6,6 +6,7 @@ from langchain_core.documents import Document from PyPDF2 import PdfReader import io +from src.shared.llm_graph_builder_exception import LLMGraphBuilderException from google.oauth2.credentials import Credentials import time import nltk @@ -34,12 +35,12 @@ def get_gcs_bucket_files_info(gcs_project_id, gcs_bucket_name, gcs_bucket_folder file_name='' message=f" Bucket:{gcs_bucket_name} does not exist in Project:{gcs_project_id}. Please provide valid GCS bucket name" logging.info(f"Bucket : {gcs_bucket_name} does not exist in project : {gcs_project_id}") - raise Exception(message) + raise LLMGraphBuilderException(message) except Exception as e: error_message = str(e) logging.error(f"Unable to create source node for gcs bucket file {file_name}") logging.exception(f'Exception Stack trace: {error_message}') - raise Exception(error_message) + raise LLMGraphBuilderException(error_message) def load_pdf(file_path): return PyMuPDFLoader(file_path) @@ -47,7 +48,7 @@ def load_pdf(file_path): def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token=None): nltk.download('punkt') nltk.download('averaged_perceptron_tagger') - if gcs_bucket_folder is not None: + if gcs_bucket_folder is not None and gcs_bucket_folder.strip()!="": if gcs_bucket_folder.endswith('/'): blob_name = gcs_bucket_folder+gcs_blob_filename else: @@ -66,7 +67,7 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=load_document_content) pages = loader.load() else : - raise Exception('File does not exist, Please re-upload the file and try again.') + raise LLMGraphBuilderException('File does not exist, Please re-upload the file and try again.') else: creds= Credentials(access_token) storage_client = storage.Client(project=gcs_project_id, credentials=creds) @@ -83,7 +84,7 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g text += page.extract_text() pages = [Document(page_content = text)] else: - raise Exception(f'File Not Found in GCS bucket - {gcs_bucket_name}') + raise LLMGraphBuilderException(f'File Not Found in GCS bucket - {gcs_bucket_name}') return gcs_blob_filename, pages def upload_file_to_gcs(file_chunk, chunk_number, original_file_name, bucket_name, folder_name_sha1_hashed): @@ -123,7 +124,6 @@ def merge_file_gcs(bucket_name, original_file_name: str, folder_name_sha1_hashed logging.info('save the merged file from chunks in gcs') file_io = io.BytesIO(merged_file) blob.upload_from_file(file_io) - # pdf_reader = PdfReader(file_io) file_size = len(merged_file) return file_size diff --git a/backend/src/document_sources/local_file.py b/backend/src/document_sources/local_file.py index 3d5bc08db..f674a202f 100644 --- a/backend/src/document_sources/local_file.py +++ b/backend/src/document_sources/local_file.py @@ -1,23 +1,9 @@ import logging -import shutil from pathlib import Path -from tempfile import NamedTemporaryFile -# from langchain_community.document_loaders import PyPDFLoader from langchain_community.document_loaders import PyMuPDFLoader from langchain_community.document_loaders import UnstructuredFileLoader from langchain_core.documents import Document -# def get_documents_from_file_by_bytes(file): -# file_name = file.filename -# logging.info(f"get_documents_from_file called for filename = {file_name}") -# suffix = Path(file.filename).suffix -# with NamedTemporaryFile(delete=True, suffix=suffix) as tmp: -# shutil.copyfileobj(file.file, tmp) -# tmp_path = Path(tmp.name) -# loader = PyPDFLoader(str(tmp_path)) -# pages = loader.load_and_split() -# return file_name, pages - def load_document_content(file_path): if Path(file_path).suffix.lower() == '.pdf': return PyMuPDFLoader(file_path) @@ -27,8 +13,7 @@ def load_document_content(file_path): def get_documents_from_file_by_path(file_path,file_name): file_path = Path(file_path) if file_path.exists(): - logging.info(f'file {file_name} processing') - # loader = PyPDFLoader(str(file_path)) + logging.info(f'file {file_name} processing') file_extension = file_path.suffix.lower() try: loader = load_document_content(file_path) @@ -58,14 +43,10 @@ def get_pages_with_page_numbers(unstructured_pages): if page.metadata['page_number']>page_number: page_number+=1 - # if not metadata: - # metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']} pages.append(Document(page_content = page_content)) page_content='' if page == unstructured_pages[-1]: - # if not metadata: - # metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']} pages.append(Document(page_content = page_content)) elif page.metadata['category']=='PageBreak' and page!=unstructured_pages[0]: diff --git a/backend/src/document_sources/s3_bucket.py b/backend/src/document_sources/s3_bucket.py index 908e8474e..cdcd7fa0f 100644 --- a/backend/src/document_sources/s3_bucket.py +++ b/backend/src/document_sources/s3_bucket.py @@ -1,5 +1,6 @@ from langchain_community.document_loaders import S3DirectoryLoader import logging +from src.shared.llm_graph_builder_exception import LLMGraphBuilderException import boto3 import os from urllib.parse import urlparse @@ -74,4 +75,4 @@ def get_documents_from_s3(s3_url, aws_access_key_id, aws_secret_access_key): except Exception as e: error_message = str(e) logging.exception(f'Exception in reading content from S3:{error_message}') - raise Exception(error_message) \ No newline at end of file + raise LLMGraphBuilderException(error_message) \ No newline at end of file diff --git a/backend/src/document_sources/web_pages.py b/backend/src/document_sources/web_pages.py index 659a81267..91c87510c 100644 --- a/backend/src/document_sources/web_pages.py +++ b/backend/src/document_sources/web_pages.py @@ -1,11 +1,16 @@ -import logging from langchain_community.document_loaders import WebBaseLoader -from src.api_response import create_api_response +from src.shared.llm_graph_builder_exception import LLMGraphBuilderException +from src.shared.common_fn import last_url_segment def get_documents_from_web_page(source_url:str): try: pages = WebBaseLoader(source_url, verify_ssl=False).load() - file_name = pages[0].metadata['title'] + try: + file_name = pages[0].metadata['title'] + if not file_name: + file_name = last_url_segment(source_url) + except: + file_name = last_url_segment(source_url) return file_name, pages except Exception as e: - raise Exception(str(e)) \ No newline at end of file + raise LLMGraphBuilderException(str(e)) \ No newline at end of file diff --git a/backend/src/document_sources/wikipedia.py b/backend/src/document_sources/wikipedia.py index 71820a69e..e4d7742b1 100644 --- a/backend/src/document_sources/wikipedia.py +++ b/backend/src/document_sources/wikipedia.py @@ -1,6 +1,6 @@ import logging from langchain_community.document_loaders import WikipediaLoader -from src.api_response import create_api_response +from src.shared.llm_graph_builder_exception import LLMGraphBuilderException def get_documents_from_Wikipedia(wiki_query:str, language:str): try: @@ -9,9 +9,8 @@ def get_documents_from_Wikipedia(wiki_query:str, language:str): logging.info(f"Total Pages from Wikipedia = {len(pages)}") return file_name, pages except Exception as e: - job_status = "Failed" message="Failed To Process Wikipedia Query" error_message = str(e) - logging.error(f"Failed To Process Wikipedia Query: {file_name}") - logging.exception(f'Exception Stack trace: {error_message}') - return create_api_response(job_status,message=message,error=error_message,file_name=file_name) \ No newline at end of file + logging.exception(f'Failed To Process Wikipedia Query: {file_name}, Exception Stack trace: {error_message}') + raise LLMGraphBuilderException(error_message+' '+message) + \ No newline at end of file diff --git a/backend/src/document_sources/youtube.py b/backend/src/document_sources/youtube.py index dee97e230..82e9a9219 100644 --- a/backend/src/document_sources/youtube.py +++ b/backend/src/document_sources/youtube.py @@ -1,11 +1,11 @@ from langchain.docstore.document import Document +from src.shared.llm_graph_builder_exception import LLMGraphBuilderException from youtube_transcript_api import YouTubeTranscriptApi import logging from urllib.parse import urlparse,parse_qs from difflib import SequenceMatcher from datetime import timedelta from src.shared.constants import YOUTUBE_CHUNK_SIZE_SECONDS -from typing import List, Dict, Any import os import re @@ -17,7 +17,7 @@ def get_youtube_transcript(youtube_id): return transcript_pieces except Exception as e: message = f"Youtube transcript is not available for youtube Id: {youtube_id}" - raise Exception(message) + raise LLMGraphBuilderException(message) def get_youtube_combined_transcript(youtube_id): try: @@ -28,7 +28,7 @@ def get_youtube_combined_transcript(youtube_id): return transcript except Exception as e: message = f"Youtube transcript is not available for youtube Id: {youtube_id}" - raise Exception(message) + raise LLMGraphBuilderException(message) def create_youtube_url(url): @@ -64,7 +64,7 @@ def get_documents_from_youtube(url): except Exception as e: error_message = str(e) logging.exception(f'Exception in reading transcript from youtube:{error_message}') - raise Exception(error_message) + raise LLMGraphBuilderException(error_message) def get_calculated_timestamps(chunks, youtube_id): logging.info('Calculating timestamps for chunks') diff --git a/backend/src/graphDB_dataAccess.py b/backend/src/graphDB_dataAccess.py index 1780b6203..89a1c0651 100644 --- a/backend/src/graphDB_dataAccess.py +++ b/backend/src/graphDB_dataAccess.py @@ -160,24 +160,33 @@ def update_KNN_graph(self): logging.info("Vector index does not exist, So KNN graph not update") def check_account_access(self, database): - query = """ - SHOW USER PRIVILEGES - YIELD * - WHERE graph = $database AND action IN ['read'] - RETURN COUNT(*) AS readAccessCount - """ try: - logging.info(f"Checking access for database: {database}") + query_dbms_componenet = "call dbms.components() yield edition" + result_dbms_componenet = self.graph.query(query_dbms_componenet) - result = self.graph.query(query, params={"database": database}) - read_access_count = result[0]["readAccessCount"] if result else 0 + if result_dbms_componenet[0]["edition"] == "enterprise": + query = """ + SHOW USER PRIVILEGES + YIELD * + WHERE graph = $database AND action IN ['read'] + RETURN COUNT(*) AS readAccessCount + """ + + logging.info(f"Checking access for database: {database}") - logging.info(f"Read access count: {read_access_count}") + result = self.graph.query(query, params={"database": database}) + read_access_count = result[0]["readAccessCount"] if result else 0 - if read_access_count > 0: - logging.info("The account has read access.") - return False + logging.info(f"Read access count: {read_access_count}") + + if read_access_count > 0: + logging.info("The account has read access.") + return False + else: + logging.info("The account has write access.") + return True else: + #Community version have no roles to execute admin command, so assuming write access as TRUE logging.info("The account has write access.") return True @@ -261,17 +270,18 @@ def get_current_status_document_node(self, file_name): d.entityNodeCount AS entityNodeCount, d.entityEntityRelCount AS entityEntityRelCount, d.communityNodeCount AS communityNodeCount, - d.communityRelCount AS communityRelCount + d.communityRelCount AS communityRelCount, + d.createdAt AS created_time """ param = {"file_name" : file_name} return self.execute_query(query, param) def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, merged_dir:str, uri): - # filename_list = filenames.split(',') + filename_list= list(map(str.strip, json.loads(filenames))) source_types_list= list(map(str.strip, json.loads(source_types))) gcs_file_cache = os.environ.get('GCS_FILE_CACHE') - # source_types_list = source_types.split(',') + for (file_name,source_type) in zip(filename_list, source_types_list): merged_file_path = os.path.join(merged_dir, file_name) if source_type == 'local file' and gcs_file_cache == 'True': @@ -280,18 +290,22 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me else: logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}') delete_uploaded_local_file(merged_file_path,file_name) - query_to_delete_document=""" - MATCH (d:Document) where d.fileName in $filename_list and d.fileSource in $source_types_list - with collect(d) as documents - unwind documents as d + + query_to_delete_document=""" + MATCH (d:Document) + WHERE d.fileName IN $filename_list AND coalesce(d.fileSource, "None") IN $source_types_list + WITH COLLECT(d) AS documents + CALL (documents) { + UNWIND documents AS d optional match (d)<-[:PART_OF]-(c:Chunk) detach delete c, d - return count(*) as deletedChunks + } IN TRANSACTIONS OF 1 ROWS """ - query_to_delete_document_and_entities=""" + query_to_delete_document_and_entities = """ MATCH (d:Document) - WHERE d.fileName IN $filename_list AND d.fileSource IN $source_types_list + WHERE d.fileName IN $filename_list AND coalesce(d.fileSource, "None") IN $source_types_list WITH COLLECT(d) AS documents + CALL (documents) { UNWIND documents AS d OPTIONAL MATCH (d)<-[:PART_OF]-(c:Chunk) OPTIONAL MATCH (c:Chunk)-[:HAS_ENTITY]->(e) @@ -304,7 +318,8 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me FOREACH (chunk IN chunks | DETACH DELETE chunk) FOREACH (entity IN entities | DETACH DELETE entity) DETACH DELETE d - """ + } IN TRANSACTIONS OF 1 ROWS + """ query_to_delete_communities = """ MATCH (c:`__Community__`) WHERE c.level = 0 AND NOT EXISTS { ()-[:IN_COMMUNITY]->(c) } @@ -326,7 +341,7 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me else : result = self.execute_query(query_to_delete_document, param) logging.info(f"Deleting {len(filename_list)} documents = '{filename_list}' from '{source_types_list}' with their entities from database") - return result, len(filename_list) + return len(filename_list) def list_unconnected_nodes(self): query = """ @@ -520,5 +535,4 @@ def update_node_relationship_count(self,document_name): "nodeCount" : nodeCount, "relationshipCount" : relationshipCount } - return response \ No newline at end of file diff --git a/backend/src/graph_query.py b/backend/src/graph_query.py index f9af6102b..aefaacbd1 100644 --- a/backend/src/graph_query.py +++ b/backend/src/graph_query.py @@ -4,9 +4,6 @@ import os import json from src.shared.constants import GRAPH_CHUNK_LIMIT,GRAPH_QUERY,CHUNK_TEXT_QUERY,COUNT_CHUNKS_QUERY -# from neo4j.debug import watch - -# watch("neo4j") def get_graphDB_driver(uri, username, password,database="neo4j"): """ @@ -28,7 +25,6 @@ def get_graphDB_driver(uri, username, password,database="neo4j"): except Exception as e: error_message = f"graph_query module: Failed to connect to the database at {uri}." logging.error(error_message, exc_info=True) - # raise Exception(error_message) from e def execute_query(driver, query,document_names,doc_limit=None): @@ -201,7 +197,7 @@ def get_graph_results(uri, username, password,database,document_names): try: logging.info(f"Starting graph query process") driver = get_graphDB_driver(uri, username, password,database) - document_names= list(map(str.strip, json.loads(document_names))) + document_names= list(map(str, json.loads(document_names))) query = GRAPH_QUERY.format(graph_chunk_limit=GRAPH_CHUNK_LIMIT) records, summary , keys = execute_query(driver, query.strip(), document_names) document_nodes = extract_node_elements(records) diff --git a/backend/src/llm.py b/backend/src/llm.py index 381a38a68..0a7f74b08 100644 --- a/backend/src/llm.py +++ b/backend/src/llm.py @@ -13,6 +13,7 @@ from langchain_community.chat_models import ChatOllama import boto3 import google.auth +from src.shared.constants import ADDITIONAL_INSTRUCTIONS def get_llm(model: str): """Retrieve the specified language model based on the model name.""" @@ -160,14 +161,14 @@ def get_chunk_id_as_doc_metadata(chunkId_chunkDoc_list): async def get_graph_document_list( - llm, combined_chunk_document_list, allowedNodes, allowedRelationship + llm, combined_chunk_document_list, allowedNodes, allowedRelationship, additional_instructions=None ): futures = [] graph_document_list = [] if "diffbot_api_key" in dir(llm): llm_transformer = llm else: - if "get_name" in dir(llm) and llm.get_name() != "ChatOenAI" or llm.get_name() != "ChatVertexAI" or llm.get_name() != "AzureChatOpenAI": + if "get_name" in dir(llm) and llm.get_name() != "ChatOpenAI" or llm.get_name() != "ChatVertexAI" or llm.get_name() != "AzureChatOpenAI": node_properties = False relationship_properties = False else: @@ -180,6 +181,7 @@ async def get_graph_document_list( allowed_nodes=allowedNodes, allowed_relationships=allowedRelationship, ignore_tool_usage=True, + additional_instructions=ADDITIONAL_INSTRUCTIONS+ (additional_instructions if additional_instructions else "") ) if isinstance(llm,DiffbotGraphTransformer): @@ -189,7 +191,8 @@ async def get_graph_document_list( return graph_document_list -async def get_graph_from_llm(model, chunkId_chunkDoc_list, allowedNodes, allowedRelationship): + +async def get_graph_from_llm(model, chunkId_chunkDoc_list, allowedNodes, allowedRelationship, additional_instructions=None): try: llm, model_name = get_llm(model) combined_chunk_document_list = get_combined_chunks(chunkId_chunkDoc_list) @@ -204,7 +207,7 @@ async def get_graph_from_llm(model, chunkId_chunkDoc_list, allowedNodes, allowed allowedRelationship = allowedRelationship.split(',') graph_document_list = await get_graph_document_list( - llm, combined_chunk_document_list, allowedNodes, allowedRelationship + llm, combined_chunk_document_list, allowedNodes, allowedRelationship, additional_instructions ) return graph_document_list except Exception as e: diff --git a/backend/src/main.py b/backend/src/main.py index d47061c46..852740365 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -1,5 +1,5 @@ from langchain_neo4j import Neo4jGraph -from src.shared.constants import (BUCKET_UPLOAD, PROJECT_ID, QUERY_TO_GET_CHUNKS, +from src.shared.constants import (BUCKET_UPLOAD,BUCKET_FAILED_FILE, PROJECT_ID, QUERY_TO_GET_CHUNKS, QUERY_TO_DELETE_EXISTING_ENTITIES, QUERY_TO_GET_LAST_PROCESSED_CHUNK_POSITION, QUERY_TO_GET_LAST_PROCESSED_CHUNK_WITHOUT_ENTITY, @@ -30,6 +30,7 @@ import shutil import urllib.parse import json +from src.shared.llm_graph_builder_exception import LLMGraphBuilderException warnings.filterwarnings("ignore") load_dotenv() @@ -40,7 +41,7 @@ def create_source_node_graph_url_s3(graph, model, source_url, aws_access_key_id, lst_file_name = [] files_info = get_s3_files_info(source_url,aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key) if len(files_info)==0: - raise Exception('No pdf files found.') + raise LLMGraphBuilderException('No pdf files found.') logging.info(f'files info : {files_info}') success_count=0 failed_count=0 @@ -48,7 +49,7 @@ def create_source_node_graph_url_s3(graph, model, source_url, aws_access_key_id, for file_info in files_info: file_name=file_info['file_key'] obj_source_node = sourceNode() - obj_source_node.file_name = file_name.split('/')[-1] + obj_source_node.file_name = file_name.split('/')[-1].strip() if isinstance(file_name.split('/')[-1], str) else file_name.split('/')[-1] obj_source_node.file_type = 'pdf' obj_source_node.file_size = file_info['file_size_bytes'] obj_source_node.file_source = source_type @@ -70,7 +71,6 @@ def create_source_node_graph_url_s3(graph, model, source_url, aws_access_key_id, except Exception as e: failed_count+=1 - # error_message = str(e) lst_file_name.append({'fileName':obj_source_node.file_name,'fileSize':obj_source_node.file_size,'url':obj_source_node.url,'status':'Failed'}) return lst_file_name,success_count,failed_count @@ -83,7 +83,7 @@ def create_source_node_graph_url_gcs(graph, model, gcs_project_id, gcs_bucket_na lst_file_metadata= get_gcs_bucket_files_info(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, credentials) for file_metadata in lst_file_metadata : obj_source_node = sourceNode() - obj_source_node.file_name = file_metadata['fileName'] + obj_source_node.file_name = file_metadata['fileName'].strip() if isinstance(file_metadata['fileName'], str) else file_metadata['fileName'] obj_source_node.file_size = file_metadata['fileSize'] obj_source_node.url = file_metadata['url'] obj_source_node.file_source = source_type @@ -121,15 +121,24 @@ def create_source_node_graph_web_url(graph, model, source_url, source_type): if pages==None or len(pages)==0: failed_count+=1 message = f"Unable to read data for given url : {source_url}" - raise Exception(message) + raise LLMGraphBuilderException(message) + try: + title = pages[0].metadata['title'] + if not title: + title = last_url_segment(source_url) + language = pages[0].metadata['language'] + except: + title = last_url_segment(source_url) + language = "N/A" + obj_source_node = sourceNode() obj_source_node.file_type = 'text' obj_source_node.file_source = source_type obj_source_node.model = model obj_source_node.url = urllib.parse.unquote(source_url) obj_source_node.created_at = datetime.now() - obj_source_node.file_name = pages[0].metadata['title'] - obj_source_node.language = pages[0].metadata['language'] + obj_source_node.file_name = title.strip() if isinstance(title, str) else title + obj_source_node.language = language obj_source_node.file_size = sys.getsizeof(pages[0].page_content) obj_source_node.chunkNodeCount=0 obj_source_node.chunkRelCount=0 @@ -163,14 +172,12 @@ def create_source_node_graph_url_youtube(graph, model, source_url, source_type): obj_source_node.communityRelCount=0 match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',obj_source_node.url) logging.info(f"match value: {match}") - video_id = parse_qs(urlparse(youtube_url).query).get('v') obj_source_node.file_name = match.group(1) transcript= get_youtube_combined_transcript(match.group(1)) logging.info(f"Youtube transcript : {transcript}") if transcript==None or len(transcript)==0: message = f"Youtube transcript is not available for : {obj_source_node.file_name}" - logging.info(f"Youtube transcript is not available for : {obj_source_node.file_name}") - raise Exception(message) + raise LLMGraphBuilderException(message) else: obj_source_node.file_size = sys.getsizeof(transcript) @@ -185,14 +192,13 @@ def create_source_node_graph_url_wikipedia(graph, model, wiki_query, source_type success_count=0 failed_count=0 lst_file_name=[] - #queries_list = wiki_query.split(',') wiki_query_id, language = check_url_source(source_type=source_type, wiki_query=wiki_query) logging.info(f"Creating source node for {wiki_query_id.strip()}, {language}") pages = WikipediaLoader(query=wiki_query_id.strip(), lang=language, load_max_docs=1, load_all_available_meta=True).load() if pages==None or len(pages)==0: failed_count+=1 message = f"Unable to read data for given Wikipedia url : {wiki_query}" - raise Exception(message) + raise LLMGraphBuilderException(message) else: obj_source_node = sourceNode() obj_source_node.file_name = wiki_query_id.strip() @@ -215,7 +221,7 @@ def create_source_node_graph_url_wikipedia(graph, model, wiki_query, source_type lst_file_name.append({'fileName':obj_source_node.file_name,'fileSize':obj_source_node.file_size,'url':obj_source_node.url, 'language':obj_source_node.language, 'status':'Success'}) return lst_file_name,success_count,failed_count -async def extract_graph_from_file_local_file(uri, userName, password, database, model, merged_file_path, fileName, allowedNodes, allowedRelationship, retry_condition): +async def extract_graph_from_file_local_file(uri, userName, password, database, model, merged_file_path, fileName, allowedNodes, allowedRelationship, retry_condition, additional_instructions): logging.info(f'Process file name :{fileName}') if not retry_condition: @@ -226,63 +232,63 @@ async def extract_graph_from_file_local_file(uri, userName, password, database, else: file_name, pages, file_extension = get_documents_from_file_by_path(merged_file_path,fileName) if pages==None or len(pages)==0: - raise Exception(f'File content is not available for file : {file_name}') + raise LLMGraphBuilderException(f'File content is not available for file : {file_name}') return await processing_source(uri, userName, password, database, model, file_name, pages, allowedNodes, allowedRelationship, True, merged_file_path) else: - return await processing_source(uri, userName, password, database, model, fileName, [], allowedNodes, allowedRelationship, True, merged_file_path, retry_condition) + return await processing_source(uri, userName, password, database, model, fileName, [], allowedNodes, allowedRelationship, True, merged_file_path, retry_condition, additional_instructions=additional_instructions) -async def extract_graph_from_file_s3(uri, userName, password, database, model, source_url, aws_access_key_id, aws_secret_access_key, file_name, allowedNodes, allowedRelationship, retry_condition): +async def extract_graph_from_file_s3(uri, userName, password, database, model, source_url, aws_access_key_id, aws_secret_access_key, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions): if not retry_condition: if(aws_access_key_id==None or aws_secret_access_key==None): - raise Exception('Please provide AWS access and secret keys') + raise LLMGraphBuilderException('Please provide AWS access and secret keys') else: logging.info("Insert in S3 Block") file_name, pages = get_documents_from_s3(source_url, aws_access_key_id, aws_secret_access_key) if pages==None or len(pages)==0: - raise Exception(f'File content is not available for file : {file_name}') + raise LLMGraphBuilderException(f'File content is not available for file : {file_name}') return await processing_source(uri, userName, password, database, model, file_name, pages, allowedNodes, allowedRelationship) else: - return await processing_source(uri, userName, password, database, model, file_name, [], allowedNodes, allowedRelationship, retry_condition=retry_condition) + return await processing_source(uri, userName, password, database, model, file_name, [], allowedNodes, allowedRelationship, retry_condition=retry_condition, additional_instructions=additional_instructions) -async def extract_graph_from_web_page(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, retry_condition): +async def extract_graph_from_web_page(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions): if not retry_condition: file_name, pages = get_documents_from_web_page(source_url) if pages==None or len(pages)==0: - raise Exception(f'Content is not available for given URL : {file_name}') + raise LLMGraphBuilderException(f'Content is not available for given URL : {file_name}') return await processing_source(uri, userName, password, database, model, file_name, pages, allowedNodes, allowedRelationship) else: - return await processing_source(uri, userName, password, database, model, file_name, [], allowedNodes, allowedRelationship, retry_condition=retry_condition) + return await processing_source(uri, userName, password, database, model, file_name, [], allowedNodes, allowedRelationship, retry_condition=retry_condition, additional_instructions=additional_instructions) -async def extract_graph_from_file_youtube(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, retry_condition): +async def extract_graph_from_file_youtube(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions): if not retry_condition: file_name, pages = get_documents_from_youtube(source_url) if pages==None or len(pages)==0: - raise Exception(f'Youtube transcript is not available for file : {file_name}') + raise LLMGraphBuilderException(f'Youtube transcript is not available for file : {file_name}') return await processing_source(uri, userName, password, database, model, file_name, pages, allowedNodes, allowedRelationship) else: - return await processing_source(uri, userName, password, database, model, file_name, [], allowedNodes, allowedRelationship, retry_condition=retry_condition) + return await processing_source(uri, userName, password, database, model, file_name, [], allowedNodes, allowedRelationship, retry_condition=retry_condition, additional_instructions=additional_instructions) -async def extract_graph_from_file_Wikipedia(uri, userName, password, database, model, wiki_query, language, file_name, allowedNodes, allowedRelationship, retry_condition): +async def extract_graph_from_file_Wikipedia(uri, userName, password, database, model, wiki_query, language, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions): if not retry_condition: file_name, pages = get_documents_from_Wikipedia(wiki_query, language) if pages==None or len(pages)==0: - raise Exception(f'Wikipedia page is not available for file : {file_name}') + raise LLMGraphBuilderException(f'Wikipedia page is not available for file : {file_name}') return await processing_source(uri, userName, password, database, model, file_name, pages, allowedNodes, allowedRelationship) else: - return await processing_source(uri, userName, password, database, model, file_name,[], allowedNodes, allowedRelationship, retry_condition=retry_condition) + return await processing_source(uri, userName, password, database, model, file_name,[], allowedNodes, allowedRelationship, retry_condition=retry_condition, additional_instructions=additional_instructions) -async def extract_graph_from_file_gcs(uri, userName, password, database, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token, file_name, allowedNodes, allowedRelationship, retry_condition): +async def extract_graph_from_file_gcs(uri, userName, password, database, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token, file_name, allowedNodes, allowedRelationship, retry_condition, additional_instructions): if not retry_condition: file_name, pages = get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token) if pages==None or len(pages)==0: - raise Exception(f'File content is not available for file : {file_name}') + raise LLMGraphBuilderException(f'File content is not available for file : {file_name}') return await processing_source(uri, userName, password, database, model, file_name, pages, allowedNodes, allowedRelationship) else: - return await processing_source(uri, userName, password, database, model, file_name, [], allowedNodes, allowedRelationship, retry_condition=retry_condition) + return await processing_source(uri, userName, password, database, model, file_name, [], allowedNodes, allowedRelationship, retry_condition=retry_condition, additional_instructions=additional_instructions) -async def processing_source(uri, userName, password, database, model, file_name, pages, allowedNodes, allowedRelationship, is_uploaded_from_local=None, merged_file_path=None, retry_condition=None): +async def processing_source(uri, userName, password, database, model, file_name, pages, allowedNodes, allowedRelationship, is_uploaded_from_local=None, merged_file_path=None, retry_condition=None, additional_instructions=None): """ Extracts a Neo4jGraph from a PDF file based on the model. @@ -299,6 +305,7 @@ async def processing_source(uri, userName, password, database, model, file_name, status and model as attributes. """ uri_latency = {} + response = {} start_time = datetime.now() processing_source_start_time = time.time() start_create_connection = time.time() @@ -332,7 +339,7 @@ async def processing_source(uri, userName, password, database, model, file_name, if result[0]['Status'] != 'Processing': obj_source_node = sourceNode() status = "Processing" - obj_source_node.file_name = file_name + obj_source_node.file_name = file_name.strip() if isinstance(file_name, str) else file_name obj_source_node.status = status obj_source_node.total_chunks = total_chunks obj_source_node.model = model @@ -346,7 +353,7 @@ async def processing_source(uri, userName, password, database, model, file_name, start_update_source_node = time.time() graphDb_data_Access.update_source_node(obj_source_node) - count_response = graphDb_data_Access.update_node_relationship_count(file_name) + graphDb_data_Access.update_node_relationship_count(file_name) end_update_source_node = time.time() elapsed_update_source_node = end_update_source_node - start_update_source_node logging.info(f'Time taken to update the document source node: {elapsed_update_source_node:.2f} seconds') @@ -354,11 +361,14 @@ async def processing_source(uri, userName, password, database, model, file_name, logging.info('Update the status as Processing') update_graph_chunk_processed = int(os.environ.get('UPDATE_GRAPH_CHUNKS_PROCESSED')) + chunk_to_be_processed = int(os.environ.get('CHUNKS_TO_BE_PROCESSED', '50')) # selected_chunks = [] is_cancelled_status = False job_status = "Completed" for i in range(0, len(chunkId_chunkDoc_list), update_graph_chunk_processed): select_chunks_upto = i+update_graph_chunk_processed + if select_chunks_upto > chunk_to_be_processed: + break logging.info(f'Selected Chunks upto: {select_chunks_upto}') if len(chunkId_chunkDoc_list) <= select_chunks_upto: select_chunks_upto = len(chunkId_chunkDoc_list) @@ -373,7 +383,7 @@ async def processing_source(uri, userName, password, database, model, file_name, break else: processing_chunks_start_time = time.time() - node_count,rel_count,latency_processed_chunk = await processing_chunks(selected_chunks,graph,uri, userName, password, database,file_name,model,allowedNodes,allowedRelationship,node_count, rel_count) + node_count,rel_count,latency_processed_chunk = await processing_chunks(selected_chunks,graph,uri, userName, password, database,file_name,model,allowedNodes,allowedRelationship,node_count, rel_count, additional_instructions) processing_chunks_end_time = time.time() processing_chunks_elapsed_end_time = processing_chunks_end_time - processing_chunks_start_time logging.info(f"Time taken {update_graph_chunk_processed} chunks processed upto {select_chunks_upto} completed in {processing_chunks_elapsed_end_time:.2f} seconds for file name {file_name}") @@ -395,7 +405,7 @@ async def processing_source(uri, userName, password, database, model, file_name, obj_source_node.node_count = node_count obj_source_node.relationship_count = rel_count graphDb_data_Access.update_source_node(obj_source_node) - count_response = graphDb_data_Access.update_node_relationship_count(file_name) + graphDb_data_Access.update_node_relationship_count(file_name) result = graphDb_data_Access.get_current_status_document_node(file_name) is_cancelled_status = result[0]['is_cancelled'] @@ -406,12 +416,12 @@ async def processing_source(uri, userName, password, database, model, file_name, end_time = datetime.now() processed_time = end_time - start_time obj_source_node = sourceNode() - obj_source_node.file_name = file_name + obj_source_node.file_name = file_name.strip() if isinstance(file_name, str) else file_name obj_source_node.status = job_status obj_source_node.processing_time = processed_time graphDb_data_Access.update_source_node(obj_source_node) - count_response = graphDb_data_Access.update_node_relationship_count(file_name) + graphDb_data_Access.update_node_relationship_count(file_name) logging.info('Updated the nodeCount and relCount properties in Document node') logging.info(f'file:{file_name} extraction has been completed') @@ -432,7 +442,7 @@ async def processing_source(uri, userName, password, database, model, file_name, uri_latency["Per_entity_latency"] = 'N/A' else: uri_latency["Per_entity_latency"] = f'{int(processing_source_func)/node_count}/s' - response = {} + response["fileName"] = file_name response["nodeCount"] = node_count response["relationshipCount"] = rel_count @@ -442,15 +452,17 @@ async def processing_source(uri, userName, password, database, model, file_name, response["success_count"] = 1 return uri_latency, response - else: - logging.info('File does not process because it\'s already in Processing status') + else: + logging.info("File does not process because its already in Processing status") + return uri_latency,response else: error_message = "Unable to get the status of document node." logging.error(error_message) - raise Exception(error_message) + raise LLMGraphBuilderException(error_message) -async def processing_chunks(chunkId_chunkDoc_list,graph,uri, userName, password, database,file_name,model,allowedNodes,allowedRelationship, node_count, rel_count): +async def processing_chunks(chunkId_chunkDoc_list,graph,uri, userName, password, database,file_name,model,allowedNodes,allowedRelationship, node_count, rel_count, additional_instructions=None): #create vector index and update chunk node with embedding + latency_processing_chunk = {} if graph is not None: if graph._driver._closed: graph = create_graph_database_connection(uri, userName, password, database) @@ -462,11 +474,11 @@ async def processing_chunks(chunkId_chunkDoc_list,graph,uri, userName, password, end_update_embedding = time.time() elapsed_update_embedding = end_update_embedding - start_update_embedding logging.info(f'Time taken to update embedding in chunk node: {elapsed_update_embedding:.2f} seconds') - latency_processing_chunk = {"update_embedding" : f'{elapsed_update_embedding:.2f}'} + latency_processing_chunk["update_embedding"] = f'{elapsed_update_embedding:.2f}' logging.info("Get graph document list from models") start_entity_extraction = time.time() - graph_documents = await get_graph_from_llm(model, chunkId_chunkDoc_list, allowedNodes, allowedRelationship) + graph_documents = await get_graph_from_llm(model, chunkId_chunkDoc_list, allowedNodes, allowedRelationship, additional_instructions) end_entity_extraction = time.time() elapsed_entity_extraction = end_entity_extraction - start_entity_extraction logging.info(f'Time taken to extract enitities from LLM Graph Builder: {elapsed_entity_extraction:.2f} seconds') @@ -528,7 +540,7 @@ def get_chunkId_chunkDoc_list(graph, file_name, pages, retry_condition): chunks = graph.query(QUERY_TO_GET_CHUNKS, params={"filename":file_name}) if chunks[0]['text'] is None or chunks[0]['text']=="" or not chunks : - raise Exception(f"Chunks are not created for {file_name}. Please re-upload file and try again.") + raise LLMGraphBuilderException(f"Chunks are not created for {file_name}. Please re-upload file and try again.") else: for chunk in chunks: chunk_doc = Document(page_content=chunk['text'], metadata={'id':chunk['id'], 'position':chunk['position']}) @@ -546,7 +558,7 @@ def get_chunkId_chunkDoc_list(graph, file_name, pages, retry_condition): return len(chunks), chunkId_chunkDoc_list[starting_chunk[0]["position"] - 1:] else: - raise Exception(f"All chunks of file are alreday processed. If you want to re-process, Please start from begnning") + raise LLMGraphBuilderException(f"All chunks of file {file_name} are already processed. If you want to re-process, Please start from begnning") else: logging.info(f"Retry : start_from_beginning with chunks {len(chunkId_chunkDoc_list)}") @@ -642,7 +654,7 @@ def upload_file(graph, model, chunk, chunk_number:int, total_chunks:int, origina logging.info("File merged successfully") file_extension = originalname.split('.')[-1] obj_source_node = sourceNode() - obj_source_node.file_name = originalname + obj_source_node.file_name = originalname.strip() if isinstance(originalname, str) else originalname obj_source_node.file_type = file_extension obj_source_node.file_size = file_size obj_source_node.file_source = 'local file' @@ -668,7 +680,7 @@ def get_labels_and_relationtypes(graph): return label order by label limit 100 } as labels, collect { CALL db.relationshipTypes() yield relationshipType as type - WHERE NOT type IN ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_','FIRST_CHUNK'] + WHERE NOT type IN ['PART_OF', 'NEXT_CHUNK', 'HAS_ENTITY', '_Bloom_Perspective_','FIRST_CHUNK','SIMILAR','IN_COMMUNITY','PARENT_COMMUNITY'] return type order by type LIMIT 100 } as relationshipTypes """ graphDb_data_Access = graphDBdataAccess(graph) @@ -685,7 +697,7 @@ def manually_cancelled_job(graph, filenames, source_types, merged_dir, uri): for (file_name,source_type) in zip(filename_list, source_types_list): obj_source_node = sourceNode() - obj_source_node.file_name = file_name + obj_source_node.file_name = file_name.strip() if isinstance(file_name, str) else file_name obj_source_node.is_cancelled = True obj_source_node.status = 'Cancelled' obj_source_node.updated_at = datetime.now() @@ -720,7 +732,7 @@ def set_status_retry(graph, file_name, retry_condition): graphDb_data_Access = graphDBdataAccess(graph) obj_source_node = sourceNode() status = "Ready to Reprocess" - obj_source_node.file_name = file_name + obj_source_node.file_name = file_name.strip() if isinstance(file_name, str) else file_name obj_source_node.status = status obj_source_node.retry_condition = retry_condition obj_source_node.is_cancelled = False @@ -732,3 +744,15 @@ def set_status_retry(graph, file_name, retry_condition): obj_source_node.relationship_count=0 logging.info(obj_source_node) graphDb_data_Access.update_source_node(obj_source_node) + +def failed_file_process(uri,file_name, merged_file_path, source_type): + gcs_file_cache = os.environ.get('GCS_FILE_CACHE') + if source_type == 'local file': + if gcs_file_cache == 'True': + folder_name = create_gcs_bucket_folder_name_hashed(uri,file_name) + copy_failed_file(BUCKET_UPLOAD, BUCKET_FAILED_FILE, folder_name, file_name) + time.sleep(5) + delete_file_from_gcs(BUCKET_UPLOAD,folder_name,file_name) + else: + logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}') + delete_uploaded_local_file(merged_file_path,file_name) \ No newline at end of file diff --git a/backend/src/make_relationships.py b/backend/src/make_relationships.py index 410a383fd..333f0c550 100644 --- a/backend/src/make_relationships.py +++ b/backend/src/make_relationships.py @@ -16,7 +16,7 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_documents_chunk_chunk_Id : list): batch_data = [] logging.info("Create HAS_ENTITY relationship between chunks and entities") - chunk_node_id_set = 'id:"{}"' + for graph_doc_chunk_id in graph_documents_chunk_chunk_Id: for node in graph_doc_chunk_id['graph_doc'].nodes: query_data={ @@ -25,10 +25,6 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume 'node_id': node.id } batch_data.append(query_data) - #node_id = node.id - #Below query is also unable to change as parametrize because we can't make parameter of Label or node type - #https://neo4j.com/docs/cypher-manual/current/syntax/parameters/ - #graph.query('MATCH(c:Chunk {'+chunk_node_id_set.format(graph_doc_chunk_id['chunk_id'])+'}) MERGE (n:'+ node.type +'{ id: "'+node_id+'"}) MERGE (c)-[:HAS_ENTITY]->(n)') if batch_data: unwind_query = """ @@ -41,19 +37,15 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume def create_chunk_embeddings(graph, chunkId_chunkDoc_list, file_name): - #create embedding isEmbedding = os.getenv('IS_EMBEDDING') - # embedding_model = os.getenv('EMBEDDING_MODEL') embeddings, dimension = EMBEDDING_FUNCTION , EMBEDDING_DIMENSION logging.info(f'embedding model:{embeddings} and dimesion:{dimension}') data_for_query = [] logging.info(f"update embedding and vector index for chunks") for row in chunkId_chunkDoc_list: - # for graph_document in row['graph_doc']: if isEmbedding.upper() == "TRUE": embeddings_arr = embeddings.embed_query(row['chunk_doc'].page_content) - # logging.info(f'Embedding list {embeddings_arr}') data_for_query.append({ "chunkId": row['chunk_id'], @@ -82,7 +74,6 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li current_chunk_id = page_content_sha1.hexdigest() position = i + 1 if i>0: - #offset += len(tiktoken.encoding_for_model("gpt2").encode(chunk.page_content)) offset += len(chunks[i-1].page_content) if i == 0: firstChunk = True diff --git a/backend/src/post_processing.py b/backend/src/post_processing.py index 47fafebda..8b79f93bc 100644 --- a/backend/src/post_processing.py +++ b/backend/src/post_processing.py @@ -4,6 +4,11 @@ from langchain_neo4j import Neo4jGraph import os from src.shared.common_fn import load_embedding_model +from langchain_core.output_parsers import JsonOutputParser +from langchain_core.prompts import ChatPromptTemplate +from src.shared.constants import GRAPH_CLEANUP_PROMPT +from src.llm import get_llm +from src.main import get_labels_and_relationtypes DROP_INDEX_QUERY = "DROP INDEX entities IF EXISTS;" LABELS_QUERY = "CALL db.labels()" @@ -187,4 +192,61 @@ def update_embeddings(rows, graph): MATCH (e) WHERE elementId(e) = row.elementId CALL db.create.setNodeVectorProperty(e, "embedding", row.embedding) """ - return graph.query(query,params={'rows':rows}) \ No newline at end of file + return graph.query(query,params={'rows':rows}) + +def graph_schema_consolidation(graph): + nodes_and_relations = get_labels_and_relationtypes(graph) + logging.info(f"nodes_and_relations in existing graph : {nodes_and_relations}") + node_labels = [] + relation_labels = [] + + node_labels.extend(nodes_and_relations[0]['labels']) + relation_labels.extend(nodes_and_relations[0]['relationshipTypes']) + + parser = JsonOutputParser() + prompt = ChatPromptTemplate(messages=[("system",GRAPH_CLEANUP_PROMPT),("human", "{input}")], + partial_variables={"format_instructions": parser.get_format_instructions()}) + + graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL",'openai_gpt_4o') + llm, _ = get_llm(graph_cleanup_model) + chain = prompt | llm | parser + nodes_dict = chain.invoke({'input':node_labels}) + relation_dict = chain.invoke({'input':relation_labels}) + + node_match = {} + relation_match = {} + for new_label , values in nodes_dict.items() : + for old_label in values: + if new_label != old_label: + node_match[old_label]=new_label + + for new_label , values in relation_dict.items() : + for old_label in values: + if new_label != old_label: + relation_match[old_label]=new_label + + logging.info(f"updated node labels : {node_match}") + logging.info(f"updated relationship labels : {relation_match}") + + # Update node labels in graph + for old_label, new_label in node_match.items(): + query = f""" + MATCH (n:`{old_label}`) + SET n:`{new_label}` + REMOVE n:`{old_label}` + """ + graph.query(query) + + # Update relation types in graph + for old_label, new_label in relation_match.items(): + query = f""" + MATCH (n)-[r:`{old_label}`]->(m) + CREATE (n)-[r2:`{new_label}`]->(m) + DELETE r + """ + graph.query(query) + + return None + + + \ No newline at end of file diff --git a/backend/src/shared/common_fn.py b/backend/src/shared/common_fn.py index 0c0b4bea1..986687e25 100644 --- a/backend/src/shared/common_fn.py +++ b/backend/src/shared/common_fn.py @@ -10,6 +10,8 @@ import re import os from pathlib import Path +from urllib.parse import urlparse + def check_url_source(source_type, yt_url:str=None, wiki_query:str=None): language='' @@ -126,4 +128,10 @@ def create_gcs_bucket_folder_name_hashed(uri, file_name): def formatted_time(current_time): formatted_time = current_time.strftime('%Y-%m-%d %H:%M:%S %Z') - return formatted_time \ No newline at end of file + return formatted_time + +def last_url_segment(url): + parsed_url = urlparse(url) + path = parsed_url.path.strip("/") # Remove leading and trailing slashes + last_url_segment = path.split("/")[-1] if path else parsed_url.netloc.split(".")[0] + return last_url_segment \ No newline at end of file diff --git a/backend/src/shared/constants.py b/backend/src/shared/constants.py index 8307dc3c9..6a69d166d 100644 --- a/backend/src/shared/constants.py +++ b/backend/src/shared/constants.py @@ -377,64 +377,132 @@ END AS paths, e """ +# VECTOR_GRAPH_SEARCH_QUERY_SUFFIX = """ +# WITH apoc.coll.toSet(apoc.coll.flatten(collect(DISTINCT paths))) AS paths, +# collect(DISTINCT e) AS entities + +# // De-duplicate nodes and relationships across chunks +# RETURN +# collect { +# UNWIND paths AS p +# UNWIND relationships(p) AS r +# RETURN DISTINCT r +# } AS rels, +# collect { +# UNWIND paths AS p +# UNWIND nodes(p) AS n +# RETURN DISTINCT n +# } AS nodes, +# entities +# } + +# // Generate metadata and text components for chunks, nodes, and relationships +# WITH d, avg_score, +# [c IN chunks | c.chunk.text] AS texts, +# [c IN chunks | {id: c.chunk.id, score: c.score}] AS chunkdetails, +# [n IN nodes | elementId(n)] AS entityIds, +# [r IN rels | elementId(r)] AS relIds, +# apoc.coll.sort([ +# n IN nodes | +# coalesce(apoc.coll.removeAll(labels(n), ['__Entity__'])[0], "") + ":" + +# n.id + +# (CASE WHEN n.description IS NOT NULL THEN " (" + n.description + ")" ELSE "" END) +# ]) AS nodeTexts, +# apoc.coll.sort([ +# r IN rels | +# coalesce(apoc.coll.removeAll(labels(startNode(r)), ['__Entity__'])[0], "") + ":" + +# startNode(r).id + " " + type(r) + " " + +# coalesce(apoc.coll.removeAll(labels(endNode(r)), ['__Entity__'])[0], "") + ":" + endNode(r).id +# ]) AS relTexts, +# entities + +# // Combine texts into response text +# WITH d, avg_score, chunkdetails, entityIds, relIds, +# "Text Content:\n" + apoc.text.join(texts, "\n----\n") + +# "\n----\nEntities:\n" + apoc.text.join(nodeTexts, "\n") + +# "\n----\nRelationships:\n" + apoc.text.join(relTexts, "\n") AS text, +# entities + +# RETURN +# text, +# avg_score AS score, +# { +# length: size(text), +# source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), +# chunkdetails: chunkdetails, +# entities : { +# entityids: entityIds, +# relationshipids: relIds +# } +# } AS metadata +# """ VECTOR_GRAPH_SEARCH_QUERY_SUFFIX = """ - WITH apoc.coll.toSet(apoc.coll.flatten(collect(DISTINCT paths))) AS paths, - collect(DISTINCT e) AS entities - - // De-duplicate nodes and relationships across chunks - RETURN - collect { - UNWIND paths AS p - UNWIND relationships(p) AS r - RETURN DISTINCT r - } AS rels, - collect { - UNWIND paths AS p - UNWIND nodes(p) AS n - RETURN DISTINCT n - } AS nodes, - entities + WITH apoc.coll.toSet(apoc.coll.flatten(collect(DISTINCT paths))) AS paths, + collect(DISTINCT e) AS entities + // De-duplicate nodes and relationships across chunks + RETURN + collect { + UNWIND paths AS p + UNWIND relationships(p) AS r + RETURN DISTINCT r + } AS rels, + collect { + UNWIND paths AS p + UNWIND nodes(p) AS n + RETURN DISTINCT n + } AS nodes, + entities } - // Generate metadata and text components for chunks, nodes, and relationships WITH d, avg_score, - [c IN chunks | c.chunk.text] AS texts, - [c IN chunks | {id: c.chunk.id, score: c.score}] AS chunkdetails, - [n IN nodes | elementId(n)] AS entityIds, - [r IN rels | elementId(r)] AS relIds, - apoc.coll.sort([ - n IN nodes | - coalesce(apoc.coll.removeAll(labels(n), ['__Entity__'])[0], "") + ":" + - n.id + - (CASE WHEN n.description IS NOT NULL THEN " (" + n.description + ")" ELSE "" END) - ]) AS nodeTexts, - apoc.coll.sort([ - r IN rels | - coalesce(apoc.coll.removeAll(labels(startNode(r)), ['__Entity__'])[0], "") + ":" + - startNode(r).id + " " + type(r) + " " + - coalesce(apoc.coll.removeAll(labels(endNode(r)), ['__Entity__'])[0], "") + ":" + endNode(r).id - ]) AS relTexts, - entities - + [c IN chunks | c.chunk.text] AS texts, + [c IN chunks | {id: c.chunk.id, score: c.score}] AS chunkdetails, + [n IN nodes | elementId(n)] AS entityIds, + [r IN rels | elementId(r)] AS relIds, + apoc.coll.sort([ + n IN nodes | + coalesce(apoc.coll.removeAll(labels(n), ['__Entity__'])[0], "") + ":" + + coalesce( + n.id, + n[head([k IN keys(n) WHERE k =~ "(?i)(name|title|id|description)$"])], + "" + ) + + (CASE WHEN n.description IS NOT NULL THEN " (" + n.description + ")" ELSE "" END) + ]) AS nodeTexts, + apoc.coll.sort([ + r IN rels | + coalesce(apoc.coll.removeAll(labels(startNode(r)), ['__Entity__'])[0], "") + ":" + + coalesce( + startNode(r).id, + startNode(r)[head([k IN keys(startNode(r)) WHERE k =~ "(?i)(name|title|id|description)$"])], + "" + ) + " " + type(r) + " " + + coalesce(apoc.coll.removeAll(labels(endNode(r)), ['__Entity__'])[0], "") + ":" + + coalesce( + endNode(r).id, + endNode(r)[head([k IN keys(endNode(r)) WHERE k =~ "(?i)(name|title|id|description)$"])], + "" + ) + ]) AS relTexts, + entities // Combine texts into response text WITH d, avg_score, chunkdetails, entityIds, relIds, - "Text Content:\n" + apoc.text.join(texts, "\n----\n") + - "\n----\nEntities:\n" + apoc.text.join(nodeTexts, "\n") + - "\n----\nRelationships:\n" + apoc.text.join(relTexts, "\n") AS text, - entities - -RETURN - text, - avg_score AS score, - { - length: size(text), - source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), - chunkdetails: chunkdetails, - entities : { - entityids: entityIds, - relationshipids: relIds - } - } AS metadata + "Text Content:\n" + apoc.text.join(texts, "\n----\n") + + "\n----\nEntities:\n" + apoc.text.join(nodeTexts, "\n") + + "\n----\nRelationships:\n" + apoc.text.join(relTexts, "\n") AS text, + entities +RETURN + text, + avg_score AS score, + { + length: size(text), + source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), + chunkdetails: chunkdetails, + entities : { + entityids: entityIds, + relationshipids: relIds + } + } AS metadata """ VECTOR_GRAPH_SEARCH_QUERY = VECTOR_GRAPH_SEARCH_QUERY_PREFIX+ VECTOR_GRAPH_SEARCH_ENTITY_QUERY.format( @@ -763,45 +831,30 @@ DELETE_ENTITIES_AND_START_FROM_BEGINNING = "delete_entities_and_start_from_beginning" START_FROM_LAST_PROCESSED_POSITION = "start_from_last_processed_position" -PROMPT_TO_ALL_LLMs = """ -"# Knowledge Graph Instructions for LLMs\n" - "## 1. Overview\n" - "You are a top-tier algorithm designed for extracting information in structured " - "formats to build a knowledge graph.\n" - "Try to capture as much information from the text as possible without " - "sacrificing accuracy. Do not add any information that is not explicitly " - "mentioned in the text.\n" - "- **Nodes** represent entities and concepts.\n" - "- The aim is to achieve simplicity and clarity in the knowledge graph, making it\n" - "accessible for a vast audience.\n" - "## 2. Labeling Nodes\n" - "- **Consistency**: Ensure you use available types for node labels.\n" - "Ensure you use basic or elementary types for node labels.\n" - "- For example, when you identify an entity representing a person, " - "always label it as **'person'**. Avoid using more specific terms " - "like 'mathematician' or 'scientist'." - "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be " - "names or human-readable identifiers found in the text.\n" - "- **Relationships** represent connections between entities or concepts.\n" - "Ensure consistency and generality in relationship types when constructing " - "knowledge graphs. Instead of using specific and momentary types " - "such as 'BECAME_PROFESSOR', use more general and timeless relationship types " - "like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n" - "## 3. Coreference Resolution\n" - "- **Maintain Entity Consistency**: When extracting entities, it's vital to " - "ensure consistency.\n" - 'If an entity, such as "John Doe", is mentioned multiple times in the text ' - 'but is referred to by different names or pronouns (e.g., "Joe", "he"),' - "always use the most complete identifier for that entity throughout the " - 'knowledge graph. In this example, use "John Doe" as the entity ID.\n' - "Remember, the knowledge graph should be coherent and easily understandable, " - "so maintaining consistency in entity references is crucial.\n" - "## 4. Node Properties\n" - "- Dates, URLs, Time, and Numerical Values: Instead of creating separate nodes for - these elements, represent them as properties of existing nodes." - "- Example: Instead of creating a node labeled "2023-03-15" and connecting it to another node - with the relationship "BORN_ON", add a property called "born_on" to the person node with the - value "2023-03-15"." - "## 5. Strict Compliance\n" - "Adhere to the rules strictly. Non-compliance will result in termination." - """ +GRAPH_CLEANUP_PROMPT = """Please consolidate the following list of types into a smaller set of more general, semantically +related types. The consolidated types must be drawn from the original list; do not introduce new types. +Return a JSON object representing the mapping of original types to consolidated types. Every key is the consolidated type +and value is list of the original types that were merged into the consolidated type. Prioritize using the most generic and +repeated term when merging. If a type doesn't merge with any other type, it should still be included in the output, +mapped to itself. + +**Input:** A list of strings representing the types to be consolidated. These types may represent either node +labels or relationship labels Your algorithm should do appropriate groupings based on semantic similarity. + +Example 1: +Input: +[ "Person", "Human", "People", "Company", "Organization", "Product"] +Output : +[Person": ["Person", "Human", "People"], Organization": ["Company", "Organization"], Product": ["Product"]] + +Example 2: +Input : +["CREATED_FOR", "CREATED_TO", "CREATED", "PLACE", "LOCATION", "VENUE"] +Output: +["CREATED": ["CREATED_FOR", "CREATED_TO", "CREATED"],"PLACE": ["PLACE", "LOCATION", "VENUE"]] +""" + +ADDITIONAL_INSTRUCTIONS = """Your goal is to identify and categorize entities while ensuring that specific data +types such as dates, numbers, revenues, and other non-entity information are not extracted as separate nodes. +Instead, treat these as properties associated with the relevant entities.""" + diff --git a/backend/src/shared/llm_graph_builder_exception.py b/backend/src/shared/llm_graph_builder_exception.py new file mode 100644 index 000000000..60972c09d --- /dev/null +++ b/backend/src/shared/llm_graph_builder_exception.py @@ -0,0 +1,6 @@ +class LLMGraphBuilderException(Exception): + """Exception raised for custom error in the application.""" + + def __init__(self, message): + self.message = message + super().__init__(message) \ No newline at end of file diff --git a/backend/src/shared/schema_extraction.py b/backend/src/shared/schema_extraction.py index 1b7f76c92..d57703e37 100644 --- a/backend/src/shared/schema_extraction.py +++ b/backend/src/shared/schema_extraction.py @@ -1,5 +1,4 @@ from typing import List -#from langchain_core.pydantic_v1 import BaseModel, Field from pydantic.v1 import BaseModel, Field from src.llm import get_llm from langchain_core.prompts import ChatPromptTemplate diff --git a/data/Apple stock during pandemic.pdf b/data/Apple stock during pandemic.pdf new file mode 100644 index 000000000..32c29e6a5 Binary files /dev/null and b/data/Apple stock during pandemic.pdf differ diff --git a/docs/frontend/frontend_docs.adoc b/docs/frontend/frontend_docs.adoc index 34e71f254..fac052739 100644 --- a/docs/frontend/frontend_docs.adoc +++ b/docs/frontend/frontend_docs.adoc @@ -16,17 +16,20 @@ This document provides a comprehensive guide for developers on how we build a Re == Folders . + ├── API + ├── Assets ├── Components | ├─ ChatBot + | | ├─ Chatbot | | ├─ ChatInfoModal - | | ├─ ChatModeToggle - | | ├─ ExpandedChatButtonContainer | | ├─ ChatModesSwitch + | | ├─ ChatModeToggle | | ├─ ChatOnlyComponent | | ├─ ChatInfo | | ├─ CommonChatActions | | ├─ CommunitiesInfo | | ├─ EntitiesInfo + | | ├─ ExpandedChatButtonContainer | | ├─ MetricsCheckbox | | ├─ MetricsTab | | ├─ MultiModeMetrics @@ -64,12 +67,15 @@ This document provides a comprehensive guide for developers on how we build a Re | ├─ UI | | ├─ Alert | | ├─ ButtonWithTooltip + | | |─ BreakDownPopOver | | ├─ CustomButton | | ├─ CustomCheckBox | | ├─ CustomMenu + | | ├─ CustomPopOver | | ├─ CustomProgressBar | | ├─ DatabaseIcon | | ├─ DatabaseStatusIcon + | | ├─ Dropdown | | ├─ ErrorBoundary | | ├─ FallBackDialog | | ├─ HoverableLink @@ -86,7 +92,6 @@ This document provides a comprehensive guide for developers on how we build a Re | | ├─ GenericSourceButton | | ├─ GenericSourceModal | ├─ Content - | ├─ Dropdown | ├─ FileTable | ├─ QuickStarter ├── HOC @@ -103,6 +108,9 @@ This document provides a comprehensive guide for developers on how we build a Re | ├─ UserCredentials | ├─ UserMessages | ├─ UserFiles + ├── HOC + | ├─ CustomModal + | ├─ WithVisibility ├── Hooks | ├─ useSourceInput | ├─ useSpeech @@ -136,72 +144,123 @@ Created a connection modal by adding details including protocol, URI, database n * If GDS Connection is there icon is scientific molecule > Graph enhancement model > Post processing jobs > gives user the leverage to check and uncheck the communities checkbox. * If AURA DB > icon is database icon > Graph enhancement model > Post processing jobs > communities checkbox is disabled. -* Before Connection : - image::images/ConnectionModal.jpg[NoConnection, 600] - * After connection: + * ** Aura DS Connection ** + +image::images/GraphDBConnection.jpg[Connection, 600] + + * ** Aura DB connection ** -image::images/NoFiles.jpg[Connection, 600] +image::images/AuraDBConnection.jpg[Connection, 600] + + * **ReadOnly User** + +image::images/ReadOnlyUser.jpg[ReadOnlyUser, 600] + + * **User not connected** + +image::images/NoConnection.jpg[User not Connection, 600] == 3. File Source integration: Implemented various file source integrations including drag-and-drop, web sources search that includes YouTube video, Wikipedia link, Amazon S3 file access, and Google Cloud Storage (GCS) file access. This allows users to upload PDF files from local storage or directly from the integrated sources. The Api’s are as follows: -* ***/source_list:*** - ** to fetch the list of files in the DB +* ***/source_list:*** to fetch the list of files in the DB -image::images/WithFiles.jpg[Connected, 600] +image::images/WithData.jpg[Connected, 600] -* ***/upload:*** - ** to upload files from Local +* ***/upload:*** to upload files from Local image::images/UploadLocalFile.jpg[Local File, 600] - - - ** status 'Uploading' while file is get uploaded. - -image::images/UploadingStatus.jpg[Upload Status, 600] - -* ***/url/scan:*** - ** to scan the link or sources of YouTube, Wikipedia, and Web Sources +* ***/url/scan:*** to scan the link or sources of YouTube, Wikipedia, and Web Sources image::images/WebSources.jpg[WebSources, 600] -* ***/url/scan:*** - ** to scan the files of S3 and GCS. - *** Add the respective Bucket URL, access key and secret key to access ***S3 files***. +* ***/url/scan:*** to scan the files of S3 and GCS. + +1) Add the respective Bucket URL, access key and secret key to access S3 files. image::images/S3BucketScan.jpg[S3 scan, 600] - - **** Add the respective Project ID, Bucket name, and folder to access ***GCS files***. User gets a redirect to the authentication page to authenticate their google account. + +2) Add the respective Project ID, Bucket name, and folder to access GCS files. image::images/GCSbucketFiles.jpg[GCS scan, 600] +3) User gets a redirect to the authentication page to authenticate their google account. + image::images/Gcloud_auth.jpg[auth login scan, 600] == 4. File Source Extraction: -* ***/extract*** - ** to fetch the number of nodes and relationships created. +* ***/extract*** to fetch the number of nodes and relationships created. *** During Extraction the selected files or all files in ‘New’ state go into ‘Processing’ state and then ‘Completed’ state if there are no failures. image::images/GenerateGraph.jpg[Generate Graph, 600] +1) A file with status Completed has an option to be Reprocess with the following options : + +image::images/CompletedReadyToReprocess.jpg[CompletedReadyToReprocess, 600] + +2) A file with status Failed/ Cancelled has an option to be Reprocess with the following options : + +image::images/FailedReadyToReprocess.jpg[FailedReadyToReprocess, 600] == 5. Graph Generation: -* Created a component for generating graphs based on the files in the table, to extract nodes and relationships. When the user clicks on the Preview Graph or on the Table View icon the user can see that the graph model holds three options for viewing: Lexical Graph, Entity Graph and Knowledge Graph. We utilized Neo4j's graph library to visualize the extracted nodes and relationships in the form of a graph query API: ***/graph_query***. There are options for customizing the graph visualization such as layout algorithms [zoom in, zoom out, fit, refresh], node styling, relationship types. -image::images/KnowledgeGraph.jpg[Knowledge Graph, 600] -image::images/EntityGraph.jpg[Entity Graph, 600] -image::images/EntityGraph.jpg[Entity Graph, 600] +* ***/graph_query:*** + *** Created a component for generating graphs based on the files in the table, to extract nodes and relationships. When the user clicks on the Preview Graph or on the Table View icon the user can see that the graph model holds three options for viewing: Lexical Graph, Entity Graph and Knowledge Graph. We utilized Neo4j's graph library to visualize the extracted nodes and relationships in the form of a graph query API: ***/graph_query***. There are options for customizing the graph visualization such as layout algorithms [zoom in, zoom out, fit, refresh], node styling, relationship types. + + * **Preview Graph** + +image::images/AllFilesGraph.jpg[AllFiles Graph, 600] + + * **File Graph** + +image::images/SingleFileQuery.jpg[Single File Graph, 600] + + * **Graph Types** + +1) Document & Chunk + +image::images/DocChunkGraph.jpg[Knowledge Graph, 600] + +2) Entities + +image::images/EntitiesGraph.jpg[Entity Graph, 600] + +3) Communities + +image::images/CommunitiesGraph.jpg[Community Graph, 600] + +* ***/get_neighbours:*** + ** This API is used to retrive the neighbor nodes of the given element id of the node. + +image::images/NeighbourNodeDisconnected.jpg[Neighbourhood Graph, 600] == 6. Chatbot: -* Created a Chatbot Component which has state variables to manage user input and chat messages. Once the user asks the question and clicks on the Ask button API: ***/chatbot*** is triggered to send user input to the backend and receive the response. The chat also has options for users to see more details about the chat, text to speech and copy the response. +Created a Chatbot Component which has state variables to manage user input and chat messages. Once the user asks the question and clicks on the Ask button API: ***/chatbot*** is triggered to send user input to the backend and receive the response. The chat also has options for users to see more details about the chat, text to speech and copy the response. + + * **Chat Drawer View** + +image::images/ChatBotSideView.jpg[ChatBotSideView, 600] + + * **Chat Modal View** + +image::images/ChatBotModalView.jpg[ChatBotModalView, 600] + + * **Chat Pop out View** + +image::images/ChatBotNewURL.jpg[ChatBotNewURL, 600] + + +* ***/clear_chat_bot:*** + ** to clear the chat history which is saved in Neo4j DB. + +image::images/ClearChatHistory.jpg[ClearChatHistory, 600] -image::images/ChatResponse.jpg[ChatResponse, 600] * ***/chunk_entities:*** @@ -209,7 +268,7 @@ image::images/ChatResponse.jpg[ChatResponse, 600] ***Sources*** -image::images/ChatInfoModal.jpg[ChatInfoModal, 600] +image::images/Sources.jpg[Sources, 600] ***Entities*** @@ -219,30 +278,43 @@ image::images/EntitiesInfo.jpg[EntitiesInfo, 600] image::images/ChunksInfo.jpg[ChunksInfo, 600] -* There are three modes ***Vector***, ***Graph***, ***Graph+Vector*** that can be provided to the chat to retrieve the answers. +* ***/metric:*** + ** The API responsible for a evaluating chatbot responses on the basis of different metrics such as faithfulness and answer relevancy. This utilises RAGAS library to calculate these metrics. + +image::images/MetricEval.jpg[MetricEval, 600] + +* ***/additional_metrics:*** + ** The API responsible for a evaluating chatbot responses on the basis of different metrics such as context entity recall, semantic score, rouge score. This reuqire additional ground truth to be supplied by user. This utilises RAGAS library to calculate these metrics. + +image::images/AdditionalMetricEval.jpg[AdditionalMetricEval, 600] -image::images/ChatModes.jpg[ChatModes, 600] - • In Vector mode, we only get the sources and chunks . +***Chat Modes*** -image::images/VectorMode.jpg[VectorMode, 600] +* There are five modes ***Vector***, ***Fulltext***, ***Graph+Vector+Fulltext***, ***Entity search+Vector***, ***Graph+Vector+Fulltext*** that can be provided to the chat to retrieve the answers in ***Production*** environment. +* There is one more mode ***Graph*** that can be provided to the chat to retrieve the answers in ***Development*** environment. +* There is one more mode ***Global search+Vector+Fulltext*** that can be provided to the chat to retrieve the answers if aura instance is ***GDS***. - • Graph Mode: Cypher query and Entities [DEV] +1) In Production Environment -image::images/GraphModeDetails.png[GraphMode, 600] -image::images/GraphModeQuery.png[GraphMode, 600] +image::images/ChatModesProd.jpg[ChatModesProd, 600] - • Graph+Vector Mode: Sources, Chunks and Entities +2) In Development Environment -image::images/GraphVectorMode.jpg[GraphVectorMode, 600] +image::images/ChatModesDev.jpg[ChatModesDev, 600] -== 6. Graph Enhancement Settings: + +== 7. Graph Enhancement Settings: Users can now set their own Schema for nodes and relations or can already be an existing schema. - + +* ***Entity Extraction Settings:*** + +image::images/GraphEnhancements.jpg[GraphEnhancements, 600] + * ***/schema:*** ** to fetch the existing schema that already exists in the db. -image::images/PredefinedSchema.jpg[PredefinedSchema, 600] +image::images/Schema.jpg[PredefinedSchema, 600] * ***/populate_graph_schema:*** ** to fetch the schema from user entered document text @@ -254,7 +326,28 @@ image::images/UserDefinedSchema.jpg[UserDefinedSchema, 600] image::images/DeleteOrphanNodes.jpg[DeleteOrphanNodes, 600] -== 7. Settings: +* ***/merge_duplicate_nodes:*** + +1) to merge the duplicate entities. + +image::images/MergeDuplicateEntities.jpg[MergeDuplicateEntities, 600] + +2) to get duplicate entities + +image::images/GetDuplicateNodes.jpg[GetDuplicateNodes, 600] + +* ***/post_processing :*** + to fine-tune the knowledge graph for improved performance and deeper analysis + +1) When GDS instance + +image::images/PostProcessingDB.jpg[PostProcessingDB, 600] + +2) When Aura DB instance + +image::images/PostProcessingDB.jpg[PostProcessingDB, 600] + +== 8. Application Options: * ***LLM Model*** @@ -262,26 +355,59 @@ User can select desired LLM models image::images/Dropdown.jpg[Dropdown, 600] -* ***Dark/Light Mode*** +* ***Documentation***: User can navigate to the application overview : https://neo4j.com/labs/genai-ecosystem/llm-graph-builder/ + +image::images/LLMGraphBuilderDocumentation.jpg[LLMGraphBuilderDocumentation, 600] + +* ***GitHub Issues***: User can navigate to the gitHub issues which are in developers bucket list : https://github.com/neo4j-labs/llm-graph-builder/issues + +image::images/GitHubIssues.jpg[GitHubIssues, 600] + -User can choose the application view : both in dark and light mode +* ***Dark/Light Mode***: User can choose the application view : both in dark and light mode + +1) Dark image::images/DarkMode.jpg[DarkMode, 600] +2) Light image::images/LightMode.jpg[LightMode, 600] -* ***Delete Files*** +* ***Chat Only Mode*** -User can delete all number/selected files from the table. +User can also use the chat only feature by navigating to the url at: https://llm-graph-builder.neo4jlabs.com/chat-only to ask questions related to documents which have been completely processed. User is required to pass the login credentials to connect to the database. -image::images/DeleteFiles.jpg[DeleteFiles, 600] +== 9. File Table Options: +User can explore various features available for files in the table, including sorting, filtering, viewing as a graph, examining nodes and relationships, copying file details, and accessing chunks related to the file. -* ***Chat Only Mode*** +***File Status*** + +image::images/FileStatus.jpg[FileStatus, 600] + +***File Nodes*** + +image::images/FileNodes.jpg[FileNodes, 600] + +***File Relationships*** + +image::images/FileRelationships.jpg[FileRelationships, 600] + +***File Actions*** -User can also use the chat only feature by navigating to the url https://dev-frontend-dcavk67s4a-uc.a.run.app/chat-only to ask questions related to documents which have been completely processed. User is required to pass the login credentials to connect to the database. +** ***Graph View*** -== 8. Interface Design: +image::images/GraphActions.jpg[GraphActions, 600] + + ** ***Copy File Data*** + +image::images/CopyFileData.jpg[CopyFileData, 600] + + ** ***Text Chunks*** + +image::images/TextChunks.jpg[TextChunks, 600] + +== 10. Interface Design: Designed a user-friendly interface that guides users through the process of connecting to Neo4j Aura, accessing file sources, uploading PDF files, and generating graphs. * ***Components:*** @neo4j-ndl/react @@ -290,7 +416,7 @@ Designed a user-friendly interface that guides users through the process of conn * ***NVL:*** @neo4j-nvl/core * ***CSS:*** Inline styling, tailwind CSS -== 9. Deployment: +== 11. Deployment: Followed best practices for optimizing performance and security of the deployed application. * ***Local Deployment:*** @@ -303,21 +429,19 @@ Followed best practices for optimizing performance and security of the deployed [source,indent=0] ---- - * LLM_MODELS="diffbot,openai-gpt-3.5,openai-gpt-4o" - * REACT_APP_SOURCES="local,youtube,wiki,s3,gcs,web" - * GOOGLE_CLIENT_ID="xxxx" [For Google GCS integration] - * CHAT_MODES="vector,graph+vector" - * CHUNK_SIZE=5242880 - * TIME_PER_BYTE=2 - * TIME_PER_PAGE=50 - * TIME_PER_CHUNK=4 - * LARGE_FILE_SIZE=5242880 - * ENV="PROD"/ ‘DEV’ - * NEO4J_USER_AGENT="LLM-Graph-Builder/v0.2-dev" - * BACKEND_API_URL= - * BLOOM_URL= - * NPM_TOKEN= - * BACKEND_PROCESSING_URL= + * VITE_LLM_MODELS="" + * VITE_REACT_APP_SOURCES="" + * VITE_GOOGLE_CLIENT_ID="xxxx" [For Google GCS integration] + * VITE_CHAT_MODES="" + * VITE_CHUNK_SIZE=5242880 + * VITE_TIME_PER_PAGE=50 + * VITE_LARGE_FILE_SIZE=5242880 + * VITE_ENV="PROD"/ ‘DEV’ + * VITE_BACKEND_API_URL= + * VITE_BLOOM_URL= + * VITE_BACKEND_PROCESSING_URL= + * VITE_LLM_MODELS_PROD="openai_gpt_4o,openai_gpt_4o_mini,diffbot,gemini_1.5_flash" + * VITE_BATCH_SIZE=2 ---- * ***Cloud Deployment:*** ** To deploy the app install the gcloud cli , run the following command in the terminal specifically from frontend root folder. @@ -327,7 +451,9 @@ Followed best practices for optimizing performance and security of the deployed *** Allow unauthenticated request : Yes -== 10. API Reference +== 12. API Reference + +=== 1) Connection Modal ----- POST /connect ----- @@ -341,7 +467,15 @@ Neo4j database connection on frontend is done with this API. * `password`= Neo4j db password, * `database`= Neo4j database name -=== Upload Files from Local +=== 2) Backend Database connection +---- +POST /backend_connection_configuation +---- + +The API responsible for create the connection obj from Neo4j DB based on environment variable and return the status for show/hide login dialog on UI + + +=== 3) Upload Files from Local ---- POST /upload ---- @@ -361,7 +495,7 @@ The upload endpoint is designed to handle the uploading of large files by breaki * `database`= Neo4j database name -=== User Defined Schema +=== 4) User Defined Schema ---- POST /schema ---- @@ -375,7 +509,7 @@ User can set schema for graph generation (i.e. Nodes and relationship labels) in * `password`= Neo4j db password, * `database`= Neo4j database name -=== Graph schema from Input Text +=== 5) Graph schema from Input Text ---- POST /populate_graph_schema ---- @@ -388,7 +522,7 @@ The API is used to populate a graph schema based on the provided input text, mod * `model`=The model to be used for populating the graph schema. * `is_schema_description_checked`=A flag indicating whether the schema description should be considered. -=== Unstructured Sources +=== 6) Unstructured Sources ---- POST /url/scan ---- @@ -414,7 +548,7 @@ Create Document node for other sources - s3 bucket, gcs bucket, wikipedia, youtu * `access_token`=Form(None) -=== Extration of Nodes and Relations from Data +=== 7) Extration of Nodes and Relations from Data ---- POST /extract ---- @@ -456,7 +590,7 @@ allowedNodes=Node labels passed from settings panel, * `allowedRelationship`=Relationship labels passed from settings panel, * `language`=Language in which wikipedia content will be extracted -=== Get list of sources +=== 8) Get list of sources ---- GET /sources_list ---- @@ -471,7 +605,7 @@ List all sources (Document nodes) present in Neo4j graph database. * `database`= Neo4j database name -=== Post processing after graph generation +=== 9) Post processing after graph generation ---- POST /post_processing : ---- @@ -486,7 +620,7 @@ This API is called at the end of processing of whole document to get create k-ne * `database`= Neo4j database name * `tasks`= List of tasks to perform -=== Chat with Data +=== 10) Chat with Data ---- POST /chat_bot ---- @@ -509,7 +643,7 @@ The API responsible for a chatbot system designed to leverage multiple AI models * `question`= User query for the chatbot * `session_id`= Session ID used to maintain the history of chats during the user's connection -=== Get entities from chunks +=== 11) Get entities from chunks ---- POST/chunk_entities ---- @@ -525,7 +659,7 @@ This API is used to get the entities and relations associated with a particular * `chunk_ids` = Chunk ids of document -=== Clear chat history +=== 12) Clear chat history ---- POST /clear_chat_bot ---- @@ -540,7 +674,7 @@ This API is used to clear the chat history which is saved in Neo4j DB. * `database`= Neo4j database name, * `session_id` = User session id for QA chat -=== View graph for a file +=== 13) View graph for a file ---- POST /graph_query ---- @@ -555,7 +689,22 @@ This API is used to view graph for a particular file. * `query_type`= Neo4j database name * `document_names` = File name for which user wants to view graph -=== SSE event to update processing status +=== 14) Get neighbour nodes +---- +POST /get_neighbours +---- + +This API is used to retrive the neighbor nodes of the given element id of the node. + +**API Parameters :** + +* `uri`=Neo4j uri, +* `userName`= Neo4j db username, +* `password`= Neo4j db password, +* `database`= Neo4j database name, +* `elementId` = Element id of the node to retrive its neighbours + +=== 15) SSE event to update processing status ---- GET /update_extract_status ---- @@ -584,7 +733,7 @@ The API gives the extraction status of a specified file. It uses Server-Sent Eve * `password`= Neo4j db password, * `database`= Neo4j database name -=== Delete selected documents +=== 16) Delete selected documents ---- POST /delete_document_and_entities ---- @@ -601,7 +750,7 @@ Deleteion of nodes and relations for multiple files is done through this API. Us * `source_types`= Document sources(Wikipedia, youtube, etc.), * `deleteEntities`= Boolean value to check entities deletion is requested or not -=== Cancel processing job +=== 17) Cancel processing job ---- POST/cancelled_job ---- @@ -617,7 +766,7 @@ This API is responsible for cancelling an in process job. * `filenames`= Name of the file whose processing need to be stopped, * `source_types`= Source of the file -=== Deletion of orpahn nodes +=== 18) Deletion of orpahn nodes ---- POST /delete_unconnected_nodes ---- @@ -632,12 +781,94 @@ The API is used to delete unconnected entities from database. * `database`= Neo4j database name, * `unconnected_entities_list`=selected entities list to delete of unconnected entities. +=== 19) Get the list of orphan nodes +---- +POST /get_unconnected_nodes_list +---- + +The API retrieves a list of nodes in the graph database that are not connected to any other nodes. + +**API Parameters :** + +* `uri`=Neo4j uri, +* `userName`= Neo4j db username, +* `password`= Neo4j db password, +* `database`= Neo4j database name + +=== 20) Get duplicate nodes +---- +POST /get_duplicate_nodes +---- + +The API is used to fetch duplicate entities from database. + +**API Parameters :** + +* `uri`=Neo4j uri, +* `userName`= Neo4j db username, +* `password`= Neo4j db password, +* `database`= Neo4j database name, + + +=== 21) Merge duplicate nodes +---- +POST /merge_duplicate_nodes +---- + +The API is used to merge duplicate entities from database selected by user. + +**API Parameters :** + +* `uri`=Neo4j uri, +* `userName`= Neo4j db username, +* `password`= Neo4j db password, +* `database`= Neo4j database name, +* `duplicate_nodes_list`= selected entities list to merge of with similar entities. + +=== 22) Drop and create vector index +---- +POST /drop_create_vector_index +---- + +The API is used to drop and create the vector index when vector index dimesion are different. + +**API Parameters :** + +* `uri`=Neo4j uri, +* `userName`= Neo4j db username, +* `password`= Neo4j db password, +* `database`= Neo4j database name, +* `isVectorIndexExist`= True or False based on whether vector index exist in database, + +=== 23) Reprocessing of sources +---- +POST /retry_processing +---- + +This API is used to Ready to Reprocess cancelled, completed or failed file sources. +Users have 3 options to Ready to Reprocess files: + +* Start from begnning - In this condition file will be processed from the begnning i.e. 1st chunk again. +* Delete entities and start from begnning - If the file source is already processed and have any existing nodes and relations then those will be deleted and file will be reprocessed from the 1st chunk. +* Start from last processed postion - Cancelled or failed files will be processed from the last successfully processed chunk position. This option is not available for completed files. + +Ones the status is set to 'Ready to Reprocess', user can again click on Generate graph to process the file for knowledge graph creation. + +**API Parameters :** + +* `uri`=Neo4j uri, +* `userName`= Neo4j db username, +* `password`= Neo4j db password, +* `database`= Neo4j database name, +* `file_name`= Name of the file which user want to Ready to Reprocess. +* `retry_condition` = One of the above 3 conditions which is selected for reprocessing. + -== 11. Conclusion: +== 13. Conclusion: In conclusion, this technical document outlines the process of building a React application with Neo4j Aura integration for graph database functionalities. -== 12. Referral Links: +== 14. Referral Links: * Dev env : https://dev-frontend-dcavk67s4a-uc.a.run.app/ * Staging env: https://staging-frontend-dcavk67s4a-uc.a.run.app/ * Prod env: https://prod-frontend-dcavk67s4a-uc.a.run.app/ diff --git a/docs/frontend/images/AdditionalMetricEval.jpg b/docs/frontend/images/AdditionalMetricEval.jpg new file mode 100644 index 000000000..2cdae6bec Binary files /dev/null and b/docs/frontend/images/AdditionalMetricEval.jpg differ diff --git a/docs/frontend/images/AllFilesGraph.jpg b/docs/frontend/images/AllFilesGraph.jpg new file mode 100644 index 000000000..21b0f9429 Binary files /dev/null and b/docs/frontend/images/AllFilesGraph.jpg differ diff --git a/docs/frontend/images/AuraDBConnection.jpg b/docs/frontend/images/AuraDBConnection.jpg new file mode 100644 index 000000000..786725a6d Binary files /dev/null and b/docs/frontend/images/AuraDBConnection.jpg differ diff --git a/docs/frontend/images/ChatBotModalView.jpg b/docs/frontend/images/ChatBotModalView.jpg new file mode 100644 index 000000000..9a7a44779 Binary files /dev/null and b/docs/frontend/images/ChatBotModalView.jpg differ diff --git a/docs/frontend/images/ChatBotNewURL.jpg b/docs/frontend/images/ChatBotNewURL.jpg new file mode 100644 index 000000000..63e4cb226 Binary files /dev/null and b/docs/frontend/images/ChatBotNewURL.jpg differ diff --git a/docs/frontend/images/ChatBotSideView.jpg b/docs/frontend/images/ChatBotSideView.jpg new file mode 100644 index 000000000..76c534883 Binary files /dev/null and b/docs/frontend/images/ChatBotSideView.jpg differ diff --git a/docs/frontend/images/ChatInfoModal.jpg b/docs/frontend/images/ChatInfoModal.jpg deleted file mode 100644 index 72c119800..000000000 Binary files a/docs/frontend/images/ChatInfoModal.jpg and /dev/null differ diff --git a/docs/frontend/images/ChatModes.jpg b/docs/frontend/images/ChatModes.jpg deleted file mode 100644 index 1dd835e24..000000000 Binary files a/docs/frontend/images/ChatModes.jpg and /dev/null differ diff --git a/docs/frontend/images/ChatModesDev.jpg b/docs/frontend/images/ChatModesDev.jpg new file mode 100644 index 000000000..204903695 Binary files /dev/null and b/docs/frontend/images/ChatModesDev.jpg differ diff --git a/docs/frontend/images/ChatModesProd.jpg b/docs/frontend/images/ChatModesProd.jpg new file mode 100644 index 000000000..90b5c4215 Binary files /dev/null and b/docs/frontend/images/ChatModesProd.jpg differ diff --git a/docs/frontend/images/ChatResponse.jpg b/docs/frontend/images/ChatResponse.jpg deleted file mode 100644 index 72c119800..000000000 Binary files a/docs/frontend/images/ChatResponse.jpg and /dev/null differ diff --git a/docs/frontend/images/ClearChatHistory.jpg b/docs/frontend/images/ClearChatHistory.jpg new file mode 100644 index 000000000..7db7b7e6a Binary files /dev/null and b/docs/frontend/images/ClearChatHistory.jpg differ diff --git a/docs/frontend/images/CommunitiesGraph.jpg b/docs/frontend/images/CommunitiesGraph.jpg new file mode 100644 index 000000000..ab939a611 Binary files /dev/null and b/docs/frontend/images/CommunitiesGraph.jpg differ diff --git a/docs/frontend/images/CompletedReadyToReprocess.jpg b/docs/frontend/images/CompletedReadyToReprocess.jpg new file mode 100644 index 000000000..3541df6ec Binary files /dev/null and b/docs/frontend/images/CompletedReadyToReprocess.jpg differ diff --git a/docs/frontend/images/CopyFileData.jpg b/docs/frontend/images/CopyFileData.jpg new file mode 100644 index 000000000..6356e633f Binary files /dev/null and b/docs/frontend/images/CopyFileData.jpg differ diff --git a/docs/frontend/images/DeleteOrphanNodes.jpg b/docs/frontend/images/DeleteOrphanNodes.jpg index e397cb4a7..5cf0e4cf3 100644 Binary files a/docs/frontend/images/DeleteOrphanNodes.jpg and b/docs/frontend/images/DeleteOrphanNodes.jpg differ diff --git a/docs/frontend/images/DocChunkGraph.jpg b/docs/frontend/images/DocChunkGraph.jpg new file mode 100644 index 000000000..fe74f5d93 Binary files /dev/null and b/docs/frontend/images/DocChunkGraph.jpg differ diff --git a/docs/frontend/images/DownLoadConversation.jpg b/docs/frontend/images/DownLoadConversation.jpg new file mode 100644 index 000000000..856ab85b9 Binary files /dev/null and b/docs/frontend/images/DownLoadConversation.jpg differ diff --git a/docs/frontend/images/EntitiesGraph.jpg b/docs/frontend/images/EntitiesGraph.jpg new file mode 100644 index 000000000..8fff408c0 Binary files /dev/null and b/docs/frontend/images/EntitiesGraph.jpg differ diff --git a/docs/frontend/images/EntityExtraction.jpg b/docs/frontend/images/EntityExtraction.jpg new file mode 100644 index 000000000..94ab74ff1 Binary files /dev/null and b/docs/frontend/images/EntityExtraction.jpg differ diff --git a/docs/frontend/images/EntityGraph.jpg b/docs/frontend/images/EntityGraph.jpg deleted file mode 100644 index 9e25473a9..000000000 Binary files a/docs/frontend/images/EntityGraph.jpg and /dev/null differ diff --git a/docs/frontend/images/FailedReadyToReprocess.jpg b/docs/frontend/images/FailedReadyToReprocess.jpg new file mode 100644 index 000000000..6a1940d8e Binary files /dev/null and b/docs/frontend/images/FailedReadyToReprocess.jpg differ diff --git a/docs/frontend/images/FileNodes.jpg b/docs/frontend/images/FileNodes.jpg new file mode 100644 index 000000000..aaae8f70b Binary files /dev/null and b/docs/frontend/images/FileNodes.jpg differ diff --git a/docs/frontend/images/FileRelationships.jpg b/docs/frontend/images/FileRelationships.jpg new file mode 100644 index 000000000..70de2462d Binary files /dev/null and b/docs/frontend/images/FileRelationships.jpg differ diff --git a/docs/frontend/images/FileStatus.jpg b/docs/frontend/images/FileStatus.jpg new file mode 100644 index 000000000..84b8bc185 Binary files /dev/null and b/docs/frontend/images/FileStatus.jpg differ diff --git a/docs/frontend/images/GEDeleteOrphanNodes.jpg b/docs/frontend/images/GEDeleteOrphanNodes.jpg deleted file mode 100644 index 203aacc57..000000000 Binary files a/docs/frontend/images/GEDeleteOrphanNodes.jpg and /dev/null differ diff --git a/docs/frontend/images/GenerateGraph.jpg b/docs/frontend/images/GenerateGraph.jpg index cc006d969..87dcc7959 100644 Binary files a/docs/frontend/images/GenerateGraph.jpg and b/docs/frontend/images/GenerateGraph.jpg differ diff --git a/docs/frontend/images/GetDuplicateNodes.jpg b/docs/frontend/images/GetDuplicateNodes.jpg new file mode 100644 index 000000000..3f26d8096 Binary files /dev/null and b/docs/frontend/images/GetDuplicateNodes.jpg differ diff --git a/docs/frontend/images/GitHubIssues.jpg b/docs/frontend/images/GitHubIssues.jpg new file mode 100644 index 000000000..18af83465 Binary files /dev/null and b/docs/frontend/images/GitHubIssues.jpg differ diff --git a/docs/frontend/images/GraphActions.jpg b/docs/frontend/images/GraphActions.jpg new file mode 100644 index 000000000..b221aa2b6 Binary files /dev/null and b/docs/frontend/images/GraphActions.jpg differ diff --git a/docs/frontend/images/GraphDBConnection.jpg b/docs/frontend/images/GraphDBConnection.jpg new file mode 100644 index 000000000..cc7684950 Binary files /dev/null and b/docs/frontend/images/GraphDBConnection.jpg differ diff --git a/docs/frontend/images/GraphEnhacements.jpg b/docs/frontend/images/GraphEnhacements.jpg deleted file mode 100644 index 8fb3d4fe2..000000000 Binary files a/docs/frontend/images/GraphEnhacements.jpg and /dev/null differ diff --git a/docs/frontend/images/GraphEnhancements.jpg b/docs/frontend/images/GraphEnhancements.jpg new file mode 100644 index 000000000..d5208ea0b Binary files /dev/null and b/docs/frontend/images/GraphEnhancements.jpg differ diff --git a/docs/frontend/images/GraphModeDetails.png b/docs/frontend/images/GraphModeDetails.png deleted file mode 100644 index d11e7dcd1..000000000 Binary files a/docs/frontend/images/GraphModeDetails.png and /dev/null differ diff --git a/docs/frontend/images/GraphModeQuery.png b/docs/frontend/images/GraphModeQuery.png deleted file mode 100644 index cfd7fbaf8..000000000 Binary files a/docs/frontend/images/GraphModeQuery.png and /dev/null differ diff --git a/docs/frontend/images/GraphVectorMode.jpg b/docs/frontend/images/GraphVectorMode.jpg deleted file mode 100644 index d378b860f..000000000 Binary files a/docs/frontend/images/GraphVectorMode.jpg and /dev/null differ diff --git a/docs/frontend/images/KnowledgeGraph.jpg b/docs/frontend/images/KnowledgeGraph.jpg deleted file mode 100644 index eeb20a627..000000000 Binary files a/docs/frontend/images/KnowledgeGraph.jpg and /dev/null differ diff --git a/docs/frontend/images/LLMGraphBuilderDocumentation.jpg b/docs/frontend/images/LLMGraphBuilderDocumentation.jpg new file mode 100644 index 000000000..5df7aa739 Binary files /dev/null and b/docs/frontend/images/LLMGraphBuilderDocumentation.jpg differ diff --git a/docs/frontend/images/LexicalGraph.jpg b/docs/frontend/images/LexicalGraph.jpg deleted file mode 100644 index 7de1543ac..000000000 Binary files a/docs/frontend/images/LexicalGraph.jpg and /dev/null differ diff --git a/docs/frontend/images/MergeDuplicateEntities.jpg b/docs/frontend/images/MergeDuplicateEntities.jpg new file mode 100644 index 000000000..2c22f07ac Binary files /dev/null and b/docs/frontend/images/MergeDuplicateEntities.jpg differ diff --git a/docs/frontend/images/MetricEval.jpg b/docs/frontend/images/MetricEval.jpg new file mode 100644 index 000000000..c1d33e83e Binary files /dev/null and b/docs/frontend/images/MetricEval.jpg differ diff --git a/docs/frontend/images/NeighbourNodeDisconnected.jpg b/docs/frontend/images/NeighbourNodeDisconnected.jpg new file mode 100644 index 000000000..829ac0a34 Binary files /dev/null and b/docs/frontend/images/NeighbourNodeDisconnected.jpg differ diff --git a/docs/frontend/images/NoConnection.jpg b/docs/frontend/images/NoConnection.jpg new file mode 100644 index 000000000..769ef542b Binary files /dev/null and b/docs/frontend/images/NoConnection.jpg differ diff --git a/docs/frontend/images/NoFiles.jpg b/docs/frontend/images/NoFiles.jpg deleted file mode 100644 index 7494026a4..000000000 Binary files a/docs/frontend/images/NoFiles.jpg and /dev/null differ diff --git a/docs/frontend/images/PostProcessingDB.jpg b/docs/frontend/images/PostProcessingDB.jpg new file mode 100644 index 000000000..63ebf0f5a Binary files /dev/null and b/docs/frontend/images/PostProcessingDB.jpg differ diff --git a/docs/frontend/images/PostProcessingGDS.jpg b/docs/frontend/images/PostProcessingGDS.jpg new file mode 100644 index 000000000..4498055d7 Binary files /dev/null and b/docs/frontend/images/PostProcessingGDS.jpg differ diff --git a/docs/frontend/images/PredefinedSchema.jpg b/docs/frontend/images/PredefinedSchema.jpg index 6b89ab137..4706af5d3 100644 Binary files a/docs/frontend/images/PredefinedSchema.jpg and b/docs/frontend/images/PredefinedSchema.jpg differ diff --git a/docs/frontend/images/ReadOnlyUser.jpg b/docs/frontend/images/ReadOnlyUser.jpg new file mode 100644 index 000000000..64e7e424f Binary files /dev/null and b/docs/frontend/images/ReadOnlyUser.jpg differ diff --git a/docs/frontend/images/Schema.jpg b/docs/frontend/images/Schema.jpg new file mode 100644 index 000000000..a1ce36d32 Binary files /dev/null and b/docs/frontend/images/Schema.jpg differ diff --git a/docs/frontend/images/SingleFileQuery.jpg b/docs/frontend/images/SingleFileQuery.jpg new file mode 100644 index 000000000..b817767c1 Binary files /dev/null and b/docs/frontend/images/SingleFileQuery.jpg differ diff --git a/docs/frontend/images/Sources.jpg b/docs/frontend/images/Sources.jpg new file mode 100644 index 000000000..d721b7481 Binary files /dev/null and b/docs/frontend/images/Sources.jpg differ diff --git a/docs/frontend/images/SourcesInfo.jpg b/docs/frontend/images/SourcesInfo.jpg deleted file mode 100644 index 80c8cfded..000000000 Binary files a/docs/frontend/images/SourcesInfo.jpg and /dev/null differ diff --git a/docs/frontend/images/TextChunks.jpg b/docs/frontend/images/TextChunks.jpg new file mode 100644 index 000000000..96e5f485c Binary files /dev/null and b/docs/frontend/images/TextChunks.jpg differ diff --git a/docs/frontend/images/UploadingStatus.jpg b/docs/frontend/images/UploadingStatus.jpg deleted file mode 100644 index 779daf239..000000000 Binary files a/docs/frontend/images/UploadingStatus.jpg and /dev/null differ diff --git a/docs/frontend/images/VectorMode.jpg b/docs/frontend/images/VectorMode.jpg deleted file mode 100644 index f0ebf4e37..000000000 Binary files a/docs/frontend/images/VectorMode.jpg and /dev/null differ diff --git a/docs/frontend/images/WithData.jpg b/docs/frontend/images/WithData.jpg new file mode 100644 index 000000000..caf994530 Binary files /dev/null and b/docs/frontend/images/WithData.jpg differ diff --git a/docs/frontend/images/WithFiles.jpg b/docs/frontend/images/WithFiles.jpg deleted file mode 100644 index a789c03bb..000000000 Binary files a/docs/frontend/images/WithFiles.jpg and /dev/null differ diff --git a/experiments/Cleanup_of_graph_model.ipynb b/experiments/Cleanup_of_graph_model.ipynb new file mode 100644 index 000000000..eb720c580 --- /dev/null +++ b/experiments/Cleanup_of_graph_model.ipynb @@ -0,0 +1,456 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_openai import ChatOpenAI\n", + "from langchain_community.graphs import Neo4jGraph\n", + "import os\n", + "from dotenv import load_dotenv\n", + "from langchain_core.prompts import ChatPromptTemplate\n", + "from langchain_community.document_loaders import PyMuPDFLoader\n", + "from langchain_experimental.graph_transformers import LLMGraphTransformer\n", + "from langchain_text_splitters import TokenTextSplitter\n", + "from langchain_core.output_parsers import JsonOutputParser\n", + "\n", + "load_dotenv('../backend/.env')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read a document data and create chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "loader = PyMuPDFLoader('../data/Apple stock during pandemic.pdf')\n", + "pages = loader.load()\n", + "\n", + "texts = \"\"\n", + "for page in pages:\n", + " texts = texts+\" \"+page.page_content\n", + "\n", + "text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)\n", + "chunks = text_splitter.split_documents(pages) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initialize OpenAI LLM" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "model_name, api_key = os.environ.get('LLM_MODEL_CONFIG_openai_gpt_4o').split(\",\")\n", + "llm = ChatOpenAI(\n", + " api_key=api_key,\n", + " model=model_name,\n", + " temperature=0,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate graph documents via LLMGraphTransformer" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "graph_documents = LLMGraphTransformer(llm).convert_to_graph_documents(chunks)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[GraphDocument(nodes=[Node(id='Apple', type='Organization', properties={}), Node(id='Pandemic', type='Event', properties={}), Node(id='Wendy Sun', type='Person', properties={}), Node(id='Kinglee High School', type='Organization', properties={}), Node(id='Zheng Zhou', type='Location', properties={}), Node(id='He Nan', type='Location', properties={}), Node(id='China', type='Location', properties={})], relationships=[Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Pandemic', type='Event', properties={}), type='EXPERIENCED_FLUCTUATION', properties={}), Relationship(source=Node(id='Wendy Sun', type='Person', properties={}), target=Node(id='Kinglee High School', type='Organization', properties={}), type='AFFILIATION', properties={}), Relationship(source=Node(id='Kinglee High School', type='Organization', properties={}), target=Node(id='Zheng Zhou', type='Location', properties={}), type='LOCATED_IN', properties={}), Relationship(source=Node(id='Zheng Zhou', type='Location', properties={}), target=Node(id='He Nan', type='Location', properties={}), type='LOCATED_IN', properties={}), Relationship(source=Node(id='He Nan', type='Location', properties={}), target=Node(id='China', type='Location', properties={}), type='LOCATED_IN', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 0, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=\"The Changes of Apple’s Stock Price During the \\nPandemic \\nWanting Sun1,* \\n1WendySun, Kinglee High School, Zheng Zhou, He Nan, China, 450000 \\n*Corresponding author. Email: wendysun040718@163.com \\nABSTRACT \\nDuring the epidemic, the economies of various countries, including many large companies, were subject to very large \\nfluctuations and impacts. Therefore, this article will explore the ups and downs of Apple's stock price during the \\nepidemic. At different points in time and severity, Apple's stock price has risen and fallen. By observing the data on \\nYahoo, we can summarize Apple's stock price changes during this period. At the beginning of the epidemic, the global \\neconomy has been damaged, so Apple is no exception. So at the beginning of the epidemic, Apple's stock price fell\")),\n", + " GraphDocument(nodes=[Node(id='Apple', type='Organization', properties={}), Node(id='Stock Price', type='Concept', properties={}), Node(id='Covid-19', type='Event', properties={})], relationships=[Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Stock Price', type='Concept', properties={}), type='INFLUENCES', properties={}), Relationship(source=Node(id='Covid-19', type='Event', properties={}), target=Node(id='Stock Price', type='Concept', properties={}), type='IMPACTS', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 0, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=\", so Apple is no exception. So at the beginning of the epidemic, Apple's stock price fell \\nsharply. But later, because Apple had a strong overall strength and a good foundation, it only took three months to \\nsolve the impact of the epidemic. Later, when the epidemic returned to a serious situation, Apple's stock price was \\nimplicated and fell slightly, but within a few weeks the stock price decline was resolved and returned to normal levels, \\nand even steadily increased. Because Apple has been constantly launching new products, the epidemic will not have a \\nvery large impact on Apple's stock price. \\nKeywords: Apple, stock price, COVID-19 \\n1. INTRODUCTION \\nAs we know, the outbreak of the epidemic in 2020 \\nhas dealt a lot of blows to many countries and \\ncompanies. Population decline, massive economic \\nlosses, and national unrest. The U\")),\n", + " GraphDocument(nodes=[Node(id='U.S. Economic Recession', type='Event', properties={}), Node(id='Euro Zone Economy', type='Economy', properties={}), Node(id='British Economy', type='Economy', properties={}), Node(id='Epidemic', type='Event', properties={}), Node(id=\"China'S Transportation Industry\", type='Industry', properties={}), Node(id=\"China'S Tourism Industry\", type='Industry', properties={}), Node(id='Chinese Economy', type='Economy', properties={}), Node(id='World Economy', type='Economy', properties={}), Node(id='Apple', type='Company', properties={})], relationships=[Relationship(source=Node(id='U.S. Economic Recession', type='Event', properties={}), target=Node(id='Serious', type='Attribute', properties={}), type='SEVERITY', properties={}), Relationship(source=Node(id='Euro Zone Economy', type='Economy', properties={}), target=Node(id='Largest Decline In Record', type='Attribute', properties={}), type='DECLINE', properties={}), Relationship(source=Node(id='British Economy', type='Economy', properties={}), target=Node(id='Largest Decline Since World War Ii', type='Attribute', properties={}), type='DECLINE', properties={}), Relationship(source=Node(id='Epidemic', type='Event', properties={}), target=Node(id=\"China'S Transportation Industry\", type='Industry', properties={}), type='IMPACT', properties={}), Relationship(source=Node(id='Epidemic', type='Event', properties={}), target=Node(id=\"China'S Tourism Industry\", type='Industry', properties={}), type='IMPACT', properties={}), Relationship(source=Node(id='Epidemic', type='Event', properties={}), target=Node(id='Chinese Economy', type='Economy', properties={}), type='IMPACT', properties={}), Relationship(source=Node(id='Chinese Economy', type='Economy', properties={}), target=Node(id='World Economy', type='Economy', properties={}), type='IMPACT', properties={}), Relationship(source=Node(id='Epidemic', type='Event', properties={}), target=Node(id='Apple', type='Company', properties={}), type='IMPACT', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 0, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=\"\\ncompanies. Population decline, massive economic \\nlosses, and national unrest. The U.S. economic \\nrecession seems to be more serious than expected. The \\neuro zone economy has experienced the largest decline \\nin record, and the British economy has also suffered the \\nlargest decline since World War II. As an exogenous \\nshock, the epidemic will inevitably have a direct impact \\non China's transportation, tourism and other industries. \\nSimilarly, as China is the main source of power to \\npromote world growth, once the Chinese economy is \\naffected, the world economy will inevitably be affected. \\nThus, in such an environment where the whole world is \\naffected, to what extent have the industry giants that \\ndominate the world been hit? Let's take the most \\nfamiliar Apple company as an example, and analyze \\nhow much impact this devastating epidemic has brought \\nto Apple�\")),\n", + " GraphDocument(nodes=[Node(id='Apple_Inc', type='Organization', properties={}), Node(id='Cupertino_California', type='Location', properties={}), Node(id='December_12_1980', type='Date', properties={}), Node(id='$623.5_Billion', type='Value', properties={}), Node(id='2012', type='Date', properties={}), Node(id='June_2014', type='Date', properties={}), Node(id='September_30_2013', type='Date', properties={}), Node(id='Hongmeng_Group', type='Organization', properties={}), Node(id='Coca-Cola', type='Organization', properties={}), Node(id='2014', type='Date', properties={}), Node(id='Google', type='Organization', properties={})], relationships=[Relationship(source=Node(id='Apple_Inc', type='Organization', properties={}), target=Node(id='Cupertino_California', type='Location', properties={}), type='HEADQUARTERED_IN', properties={}), Relationship(source=Node(id='Apple_Inc', type='Organization', properties={}), target=Node(id='December_12_1980', type='Date', properties={}), type='WENT_PUBLIC_ON', properties={}), Relationship(source=Node(id='Apple_Inc', type='Organization', properties={}), target=Node(id='$623.5_Billion', type='Value', properties={}), type='MARKET_VALUE', properties={}), Relationship(source=Node(id='$623.5_Billion', type='Value', properties={}), target=Node(id='2012', type='Date', properties={}), type='RECORDED_IN', properties={}), Relationship(source=Node(id='Apple_Inc', type='Organization', properties={}), target=Node(id='June_2014', type='Date', properties={}), type='LARGEST_COMPANY_BY_MARKET_CAPITALIZATION', properties={}), Relationship(source=Node(id='Apple_Inc', type='Organization', properties={}), target=Node(id='September_30_2013', type='Date', properties={}), type='MOST_VALUABLE_BRAND', properties={}), Relationship(source=Node(id='Hongmeng_Group', type='Organization', properties={}), target=Node(id='Coca-Cola', type='Organization', properties={}), type='SURPASSED_BY', properties={}), Relationship(source=Node(id='Apple_Inc', type='Organization', properties={}), target=Node(id='2014', type='Date', properties={}), type='SURPASSED_GOOGLE', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 0, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=' an example, and analyze \\nhow much impact this devastating epidemic has brought \\nto Apple’s stocks in several aspects. \\n \\n \\n2. BACKGROUND INFORMATION OF \\nAPPLE COMPANY \\nApple Inc. is an American multinational technology \\ncompany that specializes in consumer electronics, \\ncomputer software, and online services. In it’s \\nheadquartered in Cupertino, California. Apple went \\npublic on December 12, 1980, and set a record of \\n$623.5 billion in market value in 2012. As of June \\n2014, Apple has become the world\\'s largest company by \\nmarket capitalization for three consecutive years. On \\nSeptember 30, 2013, in the Hongmeng Group\\'s \"Best \\nGlobal Brand\" report, Apple surpassed Coca-Cola to \\nbecome the world\\'s most valuable brand. In 2014, the \\nApple brand surpassed Google to become the world\\'s')),\n", + " GraphDocument(nodes=[Node(id='Apple', type='Organization', properties={}), Node(id='Google', type='Organization', properties={}), Node(id='Fortune Global 500', type='List', properties={}), Node(id='Technology Company', type='Industry', properties={}), Node(id='Pc Vendor', type='Industry', properties={}), Node(id='Smartphone Manufacturer', type='Industry', properties={}), Node(id='Us$2 Trillion', type='Value', properties={}), Node(id='2021 3Rd International Conference On Economic Management And Cultural Industry', type='Event', properties={}), Node(id='Atlantis Press International B.V.', type='Organization', properties={})], relationships=[Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Google', type='Organization', properties={}), type='SURPASSED', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Us$2 Trillion', type='Value', properties={}), type='MARKET_VALUE_EXCEEDED', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Fortune Global 500', type='List', properties={}), type='RANKED_6TH_ON', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Technology Company', type='Industry', properties={}), type='LARGEST_BY_REVENUE', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Pc Vendor', type='Industry', properties={}), type='FOURTH-LARGEST_BY_UNIT_SALES', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Smartphone Manufacturer', type='Industry', properties={}), type='FOURTH-LARGEST', properties={}), Relationship(source=Node(id='2021 3Rd International Conference On Economic Management And Cultural Industry', type='Event', properties={}), target=Node(id='Atlantis Press International B.V.', type='Organization', properties={}), type='PUBLISHED_BY', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 0, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=\"'s most valuable brand. In 2014, the \\nApple brand surpassed Google to become the world's \\nmost valuable brand. On August 19, 2020 local time, \\nApple's market value exceeded US$2 trillion for the \\nfirst time. Ranked 6th on the Fortune Global 500 list in \\n2021. Apple is the world's largest technology company \\nby revenue and, since January 2021, the world's most \\nvaluable company. As of 2021, Apple is the world's \\nfourth-largest PC vendor by unit sales, and fourth-\\nlargest smartphone manufacturer. \\n \\nAdvances in Economics, Business and Management Research, volume 203\\nProceedings of the 2021 3rd International Conference on Economic Management and\\nCultural Industry (ICEMCI 2021)\\nCopyright © 2021 The Authors. Published by Atlantis Press International B.V.\\nThis is an open access article distributed under the CC BY-NC 4.0 license -http\")),\n", + " GraphDocument(nodes=[Node(id='Cc By-Nc 4.0 License', type='License', properties={}), Node(id='Http://Creativecommons.Org/Licenses/By-Nc/4.0/', type='Url', properties={})], relationships=[Relationship(source=Node(id='Cc By-Nc 4.0 License', type='License', properties={}), target=Node(id='Http://Creativecommons.Org/Licenses/By-Nc/4.0/', type='Url', properties={}), type='DISTRIBUTED_UNDER', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 0, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content='\\nThis is an open access article distributed under the CC BY-NC 4.0 license -http://creativecommons.org/licenses/by-nc/4.0/.\\n373\\n')),\n", + " GraphDocument(nodes=[Node(id='Population', type='Concept', properties={}), Node(id='Spending', type='Concept', properties={}), Node(id='Inventories', type='Concept', properties={}), Node(id='Industries', type='Concept', properties={}), Node(id='Global Economy', type='Concept', properties={}), Node(id='International Monetary Fund', type='Organization', properties={}), Node(id='April 2020 Global Economic Survey', type='Publication', properties={}), Node(id='New Crown Epidemic', type='Event', properties={}), Node(id='2008-2009 Financial Crisis', type='Event', properties={}), Node(id='Experts', type='Person', properties={}), Node(id='Global Economic Recession', type='Event', properties={}), Node(id='End Of 2007', type='Date', properties={}), Node(id='June 2009', type='Date', properties={}), Node(id='Apple', type='Organization', properties={}), Node(id='China', type='Location', properties={})], relationships=[Relationship(source=Node(id='Population', type='Concept', properties={}), target=Node(id='Industries', type='Concept', properties={}), type='DECLINE_CAUSES_STAGNATION', properties={}), Relationship(source=Node(id='Spending', type='Concept', properties={}), target=Node(id='Industries', type='Concept', properties={}), type='DECREASE_CAUSES_STAGNATION', properties={}), Relationship(source=Node(id='Inventories', type='Concept', properties={}), target=Node(id='Industries', type='Concept', properties={}), type='INCREASE_CAUSES_STAGNATION', properties={}), Relationship(source=Node(id='Industries', type='Concept', properties={}), target=Node(id='Global Economy', type='Concept', properties={}), type='STAGNATION_IMPACTS', properties={}), Relationship(source=Node(id='International Monetary Fund', type='Organization', properties={}), target=Node(id='April 2020 Global Economic Survey', type='Publication', properties={}), type='STATED_IN', properties={}), Relationship(source=Node(id='New Crown Epidemic', type='Event', properties={}), target=Node(id='Global Economy', type='Concept', properties={}), type='CAUSES_SHRINKAGE', properties={}), Relationship(source=Node(id='Global Economy', type='Concept', properties={}), target=Node(id='2008-2009 Financial Crisis', type='Event', properties={}), type='MORE_SEVERE_THAN', properties={}), Relationship(source=Node(id='Experts', type='Person', properties={}), target=Node(id='Economic Contraction', type='Concept', properties={}), type='PREDICTED_IMPACT', properties={}), Relationship(source=Node(id='Global Economic Recession', type='Event', properties={}), target=Node(id='End Of 2007', type='Date', properties={}), type='BEGAN', properties={}), Relationship(source=Node(id='Global Economic Recession', type='Event', properties={}), target=Node(id='June 2009', type='Date', properties={}), type='RECOVERED', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='China', type='Location', properties={}), type='COMPONENTS_PRODUCED_IN', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 1, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content='3. MACRO PERSPECTIVE \\nFrom a macro perspective, the decline in population, \\nthe decrease in spending, and the increase in inventories \\nhave caused many industries to stagnate. This is bound \\nto bring a huge blow on the global economy to a serious \\nextent. The International Monetary Fund stated in the \\n\"April 2020 Global Economic Survey\" that due to the \\nnew crown epidemic, the global economy is expected to \\nshrink by 3% in 2020, and the impact will be much \\nmore severe than the 2008-2009 financial crisis. Some \\nexperts have predicted that the impact of the epidemic \\ncrisis on economic contraction will be three times that \\nof the financial crisis. The last global economic \\nrecession began at the end of 2007 and did not fully \\nrecover until June 2009. Most of Apple\\'s components \\nare produced in China, so when China is')),\n", + " GraphDocument(nodes=[Node(id='Apple', type='Organization', properties={}), Node(id='China', type='Location', properties={}), Node(id='Epidemic', type='Event', properties={}), Node(id='Wall Street Journal', type='Organization', properties={}), Node(id='March 24, 2020', type='Date', properties={}), Node(id='Microsoft', type='Organization', properties={})], relationships=[Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='China', type='Location', properties={}), type='COMPONENTS_PRODUCED_IN', properties={}), Relationship(source=Node(id='China', type='Location', properties={}), target=Node(id='Epidemic', type='Event', properties={}), type='AFFECTED_BY', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Epidemic', type='Event', properties={}), type='AFFECTED_BY', properties={}), Relationship(source=Node(id='Wall Street Journal', type='Organization', properties={}), target=Node(id='March 24, 2020', type='Date', properties={}), type='REPORTED_ON', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Microsoft', type='Organization', properties={}), type='FELL_OUT_OF_TRILLION_CLUB', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 1, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=' until June 2009. Most of Apple\\'s components \\nare produced in China, so when China is affected by the \\nepidemic and the work is stopped, Apple\\'s products \\ncannot be made at the same time [3]. This will \\ninevitably have a huge impact on Apple. According to \\ninformation reported by the Wall Street Journal on \\nMarch 24, 2020, Apple\\'s stock price closed down 2.12% \\non Monday, and its market value fell below US$1 \\ntrillion to US$981.7 billion. In this way, Apple fell out \\nof the \"trillion club\" of US stocks, of which only \\nMicrosoft was left [7]. \\n4. MICRO PERSPECTIVE \\nIt is obvious that Apple has been affected by the \\nepidemic in a great extent, however, we still need to \\ntake a look Apple’s economic changes')),\n", + " GraphDocument(nodes=[Node(id='Apple', type='Organization', properties={}), Node(id='February 3, 2020', type='Date', properties={}), Node(id='March 23, 2020', type='Date', properties={}), Node(id='September 1, 2020', type='Date', properties={})], relationships=[Relationship(source=Node(id='February 3, 2020', type='Date', properties={}), target=Node(id='Apple', type='Organization', properties={}), type='OPENING_PRICE_76.07', properties={}), Relationship(source=Node(id='February 3, 2020', type='Date', properties={}), target=Node(id='Apple', type='Organization', properties={}), type='CLOSING_PRICE_80.01', properties={}), Relationship(source=Node(id='March 23, 2020', type='Date', properties={}), target=Node(id='Apple', type='Organization', properties={}), type='OPENING_PRICE_57.02', properties={}), Relationship(source=Node(id='March 23, 2020', type='Date', properties={}), target=Node(id='Apple', type='Organization', properties={}), type='CLOSING_PRICE_56.09', properties={}), Relationship(source=Node(id='September 1, 2020', type='Date', properties={}), target=Node(id='Apple', type='Organization', properties={}), type='OPENING_PRICE_132.76', properties={}), Relationship(source=Node(id='September 1, 2020', type='Date', properties={}), target=Node(id='Apple', type='Organization', properties={}), type='CLOSING_PRICE_134.18', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 1, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=' great extent, however, we still need to \\ntake a look Apple’s economic changes in a micro \\nperspective. At the beginning of the epidemic, which is \\nthe start of February, the price started to decline. On \\nFebruary 3, 2020, the opening price is 76.07; the \\nclosing price is 80.01. From this point, the closing price \\nstarted to diminish. On March 23, 2020, the opening \\nprice is 57.02, and the closing price is 56.09. The stock \\nprice for this day is the lowest price during epidemic in \\nApple. Then, it started to increase gradually until \\nSeptember 1, this day is also a peak, the opening price \\non September 1 is 132.76 and the closing price is \\n134.18. However, after September 1, the price started to \\ndrop again. There is a very low price on')),\n", + " GraphDocument(nodes=[Node(id='September 1', type='Date', properties={}), Node(id='September 18, 2020', type='Date', properties={}), Node(id='110.40', type='Price', properties={}), Node(id='106.84', type='Price', properties={}), Node(id=\"Apple'S Stock Price\", type='Stock', properties={}), Node(id='Advances In Economics, Business And Management Research, Volume 203', type='Publication', properties={})], relationships=[Relationship(source=Node(id='September 1', type='Date', properties={}), target=Node(id=\"Apple'S Stock Price\", type='Stock', properties={}), type='PRICE_DROP_START', properties={}), Relationship(source=Node(id='September 18, 2020', type='Date', properties={}), target=Node(id=\"Apple'S Stock Price\", type='Stock', properties={}), type='LOW_PRICE_DATE', properties={}), Relationship(source=Node(id='110.40', type='Price', properties={}), target=Node(id=\"Apple'S Stock Price\", type='Stock', properties={}), type='OPENING_PRICE', properties={}), Relationship(source=Node(id='106.84', type='Price', properties={}), target=Node(id=\"Apple'S Stock Price\", type='Stock', properties={}), type='CLOSING_PRICE', properties={}), Relationship(source=Node(id=\"Apple'S Stock Price\", type='Stock', properties={}), target=Node(id='Advances In Economics, Business And Management Research, Volume 203', type='Publication', properties={}), type='MENTIONED_IN', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 1, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=\" after September 1, the price started to \\ndrop again. There is a very low price on September 18, \\n2020. The opening price is 110.40, the closing price is \\n106.84. \\n \\nFigure 1 Apple's stock price \\n \\n \\nAdvances in Economics, Business and Management Research, volume 203\\n374\\n\")),\n", + " GraphDocument(nodes=[Node(id='Apple', type='Organization', properties={}), Node(id='Iphone 12', type='Product', properties={}), Node(id='Iphone 12 Pro', type='Product', properties={}), Node(id='Iphone 12 Pro Max', type='Product', properties={}), Node(id='October 14, 2020', type='Date', properties={}), Node(id='October 12, 2020', type='Date', properties={})], relationships=[Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Iphone 12', type='Product', properties={}), type='ANNOUNCED', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Iphone 12 Pro', type='Product', properties={}), type='ANNOUNCED', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Iphone 12 Pro Max', type='Product', properties={}), type='ANNOUNCED', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='October 14, 2020', type='Date', properties={}), type='ANNOUNCEMENT_DATE', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='October 12, 2020', type='Date', properties={}), type='STOCK_PRICE_INCREASE', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 2, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=\" \\nFigure 2. Apple's stock price \\nThe reasons which caused this drop might be a lot, \\nthe most common answer might be the delay of the \\nannouncement of new Apple Products. A numbers of \\npeople waiting on this day for a long time, because \\nApple supposed to announce the new products like \\niPhone 12, iPhone 12 Pro and iPhone 12 Pro Max, etc. \\nNevertheless, Apple delayed the publish of their new \\nproducts until October 14, 2020. There is a small peak \\nof rising stock price on October 12, which is two days \\nbefore the publish of the new products. The increasing \\nof Apple stock might because people putted huge \\nexpectations on these new products, and they have been \\nwaited for a long period. This motivation lead them to \\nbuy these products right away, which caused the \\nsuddenly increasing price of Apple’ stock. On\")),\n", + " GraphDocument(nodes=[Node(id='Apple', type='Company', properties={}), Node(id='November 2, 2020', type='Date', properties={}), Node(id='December 28, 2020', type='Date', properties={}), Node(id='January 26, 2021', type='Date', properties={})], relationships=[Relationship(source=Node(id='November 2, 2020', type='Date', properties={}), target=Node(id='Apple', type='Company', properties={}), type='STOCK_PRICE_DROP', properties={}), Relationship(source=Node(id='December 28, 2020', type='Date', properties={}), target=Node(id='Apple', type='Company', properties={}), type='STOCK_PRICE_INCREASE', properties={}), Relationship(source=Node(id='January 26, 2021', type='Date', properties={}), target=Node(id='Apple', type='Company', properties={}), type='STOCK_PRICE_PEAK', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 2, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=\" products right away, which caused the \\nsuddenly increasing price of Apple’ stock. On \\nNovember 2, there is a small drop on this day. The \\nopening price is 109.11, and the closing price is 108.77. \\nOn December 28, there is another increasing peak on \\nApple’s stock. The opening price is 133.99, and the \\nclosing price 136.69. This is also the highest price of \\nApple stock in 2020. After 2020, Apple's stock price \\nwill continue to increase in 2021. This study used two \\nyears of data for comparison, which means it will only \\nuse data from February 3, 2021 and before. So at the \\nbeginning of 2021, Apple’s stock price ushered in its \\nfirst peak. On January 26, 2021, the opening price is \\n143.6, and the closing price is 143.16.\")),\n", + " GraphDocument(nodes=[Node(id='Apple', type='Organization', properties={}), Node(id='Microsoft', type='Organization', properties={}), Node(id='Bill Gates', type='Person', properties={}), Node(id='Paul Allen', type='Person', properties={}), Node(id='April 4, 1975', type='Date', properties={}), Node(id='Redmond, Washington', type='Location', properties={})], relationships=[Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='Bill Gates', type='Person', properties={}), type='FOUNDED_BY', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='Paul Allen', type='Person', properties={}), type='FOUNDED_BY', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='April 4, 1975', type='Date', properties={}), type='FOUNDED_ON', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='Redmond, Washington', type='Location', properties={}), type='HEADQUARTERED_IN', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 2, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=', the opening price is \\n143.6, and the closing price is 143.16. Later on, there is \\na small drop until February 3, 2021. The opening price \\nis 135.76, and the closing price is 133.94. Comparing \\nthe data at the same day but different year on February \\n3, we can see that the price of Apple’s stock increased \\nby 67.4% in one year. \\n5. COMPARISON WITH MICROSOFT \\nCOMPANY \\nAnother very famous company which can compete \\nwith Apple is Microsoft Company. Microsoft is an \\nAmerican multinational technology company founded \\nby Bill Gates and Paul Allen on April 4, 1975. The \\ncompany is headquartered in Redmond, Washington \\n(near Seattle), focusing on R&D, manufacturing, \\nlicensing and providing a wide range of computer \\nsoftware services. The most famous')),\n", + " GraphDocument(nodes=[Node(id='Microsoft', type='Organization', properties={}), Node(id='Windows Operating System', type='Product', properties={}), Node(id='Office Series Software', type='Product', properties={}), Node(id='Apple', type='Organization', properties={})], relationships=[Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='Windows Operating System', type='Product', properties={}), type='PRODUCES', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='Office Series Software', type='Product', properties={}), type='PRODUCES', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='Apple', type='Organization', properties={}), type='COMPARED_TO', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 2, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=\", \\nlicensing and providing a wide range of computer \\nsoftware services. The most famous and best-selling \\nproducts are Windows operating system and Office \\nseries software. It is the world's largest computer \\nsoftware provider and the leader of PC (Personal \\nComputer) software development in the world. On June \\n23, 2021, Microsoft's market value exceeded the $2 \\ntrillion mark. This is the second US company to enter \\nafter Apple's breakthrough of $2 trillion. In 2019, the \\nmarket value of Microsoft reached US$1 trillion for the \\nfirst time, 33 years after the original Microsoft IPO. \\nFrom 1 trillion to 2 trillion, Microsoft only took about 2 \\nyears. Comparing Microsoft and Apple, Microsoft has \\nmore patents, company environment, R&D ratio, and \\nR&D investment than Apple. Apple's profit, profit \\nmargin, revenue, etc. are\")),\n", + " GraphDocument(nodes=[Node(id='Apple', type='Organization', properties={}), Node(id='Microsoft', type='Organization', properties={}), Node(id='2_Trillion', type='Value', properties={}), Node(id='Profit', type='Concept', properties={}), Node(id='Profit_Margin', type='Concept', properties={}), Node(id='Revenue', type='Concept', properties={}), Node(id='Market_Value', type='Concept', properties={}), Node(id='Epidemic', type='Event', properties={}), Node(id='Employees', type='Group', properties={}), Node(id='Home', type='Location', properties={}), Node(id='Stock_Price', type='Concept', properties={}), Node(id='March_6_2020', type='Date', properties={})], relationships=[Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Microsoft', type='Organization', properties={}), type='MORE_R&D_INVESTMENT_THAN', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Microsoft', type='Organization', properties={}), type='MORE_PROFIT_THAN', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Microsoft', type='Organization', properties={}), type='MORE_PROFIT_MARGIN_THAN', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Microsoft', type='Organization', properties={}), type='MORE_REVENUE_THAN', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='2_Trillion', type='Value', properties={}), type='MARKET_VALUE_CAN_BREAK', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='Profit', type='Concept', properties={}), type='NOT_GOOD_PERFORMANCE_IN', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='Profit_Margin', type='Concept', properties={}), type='NOT_GOOD_PERFORMANCE_IN', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='Enterprise-Level_Market', type='Concept', properties={}), type='IS_IN', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='Epidemic', type='Event', properties={}), type='AFFECTED_BY', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='Employees', type='Group', properties={}), type='LET_WORK_AT_HOME', properties={}), Relationship(source=Node(id='Employees', type='Group', properties={}), target=Node(id='Home', type='Location', properties={}), type='WORK_AT', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='Stock_Price', type='Concept', properties={}), type='CHANGED_WITH_EPIDEMIC', properties={}), Relationship(source=Node(id='Epidemic', type='Event', properties={}), target=Node(id='March_6_2020', type='Date', properties={}), type='REPORT_DATE', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 2, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=\"&D investment than Apple. Apple's profit, profit \\nmargin, revenue, etc. are more than that of Microsoft. \\nThis is why Apple’s market value can break 2 trillion \\nearlier than Microsoft. On the two items of profit and \\nprofit margin, Microsoft's performance is not good. \\nAfter all, it is an enterprise-level market. If the profit \\nmargin is too high, customers are easy to lose. \\nDuring the epidemic, Microsoft let their employees \\nto work at home, Its stock price has also changed more \\nor less with the epidemic flow. In March 6, 2020 report, \\nAdvances in Economics, Business and Management Research, volume 203\\n375\\n\")),\n", + " GraphDocument(nodes=[Node(id='Microsoft', type='Organization', properties={}), Node(id='April 24, 2000', type='Date', properties={}), Node(id='February 10, 2020', type='Date', properties={}), Node(id='March 16, 2020', type='Date', properties={}), Node(id='March 23, 2020', type='Date', properties={}), Node(id='Apple', type='Organization', properties={}), Node(id='Dow', type='Index', properties={})], relationships=[Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='14.74%', type='Percentage', properties={}), type='STOCK_PRICE_DROP', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='April 24, 2000', type='Date', properties={}), type='LARGEST_SINGLE_DAY_DROP_SINCE', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='15.6%', type='Percentage', properties={}), type='STOCK_PRICE_DROP', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='27%', type='Percentage', properties={}), type='STOCK_PRICE_DROP_PAST_MONTH', properties={}), Relationship(source=Node(id='Dow', type='Index', properties={}), target=Node(id='31%', type='Percentage', properties={}), type='STOCK_PRICE_DROP_PAST_MONTH', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='February 10, 2020', type='Date', properties={}), type='STOCK_PRICE_START_DECREASE', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='March 16, 2020', type='Date', properties={}), type='LOWEST_PRICE_DATE', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='140.0', type='Price', properties={}), type='OPENING_PRICE', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='135.42', type='Price', properties={}), type='CLOSING_PRICE', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='March 23, 2020', type='Date', properties={}), type='LOWEST_PRICE_DATE', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 3, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=\"Microsoft's stock price plummeted 14.74% on Monday, \\nthe largest single-day drop since April 24, 2000. At that \\ntime, Microsoft's stock price fell by 15.6%. In the past \\nmonth, Microsoft's stock price has fallen by 27%, and \\nthe Dow has fallen by 31% [5]. \\nIn a micro perspective, during epidemic time, \\nMicrosoft’s stock start decrease since February 10, \\n2020. At the time when Apple had their its lowest price \\non March 23, Microsoft also having a bad situation, but \\nthe lowest price of Microsoft price is on March 16. The \\nopening price on that day is 140.0, the closing price is \\n135.42. Even thought the stock price of Microsoft is \\nmuch higher than Apple, but their volume is quite \\nsimilar, even Apple’s volumes are higher than \\nMicrosoft. It is because Apple\")),\n", + " GraphDocument(nodes=[Node(id='Apple', type='Company', properties={}), Node(id='Microsoft', type='Company', properties={}), Node(id='Epidemic', type='Event', properties={}), Node(id=\"Apple'S Stock Price\", type='Financial_metric', properties={}), Node(id=\"Apple'S New Products\", type='Product', properties={})], relationships=[Relationship(source=Node(id='Apple', type='Company', properties={}), target=Node(id='Microsoft', type='Company', properties={}), type='HIGHER_MARKET_VALUE_THAN', properties={}), Relationship(source=Node(id='Epidemic', type='Event', properties={}), target=Node(id=\"Apple'S Stock Price\", type='Financial_metric', properties={}), type='AFFECTED', properties={}), Relationship(source=Node(id=\"Apple'S Stock Price\", type='Financial_metric', properties={}), target=Node(id='Apple', type='Company', properties={}), type='RECOVERED_QUICKLY', properties={}), Relationship(source=Node(id=\"Apple'S Stock Price\", type='Financial_metric', properties={}), target=Node(id='Apple', type='Company', properties={}), type='CONTINUED_TO_RISE', properties={}), Relationship(source=Node(id=\"Apple'S New Products\", type='Product', properties={}), target=Node(id=\"Apple'S Stock Price\", type='Financial_metric', properties={}), type='IMPACTED', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 3, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=\"\\nsimilar, even Apple’s volumes are higher than \\nMicrosoft. It is because Apple's market value is higher \\nthan Microsoft. \\n6. OVERALL TREND \\nFor Apple, the epidemic only affected it at the \\nbeginning, but it took only four months for Apple to \\nreturn to its previous normal stock price. It can be seen \\nthat the true hidden strength behind Apple can enable \\nApple. Recovered in such a short time. The overall \\nsituation of Apple's stock price continued to rise. It only \\nfell sharply between February and June, when the \\nepidemic first began, but it quickly returned to its \\noriginal state. During the period from August to \\nNovember, the impact of Apple's new products rose \\nfirst and then fell back to normal levels. There was also \\na peak in early 2021, and then it returned to normal \\nlevels. The overall\")),\n", + " GraphDocument(nodes=[Node(id='Apple', type='Organization', properties={}), Node(id='Microsoft', type='Organization', properties={}), Node(id='Early 2021', type='Time', properties={}), Node(id='December 2020', type='Time', properties={}), Node(id='September', type='Time', properties={}), Node(id='February', type='Time', properties={}), Node(id='March', type='Time', properties={}), Node(id='June', type='Time', properties={})], relationships=[Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Early 2021', type='Time', properties={}), type='PEAK', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='67.4%', type='Percentage', properties={}), type='INCREASE', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='February', type='Time', properties={}), type='DECLINE_START', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='March', type='Time', properties={}), type='DECLINE_END', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='June', type='Time', properties={}), type='RECOVERY', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='December 2020', type='Time', properties={}), type='HIGHEST_VALUE', properties={}), Relationship(source=Node(id='Microsoft', type='Organization', properties={}), target=Node(id='September', type='Time', properties={}), type='HIGHEST_VALUE', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 3, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=\" \\na peak in early 2021, and then it returned to normal \\nlevels. The overall level of Apple's stock increased by \\n67.4% over the same period. \\nFor Microsoft, his situation is actually similar to that \\nof the same type of Apple. At the beginning of the \\nepidemic, there was a huge decline from February to \\nMarch, but it was basically fully restored to the \\nprevious level before June. Like Apple, it took just four \\nmonths to solve the severe damage caused by the \\nepidemic, which shows that Microsoft's strength cannot \\nbe underestimated. Microsoft's overall trend continues \\nto rise. However, unlike Apple, the highest value of \\nApple's stock is in December, which is the end of 2020, \\nbut the highest value of Microsoft's stock is in \\nSeptember. After September, the stock declined \\nslightly, but it only fell\")),\n", + " GraphDocument(nodes=[Node(id='Apple', type='Company', properties={}), Node(id='Microsoft', type='Company', properties={}), Node(id='Pandemic', type='Event', properties={})], relationships=[Relationship(source=Node(id='Microsoft', type='Company', properties={}), target=Node(id='Apple', type='Company', properties={}), type='HIGHER_STOCK_PRICE_THAN', properties={}), Relationship(source=Node(id='Apple', type='Company', properties={}), target=Node(id='Microsoft', type='Company', properties={}), type='HIGHER_TRANSACTION_VOLUME_THAN', properties={}), Relationship(source=Node(id='Pandemic', type='Event', properties={}), target=Node(id='Apple', type='Company', properties={}), type='DID_NOT_CAUSE_IRREPARABLE_BLOW_TO', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 3, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=\" in \\nSeptember. After September, the stock declined \\nslightly, but it only fell to a normal level, and then \\nslowly climbed up. \\n7. CONCLUSION \\nComparing Apple's stock price and Microsoft's \\nstock price, we can find that Microsoft's stock price is \\nmuch higher than Apple's, but when we compare their \\nnumber of transactions, we find that Apple's number of \\ntransactions is many times greater than that of \\nMicrosoft. This has also led to the fact that the total \\ntrading volume of Apple stock and Microsoft stock is \\nsimilar, and sometimes Apple is even higher than \\nMicrosoft. \\nTo conclude, the pandemic did not bring an \\nirreparable blow to Apple. Instead, it only took it four \\nmonths to restore its stock to its original value, or even \\nrise. This is enough to show that Apple's\")),\n", + " GraphDocument(nodes=[Node(id='Apple', type='Organization', properties={}), Node(id='Yahoo', type='Organization', properties={}), Node(id='Verizon Media', type='Organization', properties={}), Node(id='Ziemba, W. T.', type='Person', properties={}), Node(id='Aapl', type='Stock', properties={}), Node(id='Covid-19', type='Event', properties={})], relationships=[Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Aapl', type='Stock', properties={}), type='HAS_STOCK', properties={}), Relationship(source=Node(id='Yahoo', type='Organization', properties={}), target=Node(id='Verizon Media', type='Organization', properties={}), type='PART_OF', properties={}), Relationship(source=Node(id='Ziemba, W. T.', type='Person', properties={}), target=Node(id='Covid-19', type='Event', properties={}), type='WROTE_ABOUT', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 3, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=\" stock to its original value, or even \\nrise. This is enough to show that Apple's foundation is \\nstable and there are excellent strategies to enable them \\nto recover their losses in time when they faced strong \\nimpacts. In general, Apple's stock only declined during \\nthe most severe period of the pandemic, and remained \\nat a stable value for the rest of the time, and there was \\nalso a phenomenon of continuous rise. \\nREFERENCES \\n[1] Yahoo is now a part of Verizon Media. (n.d.). \\nYahoo. \\nRetrieved \\nAugust \\n29, \\n2021, \\nfrom \\nhttps://finance.yahoo.com/quote/AAPL/chart?\\\\ \\n[2] Ziemba, W. T. (2020, June 21). The COVID-19 \\nCrash in the US \\nStock Market\")),\n", + " GraphDocument(nodes=[Node(id='Ziemba', type='Person', properties={}), Node(id='Yan', type='Person', properties={}), Node(id='Camba', type='Person', properties={}), Node(id='The Covid-19 Crash In The Us Stock Market', type='Publication', properties={}), Node(id='Analysis Of The Effect Of Covid-19 On The Stock Market And Investing Strategies', type='Publication', properties={}), Node(id='The Effects Of Restrictions In Economic Activity On The Spread Of Covid-19 In The Philippines', type='Publication', properties={})], relationships=[Relationship(source=Node(id='Ziemba', type='Person', properties={}), target=Node(id='The Covid-19 Crash In The Us Stock Market', type='Publication', properties={}), type='AUTHOR', properties={}), Relationship(source=Node(id='Yan', type='Person', properties={}), target=Node(id='Analysis Of The Effect Of Covid-19 On The Stock Market And Investing Strategies', type='Publication', properties={}), type='AUTHOR', properties={}), Relationship(source=Node(id='Camba', type='Person', properties={}), target=Node(id='The Effects Of Restrictions In Economic Activity On The Spread Of Covid-19 In The Philippines', type='Publication', properties={}), type='AUTHOR', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 3, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content='2020, June 21). The COVID-19 \\nCrash in the US \\nStock Market. \\nZiemba. \\nhttps://papers.ssrn.com/sol3/papers.cfm?abstract_i\\nd=3632410 \\n[3] Yan, B. (2020, March 28). Analysis of the Effect of \\nCOVID-19 on the Stock Market and Investing \\nStrategies. \\nYan. \\nhttps://papers.ssrn.com/sol3/Papers.cfm?abstract_i\\nd=3563380 \\n[4] Camba, J. A. C. (2020). The Effects of Restrictions \\nin Economic Activity on the Spread of COVID-19 \\nin the Philippines: Insights from Apple and Google \\nMobility Indicators -The Journal of Asian Finance, \\nEconomics and Business | Korea Science. Camb')),\n", + " GraphDocument(nodes=[Node(id='The Journal Of Asian Finance, Economics And Business', type='Publication', properties={}), Node(id='Korea Science', type='Organization', properties={}), Node(id='Camba', type='Person', properties={}), Node(id='Shivaani, M. V.', type='Person', properties={}), Node(id='Journal Of Emerging Technologies In Accounting', type='Publication', properties={}), Node(id='Apple', type='Organization', properties={}), Node(id='Amazon', type='Organization', properties={})], relationships=[Relationship(source=Node(id='The Journal Of Asian Finance, Economics And Business', type='Publication', properties={}), target=Node(id='Korea Science', type='Organization', properties={}), type='PUBLISHED_BY', properties={}), Relationship(source=Node(id='Camba', type='Person', properties={}), target=Node(id='The Journal Of Asian Finance, Economics And Business', type='Publication', properties={}), type='AUTHOR', properties={}), Relationship(source=Node(id='Shivaani, M. V.', type='Person', properties={}), target=Node(id='Journal Of Emerging Technologies In Accounting', type='Publication', properties={}), type='AUTHOR', properties={}), Relationship(source=Node(id='Apple', type='Organization', properties={}), target=Node(id='Amazon', type='Organization', properties={}), type='COMPARED_TO', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 3, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content=' Indicators -The Journal of Asian Finance, \\nEconomics and Business | Korea Science. Camba. \\nhttps://www.koreascience.or.kr/article/JAKO2020\\n34651879125.page \\n[5] Shivaani, M. V. (n.d.). Comparing Apple to \\nAmazon: Just a Matter of Words in Machine \\nLearning \\nWorld \\n| \\nJournal \\nof \\nEmerging \\nTechnologies in Accounting. Shivaani. Retrieved \\nAugust \\n29, \\n2021, \\nfrom \\nhttps://meridian.allenpress.com/jeta/article-\\nabstract/doi/10.2308/JETA-2020-\\n045/464597/Comparing-Apple-to-Amazon-Just-a-\\nMatter-of-Words \\n[6] Apple (AAPL)')),\n", + " GraphDocument(nodes=[Node(id='Apple', type='Company', properties={}), Node(id='Microsoft', type='Company', properties={}), Node(id='Trillion-Dollar Club', type='Economic status', properties={})], relationships=[Relationship(source=Node(id='Apple', type='Company', properties={}), target=Node(id='Trillion-Dollar Club', type='Economic status', properties={}), type='FALLEN_BELOW', properties={}), Relationship(source=Node(id='Microsoft', type='Company', properties={}), target=Node(id='Trillion-Dollar Club', type='Economic status', properties={}), type='MEMBER', properties={})], source=Document(metadata={'source': '../data/Apple stock during pandemic.pdf', 'file_path': '../data/Apple stock during pandemic.pdf', 'page': 3, 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': 'EDZ', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word 2016', 'producer': 'Microsoft® Word 2016; modified using iText 5.0.6 (c) 1T3XT BVBA', 'creationDate': \"D:20211208111846+05'30'\", 'modDate': \"D:20220105193619+05'30'\", 'trapped': ''}, page_content='a-\\nMatter-of-Words \\n[6] Apple (AAPL) Vs Microsoft (MSFT): Which Is A \\nBetter Tech Stock To Buy Right Now?, 28 Jan. \\n2021, \\nwww.nasdaq.com/articles/apple-aapl-vs-\\nmicrosoft-msft%3A-which-is-a-better-tech-stock-\\nto-buy-right-now-2021-01-28. \\n[7] Osipovich Alexander. “Apple’s market capitalisation \\nhas fallen below trillion, and Microsoft is the only \\nremaining U.S. company in the trillion-dollar \\nclub.” Wall Street News, 24 Mar. 2020, \\ncn.wsj.com/articles/ \\nAdvances in Economics, Business and Management Research, volume 203\\n376\\n'))]" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph_documents" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add nodes and relations to Neo4j DB" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "graph = Neo4jGraph(url=os.environ.get('NEO4J_URI'), \n", + " database=\"neo4j\", \n", + " username=os.environ.get('NEO4J_USERNAME'), \n", + " password=os.environ.get('NEO4J_PASSWORD'))" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "graph.add_graph_documents(graph_documents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get distinct nodes and relations from graph document" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "node_labels = set()\n", + "rel_labels = set()\n", + "\n", + "for gd in graph_documents:\n", + " for node in gd.nodes:\n", + " node_labels.add(node.type)\n", + " \n", + " for rel in gd.relationships:\n", + " rel_labels.add(rel.type)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "system_prompt = \"\"\"Please consolidate the following list of types into a smaller set of more general, semantically \n", + "related types. The consolidated types must be drawn from the original list; do not introduce new types. \n", + "Return a JSON object representing the mapping of original types to consolidated types. Every key is the consolidated type\n", + "and value is list of the original types that were merged into the consolidated type. Prioritize using the most generic and \n", + "repeated term when merging. If a type doesn't merge with any other type, it should still be included in the output, \n", + "mapped to itself.\n", + "\n", + "**Input:** A list of strings representing the types to be consolidated. These types may represent either node \n", + "labels or relationship labels Your algorithm should do appropriate groupings based on semantic similarity.\n", + "\n", + "Example 1:\n", + "Input: \n", + "[ \"Person\", \"Human\", \"People\", \"Company\", \"Organization\", \"Product\"]\n", + "Output :\n", + "[Person\": [\"Person\", \"Human\", \"People\"], Organization\": [\"Company\", \"Organization\"], Product\": [\"Product\"]]\n", + "\n", + "Example 2:\n", + "Input :\n", + "[\"CREATED_FOR\", \"CREATED_TO\", \"CREATED\", \"PLACE\", \"LOCATION\", \"VENUE\"]\n", + "Output:\n", + "[\"CREATED\": [\"CREATED_FOR\", \"CREATED_TO\", \"CREATED\"],\"PLACE\": [\"PLACE\", \"LOCATION\", \"VENUE\"]]\n", + "\"\"\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get list of more general, semantically related labels from LLM" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "parser = JsonOutputParser()\n", + "prompt = ChatPromptTemplate(messages=[(\"system\",system_prompt),(\"human\", \"{input}\")],\n", + " partial_variables={\"format_instructions\": parser.get_format_instructions()})\n", + "\n", + "chain = prompt | llm | parser\n", + "nodes_dict = chain.invoke({'input':node_labels})\n", + "relation_dict = chain.invoke({'input':rel_labels})" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Economy': ['Economy', 'Economic status'],\n", + " 'List': ['List'],\n", + " 'Index': ['Index'],\n", + " 'Publication': ['Publication'],\n", + " 'Concept': ['Concept'],\n", + " 'Location': ['Location'],\n", + " 'Person': ['Person'],\n", + " 'Price': ['Price', 'Value'],\n", + " 'Time': ['Time', 'Date'],\n", + " 'Group': ['Group'],\n", + " 'Event': ['Event'],\n", + " 'Product': ['Product'],\n", + " 'Company': ['Company', 'Organization', 'Industry'],\n", + " 'License': ['License'],\n", + " 'Stock': ['Stock'],\n", + " 'Financial_metric': ['Financial_metric'],\n", + " 'Url': ['Url']}" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nodes_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'RECOVERED': ['RECOVERED', 'RECOVERED_FROM'],\n", + " 'COMPONENTS_PRODUCED_IN': ['COMPONENTS_PRODUCED_IN'],\n", + " 'MENTIONS': ['MENTIONS'],\n", + " 'PRICE': ['OPENING_PRICE_132.76',\n", + " 'LOWEST_PRICE',\n", + " 'OPENING_PRICE_57.02',\n", + " 'CLOSING_PRICE_134.18',\n", + " 'OPENING_PRICE_76.07',\n", + " 'CLOSING_PRICE_56.09',\n", + " 'CLOSING_PRICE_80.01'],\n", + " 'PART_OF': ['PART_OF'],\n", + " 'MARKET_VALUE': ['MARKET_VALUE',\n", + " 'MARKET_VALUE_HIGHER_THAN',\n", + " 'MARKET_VALUE_RECORD',\n", + " 'MARKET_VALUE_EXCEEDED'],\n", + " 'FOUNDED_ON': ['FOUNDED_ON'],\n", + " 'AUTHOR': ['AUTHOR'],\n", + " 'IS': ['IS'],\n", + " 'SURPASSED': ['SURPASSED_IN_BRAND_VALUE',\n", + " 'SURPASSED_BY_APPLE_IN_BRAND_VALUE',\n", + " 'SURPASSED'],\n", + " 'TRANSACTION': ['HIGHER_TRANSACTION_NUMBER'],\n", + " 'CAUSES': ['CAUSES_SHRINKAGE',\n", + " 'INCREASE_CAUSES_STAGNATION',\n", + " 'DECREASE_CAUSES_STAGNATION',\n", + " 'DECLINE_CAUSES_STAGNATION'],\n", + " 'DATA_OBSERVED_ON': ['DATA_OBSERVED_ON'],\n", + " 'AFFECTED': ['AFFECTED_BY', 'AFFECTED'],\n", + " 'HAS': ['HAS'],\n", + " 'DISTRIBUTED_UNDER': ['DISTRIBUTED_UNDER'],\n", + " 'COMPARISON': ['VOLUME_COMPARISON',\n", + " 'STOCK_PRICE_FALL_COMPARISON',\n", + " 'COMPARED_TO'],\n", + " 'PROFIT': ['GREATER_PROFIT_MARGIN', 'GREATER_PROFIT'],\n", + " 'RANK': ['FOURTH-LARGEST_BY_UNIT_SALES',\n", + " 'FOURTH-LARGEST',\n", + " 'LARGEST_BY_REVENUE',\n", + " 'LARGEST_COMPANY_BY_MARKET_CAP',\n", + " 'RANKED_6TH_ON'],\n", + " 'STOCK': ['STOCK_PRICE_INCREASE',\n", + " 'STOCK_PRICE_DROP',\n", + " 'STOCK_PRICE_PEAK',\n", + " 'STOCK_START_DECREASE'],\n", + " 'PUBLISHED': ['PUBLISHED_BY', 'PUBLISHED_IN'],\n", + " 'REPORTED_ON': ['REPORTED_ON'],\n", + " 'LOCATED_IN': ['LOCATED_IN', 'HEADQUARTERED_IN'],\n", + " 'FELL_OUT_OF_TRILLION_CLUB': ['FELL_OUT_OF_TRILLION_CLUB'],\n", + " 'IMPACT': ['IMPACT', 'PREDICTED_IMPACT', 'IMPACTED', 'STAGNATION_IMPACTS'],\n", + " 'DECLINE': ['DECLINE'],\n", + " 'INCREASE': ['INCREASE'],\n", + " 'RECOVERY': ['RECOVERY'],\n", + " 'SIMILAR_TRADING_VOLUME': ['SIMILAR_TRADING_VOLUME'],\n", + " 'STUDIED': ['STUDIED'],\n", + " 'ANNOUNCED': ['ANNOUNCED', 'ANNOUNCEMENT_DATE'],\n", + " 'MOST_VALUABLE_BRAND': ['MOST_VALUABLE_BRAND'],\n", + " 'AFFILIATED_WITH': ['AFFILIATED_WITH'],\n", + " 'PRODUCES': ['PRODUCES'],\n", + " 'WENT_PUBLIC_ON': ['WENT_PUBLIC_ON'],\n", + " 'REVENUE': ['GREATER_REVENUE'],\n", + " 'HIGHEST_VALUE': ['HIGHEST_VALUE'],\n", + " 'FOUNDED_BY': ['FOUNDED_BY'],\n", + " 'EXPERIENCED_FLUCTUATION': ['EXPERIENCED_FLUCTUATION'],\n", + " 'STATED_IN': ['STATED_IN'],\n", + " 'PEAK': ['PEAK'],\n", + " 'LOW': ['LOW'],\n", + " 'RECORDED_IN': ['RECORDED_IN'],\n", + " 'LARGEST_SINGLE_DAY_DROP_SINCE': ['LARGEST_SINGLE_DAY_DROP_SINCE'],\n", + " 'START_DROP': ['START_DROP']}" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "relation_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "node_match = {}\n", + "relation_match = {}\n", + "\n", + "for new_label , values in nodes_dict.items() :\n", + " for old_label in values:\n", + " node_match[old_label]=new_label\n", + " \n", + "for new_label , values in relation_dict.items() :\n", + " for old_label in values:\n", + " relation_match[old_label]=new_label " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Update new node labels to database" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "for old_label, new_label in node_match.items():\n", + " graph.query(\"\"\"MATCH (n:$($label))\n", + " SET n:$($new_label)\n", + " REMOVE n:$($label)\n", + " \"\"\",\n", + " params={'label':old_label, 'new_label':new_label})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Update new relationship labels to database" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "for old_label, new_label in relation_match.items():\n", + " graph.query(\"\"\"MATCH (n)-[r:$($label)]->(m)\n", + " CREATE (n)-[r2:$($new_label)]->(m)\n", + " WITH r\n", + " DELETE r\n", + " \"\"\",\n", + " params={'label':old_label, 'new_label':new_label})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Todo -\n", + "\n", + "1. Query correction for relation updation and copying relation properties to new relation\n", + "2. Sending nodes and relation labels in batches to LLM if exceed X number.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/experiments/nova_models_trial.ipynb b/experiments/nova_models_trial.ipynb new file mode 100644 index 000000000..6ed2405ea --- /dev/null +++ b/experiments/nova_models_trial.ipynb @@ -0,0 +1,116 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error invoking the LLM: 1 validation error for ChatBedrockConverse\n", + " Value error, Could not load credentials to authenticate with AWS client. Please check that credentials in the specified profile name are valid. Bedrock error:\n", + "\n", + "You must specify a region. [type=value_error, input_value={'model': 'amazon.nova-li...sable_streaming': False}, input_type=dict]\n", + " For further information visit https://errors.pydantic.dev/2.9/v/value_error\n" + ] + }, + { + "ename": "ValidationError", + "evalue": "1 validation error for ChatBedrockConverse\n Value error, Could not load credentials to authenticate with AWS client. Please check that credentials in the specified profile name are valid. Bedrock error:\n\nYou must specify a region. [type=value_error, input_value={'model': 'amazon.nova-li...sable_streaming': False}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.9/v/value_error", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 47\u001b[0m\n\u001b[1;32m 45\u001b[0m query \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCreate list of 3 popular movies\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 46\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 47\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mllm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minvoke\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 48\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLLM Response:\u001b[39m\u001b[38;5;124m\"\u001b[39m, response)\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[0;32m/opt/conda/envs/myenv/lib/python3.10/site-packages/langchain_core/language_models/chat_models.py:286\u001b[0m, in \u001b[0;36mBaseChatModel.invoke\u001b[0;34m(self, input, config, stop, **kwargs)\u001b[0m\n\u001b[1;32m 275\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minvoke\u001b[39m(\n\u001b[1;32m 276\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 277\u001b[0m \u001b[38;5;28minput\u001b[39m: LanguageModelInput,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m 282\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m BaseMessage:\n\u001b[1;32m 283\u001b[0m config \u001b[38;5;241m=\u001b[39m ensure_config(config)\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m cast(\n\u001b[1;32m 285\u001b[0m ChatGeneration,\n\u001b[0;32m--> 286\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_prompt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 287\u001b[0m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 288\u001b[0m \u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 289\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcallbacks\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 290\u001b[0m \u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtags\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 291\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmetadata\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 292\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrun_name\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 293\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrun_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mgenerations[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m],\n\u001b[1;32m 296\u001b[0m )\u001b[38;5;241m.\u001b[39mmessage\n", + "File \u001b[0;32m/opt/conda/envs/myenv/lib/python3.10/site-packages/langchain_core/language_models/chat_models.py:786\u001b[0m, in \u001b[0;36mBaseChatModel.generate_prompt\u001b[0;34m(self, prompts, stop, callbacks, **kwargs)\u001b[0m\n\u001b[1;32m 778\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mgenerate_prompt\u001b[39m(\n\u001b[1;32m 779\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 780\u001b[0m prompts: \u001b[38;5;28mlist\u001b[39m[PromptValue],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 783\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m 784\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m LLMResult:\n\u001b[1;32m 785\u001b[0m prompt_messages \u001b[38;5;241m=\u001b[39m [p\u001b[38;5;241m.\u001b[39mto_messages() \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m prompts]\n\u001b[0;32m--> 786\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprompt_messages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallbacks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/envs/myenv/lib/python3.10/site-packages/langchain_core/language_models/chat_models.py:643\u001b[0m, in \u001b[0;36mBaseChatModel.generate\u001b[0;34m(self, messages, stop, callbacks, tags, metadata, run_name, run_id, **kwargs)\u001b[0m\n\u001b[1;32m 641\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m run_managers:\n\u001b[1;32m 642\u001b[0m run_managers[i]\u001b[38;5;241m.\u001b[39mon_llm_error(e, response\u001b[38;5;241m=\u001b[39mLLMResult(generations\u001b[38;5;241m=\u001b[39m[]))\n\u001b[0;32m--> 643\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 644\u001b[0m flattened_outputs \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 645\u001b[0m LLMResult(generations\u001b[38;5;241m=\u001b[39m[res\u001b[38;5;241m.\u001b[39mgenerations], llm_output\u001b[38;5;241m=\u001b[39mres\u001b[38;5;241m.\u001b[39mllm_output) \u001b[38;5;66;03m# type: ignore[list-item]\u001b[39;00m\n\u001b[1;32m 646\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m res \u001b[38;5;129;01min\u001b[39;00m results\n\u001b[1;32m 647\u001b[0m ]\n\u001b[1;32m 648\u001b[0m llm_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_combine_llm_outputs([res\u001b[38;5;241m.\u001b[39mllm_output \u001b[38;5;28;01mfor\u001b[39;00m res \u001b[38;5;129;01min\u001b[39;00m results])\n", + "File \u001b[0;32m/opt/conda/envs/myenv/lib/python3.10/site-packages/langchain_core/language_models/chat_models.py:633\u001b[0m, in \u001b[0;36mBaseChatModel.generate\u001b[0;34m(self, messages, stop, callbacks, tags, metadata, run_name, run_id, **kwargs)\u001b[0m\n\u001b[1;32m 630\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(messages):\n\u001b[1;32m 631\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 632\u001b[0m results\u001b[38;5;241m.\u001b[39mappend(\n\u001b[0;32m--> 633\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_generate_with_cache\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 634\u001b[0m \u001b[43m \u001b[49m\u001b[43mm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 635\u001b[0m \u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 636\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrun_managers\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrun_managers\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 637\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 638\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 639\u001b[0m )\n\u001b[1;32m 640\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 641\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m run_managers:\n", + "File \u001b[0;32m/opt/conda/envs/myenv/lib/python3.10/site-packages/langchain_core/language_models/chat_models.py:851\u001b[0m, in \u001b[0;36mBaseChatModel._generate_with_cache\u001b[0;34m(self, messages, stop, run_manager, **kwargs)\u001b[0m\n\u001b[1;32m 849\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 850\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39msignature(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_generate)\u001b[38;5;241m.\u001b[39mparameters\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrun_manager\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m--> 851\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_generate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 852\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrun_manager\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 853\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 854\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 855\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_generate(messages, stop\u001b[38;5;241m=\u001b[39mstop, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m/opt/conda/envs/myenv/lib/python3.10/site-packages/langchain_aws/chat_models/bedrock.py:524\u001b[0m, in \u001b[0;36mChatBedrock._generate\u001b[0;34m(self, messages, stop, run_manager, **kwargs)\u001b[0m\n\u001b[1;32m 516\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_generate\u001b[39m(\n\u001b[1;32m 517\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 518\u001b[0m messages: List[BaseMessage],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 521\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m 522\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m ChatResult:\n\u001b[1;32m 523\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbeta_use_converse_api:\n\u001b[0;32m--> 524\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_as_converse\u001b[49m\u001b[38;5;241m.\u001b[39m_generate(\n\u001b[1;32m 525\u001b[0m messages, stop\u001b[38;5;241m=\u001b[39mstop, run_manager\u001b[38;5;241m=\u001b[39mrun_manager, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[1;32m 526\u001b[0m )\n\u001b[1;32m 527\u001b[0m completion \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 528\u001b[0m llm_output: Dict[\u001b[38;5;28mstr\u001b[39m, Any] \u001b[38;5;241m=\u001b[39m {}\n", + "File \u001b[0;32m/opt/conda/envs/myenv/lib/python3.10/site-packages/langchain_aws/chat_models/bedrock.py:853\u001b[0m, in \u001b[0;36mChatBedrock._as_converse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 851\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtemperature \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 852\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtemperature\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtemperature\n\u001b[0;32m--> 853\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mChatBedrockConverse\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 854\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 855\u001b[0m \u001b[43m \u001b[49m\u001b[43mregion_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mregion_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 856\u001b[0m \u001b[43m \u001b[49m\u001b[43mcredentials_profile_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcredentials_profile_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 857\u001b[0m \u001b[43m \u001b[49m\u001b[43maws_access_key_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maws_access_key_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 858\u001b[0m \u001b[43m \u001b[49m\u001b[43maws_secret_access_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maws_secret_access_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 859\u001b[0m \u001b[43m \u001b[49m\u001b[43maws_session_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maws_session_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 860\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 861\u001b[0m \u001b[43m \u001b[49m\u001b[43mprovider\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprovider\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 862\u001b[0m \u001b[43m \u001b[49m\u001b[43mbase_url\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mendpoint_url\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 863\u001b[0m \u001b[43m \u001b[49m\u001b[43mguardrail_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mguardrails\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_guardrails_enabled\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[call-arg]\u001b[39;49;00m\n\u001b[1;32m 864\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 865\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/envs/myenv/lib/python3.10/site-packages/langchain_core/load/serializable.py:125\u001b[0m, in \u001b[0;36mSerializable.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs: Any, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 124\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\"\"\"\u001b[39;00m\n\u001b[0;32m--> 125\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/envs/myenv/lib/python3.10/site-packages/pydantic/main.py:212\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(self, **data)\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 211\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 212\u001b[0m validated_self \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m validated_self:\n\u001b[1;32m 214\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 215\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mA custom validator is returning a value other than `self`.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mReturning anything other than `self` from a top level model validator isn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt supported when validating via `__init__`.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSee the `model_validator` docs (https://docs.pydantic.dev/latest/concepts/validators/#model-validators) for more details.\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 218\u001b[0m category\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 219\u001b[0m )\n", + "\u001b[0;31mValidationError\u001b[0m: 1 validation error for ChatBedrockConverse\n Value error, Could not load credentials to authenticate with AWS client. Please check that credentials in the specified profile name are valid. Bedrock error:\n\nYou must specify a region. [type=value_error, input_value={'model': 'amazon.nova-li...sable_streaming': False}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.9/v/value_error" + ] + } + ], + "source": [ + "#Import Necessary Modules\n", + "import boto3\n", + "from langchain_aws import ChatBedrock\n", + "import os\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "\n", + "try:\n", + " aws_access_key = os.getenv(\"AWS_ACCESS_KEY_ID\")\n", + " aws_secret_key = os.getenv(\"AWS_SECRET_ACCESS_KEY\")\n", + " region_name = \"us-east-1\"\n", + " model_name = \"amazon.nova-lite-v1:0\"\n", + "\n", + " if not aws_access_key or not aws_secret_key:\n", + " raise ValueError(\"AWS credentials are missing. Ensure AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are set in the .env file.\")\n", + "\n", + "except Exception as e:\n", + " print(f\"Error loading environment variables: {e}\")\n", + " raise\n", + "\n", + "#Initialize AWS Bedrock Client\n", + "try:\n", + " bedrock_client = boto3.client(\n", + " service_name=\"bedrock-runtime\",\n", + " region_name=region_name,\n", + " aws_access_key_id=aws_access_key,\n", + " aws_secret_access_key=aws_secret_key\n", + " )\n", + "except Exception as e:\n", + " print(f\"Error initializing Bedrock client: {e}\")\n", + " raise\n", + "\n", + "#Set Up LangChain ChatBedrock LLM\n", + "try:\n", + " llm = ChatBedrock(\n", + " client=bedrock_client,\n", + " model_id=model_name,\n", + " model_kwargs=dict(temperature=0)\n", + " )\n", + "except Exception as e:\n", + " print(f\"Error setting up ChatBedrock LLM: {e}\")\n", + " raise\n", + "\n", + "\n", + "query = \"Create list of 3 popular movies\"\n", + "try:\n", + " response = llm.invoke(query)\n", + " print(\"LLM Response:\", response)\n", + "except Exception as e:\n", + " print(f\"Error invoking the LLM: {e}\")\n", + " raise\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "myenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/frontend/src/HOC/WithVisibility.tsx b/frontend/src/HOC/WithVisibility.tsx new file mode 100644 index 000000000..8b3dd53af --- /dev/null +++ b/frontend/src/HOC/WithVisibility.tsx @@ -0,0 +1,13 @@ +import { VisibilityProps } from "../types"; + +export function withVisibility

(WrappedComponent: React.ComponentType

) { + const VisibityControlled = (props: P & VisibilityProps) => { + if (props.isVisible === false) { + return null; + } + + return ; + }; + + return VisibityControlled; +} diff --git a/frontend/src/components/ChatBot/ChatModeToggle.tsx b/frontend/src/components/ChatBot/ChatModeToggle.tsx index a35674348..bcfc331d0 100644 --- a/frontend/src/components/ChatBot/ChatModeToggle.tsx +++ b/frontend/src/components/ChatBot/ChatModeToggle.tsx @@ -37,7 +37,7 @@ export default function ChatModeToggle({ : AvailableModes?.filter((m) => !m.mode.includes(chatModeLables['global search+vector+fulltext'])); }, [isGdsActive, isCommunityAllowed]); const menuItems = useMemo(() => { - return memoizedChatModes?.map((m) => { + return memoizedChatModes?.map((m, index) => { const handleModeChange = () => { if (chatModes.includes(m.mode)) { if (chatModes.length === 1) { @@ -49,6 +49,7 @@ export default function ChatModeToggle({ } }; return { + id: m.mode || `menu-item -${index}`, title: (

diff --git a/frontend/src/components/Content.tsx b/frontend/src/components/Content.tsx index cbd7875f7..49ad45ce3 100644 --- a/frontend/src/components/Content.tsx +++ b/frontend/src/components/Content.tsx @@ -44,14 +44,13 @@ import PostProcessingToast from './Popups/GraphEnhancementDialog/PostProcessingC import { getChunkText } from '../services/getChunkText'; import ChunkPopUp from './Popups/ChunkPopUp'; import { isExpired, isFileReadyToProcess } from '../utils/Utils'; +import { useHasSelections } from '../hooks/useHasSelections'; const ConfirmationDialog = lazy(() => import('./Popups/LargeFilePopUp/ConfirmationDialog')); let afterFirstRender = false; const Content: React.FC = ({ - isSchema, - setIsSchema, showEnhancementDialog, toggleEnhancementDialog, setOpenConnection, @@ -83,7 +82,7 @@ const Content: React.FC = ({ alertType: 'neutral', alertMessage: '', }); - const { setClearHistoryData } = useMessageContext(); + const { setMessages } = useMessageContext(); const { filesData, setFilesData, @@ -99,12 +98,15 @@ const Content: React.FC = ({ setProcessedCount, setchatModes, model, + additionalInstructions, + setAdditionalInstructions, } = useFileContext(); const [viewPoint, setViewPoint] = useState<'tableView' | 'showGraphView' | 'chatInfoView' | 'neighborView'>( 'tableView' ); const [showDeletePopUp, setshowDeletePopUp] = useState(false); const [deleteLoading, setdeleteLoading] = useState(false); + const hasSelections = useHasSelections(selectedNodes, selectedRels); const { updateStatusForLargeFiles } = useServerSideEvent( (inMinutes, time, fileName) => { @@ -135,11 +137,21 @@ const Content: React.FC = ({ } if (processedCount === 1 && queue.isEmpty()) { (async () => { - showNormalToast(); + showNormalToast( + + ); try { const payload = isGdsActive - ? postProcessingTasks - : postProcessingTasks.filter((task) => task !== 'enable_communities'); + ? hasSelections + ? postProcessingTasks.filter((task) => task !== 'graph_schema_consolidation') + : postProcessingTasks + : hasSelections + ? postProcessingTasks.filter((task) => task !== 'graph_schema_consolidation' && task !== 'enable_communities') + : postProcessingTasks.filter((task) => task !== 'enable_communities'); const response = await postProcessing(userCredentials as UserCredentials, payload); if (response.data.status === 'Success') { const communityfiles = response.data?.data; @@ -185,13 +197,6 @@ const Content: React.FC = ({ afterFirstRender = true; }, [queue.items.length, userCredentials]); - useEffect(() => { - const storedSchema = localStorage.getItem('isSchema'); - if (storedSchema !== null) { - setIsSchema(JSON.parse(storedSchema)); - } - }, [isSchema]); - const handleDropdownChange = (selectedOption: OptionType | null | void) => { if (selectedOption?.value) { setModel(selectedOption?.value); @@ -217,7 +222,7 @@ const Content: React.FC = ({ } toggleChunksLoading(); }; - + const extractData = async (uid: string, isselectedRows = false, filesTobeProcess: CustomFile[]) => { if (!isselectedRows) { const fileItem = filesData.find((f) => f.id == uid); @@ -284,13 +289,17 @@ const Content: React.FC = ({ selectedRels.map((t) => t.value), fileItem.googleProjectId, fileItem.language, - fileItem.accessToken + fileItem.accessToken, + additionalInstructions ); - if (apiResponse?.status === 'Failed') { let errorobj = { error: apiResponse.error, message: apiResponse.message, fileName: apiResponse.file_name }; throw new Error(JSON.stringify(errorobj)); } else if (fileItem.size != undefined && fileItem.size < largeFileSize) { + if (apiResponse.data.message) { + const apiRes = apiResponse.data.message; + showSuccessToast(apiRes); + } setFilesData((prevfiles) => { return prevfiles.map((curfile) => { if (curfile.name == apiResponse?.data?.fileName) { @@ -371,7 +380,9 @@ const Content: React.FC = ({ const addFilesToQueue = async (remainingFiles: CustomFile[]) => { if (!remainingFiles.length) { - showNormalToast(); + showNormalToast( + + ); try { const response = await postProcessing(userCredentials as UserCredentials, postProcessingTasks); if (response.data.status === 'Success') { @@ -521,9 +532,8 @@ const Content: React.FC = ({ const handleOpenGraphClick = () => { const bloomUrl = process.env.VITE_BLOOM_URL; const uriCoded = userCredentials?.uri.replace(/:\d+$/, ''); - const connectURL = `${uriCoded?.split('//')[0]}//${userCredentials?.userName}@${uriCoded?.split('//')[1]}:${ - userCredentials?.port ?? '7687' - }`; + const connectURL = `${uriCoded?.split('//')[0]}//${userCredentials?.userName}@${uriCoded?.split('//')[1]}:${userCredentials?.port ?? '7687' + }`; const encodedURL = encodeURIComponent(connectURL); const replacedUrl = bloomUrl?.replace('{CONNECT_URL}', encodedURL); window.open(replacedUrl, '_blank'); @@ -536,6 +546,7 @@ const Content: React.FC = ({ const disconnect = () => { queue.clear(); + const date = new Date(); setProcessedCount(0); setConnectionStatus(false); localStorage.removeItem('password'); @@ -543,7 +554,22 @@ const Content: React.FC = ({ setUserCredentials({ uri: '', password: '', userName: '', database: '' }); setSelectedNodes([]); setSelectedRels([]); - setClearHistoryData(true); + localStorage.removeItem('instructions'); + setAdditionalInstructions(''); + setMessages([ + { + datetime: `${date.toLocaleDateString()} ${date.toLocaleTimeString()}`, + id: 2, + modes: { + 'graph+vector+fulltext': { + message: + ' Welcome to the Neo4j Knowledge Graph Chat. You can ask questions related to documents which have been completely processed.', + }, + }, + user: 'chatbot', + currentMode: 'graph+vector+fulltext', + }, + ]); setchatModes([chatModeLables['graph+vector+fulltext']]); }; @@ -554,23 +580,31 @@ const Content: React.FC = ({ setRetryLoading(false); if (response.data.status === 'Failure') { throw new Error(response.data.error); - } - const isStartFromBegining = retryoption === RETRY_OPIONS[0] || retryoption === RETRY_OPIONS[1]; - setFilesData((prev) => { - return prev.map((f) => { - return f.name === filename - ? { + } else if ( + response.data.status === 'Success' && + response.data?.message != undefined && + (response.data?.message as string).includes('Chunks are not created') + ) { + showNormalToast(response.data.message as string); + retryOnclose() + } else { + const isStartFromBegining = retryoption === RETRY_OPIONS[0] || retryoption === RETRY_OPIONS[1]; + setFilesData((prev) => { + return prev.map((f) => { + return f.name === filename + ? { ...f, status: 'Ready to Reprocess', processingProgress: isStartFromBegining ? 0 : f.processingProgress, nodesCount: isStartFromBegining ? 0 : f.nodesCount, relationshipsCount: isStartFromBegining ? 0 : f.relationshipsCount, } - : f; + : f; + }); }); - }); - showSuccessToast(response.data.message as string); - retryOnclose(); + showSuccessToast(response.data.message as string); + retryOnclose(); + } } catch (error) { setRetryLoading(false); if (error instanceof Error) { @@ -672,12 +706,14 @@ const Content: React.FC = ({ const selectedRows = childRef.current?.getSelectedRows(); if (selectedRows?.length) { const expiredFilesExists = selectedRows.some( - (c) => isFileReadyToProcess(c, true) && isExpired(c?.createdAt as Date) + (c) => c.status !== 'Ready to Reprocess' && isExpired(c?.createdAt as Date ?? new Date()) ); const largeFileExists = selectedRows.some( (c) => isFileReadyToProcess(c, true) && typeof c.size === 'number' && c.size > largeFileSize ); if (expiredFilesExists) { + setshowExpirationModal(true); + } else if (largeFileExists && isGCSActive) { setshowConfirmationModal(true); } else if (largeFileExists && isGCSActive) { setshowExpirationModal(true); @@ -685,7 +721,9 @@ const Content: React.FC = ({ handleGenerateGraph(selectedRows.filter((f) => isFileReadyToProcess(f, false))); } } else if (filesData.length) { - const expiredFileExists = filesData.some((c) => isFileReadyToProcess(c, true) && isExpired(c.createdAt as Date)); + const expiredFileExists = filesData.some( + (c) => isExpired(c?.createdAt as Date) + ); const largeFileExists = filesData.some( (c) => isFileReadyToProcess(c, true) && typeof c.size === 'number' && c.size > largeFileSize ); @@ -746,6 +784,20 @@ const Content: React.FC = ({ onClose={() => setshowConfirmationModal(false)} loading={extractLoading} selectedRows={childRef.current?.getSelectedRows() as CustomFile[]} + isLargeDocumentAlert={true} + > + + )} + {showExpirationModal && filesForProcessing.length && ( + }> + setshowExpirationModal(false)} + loading={extractLoading} + selectedRows={childRef.current?.getSelectedRows() as CustomFile[]} + isLargeDocumentAlert={false} > )} @@ -812,19 +864,17 @@ const Content: React.FC = ({ />
- {!isSchema ? ( + {!hasSelections ? ( - ) : selectedNodes.length || selectedRels.length ? ( - - ) : ( - - )} + ) : + ( + )}
- {isSchema ? ( + {hasSelections? ( - {(!selectedNodes.length || !selectedNodes.length) && 'Empty'} Graph Schema configured - {selectedNodes.length || selectedRels.length + {(hasSelections)} Graph Schema configured + {hasSelections ? `(${selectedNodes.length} Labels + ${selectedRels.length} Rel Types)` : ''} diff --git a/frontend/src/components/Graph/ResultOverview.tsx b/frontend/src/components/Graph/ResultOverview.tsx index 17fdbbc78..3e9d05121 100644 --- a/frontend/src/components/Graph/ResultOverview.tsx +++ b/frontend/src/components/Graph/ResultOverview.tsx @@ -135,7 +135,6 @@ const ResultOverview: React.FunctionComponent = ({ } - />
diff --git a/frontend/src/components/Layout/DrawerDropzone.tsx b/frontend/src/components/Layout/DrawerDropzone.tsx index e937e7438..7396ff786 100644 --- a/frontend/src/components/Layout/DrawerDropzone.tsx +++ b/frontend/src/components/Layout/DrawerDropzone.tsx @@ -1,19 +1,18 @@ +import React, { useMemo, Suspense, lazy } from 'react'; import { Drawer, Flex, StatusIndicator, Typography, useMediaQuery } from '@neo4j-ndl/react'; import DropZone from '../DataSources/Local/DropZone'; -import React, { useMemo, Suspense, lazy } from 'react'; import S3Component from '../DataSources/AWS/S3Bucket'; -import { DrawerProps } from '../../types'; import GCSButton from '../DataSources/GCS/GCSButton'; import CustomAlert from '../UI/Alert'; +import FallBackDialog from '../UI/FallBackDialog'; import { useAlertContext } from '../../context/Alert'; +import { useCredentials } from '../../context/UserCredentials'; import { APP_SOURCES } from '../../utils/Constants'; import GenericButton from '../WebSources/GenericSourceButton'; import GenericModal from '../WebSources/GenericSourceModal'; -import FallBackDialog from '../UI/FallBackDialog'; -import { useCredentials } from '../../context/UserCredentials'; +import { DrawerProps } from '../../types'; const S3Modal = lazy(() => import('../DataSources/AWS/S3Modal')); const GCSModal = lazy(() => import('../DataSources/GCS/GCSModal')); - const DrawerDropzone: React.FC = ({ isExpanded, toggleS3Modal, @@ -24,189 +23,102 @@ const DrawerDropzone: React.FC = ({ showGenericModal, }) => { const { closeAlert, alertState } = useAlertContext(); - const { isReadOnlyUser, isBackendConnected } = useCredentials(); - const largedesktops = useMediaQuery(`(min-width:1440px )`); - - const isYoutubeOnlyCheck = useMemo( - () => APP_SOURCES?.includes('youtube') && !APP_SOURCES.includes('wiki') && !APP_SOURCES.includes('web'), - [APP_SOURCES] + const { isReadOnlyUser, isBackendConnected, connectionStatus } = useCredentials(); + const isLargeDesktop = useMediaQuery('(min-width:1440px)'); + const isYoutubeOnly = useMemo( + () => APP_SOURCES.includes('youtube') && !APP_SOURCES.includes('wiki') && !APP_SOURCES.includes('web'), + [] ); - const isWikipediaOnlyCheck = useMemo( - () => APP_SOURCES?.includes('wiki') && !APP_SOURCES.includes('youtube') && !APP_SOURCES.includes('web'), - [APP_SOURCES] + const isWikipediaOnly = useMemo( + () => APP_SOURCES.includes('wiki') && !APP_SOURCES.includes('youtube') && !APP_SOURCES.includes('web'), + [] ); - const iswebOnlyCheck = useMemo( - () => APP_SOURCES?.includes('web') && !APP_SOURCES.includes('youtube') && !APP_SOURCES.includes('wiki'), - [APP_SOURCES] + const isWebOnly = useMemo( + () => APP_SOURCES.includes('web') && !APP_SOURCES.includes('youtube') && !APP_SOURCES.includes('wiki'), + [] ); - + if (!isLargeDesktop) { + return null; + } return ( -
+
- {!isReadOnlyUser ? ( - - {alertState.showAlert && ( - - )} -
-
-
-
- {process.env.VITE_ENV != 'PROD' && ( - - - {!isBackendConnected ? : } - + {connectionStatus ? ( + !isReadOnlyUser ? ( + + {alertState.showAlert && ( + + )} +
+
+ {process.env.VITE_ENV !== 'PROD' && ( +
+ + Backend connection status - )} -
- {process.env.VITE_ENV != 'PROD' ? ( - <> - - {APP_SOURCES != undefined && APP_SOURCES.includes('local') && ( -
- -
- )} - {(APP_SOURCES != undefined && APP_SOURCES.includes('s3')) || - (APP_SOURCES != undefined && APP_SOURCES.includes('gcs')) ? ( - <> - {(APP_SOURCES.includes('youtube') || - APP_SOURCES.includes('wiki') || - APP_SOURCES.includes('web')) && ( -
- - -
- )} - {APP_SOURCES.includes('s3') && ( -
- - }> - - -
- )} - {APP_SOURCES.includes('gcs') && ( -
- - }> - - -
- )} - - ) : ( - <> - )} -
- - ) : ( - <> - - {APP_SOURCES != undefined && APP_SOURCES.includes('local') && ( -
- -
- )} - {((APP_SOURCES != undefined && APP_SOURCES.includes('youtube')) || - (APP_SOURCES != undefined && APP_SOURCES.includes('wiki')) || - (APP_SOURCES != undefined && APP_SOURCES.includes('web'))) && ( -
- - -
- )} - {(APP_SOURCES != undefined && APP_SOURCES.includes('s3')) || - (APP_SOURCES != undefined && APP_SOURCES.includes('gcs')) ? ( - <> - {APP_SOURCES != undefined && APP_SOURCES.includes('s3') && ( -
- - }> - - -
- )} - {APP_SOURCES != undefined && APP_SOURCES.includes('gcs') && ( -
- - -
- )} - - ) : ( - <> - )} -
- +
)} + + {APP_SOURCES.includes('local') && ( +
+ +
+ )} + {APP_SOURCES.some((source) => ['youtube', 'wiki', 'web'].includes(source)) && ( +
+ + +
+ )} + {APP_SOURCES.includes('s3') && ( +
+ + }> + + +
+ )} + {APP_SOURCES.includes('gcs') && ( +
+ + }> + + +
+ )} +
-
- + + ) : ( + + + This user account does not have permission to access or manage data sources. + + + ) ) : ( - + - This user account does not have permission to access or manage data sources. + You are not logged in. Please Login to access the content. )} @@ -214,5 +126,4 @@ const DrawerDropzone: React.FC = ({
); }; - export default DrawerDropzone; diff --git a/frontend/src/components/Layout/Header.tsx b/frontend/src/components/Layout/Header.tsx index 2cd243b29..f722440c9 100644 --- a/frontend/src/components/Layout/Header.tsx +++ b/frontend/src/components/Layout/Header.tsx @@ -11,10 +11,9 @@ import { ArrowDownTrayIconOutline, } from '@neo4j-ndl/react/icons'; import { Button, TextLink, Typography } from '@neo4j-ndl/react'; -import { Dispatch, memo, SetStateAction, useCallback, useContext, useEffect, useRef, useState } from 'react'; +import { Dispatch, memo, SetStateAction, useCallback, useContext, useRef, useState } from 'react'; import { IconButtonWithToolTip } from '../UI/IconButtonToolTip'; import { buttonCaptions, tooltips } from '../../utils/Constants'; -import { useFileContext } from '../../context/UsersFiles'; import { ThemeWrapperContext } from '../../context/ThemeWrapper'; import { useCredentials } from '../../context/UserCredentials'; import { useNavigate } from 'react-router'; @@ -39,13 +38,9 @@ const Header: React.FC = ({ chatOnly, deleteOnClick, setOpenConnecti window.open(url, '_blank'); }, []); const downloadLinkRef = useRef(null); - const { isSchema, setIsSchema } = useFileContext(); const { connectionStatus } = useCredentials(); const chatAnchor = useRef(null); const [showChatModeOption, setshowChatModeOption] = useState(false); - useEffect(() => { - setIsSchema(isSchema); - }, [isSchema]); const openChatPopout = useCallback(() => { let session = localStorage.getItem('neo4j.connection'); diff --git a/frontend/src/components/Layout/PageLayout.tsx b/frontend/src/components/Layout/PageLayout.tsx index fda1f03d8..4a1efac64 100644 --- a/frontend/src/components/Layout/PageLayout.tsx +++ b/frontend/src/components/Layout/PageLayout.tsx @@ -52,7 +52,7 @@ const PageLayout: React.FC = () => { }; const { messages, setClearHistoryData, clearHistoryData, setMessages, setIsDeleteChatLoading } = useMessageContext(); - const { isSchema, setIsSchema, setShowTextFromSchemaDialog, showTextFromSchemaDialog } = useFileContext(); + const { setShowTextFromSchemaDialog, showTextFromSchemaDialog } = useFileContext(); const { setConnectionStatus, setGdsActive, @@ -80,8 +80,26 @@ const PageLayout: React.FC = () => { setShowDisconnectButton(isModalOpen); localStorage.setItem('disconnectButtonState', isModalOpen ? 'true' : 'false'); }; - // To parse and set user credentials from session - const setUserCredentialsFromSession = (neo4jConnection: string) => { + const setUserCredentialsLocally = (credentials: any) => { + setUserCredentials(credentials); + setIsGCSActive(credentials.isGCSActive ?? false); + setGdsActive(credentials.isgdsActive); + setIsReadOnlyUser(credentials.isReadonlyUser); + localStorage.setItem( + 'neo4j.connection', + JSON.stringify({ + uri: credentials.uri, + user: credentials.userName, + password: btoa(credentials.password), + database: credentials.database, + userDbVectorIndex: 384, + isReadOnlyUser: credentials.isReadonlyUser, + isgdsActive: credentials.isgdsActive, + isGCSActive: credentials.isGCSActive, + }) + ); + }; + const parseSessionAndSetCredentials = (neo4jConnection: string) => { if (!neo4jConnection) { console.error('Invalid session data:', neo4jConnection); setOpenConnection((prev) => ({ ...prev, openPopUp: true })); @@ -116,20 +134,8 @@ const PageLayout: React.FC = () => { btoa(envCredentials.password) !== storedCredentials.password || envCredentials.database !== storedCredentials.database; if (isDiffCreds) { - setUserCredentials(envCredentials); - setIsGCSActive(envCredentials.isGCSActive ?? false); - localStorage.setItem( - 'neo4j.connection', - JSON.stringify({ - uri: envCredentials.uri, - user: envCredentials.userName, - password: btoa(envCredentials.password), - database: envCredentials.database, - userDbVectorIndex: 384, - isReadOnlyUser: envCredentials.isReadonlyUser, - isgdsActive: envCredentials.isgdsActive, - }) - ); + setUserCredentialsLocally(envCredentials); + setClearHistoryData(true); return true; } return false; @@ -143,48 +149,43 @@ const PageLayout: React.FC = () => { try { backendApiResponse = await envConnectionAPI(); const connectionData = backendApiResponse.data; - const envCredentials = { - uri: connectionData.data.uri, - password: atob(connectionData.data.password), - userName: connectionData.data.user_name, - database: connectionData.data.database, - isReadonlyUser: !connectionData.data.write_access, - isgdsActive: connectionData.data.gds_status, - isGCSActive: connectionData?.data?.gcs_file_cache === 'True', - }; - setIsGCSActive(connectionData?.data?.gcs_file_cache === 'True'); - if (session) { - const updated = updateSessionIfNeeded(envCredentials, session); - if (!updated) { - setUserCredentialsFromSession(session); // Use stored session if no update is needed + if (connectionData.data && connectionData.status === 'Success') { + const envCredentials = { + uri: connectionData.data.uri, + password: atob(connectionData.data.password), + userName: connectionData.data.user_name, + database: connectionData.data.database, + isReadonlyUser: !connectionData.data.write_access, + isgdsActive: connectionData.data.gds_status, + isGCSActive: connectionData?.data?.gcs_file_cache === 'True', + }; + setIsGCSActive(envCredentials.isGCSActive); + if (session) { + const updated = updateSessionIfNeeded(envCredentials, session); + if (!updated) { + parseSessionAndSetCredentials(session); + } + setConnectionStatus(Boolean(connectionData.data.graph_connection)); + setIsBackendConnected(true); + } else { + setUserCredentialsLocally(envCredentials); + setConnectionStatus(true); } - setConnectionStatus(Boolean(connectionData.data.graph_connection)); - setIsBackendConnected(true); handleDisconnectButtonState(false); } else { - setUserCredentials(envCredentials); - localStorage.setItem( - 'neo4j.connection', - JSON.stringify({ - uri: envCredentials.uri, - user: envCredentials.userName, - password: btoa(envCredentials.password), - database: envCredentials.database, - userDbVectorIndex: 384, - isReadOnlyUser: envCredentials.isReadonlyUser, - isgdsActive: envCredentials.isgdsActive, - isGCSActive: envCredentials.isGCSActive, - }) - ); - setConnectionStatus(true); - setGdsActive(envCredentials.isgdsActive); - setIsReadOnlyUser(envCredentials.isReadonlyUser); - handleDisconnectButtonState(false); + if (session) { + parseSessionAndSetCredentials(session); + setConnectionStatus(true); + } else { + setErrorMessage(backendApiResponse?.data?.error); + setOpenConnection((prev) => ({ ...prev, openPopUp: true })); + } + handleDisconnectButtonState(true); } } catch (error) { console.error('Error during backend API call:', error); if (session) { - setUserCredentialsFromSession(session); + parseSessionAndSetCredentials(session); setConnectionStatus(true); } else { setErrorMessage(backendApiResponse?.data?.error); @@ -287,8 +288,6 @@ const PageLayout: React.FC = () => { openTextSchema={() => { setShowTextFromSchemaDialog({ triggeredFrom: 'schemadialog', show: true }); }} - isSchema={isSchema} - setIsSchema={setIsSchema} showEnhancementDialog={showEnhancementDialog} toggleEnhancementDialog={toggleEnhancementDialog} setOpenConnection={setOpenConnection} @@ -347,8 +346,6 @@ const PageLayout: React.FC = () => { openTextSchema={() => { setShowTextFromSchemaDialog({ triggeredFrom: 'schemadialog', show: true }); }} - isSchema={isSchema} - setIsSchema={setIsSchema} showEnhancementDialog={showEnhancementDialog} toggleEnhancementDialog={toggleEnhancementDialog} setOpenConnection={setOpenConnection} diff --git a/frontend/src/components/Popups/ConnectionModal/ConnectionModal.tsx b/frontend/src/components/Popups/ConnectionModal/ConnectionModal.tsx index 2bc82735e..a8ddb3030 100644 --- a/frontend/src/components/Popups/ConnectionModal/ConnectionModal.tsx +++ b/frontend/src/components/Popups/ConnectionModal/ConnectionModal.tsx @@ -41,8 +41,15 @@ export default function ConnectionModal({ const [username, setUsername] = useState(initialusername ?? 'neo4j'); const [password, setPassword] = useState(''); const [connectionMessage, setMessage] = useState({ type: 'unknown', content: '' }); - const { setUserCredentials, userCredentials, setGdsActive, setIsReadOnlyUser, errorMessage, setIsGCSActive } = - useCredentials(); + const { + setUserCredentials, + userCredentials, + setGdsActive, + setIsReadOnlyUser, + errorMessage, + setIsGCSActive, + setShowDisconnectButton, + } = useCredentials(); const [isLoading, setIsLoading] = useState(false); const [searchParams, setSearchParams] = useSearchParams(); const [userDbVectorIndex, setUserDbVectorIndex] = useState(initialuserdbvectorindex ?? undefined); @@ -126,7 +133,7 @@ export default function ConnectionModal({ useEffect(() => { if (errorMessage) { - setMessage({ type: 'danger', content: errorMessage }); + setMessage({ type: 'warning', content: errorMessage }); } }, [errorMessage]); @@ -241,6 +248,7 @@ export default function ConnectionModal({ !response.data.data.chunks_exists ) { setConnectionStatus(true); + setShowDisconnectButton(true); setOpenConnection((prev) => ({ ...prev, openPopUp: false })); setMessage({ type: 'success', diff --git a/frontend/src/components/Popups/ExpirationModal/ExpiredFilesAlert.tsx b/frontend/src/components/Popups/ExpirationModal/ExpiredFilesAlert.tsx index 23738820c..2207c866a 100644 --- a/frontend/src/components/Popups/ExpirationModal/ExpiredFilesAlert.tsx +++ b/frontend/src/components/Popups/ExpirationModal/ExpiredFilesAlert.tsx @@ -7,6 +7,7 @@ import BellImage from '../../../assets/images/Stopwatch-blue.svg'; import AlertIcon from '../../Layout/AlertIcon'; import { isExpired } from '../../../utils/Utils'; import { EXPIRATION_DAYS } from '../../../utils/Constants'; +import { IconWithToolTip } from '../../UI/IconButtonToolTip'; const ExpiredFilesAlert: FC = ({ Files, handleToggle, checked }) => { return ( @@ -31,14 +32,8 @@ const ExpiredFilesAlert: FC = ({ Files, handleToggle, checked } { - if (e.target.checked) { - handleToggle(true, f.id); - } else { - handleToggle(false, f.id); - } - }} - isChecked={checked.indexOf(f.id) !== -1} + isChecked={checked.includes(f.id)} + onChange={(e) => handleToggle(e.target.checked, f.id)} htmlAttributes={{ tabIndex: -1 }} /> @@ -53,7 +48,9 @@ const ExpiredFilesAlert: FC = ({ Files, handleToggle, checked } {f.createdAt != undefined && isExpired(f.createdAt) ? ( - + + + ) : ( <> diff --git a/frontend/src/components/Popups/GraphEnhancementDialog/AdditionalInstructions/index.tsx b/frontend/src/components/Popups/GraphEnhancementDialog/AdditionalInstructions/index.tsx new file mode 100644 index 000000000..40be2c99e --- /dev/null +++ b/frontend/src/components/Popups/GraphEnhancementDialog/AdditionalInstructions/index.tsx @@ -0,0 +1,64 @@ +import { Flex, TextArea, Typography, useMediaQuery } from '@neo4j-ndl/react'; +import { buttonCaptions } from '../../../../utils/Constants'; +import { tokens } from '@neo4j-ndl/base'; +import ButtonWithToolTip from '../../../UI/ButtonWithToolTip'; +import { useCallback } from 'react'; +import { useFileContext } from '../../../../context/UsersFiles'; +import { showNormalToast } from '../../../../utils/toasts'; + +export default function AdditionalInstructionsText({ + closeEnhanceGraphSchemaDialog, +}: { + closeEnhanceGraphSchemaDialog: () => void; +}) { + const { breakpoints } = tokens; + const tablet = useMediaQuery(`(min-width:${breakpoints.xs}) and (max-width: ${breakpoints.lg})`); + const { additionalInstructions, setAdditionalInstructions } = useFileContext(); + + const clickAnalyzeIntructHandler = useCallback(async () => { + localStorage.setItem('instructions', additionalInstructions); + closeEnhanceGraphSchemaDialog(); + showNormalToast(`Successfully Applied the Instructions`); + }, [additionalInstructions]); + return ( + +
+ + + + {buttonCaptions.provideAdditionalInstructions} + + + +