From 5f2290bf9b59d4f98c3555eea2b8bcbed6c955ec Mon Sep 17 00:00:00 2001 From: aashipandya <156318202+aashipandya@users.noreply.github.com> Date: Tue, 17 Sep 2024 16:30:24 +0530 Subject: [PATCH] Retry processing - node and rels count update condition for start from beginning (#737) * Remove TotalPages when save file on local (#684) * file_name reference and verify_ssl issue fixed (#683) * User flow changes for recreating supported vector index (#682) * removed the if check * Add one more check for create vector index when chunks are exist without embeddings * removed local files * condition changes * chunks exists check * chunk exists without embeddings check * vector Index issue fixed * vector index with different dimension * Update graphDB_dataAccess.py --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * Remove TotalPages when save file on local (#684) * file_name reference and verify_ssl issue fixed (#683) * User flow changes for recreating supported vector index (#682) * removed the if check * Add one more check for create vector index when chunks are exist without embeddings * removed local files * condition changes * chunks exists check * chunk exists without embeddings check * vector Index issue fixed * vector index with different dimension * Update graphDB_dataAccess.py --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * Reapply "Dockerfile changes with VITE label" This reverts commit a83e0855fbf54d2b5af009d96c4edf0bcd7ab84a. * Revert "Dockerfile changes with VITE label" This reverts commit 2840ebc9e6156c51465a9f54be72ca2d014147c2. * Concurrent processing of files (#665) * Update README.md * Droped the old vector index (#652) * added cypher_queries and llm chatbot files * updated llm-chatbot-python * added llm-chatbot-python * updated llm-chatbot-python folder * Added chatbot "hybrid " mode use case * added the concurrent file processing * page refresh scenario * fixed waiting files processing issue in refresh scenario * removed boolean param * fixed processedCount issue * checkbox with waiting check * fixed the refresh scenario with processing files * processing files check * server side error * processing file count check for processing files less than batch size * processing count check to handle allselected files * created helper functions * code improvements * __ changes (#656) * DiffbotGraphTransformer doesn't need an LLMGraphTransformer (#659) Co-authored-by: jeromechoo * Removed experiments/llm-chatbot-python folder from DEV branch * redcued the password clear timeout * Removed experiments/Cypher_Queries.ipynb file from DEV branch * disabled the closed button on banner and connection dialog while API is in pending state * update delete query with entities * node id check (#663) * Status source and type filtering (#664) * status source * Name change * type change * rollback to previous working nvl version * added the alert * add BATCH_SIZE to docker * temp fixes for 0.3.1 * alert fix for less than batch size processing * new virtual env * added Hybrid Chat modes (#670) * Rename the function #657 * label and checkboxes placement changes (#675) * label and checkboxes placement changes * checkbox placement changes * Graph node filename check * env fixes with latest nvl libraries * format fixes * removed local files * Remove TotalPages when save file on local (#684) * file_name reference and verify_ssl issue fixed (#683) * User flow changes for recreating supported vector index (#682) * removed the if check * Add one more check for create vector index when chunks are exist without embeddings * removed local files * condition changes * chunks exists check * chunk exists without embeddings check * vector Index issue fixed * vector index with different dimension * Update graphDB_dataAccess.py --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * ndl changes * label and checkboxes placement changes (#675) * label and checkboxes placement changes * checkbox placement changes * env fixes with latest nvl libraries * format fixes * User flow changes for recreating supported vector index (#682) * removed the if check * Add one more check for create vector index when chunks are exist without embeddings * removed local files * condition changes * chunks exists check * chunk exists without embeddings check * vector Index issue fixed * vector index with different dimension * Update graphDB_dataAccess.py --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * env fixes with latest nvl libraries * format fixes * User flow changes for recreating supported vector index (#682) * removed the if check * Add one more check for create vector index when chunks are exist without embeddings * removed local files * condition changes * chunks exists check * chunk exists without embeddings check * vector Index issue fixed * vector index with different dimension * Update graphDB_dataAccess.py --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * Status source and type filtering (#664) * status source * Name change * type change * added the alert * temp fixes for 0.3.1 * label and checkboxes placement changes (#675) * label and checkboxes placement changes * checkbox placement changes * env fixes with latest nvl libraries * format fixes * User flow changes for recreating supported vector index (#682) * removed the if check * Add one more check for create vector index when chunks are exist without embeddings * removed local files * condition changes * chunks exists check * chunk exists without embeddings check * vector Index issue fixed * vector index with different dimension * Update graphDB_dataAccess.py --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * ndl changes * env fixes with latest nvl libraries * format fixes * User flow changes for recreating supported vector index (#682) * removed the if check * Add one more check for create vector index when chunks are exist without embeddings * removed local files * condition changes * chunks exists check * chunk exists without embeddings check * vector Index issue fixed * vector index with different dimension * Update graphDB_dataAccess.py --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * env fixes with latest nvl libraries * format fixes * User flow changes for recreating supported vector index (#682) * removed the if check * Add one more check for create vector index when chunks are exist without embeddings * removed local files * condition changes * chunks exists check * chunk exists without embeddings check * vector Index issue fixed * vector index with different dimension * Update graphDB_dataAccess.py --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * added cypher_queries and llm chatbot files * updated llm-chatbot-python * added llm-chatbot-python * updated llm-chatbot-python folder * page refresh scenario * fixed waiting files processing issue in refresh scenario * Removed experiments/llm-chatbot-python folder from DEV branch * disabled the closed button on banner and connection dialog while API is in pending state * node id check (#663) * Status source and type filtering (#664) * status source * Name change * type change * rollback to previous working nvl version * added the alert * temp fixes for 0.3.1 * label and checkboxes placement changes (#675) * label and checkboxes placement changes * checkbox placement changes * env fixes with latest nvl libraries * format fixes * User flow changes for recreating supported vector index (#682) * removed the if check * Add one more check for create vector index when chunks are exist without embeddings * removed local files * condition changes * chunks exists check * chunk exists without embeddings check * vector Index issue fixed * vector index with different dimension * Update graphDB_dataAccess.py --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * ndl changes * env fixes with latest nvl libraries * format fixes * User flow changes for recreating supported vector index (#682) * removed the if check * Add one more check for create vector index when chunks are exist without embeddings * removed local files * condition changes * chunks exists check * chunk exists without embeddings check * vector Index issue fixed * vector index with different dimension * Update graphDB_dataAccess.py --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * env fixes with latest nvl libraries * format fixes * User flow changes for recreating supported vector index (#682) * removed the if check * Add one more check for create vector index when chunks are exist without embeddings * removed local files * condition changes * chunks exists check * chunk exists without embeddings check * vector Index issue fixed * vector index with different dimension * Update graphDB_dataAccess.py --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * Status source and type filtering (#664) * status source * Name change * type change * added the alert * temp fixes for 0.3.1 * label and checkboxes placement changes (#675) * label and checkboxes placement changes * checkbox placement changes * env fixes with latest nvl libraries * format fixes * User flow changes for recreating supported vector index (#682) * removed the if check * Add one more check for create vector index when chunks are exist without embeddings * removed local files * condition changes * chunks exists check * chunk exists without embeddings check * vector Index issue fixed * vector index with different dimension * Update graphDB_dataAccess.py --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * ndl changes * env fixes with latest nvl libraries * format fixes * User flow changes for recreating supported vector index (#682) * removed the if check * Add one more check for create vector index when chunks are exist without embeddings * removed local files * condition changes * chunks exists check * chunk exists without embeddings check * vector Index issue fixed * vector index with different dimension * Update graphDB_dataAccess.py --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * env fixes with latest nvl libraries * format fixes * User flow changes for recreating supported vector index (#682) * removed the if check * Add one more check for create vector index when chunks are exist without embeddings * removed local files * condition changes * chunks exists check * chunk exists without embeddings check * vector Index issue fixed * vector index with different dimension * Update graphDB_dataAccess.py --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * property spell fix --------- Co-authored-by: vasanthasaikalluri <165021735+vasanthasaikalluri@users.noreply.github.com> Co-authored-by: Jayanth T Co-authored-by: abhishekkumar-27 <164544129+abhishekkumar-27@users.noreply.github.com> Co-authored-by: Prakriti Solankey <156313631+prakriti-solankey@users.noreply.github.com> Co-authored-by: Jerome Choo Co-authored-by: jeromechoo Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> * env changes * format fixes * set retry status * retry processing backend * added the retry icon on rows * vite changes in docker compose * added retry dialog * Integrated the Retry processing API * Integrated the Extract API fro retry processing * Integrated ndl toast component * replaced foreach with normal for loop for better performance * types improvements * used toast component * spell fix * Issue fixed * processing changes in main * function closing fix * retry processing issue fixed * autoclosing the retry popup on retry api success * removed the retry if check * resetting the node and relationship count on retry * added the enter key events on the popups * fixed wikipedia icon on large file alert popup * setting nodes to 0 and start from last processed chunk logic changes * Retry Popup fixes * status changes for upload failed scenario * kept condition specific * changed status to reprocess from retry * Reprocess wording changes * tooltip changes * wordings and size changes * Changed status to Reprocess * updated node count for start from begnning --------- Co-authored-by: Pravesh Kumar <121786590+praveshkumar1988@users.noreply.github.com> Co-authored-by: kartikpersistent <101251502+kartikpersistent@users.noreply.github.com> Co-authored-by: Prakriti Solankey <156313631+prakriti-solankey@users.noreply.github.com> Co-authored-by: vasanthasaikalluri <165021735+vasanthasaikalluri@users.noreply.github.com> Co-authored-by: Jayanth T Co-authored-by: abhishekkumar-27 <164544129+abhishekkumar-27@users.noreply.github.com> Co-authored-by: Jerome Choo Co-authored-by: jeromechoo --- backend/src/main.py | 12 +++++++++--- backend/src/shared/constants.py | 7 ++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/backend/src/main.py b/backend/src/main.py index cc5569c0d..eebdff70f 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -5,7 +5,8 @@ QUERY_TO_GET_LAST_PROCESSED_CHUNK_WITHOUT_ENTITY, START_FROM_BEGINNING, START_FROM_LAST_PROCESSED_POSITION, - DELETE_ENTITIES_AND_START_FROM_BEGINNING) + DELETE_ENTITIES_AND_START_FROM_BEGINNING, + QUERY_TO_GET_NODES_AND_RELATIONS_OF_A_DOCUMENT) from src.shared.schema_extraction import schema_extraction_from_text from langchain_community.document_loaders import GoogleApiClient, GoogleApiYoutubeLoader from dotenv import load_dotenv @@ -342,8 +343,13 @@ def processing_source(uri, userName, password, database, model, file_name, pages obj_source_node.updated_at = end_time obj_source_node.processing_time = processed_time obj_source_node.processed_chunk = select_chunks_upto+select_chunks_with_retry - obj_source_node.node_count = node_count - obj_source_node.relationship_count = rel_count + if retry_condition == START_FROM_BEGINNING: + result = graph.query(QUERY_TO_GET_NODES_AND_RELATIONS_OF_A_DOCUMENT, params={"filename":file_name}) + obj_source_node.node_count = result[0]['nodes'] + obj_source_node.relationship_count = result[0]['rels'] + else: + obj_source_node.node_count = node_count + obj_source_node.relationship_count = rel_count graphDb_data_Access.update_source_node(obj_source_node) result = graphDb_data_Access.get_current_status_document_node(file_name) diff --git a/backend/src/shared/constants.py b/backend/src/shared/constants.py index 8bb1c56b1..c6d404a7f 100644 --- a/backend/src/shared/constants.py +++ b/backend/src/shared/constants.py @@ -312,7 +312,12 @@ RETURN c.id as id,c.position as position ORDER BY c.position LIMIT 1 """ - +QUERY_TO_GET_NODES_AND_RELATIONS_OF_A_DOCUMENT = """ + MATCH (d:Document)<-[:PART_OF]-(:Chunk)-[:HAS_ENTITY]->(e) where d.fileName=$filename + OPTIONAL MATCH (d)<-[:PART_OF]-(:Chunk)-[:HAS_ENTITY]->(e2:!Chunk)-[rel]-(e) + RETURN count(DISTINCT e) as nodes, count(DISTINCT rel) as rels + """ + START_FROM_BEGINNING = "start_from_beginning" DELETE_ENTITIES_AND_START_FROM_BEGINNING = "delete_entities_and_start_from_beginning" START_FROM_LAST_PROCESSED_POSITION = "start_from_last_processed_position"