diff --git a/backend/requirements.txt b/backend/requirements.txt index 158458ce1..30b767939 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -69,22 +69,22 @@ jsonpath-python==1.0.6 jsonpointer==2.4 json-repair==0.25.2 kiwisolver==1.4.5 -langchain -langchain-aws -langchain-anthropic -langchain-fireworks -langchain-google-genai -langchain-community -langchain-core -langchain-experimental -langchain-google-vertexai -langchain-groq -langchain-openai -langchain-text-splitters +langchain==0.3.0 +langchain-aws==0.2.1 +langchain-anthropic==0.2.1 +langchain-fireworks==0.2.0 +langchain-google-genai==2.0.0 +langchain-community==0.3.0 +langchain-core==0.3.5 +langchain-experimental==0.3.1 +langchain-google-vertexai==2.0.1 +langchain-groq==0.2.0 +langchain-openai==0.2.0 +langchain-text-splitters==0.3.0 langdetect==1.0.9 -langsmith==0.1.83 +langsmith==0.1.128 layoutparser==0.3.4 -langserve==0.2.2 +langserve==0.3.0 #langchain-cli==0.0.25 lxml==5.1.0 MarkupSafe==2.1.5 @@ -100,7 +100,7 @@ numpy==1.26.4 omegaconf==2.3.0 onnx==1.16.1 onnxruntime==1.18.1 -openai==1.35.10 +openai==1.47.1 opencv-python==4.8.0.76 orjson==3.9.15 packaging==23.2 @@ -144,7 +144,6 @@ shapely==2.0.3 six==1.16.0 sniffio==1.3.1 soupsieve==2.5 -SQLAlchemy==2.0.28 starlette==0.37.2 sse-starlette==2.1.2 starlette-session==0.4.3 @@ -159,7 +158,7 @@ transformers==4.42.3 types-protobuf types-requests typing-inspect==0.9.0 -typing_extensions==4.9.0 +typing_extensions==4.12.2 tzdata==2024.1 unstructured==0.14.9 unstructured-client==0.23.8 diff --git a/backend/src/llm.py b/backend/src/llm.py index 505bb89fb..c2335685f 100644 --- a/backend/src/llm.py +++ b/backend/src/llm.py @@ -9,13 +9,14 @@ import concurrent.futures from concurrent.futures import ThreadPoolExecutor from langchain_experimental.graph_transformers import LLMGraphTransformer +from langchain_core.prompts import ChatPromptTemplate from langchain_anthropic import ChatAnthropic from langchain_fireworks import ChatFireworks from langchain_aws import ChatBedrock from langchain_community.chat_models import ChatOllama import boto3 import google.auth -from src.shared.constants import MODEL_VERSIONS +from src.shared.constants import MODEL_VERSIONS, PROMPT_TO_ALL_LLMs def get_llm(model: str): @@ -28,7 +29,7 @@ def get_llm(model: str): model_name = MODEL_VERSIONS[model] llm = ChatVertexAI( model_name=model_name, - convert_system_message_to_human=True, + #convert_system_message_to_human=True, credentials=credentials, project=project_id, temperature=0, @@ -149,8 +150,9 @@ def get_graph_document_list( if "diffbot_api_key" in dir(llm): llm_transformer = llm else: - if "get_name" in dir(llm) and llm.get_name() == "ChatOllama": + if "get_name" in dir(llm) and llm.get_name() != "ChatOenAI" or llm.get_name() != "ChatVertexAI" or llm.get_name() != "AzureChatOpenAI": node_properties = False + relationship_properties = False else: node_properties = ["description"] relationship_properties = ["description"] @@ -160,6 +162,8 @@ def get_graph_document_list( relationship_properties=relationship_properties, allowed_nodes=allowedNodes, allowed_relationships=allowedRelationship, + ignore_tool_usage=True, + #prompt = ChatPromptTemplate.from_messages(["system",PROMPT_TO_ALL_LLMs]) ) with ThreadPoolExecutor(max_workers=10) as executor: for chunk in combined_chunk_document_list: diff --git a/backend/src/main.py b/backend/src/main.py index 9e142a3cb..41942d500 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -433,14 +433,14 @@ def processing_chunks(chunkId_chunkDoc_list,graph,uri, userName, password, datab node_type= node.type if (node_id, node_type) not in distinct_nodes: distinct_nodes.add((node_id, node_type)) - #get all relations - for relation in graph_document.relationships: - relations.append(relation.type) - - node_count += len(distinct_nodes) - rel_count += len(relations) - print(f'node count internal func:{node_count}') - print(f'relation count internal func:{rel_count}') + #get all relations + for relation in graph_document.relationships: + relations.append(relation.type) + + node_count += len(distinct_nodes) + rel_count += len(relations) + print(f'node count internal func:{node_count}') + print(f'relation count internal func:{rel_count}') return node_count,rel_count def get_chunkId_chunkDoc_list(graph, file_name, pages, retry_condition): diff --git a/backend/src/shared/constants.py b/backend/src/shared/constants.py index bbf579520..f76812553 100644 --- a/backend/src/shared/constants.py +++ b/backend/src/shared/constants.py @@ -1,7 +1,8 @@ MODEL_VERSIONS = { "openai-gpt-3.5": "gpt-3.5-turbo-0125", "gemini-1.0-pro": "gemini-1.0-pro-001", - "gemini-1.5-pro": "gemini-1.5-pro-preview-0514", + "gemini-1.5-pro": "gemini-1.5-pro-002", + "gemini-1.5-flash": "gemini-1.5-flash-002", "openai-gpt-4": "gpt-4-turbo-2024-04-09", "diffbot" : "gpt-4-turbo-2024-04-09", "openai-gpt-4o-mini": "gpt-4o-mini-2024-07-18", @@ -9,7 +10,7 @@ "groq-llama3" : "llama3-70b-8192" } OPENAI_MODELS = ["openai-gpt-3.5", "openai-gpt-4o", "openai-gpt-4o-mini"] -GEMINI_MODELS = ["gemini-1.0-pro", "gemini-1.5-pro"] +GEMINI_MODELS = ["gemini-1.0-pro", "gemini-1.5-pro", "gemini-1.5-flash"] GROQ_MODELS = ["groq-llama3"] BUCKET_UPLOAD = 'llm-graph-builder-upload' BUCKET_FAILED_FILE = 'llm-graph-builder-failed' @@ -92,14 +93,14 @@ CHAT_DOC_SPLIT_SIZE = 3000 CHAT_EMBEDDING_FILTER_SCORE_THRESHOLD = 0.10 CHAT_TOKEN_CUT_OFF = { - ("openai-gpt-3.5",'azure_ai_gpt_35',"gemini-1.0-pro","gemini-1.5-pro","groq-llama3",'groq_llama3_70b','anthropic_claude_3_5_sonnet','fireworks_llama_v3_70b','bedrock_claude_3_5_sonnet', ) : 4, + ("openai-gpt-3.5",'azure_ai_gpt_35',"gemini-1.0-pro","gemini-1.5-pro","gemini-1.5-flash","groq-llama3",'groq_llama3_70b','anthropic_claude_3_5_sonnet','fireworks_llama_v3_70b','bedrock_claude_3_5_sonnet', ) : 4, ("openai-gpt-4","diffbot" ,'azure_ai_gpt_4o',"openai-gpt-4o", "openai-gpt-4o-mini") : 28, ("ollama_llama3") : 2 } CHAT_TOKEN_CUT_OFF = { - ("openai-gpt-3.5",'azure_ai_gpt_35',"gemini-1.0-pro","gemini-1.5-pro","groq-llama3",'groq_llama3_70b','anthropic_claude_3_5_sonnet','fireworks_llama_v3_70b','bedrock_claude_3_5_sonnet', ) : 4, + ("openai-gpt-3.5",'azure_ai_gpt_35',"gemini-1.0-pro","gemini-1.5-pro", "gemini-1.5-flash","groq-llama3",'groq_llama3_70b','anthropic_claude_3_5_sonnet','fireworks_llama_v3_70b','bedrock_claude_3_5_sonnet', ) : 4, ("openai-gpt-4","diffbot" ,'azure_ai_gpt_4o',"openai-gpt-4o", "openai-gpt-4o-mini") : 28, ("ollama_llama3") : 2 } @@ -476,3 +477,46 @@ START_FROM_BEGINNING = "start_from_beginning" DELETE_ENTITIES_AND_START_FROM_BEGINNING = "delete_entities_and_start_from_beginning" START_FROM_LAST_PROCESSED_POSITION = "start_from_last_processed_position" + +PROMPT_TO_ALL_LLMs = """ +"# Knowledge Graph Instructions for LLMs\n" + "## 1. Overview\n" + "You are a top-tier algorithm designed for extracting information in structured " + "formats to build a knowledge graph.\n" + "Try to capture as much information from the text as possible without " + "sacrificing accuracy. Do not add any information that is not explicitly " + "mentioned in the text.\n" + "- **Nodes** represent entities and concepts.\n" + "- The aim is to achieve simplicity and clarity in the knowledge graph, making it\n" + "accessible for a vast audience.\n" + "## 2. Labeling Nodes\n" + "- **Consistency**: Ensure you use available types for node labels.\n" + "Ensure you use basic or elementary types for node labels.\n" + "- For example, when you identify an entity representing a person, " + "always label it as **'person'**. Avoid using more specific terms " + "like 'mathematician' or 'scientist'." + "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be " + "names or human-readable identifiers found in the text.\n" + "- **Relationships** represent connections between entities or concepts.\n" + "Ensure consistency and generality in relationship types when constructing " + "knowledge graphs. Instead of using specific and momentary types " + "such as 'BECAME_PROFESSOR', use more general and timeless relationship types " + "like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n" + "## 3. Coreference Resolution\n" + "- **Maintain Entity Consistency**: When extracting entities, it's vital to " + "ensure consistency.\n" + 'If an entity, such as "John Doe", is mentioned multiple times in the text ' + 'but is referred to by different names or pronouns (e.g., "Joe", "he"),' + "always use the most complete identifier for that entity throughout the " + 'knowledge graph. In this example, use "John Doe" as the entity ID.\n' + "Remember, the knowledge graph should be coherent and easily understandable, " + "so maintaining consistency in entity references is crucial.\n" + "## 4. Node Properties\n" + "- Dates, URLs, Time, and Numerical Values: Instead of creating separate nodes for + these elements, represent them as properties of existing nodes." + "- Example: Instead of creating a node labeled "2023-03-15" and connecting it to another node + with the relationship "BORN_ON", add a property called "born_on" to the person node with the + value "2023-03-15"." + "## 5. Strict Compliance\n" + "Adhere to the rules strictly. Non-compliance will result in termination." + """ \ No newline at end of file diff --git a/backend/src/shared/schema_extraction.py b/backend/src/shared/schema_extraction.py index 27008acae..80954ba65 100644 --- a/backend/src/shared/schema_extraction.py +++ b/backend/src/shared/schema_extraction.py @@ -1,5 +1,6 @@ from typing import List -from langchain_core.pydantic_v1 import BaseModel, Field +#from langchain_core.pydantic_v1 import BaseModel, Field +from pydantic.v1 import BaseModel, Field from src.llm import get_llm from src.shared.constants import MODEL_VERSIONS from langchain_core.prompts import ChatPromptTemplate diff --git a/frontend/src/utils/Constants.ts b/frontend/src/utils/Constants.ts index cf37e9313..08948ba87 100644 --- a/frontend/src/utils/Constants.ts +++ b/frontend/src/utils/Constants.ts @@ -45,12 +45,13 @@ export const llms = 'openai-gpt-4o-mini', 'gemini-1.0-pro', 'gemini-1.5-pro', + 'gemini-1.5-flash', 'azure_ai_gpt_35', 'azure_ai_gpt_4o', 'ollama_llama3', 'groq_llama3_70b', 'anthropic_claude_3_5_sonnet', - 'fireworks_v3p1_405b', + 'fireworks_llama_v3p2_90b', 'bedrock_claude_3_5_sonnet', ];